import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

# Set global plot style
sns.set_theme(style='darkgrid')
plt.rcParams['figure.figsize'] = (12, 5)
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.titleweight'] = 'bold'

# File paths
BASE = '/path/to/data/Sales Report Automation Tutorial Files'
TRANSACTIONS_PATH = f'{BASE}/TransactionsHistory.xlsx'
CUSTOMERS_PATH    = f'{BASE}/CustomerMD.csv'
PRODUCTS_PATH     = f'{BASE}/ProductMD.csv'

# Load data
print('Loading transactions...')
transactions = pd.read_excel(TRANSACTIONS_PATH)

print('Loading customer master data...')
customers = pd.read_csv(CUSTOMERS_PATH, sep=';', encoding='latin-1')

print('Loading product master data...')
products = pd.read_csv(PRODUCTS_PATH, encoding='latin-1')

print(f'\nTransactions loaded: {len(transactions):,} rows')
print(f'Customers loaded:    {len(customers):,} rows')
print(f'Products loaded:     {len(products):,} rows')

Loading transactions...

Loading customer master data...
Loading product master data...

Transactions loaded: 545,592 rows
Customers loaded:    5,856 rows
Products loaded:     3,812 rows

# Preview each dataset
print('=== TRANSACTIONS ===')
display(transactions.head())
print(f'Shape: {transactions.shape}')
print('\nData types:')
print(transactions.dtypes)

=== TRANSACTIONS ===

Shape: (545592, 6)

Data types:
Invoice          object
StockCode        object
Quantity          int64
Price           float64
Customer_ID       int64
Invoice_time     object
dtype: object

print('=== CUSTOMERS ===')
display(customers.head())
print(f'\n=== PRODUCTS ===')
display(products.head())
print(f'\nProduct categories: {products["Category"].nunique()}')
print(products['Category'].value_counts())

=== CUSTOMERS ===

=== PRODUCTS ===

Product categories: 8
Category
Home             1441
Accessories       585
Stationery        568
Air fragances     496
Seasonal          279
Hobbies           216
Kids              206
Others             21
Name: count, dtype: int64

# Check for missing values
print('=== MISSING VALUES ===')
print('\nTransactions:')
print(transactions.isnull().sum())
print('\nCustomers:')
print(customers.isnull().sum())
print('\nProducts:')
print(products.isnull().sum())

=== MISSING VALUES ===

Transactions:
Invoice         0
StockCode       0
Quantity        0
Price           0
Customer_ID     0
Invoice_time    0
dtype: int64

Customers:
Customer_ID    0
Country        0
dtype: int64

Products:
StockCode      0
Description    0
Category       0
Subcategory    0
dtype: int64

# --- Parse Invoice_time ---
# Format is like 'D01/12/17T07:45:00' — strip leading 'D' then parse
transactions['Invoice_time'] = transactions['Invoice_time'].astype(str).str.lstrip('D')
transactions['Invoice_time'] = pd.to_datetime(transactions['Invoice_time'], format='%d/%m/%yT%H:%M:%S', errors='coerce')

# --- Remove returns (negative quantities) and zero-price rows ---
print(f'Rows before cleaning: {len(transactions):,}')
transactions = transactions[transactions['Quantity'] > 0]
transactions = transactions[transactions['Price'] > 0]
print(f'Rows after removing returns/zero-price: {len(transactions):,}')

# --- Drop rows with missing Customer_ID ---
transactions = transactions.dropna(subset=['Customer_ID'])
transactions['Customer_ID'] = transactions['Customer_ID'].astype(int)
print(f'Rows after dropping missing Customer_ID: {len(transactions):,}')

# --- Add Revenue column ---
transactions['Revenue'] = transactions['Quantity'] * transactions['Price']

# --- Extract date parts for time-series analysis ---
transactions['Year']       = transactions['Invoice_time'].dt.year
transactions['Month']      = transactions['Invoice_time'].dt.month
transactions['YearMonth']  = transactions['Invoice_time'].dt.to_period('M')

print('\nCleaned transactions preview:')
display(transactions.head())

Rows before cleaning: 545,592

Rows after removing returns/zero-price: 532,294

Rows after dropping missing Customer_ID: 532,294

Cleaned transactions preview:

# --- Merge transactions with customer country data ---
df = transactions.merge(customers, on='Customer_ID', how='left')

# --- Merge with product category data ---
df = df.merge(products[['StockCode', 'Description', 'Category', 'Subcategory']], on='StockCode', how='left')

print(f'Final merged dataset shape: {df.shape}')
display(df.head())

Final merged dataset shape: (532294, 14)

# Aggregate revenue by country across all markets
revenue_by_country_all = (
    df.groupby('Country')['Revenue']
    .sum()
    .reset_index()
    .rename(columns={'Revenue': 'Total_Revenue'})
    .sort_values('Total_Revenue', ascending=False)
)

fig = px.choropleth(
    revenue_by_country_all,
    locations='Country',
    locationmode='country names',
    color='Total_Revenue',
    color_continuous_scale='Blues',
    title='Total Revenue by Country',
    labels={'Total_Revenue': 'Total Revenue (£)'},
    hover_name='Country',
    hover_data={'Total_Revenue': ':,.0f', 'Country': False}
)
fig.update_layout(
    title_font_size=16,
    geo=dict(
        showframe=False,
        showcoastlines=True,
        projection_type='natural earth'
    ),
    coloraxis_colorbar=dict(title='Revenue (£)')
)
fig.show()

print('\nTop 15 Countries by Revenue:')
top15 = revenue_by_country_all.head(15).copy()
top15['Total_Revenue'] = top15['Total_Revenue'].apply(lambda x: f'£{x:,.2f}')
print(top15.to_string(index=False))

Top 15 Countries by Revenue:
        Country  Total_Revenue
 United Kingdom £10,108,978.85
           EIRE    £444,273.95
    Netherlands    £371,358.22
        Germany    £293,820.67
         France    £214,385.07
      Australia    £109,700.20
          Spain     £73,270.70
    Switzerland     £68,574.33
         Sweden     £57,425.49
        Denmark     £51,787.21
        Belgium     £40,199.01
       Portugal     £38,466.34
         Norway     £33,039.67
Channel Islands     £31,771.42
          Japan     £29,033.27

# Revenue by product category (exclude unmatched rows)
revenue_by_category = (
    df.dropna(subset=['Category'])
    .groupby('Category')['Revenue']
    .sum()
    .sort_values(ascending=False)
)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Bar chart
axes[0].barh(revenue_by_category.index, revenue_by_category.values,
             color=sns.color_palette('Set2', len(revenue_by_category)))
axes[0].set_title('Revenue by Product Category')
axes[0].set_xlabel('Total Revenue (£)')
axes[0].xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'£{x:,.0f}'))

# Pie chart
axes[1].pie(revenue_by_category.values, labels=revenue_by_category.index,
            autopct='%1.1f%%', colors=sns.color_palette('Set2', len(revenue_by_category)),
            startangle=140)
axes[1].set_title('Revenue Share by Category')

plt.tight_layout()
plt.savefig('revenue_by_category.png', dpi=150, bbox_inches='tight')
plt.show()

# Aggregate revenue by month and sort chronologically
monthly_revenue = (
    df.groupby('YearMonth')['Revenue']
    .sum()
    .sort_index()
)

fig, ax = plt.subplots(figsize=(14, 5))
ax.plot(monthly_revenue.index.astype(str), monthly_revenue.values,
        marker='o', linewidth=2, color='steelblue', markersize=5)
ax.fill_between(monthly_revenue.index.astype(str), monthly_revenue.values, alpha=0.2, color='steelblue')
ax.set_title('Monthly Revenue Trend')
ax.set_xlabel('Month')
ax.set_ylabel('Total Revenue (£)')
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'£{x:,.0f}'))
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('monthly_revenue_trend.png', dpi=150, bbox_inches='tight')
plt.show()

print(f'Peak revenue month: {monthly_revenue.idxmax()} — £{monthly_revenue.max():,.2f}')
print(f'Lowest revenue month: {monthly_revenue.idxmin()} — £{monthly_revenue.min():,.2f}')

Peak revenue month: 2018-11 — £1,022,742.75
Lowest revenue month: 2019-02 — £381,336.66

# Top 10 customers by total revenue spent
top_customers = (
    df.groupby('Customer_ID')
    .agg(Total_Revenue=('Revenue', 'sum'),
         Total_Orders=('Invoice', 'nunique'),
         Country=('Country', 'first'))
    .sort_values('Total_Revenue', ascending=False)
    .head(10)
    .reset_index()
)

fig, ax = plt.subplots(figsize=(12, 5))
bars = ax.bar(top_customers['Customer_ID'].astype(str),
              top_customers['Total_Revenue'],
              color=sns.color_palette('magma', 10))
ax.set_title('Top 10 Customers by Total Revenue')
ax.set_xlabel('Customer ID')
ax.set_ylabel('Total Revenue (£)')
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'£{x:,.0f}'))
plt.tight_layout()
plt.savefig('top_customers.png', dpi=150, bbox_inches='tight')
plt.show()

print('\nTop 10 Customers:')
top_customers['Total_Revenue'] = top_customers['Total_Revenue'].apply(lambda x: f'£{x:,.2f}')
display(top_customers)

Top 10 Customers:

# Top 10 products by revenue — use Description from products table where available
top_products = (
    df.dropna(subset=['Description'])
    .groupby('Description')['Revenue']
    .sum()
    .sort_values(ascending=False)
    .head(10)
)

fig, ax = plt.subplots(figsize=(12, 6))
ax.barh(top_products.index[::-1], top_products.values[::-1],
        color=sns.color_palette('viridis', 10))
ax.set_title('Top 10 Products by Total Revenue')
ax.set_xlabel('Total Revenue (£)')
ax.xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'£{x:,.0f}'))
plt.tight_layout()
plt.savefig('top_products.png', dpi=150, bbox_inches='tight')
plt.show()

total_revenue    = df['Revenue'].sum()
total_orders     = df['Invoice'].nunique()
total_customers  = df['Customer_ID'].nunique()
total_products   = df['StockCode'].nunique()
top_country      = df.groupby('Country')['Revenue'].sum().idxmax()
top_category     = df.dropna(subset=['Category']).groupby('Category')['Revenue'].sum().idxmax()
date_range       = f"{df['Invoice_time'].min().strftime('%b %Y')} — {df['Invoice_time'].max().strftime('%b %Y')}"

print('=' * 50)
print('       SALES PERFORMANCE SUMMARY')
print('=' * 50)
print(f'  Date Range:        {date_range}')
print(f'  Total Revenue:     £{total_revenue:,.2f}')
print(f'  Total Orders:      {total_orders:,}')
print(f'  Unique Customers:  {total_customers:,}')
print(f'  Unique Products:   {total_products:,}')
print(f'  Top Country:       {top_country}')
print(f'  Top Category:      {top_category}')
print('=' * 50)

==================================================
       SALES PERFORMANCE SUMMARY
==================================================
  Date Range:        Dec 2017 — Aug 2019
  Total Revenue:     £12,175,404.81
  Total Orders:      29,418
  Unique Customers:  5,234
  Unique Products:   3,404
  Top Country:       United Kingdom
  Top Category:      Home
==================================================

	Invoice	StockCode	Quantity	Price	Customer_ID	Invoice_time
0	489434	85048	12	6.95	13085	D01/12/17T07:45:00
1	489434	79323P	12	6.75	13085	D01/12/17T07:45:00
2	489434	79323W	12	6.75	13085	D01/12/17T07:45:00
3	489434	22041	48	2.10	13085	D01/12/17T07:45:00
4	489434	21232	24	1.25	13085	D01/12/17T07:45:00

	Customer_ID	Country
0	12346	United Kingdom
1	12347	Iceland
2	12348	Finland
3	12349	Italy
4	12350	Norway

	StockCode	Description	Category	Subcategory
0	10002R	ROBOT PENCIL SHARPNER	Stationery	Desktop Stationery
1	10109	BENDY COLOUR PENCILS	Stationery	Desktop Stationery
2	10120	DOGGY RUBBER	Stationery	Desktop Stationery
3	10123C	HEARTS WRAPPING TAPE	Stationery	Desktop Stationery
4	10123G	ARMY CAMO WRAPPING TAPE	Stationery	Desktop Stationery

	Invoice	StockCode	Quantity	Price	Customer_ID	Invoice_time	Revenue	Year	Month	YearMonth
0	489434	85048	12	6.95	13085	2017-12-01 07:45:00	83.4	2017	12	2017-12
1	489434	79323P	12	6.75	13085	2017-12-01 07:45:00	81.0	2017	12	2017-12
2	489434	79323W	12	6.75	13085	2017-12-01 07:45:00	81.0	2017	12	2017-12
3	489434	22041	48	2.10	13085	2017-12-01 07:45:00	100.8	2017	12	2017-12
4	489434	21232	24	1.25	13085	2017-12-01 07:45:00	30.0	2017	12	2017-12

	Invoice	StockCode	Quantity	Price	Customer_ID	Invoice_time	Revenue	Year	Month	YearMonth	Country	Description	Category	Subcategory
0	489434	85048	12	6.95	13085	2017-12-01 07:45:00	83.4	2017	12	2017-12	United Kingdom	15CM CHRISTMAS GLASS BALL 20 LIGHTS	Seasonal	Christmas
1	489434	79323P	12	6.75	13085	2017-12-01 07:45:00	81.0	2017	12	2017-12	United Kingdom	PINK CHERRY LIGHTS	Home	Furniture
2	489434	79323W	12	6.75	13085	2017-12-01 07:45:00	81.0	2017	12	2017-12	United Kingdom	WHITE CHERRY LIGHTS	Home	Furniture
3	489434	22041	48	2.10	13085	2017-12-01 07:45:00	100.8	2017	12	2017-12	United Kingdom	RECORD FRAME 7" SINGLE SIZE	Hobbies	Photo
4	489434	21232	24	1.25	13085	2017-12-01 07:45:00	30.0	2017	12	2017-12	United Kingdom	STRAWBERRY CERAMIC TRINKET BOX	Accessories	Bijoux

Sales Performance Analysis¶

Overview¶

1. Setup & Data Loading¶

2. Initial Data Exploration¶

3. Data Cleaning¶

4. Analysis & Visualizations¶

4.1 Revenue by Country (Top 15)¶

4.2 Revenue by Product Category¶

4.3 Monthly Revenue Trend¶

4.4 Top 10 Customers by Revenue¶

4.5 Top 10 Products by Revenue¶

5. Key Findings Summary¶

6. Conclusions & Recommendations¶

What the Data Tells Us¶

Recommendations¶

	Customer_ID	Total_Revenue	Total_Orders	Country
0	18102	£447,094.58	107	United Kingdom
1	14646	£352,857.03	117	Netherlands
2	14156	£244,563.10	135	EIRE
3	14911	£190,366.63	301	EIRE
4	13694	£153,166.00	125	United Kingdom
5	17450	£124,144.86	26	United Kingdom
6	17511	£123,457.99	46	United Kingdom
7	15061	£108,491.58	110	United Kingdom
8	16684	£106,242.56	43	United Kingdom
9	17949	£101,184.55	97	United Kingdom

#	Recommendation	Rationale
1	Develop international market strategy	Countries with high order frequency but low revenue per order are candidates for upsell campaigns or bundled product offerings
2	Protect and grow top-10 customer relationships	Concentrated revenue from a small customer base warrants proactive account management and loyalty incentives
3	Build Q4 operational capacity in advance	Seasonal peaks require inventory build-up and logistics planning starting in Q3
4	Invest in Home and Accessories categories	These categories already have proven demand — deeper product variety and promotional focus would likely yield strong returns
5	Monitor and diversify UK dependency	Long-term resilience requires growing international revenue share, particularly in high-performing European markets