import pandas as pd
import matplotlib.pyplot as plt
# Part A: Load & Explore Data
# Load all CSV files
sales = pd.read_csv('sales.csv')
inventory = pd.read_csv('inventory.csv')
customers = pd.read_csv('customers.csv')
# Explore data
print("Sales Data:")
print(sales.head())
print(sales.info())
print(sales.describe())
print("\nInventory Data:")
print(inventory.head())
print(inventory.info())
print(inventory.describe())
print("\nCustomer Data:")
print(customers.head())
print(customers.info())
print(customers.describe())
# Rename confusing columns
sales = sales.rename(columns={'Unit_Price': 'Price_per_Unit'})
inventory = inventory.rename(columns={'Unit_Price': 'Price_per_Unit'})
# Convert Date column to datetime
sales['Date'] = pd.to_datetime(sales['Date'])
# Part B: Data Cleaning
# Handle missing values
sales = sales.dropna()
inventory = inventory.dropna()
customers = customers.dropna()
# Drop duplicates
sales = sales.drop_duplicates()
inventory = inventory.drop_duplicates()
customers = customers.drop_duplicates()
# Ensure proper data types
sales['Quantity'] = sales['Quantity'].astype(int)
customers['Age'] = customers['Age'].astype(int)
# Part C: Data Analysis
# 1. Sales Insights
# Total revenue generated
sales['Total_Sale'] = sales['Quantity'] * sales['Price_per_Unit']
total_revenue = sales['Total_Sale'].sum()
print(f"\nTotal Revenue: ₹{total_revenue:.2f}")
# Top 5 products by sales revenue
product_revenue = sales.groupby('Product_ID')
['Total_Sale'].sum().sort_values(ascending=False)
print("\nTop 5 Products by Revenue:")
print(product_revenue.head())
# Monthly revenue trend
sales['Month'] = sales['Date'].dt.month
monthly_revenue = sales.groupby('Month')['Total_Sale'].sum()
print("\nMonthly Revenue Trend:")
print(monthly_revenue)
# Most popular payment method
popular_payment = sales['Payment_Method'].value_counts().idxmax()
print(f"\nMost Popular Payment Method: {popular_payment}")
# 2. Inventory Management
# Products that need restocking (Stock < 10)
restock_products = inventory[inventory['Stock'] < 10]
print("\nProducts that need restocking:")
print(restock_products[['Product_ID', 'Product_Name', 'Stock']])
# Most stocked category
most_stocked = inventory.groupby('Category')['Stock'].sum().idxmax()
print(f"\nMost Stocked Category: {most_stocked}")
# Inventory value per category
inventory['Inventory_Value'] = inventory['Stock'] * inventory['Price_per_Unit']
category_value = inventory.groupby('Category')['Inventory_Value'].sum()
print("\nInventory Value per Category:")
print(category_value)
# 3. Customer Analysis
# Cities with most customers
city_counts = customers['City'].value_counts()
print("\nCities with Most Customers:")
print(city_counts.head())
# Gender distribution
gender_dist = customers['Gender'].value_counts(normalize=True) * 100
print("\nGender Distribution:")
print(gender_dist)
# Average age of customers
avg_age = customers['Age'].mean()
print(f"\nAverage Age of Customers: {avg_age:.1f} years")
# Part D: Visualizations
plt.figure(figsize=(15, 10))
# Line chart: Monthly revenue
plt.subplot(2, 2, 1)
monthly_revenue.plot(kind='line', marker='o')
plt.title('Monthly Revenue Trend')
plt.xlabel('Month')
plt.ylabel('Revenue (₹)')
plt.grid(True)
# Bar chart: Top 5 products by revenue
plt.subplot(2, 2, 2)
product_revenue.head().plot(kind='bar')
plt.title('Top 5 Products by Revenue')
plt.xlabel('Product ID')
plt.ylabel('Revenue (₹)')
# Pie chart: Payment methods distribution
plt.subplot(2, 2, 3)
sales['Payment_Method'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Payment Methods Distribution')
plt.ylabel('')
# Histogram: Age distribution of customers
plt.subplot(2, 2, 4)
customers['Age'].plot(kind='hist', bins=10)
plt.title('Age Distribution of Customers')
plt.xlabel('Age')
plt.ylabel('Count')
plt.tight_layout()
plt.show()
# Part E: Advanced (Optional)
# Merge sales and inventory to calculate profit (assuming cost price is 70%
of selling price)
merged_data = pd.merge(sales, inventory, on='Product_ID')
merged_data['Cost_Price'] = merged_data['Price_per_Unit_y'] * 0.7
merged_data['Profit'] = (merged_data['Price_per_Unit_x'] -
merged_data['Cost_Price']) * merged_data['Quantity']
# Flag customers who made purchases above ₹10,000
customer_spending = sales.groupby('Customer_ID')['Total_Sale'].sum()
big_spenders = customer_spending[customer_spending >
10000].index.tolist()
print("\nCustomers who spent more than ₹10,000:")
print(big_spenders)