import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# Generate synthetic dataset
np.random.seed(42)
data = {
'numerical_column': np.append(np.random.randint(20, 80, 95), [150, 160, 170, 5,
10]),
'categorical_column': np.random.choice(['A', 'B', 'C', 'D'], 100)
}
df = pd.DataFrame(data)
df.to_csv('Sample_Data.csv', index=False) # Optional: Save to CSV
df = pd.read_csv('Sample_Data.csv') # Load the dataset
# Numerical analysis
num = df['numerical_column'].dropna()
print("Dataset Preview:\n", df.head())
print(f"\n--- Numerical Stats for 'numerical_column' ---")
print(f"Mean: {num.mean():.2f} | Median: {num.median()} | Mode:
{num.mode().values}")
print(f"Std Dev: {num.std():.2f} | Variance: {num.var():.2f} | Range: {num.max() -
num.min()}")
# Plot histogram & boxplot
plt.figure(figsize=(10, 4))
sns.histplot(num, bins=20, kde=True).set(title='Histogram')
plt.figure()
sns.boxplot(x=num).set(title='Boxplot')
plt.show()
# Outlier detection using IQR
Q1, Q3 = num.quantile([0.25, 0.75])
IQR = Q3 - Q1
outliers = num[(num < Q1 - 1.5 * IQR) | (num > Q3 + 1.5 * IQR)]
print(f"\nOutliers Detected:\n{outliers.values}")
# Categorical analysis
cat_counts = df['categorical_column'].value_counts()
print(f"\n--- Category Frequencies ---\n{cat_counts}")
# Bar & pie chart
plt.figure(figsize=(10, 4))
cat_counts.plot(kind='bar', color='lightgreen', title='Bar Chart')
plt.xlabel('Category')
plt.ylabel('Count')
plt.tight_layout()
plt.figure()
cat_counts.plot(kind='pie', autopct='%1.1f%%', title='Pie Chart')
plt.ylabel('')
plt.tight_layout()
plt.show()