import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df
sepal length (cm) sepal width (cm) petal length (cm) petal
width (cm)
0 5.1 3.5 1.4
0.2
1 4.9 3.0 1.4
0.2
2 4.7 3.2 1.3
0.2
3 4.6 3.1 1.5
0.2
4 5.0 3.6 1.4
0.2
.. ... ... ...
...
145 6.7 3.0 5.2
2.3
146 6.3 2.5 5.0
1.9
147 6.5 3.0 5.2
2.0
148 6.2 3.4 5.4
2.3
149 5.9 3.0 5.1
1.8
[150 rows x 4 columns]
np.random.seed(0)
nan_indices = np.random.choice(df.index, size=20, replace=True)
df.loc[nan_indices, 'sepal length (cm)'] = np.nan
print("Initial Data with Missing Values:")
print(df.head())
Initial Data with Missing Values:
sepal length (cm) sepal width (cm) petal length (cm) petal width
(cm)
0 5.1 3.5 1.4
0.2
1 4.9 3.0 1.4
0.2
2 4.7 3.2 1.3
0.2
3 4.6 3.1 1.5
0.2
4 5.0 3.6 1.4
0.2
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df),
columns=df.columns)
print("\nData after Handling Missing Values:")
print(df_imputed.head())
Data after Handling Missing Values:
sepal length (cm) sepal width (cm) petal length (cm) petal width
(cm)
0 5.1 3.5 1.4
0.2
1 4.9 3.0 1.4
0.2
2 4.7 3.2 1.3
0.2
3 4.6 3.1 1.5
0.2
4 5.0 3.6 1.4
0.2
df_smoothed = df_imputed.rolling(window=3).mean()
print("\nSmoothed Data (with Rolling Window):")
print(df_smoothed.head(10))
Smoothed Data (with Rolling Window):
sepal length (cm) sepal width (cm) petal length (cm) petal width
(cm)
0 NaN NaN NaN
NaN
1 NaN NaN NaN
NaN
2 4.900000 3.233333 1.366667
0.200000
3 4.733333 3.100000 1.400000
0.200000
4 4.766667 3.300000 1.400000
0.200000
5 5.000000 3.533333 1.533333
0.266667
6 5.000000 3.633333 1.500000
0.300000
7 5.276106 3.566667 1.533333
0.300000
8 4.942773 3.233333 1.433333
0.233333
9 5.352212 3.133333 1.466667
0.166667
z_scores = np.abs(stats.zscore(df_imputed))
outliers = (z_scores > 3).all(axis=1)
df_no_outliers = df_imputed[~outliers]
print("\nData after Removing Outliers:")
print(df_no_outliers.head())
Data after Removing Outliers:
sepal length (cm) sepal width (cm) petal length (cm) petal width
(cm)
0 5.1 3.5 1.4
0.2
1 4.9 3.0 1.4
0.2
2 4.7 3.2 1.3
0.2
3 4.6 3.1 1.5
0.2
4 5.0 3.6 1.4
0.2
scaler_min_max = MinMaxScaler()
df_minmax_scaled =
pd.DataFrame(scaler_min_max.fit_transform(df_no_outliers),
columns=df_no_outliers.columns)
print("\nData after Min-Max Scaling:")
print(df_minmax_scaled.head())
Data after Min-Max Scaling:
sepal length (cm) sepal width (cm) petal length (cm) petal width
(cm)
0 0.222222 0.625000 0.067797
0.041667
1 0.166667 0.416667 0.067797
0.041667
2 0.111111 0.500000 0.050847
0.041667
3 0.083333 0.458333 0.084746
0.041667
4 0.194444 0.666667 0.067797
0.041667
scaler_standard = StandardScaler()
df_standard_scaled =
pd.DataFrame(scaler_standard.fit_transform(df_no_outliers),
columns=df_no_outliers.columns)
print("\nData after Standard Scaling:")
print(df_standard_scaled.head())
Data after Standard Scaling:
sepal length (cm) sepal width (cm) petal length (cm) petal width
(cm)
0 -0.981414 1.019004 -1.340227 -
1.315444
1 -1.250916 -0.131979 -1.340227 -
1.315444
2 -1.520417 0.328414 -1.397064 -
1.315444
3 -1.655168 0.098217 -1.283389 -
1.315444
4 -1.116165 1.249201 -1.340227 -
1.315444
print("\nDescriptive Statistics:")
print(df_imputed.describe())
Descriptive Statistics:
sepal length (cm) sepal width (cm) petal length (cm) \
count 150.000000 150.000000 150.000000
mean 5.828319 3.057333 3.758000
std 0.744597 0.435866 1.765298
min 4.300000 2.000000 1.000000
25% 5.400000 2.800000 1.600000
50% 5.828319 3.000000 4.350000
75% 6.275000 3.300000 5.100000
max 7.900000 4.400000 6.900000
petal width (cm)
count 150.000000
mean 1.199333
std 0.762238
min 0.100000
25% 0.300000
50% 1.300000
75% 1.800000
max 2.500000
df['species'] = data.target
for feature in df.columns[:-1]:
f_stat, p_val = stats.f_oneway(df[df['species'] == 0][feature],
df[df['species'] == 1][feature],
df[df['species'] == 2][feature])
print(f"\nANOVA for {feature}: F-statistic = {f_stat:.3f}, p-value
= {p_val:.3f}")
if p_val < 0.05:
print(f" -> The means of {feature} are significantly
different across species (reject H0)")
else:
print(f" -> The means of {feature} are not significantly
different across species (fail to reject H0)")
ANOVA for sepal length (cm): F-statistic = nan, p-value = nan
-> The means of sepal length (cm) are not significantly different
across species (fail to reject H0)
ANOVA for sepal width (cm): F-statistic = 49.160, p-value = 0.000
-> The means of sepal width (cm) are significantly different across
species (reject H0)
ANOVA for petal length (cm): F-statistic = 1180.161, p-value = 0.000
-> The means of petal length (cm) are significantly different across
species (reject H0)
ANOVA for petal width (cm): F-statistic = 960.007, p-value = 0.000
-> The means of petal width (cm) are significantly different across
species (reject H0)
print("\nNon-Parametric Test (Kruskal-Wallis H-test):")
for feature in df.columns[:-1]:
h_stat, p_val = stats.kruskal(df[df['species'] == 0][feature],
df[df['species'] == 1][feature],
df[df['species'] == 2][feature])
print(f"\nKruskal-Wallis for {feature}: H-statistic =
{h_stat:.3f}, p-value = {p_val:.3f}")
if p_val < 0.05:
print(f" -> The distributions of {feature} are significantly
different across species (reject H0)")
else:
print(f" -> The distributions of {feature} are not
significantly different across species (fail to reject H0)")
Non-Parametric Test (Kruskal-Wallis H-test):
Kruskal-Wallis for sepal length (cm): H-statistic = nan, p-value = nan
-> The distributions of sepal length (cm) are not significantly
different across species (fail to reject H0)
Kruskal-Wallis for sepal width (cm): H-statistic = 63.571, p-value =
0.000
-> The distributions of sepal width (cm) are significantly different
across species (reject H0)
Kruskal-Wallis for petal length (cm): H-statistic = 130.411, p-value =
0.000
-> The distributions of petal length (cm) are significantly
different across species (reject H0)
Kruskal-Wallis for petal width (cm): H-statistic = 131.185, p-value =
0.000
-> The distributions of petal width (cm) are significantly different
across species (reject H0)