SML - Lab03 - Colab
SML - Lab03 - Colab
ID - S25MCAG0021
LAB - III Statistical Machine Learning
import pandas as pd
df = pd.read_csv('Titanic-Dataset.csv')
print(df.head(10))
import pandas as pd
df = pd.read_csv("housing.csv")
missing_values = df.isnull().sum()
#3. Replace missing age values with the mean age (titanic.csv).
import pandas as pd
df = pd.read_csv("Titanic-Dataset.csv")
df["Age"].fillna(df["Age"].mean(), inplace=True)
print(df["Age"])
0 22.000000
1 38.000000
2 26.000000
3 35.000000
4 35.000000
...
886 27.000000
887 19.000000
888 29.699118
889 26.000000
890 32.000000
Name: Age, Length: 891, dtype: float64
/tmp/ipython-input-3046390652.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series throug
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[c
df["Age"].fillna(df["Age"].mean(), inplace=True)
import pandas as pd
df = pd.read_csv("winequalityN.csv")
print("Duplicate rows:", df.duplicated().sum())
df = df.drop_duplicates()
df.to_csv("winequality_cleaned.csv", index=False)
import pandas as pd
df = pd.read_csv("Iris.csv")
df.columns = df.columns.str.lower()
print(df.head())
import pandas as pd
from google.colab import drive
import pandas as pd
df = pd.read_csv("Titanic-Dataset.csv")
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
print(df.head())
import pandas as pd
df = pd.read_csv("Titanic-Dataset.csv")
df = pd.get_dummies(df, columns=["Embarked"])
print(df.head())
import pandas as pd
df = pd.read_csv("winequalityN.csv")
num_cols = df.select_dtypes(include=['float64','int64']).columns
df[num_cols] = (df[num_cols] - df[num_cols].mean()) / df[num_cols].std()
print(df.head())
#9. Create a new feature “BMI” from height and weight columns (students.csv).
import pandas as pd
df = pd.read_csv("student-data.csv")
print(df.columns)
df = pd.read_csv("Titanic-Dataset.csv")
df = df.drop(["PassengerId"], axis=1)
print(df.head())
import pandas as pd
df = pd.read_csv("Titanic-Dataset.csv")
cat_df = df.select_dtypes(include=['object'])
cat_df
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female PC 17599 C85 C
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 113803 C123 S
888 Johnston, Miss. Catherine Helen "Carrie" female W./C. 6607 NaN S
import pandas as pd
df = pd.read_csv("housing.csv")
df = df.select_dtypes(include=['number'])
print(df)
Avg. Area Income Avg. Area House Age Avg. Area Number of Rooms \
0 79545.45857 5.682861 7.009188
1 79248.64245 6.002900 6.730821
2 61287.06718 5.865890 8.512727
3 63345.24005 7.188236 5.586729
4 59982.19723 5.040555 7.839388
... ... ... ...
4995 60567.94414 7.830362 6.137356
4996 78491.27543 6.999135 6.576763
4997 63390.68689 7.250591 4.805081
4998 68001.33124 5.534388 7.130144
4999 65510.58180 5.992305 6.792336
import pandas as pd
train = pd.read_csv("Titanic-Dataset.csv")
test = pd.read_csv("Titanic-Dataset.csv")
df = pd.concat([train, test], ignore_index=True)
df.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
STON/O2.
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 7.9250 NaN S
3101282
#14. Combine student grades from two CSV files (students_math.csv + students_portuguese.csv).
import pandas as pd
math = pd.read_csv("student-data.csv")
portuguese = pd.read_csv("student-data.csv")
combined = pd.concat([math, portuguese], ignore_index=True)
print(combined)
school sex age address famsize Pstatus Medu Fedu Mjob Fjob \
0 GP F 18 U GT3 A 4 4 at_home teacher
1 GP F 17 U GT3 T 1 1 at_home other
2 GP F 15 U LE3 T 1 1 at_home other
3 GP F 15 U GT3 T 4 2 health services
4 GP F 16 U GT3 T 3 3 other other
.. ... .. ... ... ... ... ... ... ... ...
785 MS M 20 U LE3 A 2 2 services services
786 MS M 17 U LE3 T 3 1 services services
787 MS M 21 R GT3 T 1 1 other other
788 MS M 18 R LE3 T 3 2 services other
789 MS M 19 U LE3 T 1 1 other at_home
... internet romantic famrel freetime goout Dalc Walc health absences \
0 ... no no 4 3 4 1 1 3 6
1 ... yes no 5 3 3 1 1 3 4
2 ... yes no 4 3 2 2 3 3 10
3 ... yes yes 3 2 2 1 1 5 2
4 ... no no 4 3 2 1 2 5 4
.. ... ... ... ... ... ... ... ... ... ...
785 ... no no 5 5 4 4 5 4 11
786 ... yes no 2 4 5 3 4 2 3
787 ... no no 5 5 3 3 3 3 3
788 ... yes no 4 4 1 3 4 5 0
789 ... yes no 3 2 3 3 3 5 5
passed
0 no
1 no
2 yes
3 yes
4 yes
.. ...
785 no
786 yes
787 no
788 yes
789 no
import pandas as pd
setosa = pd.read_csv("Iris.csv")
versicolor = pd.read_csv("Iris.csv")
data = pd.concat([setosa, versicolor], axis=0)
print(data)
Species
0 Iris-setosa
1 Iris-setosa
2 Iris-setosa
3 Iris-setosa
4 Iris-setosa
.. ...
145 Iris-virginica
146 Iris-virginica
147 Iris-virginica
148 Iris-virginica
149 Iris-virginica
import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv("Iris.csv")
train, test = train_test_split(data, test_size=0.2, random_state=42)
print(train.shape)
print(test.shape)
(120, 6)
(30, 6)
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv("Titanic-Dataset.csv")
X = df.drop("Survived", axis=1)
y = df["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
import pandas as pd
df = pd.read_csv("winequalityN.csv")
sampled = df.sample(n=100, random_state=42)
print(sampled)
import pandas as pd
df = pd.read_csv("housing.csv")
print("Mean:\n", df.mean(numeric_only=True))
print("Median:\n", df.median(numeric_only=True))
print("Mode:\n", df.mode(numeric_only=True).iloc[0])
Mean:
Avg. Area Income 6.858311e+04
Avg. Area House Age 5.977222e+00
Avg. Area Number of Rooms 6.987792e+00
Avg. Area Number of Bedrooms 3.981330e+00
Area Population 3.616352e+04
Price 1.232073e+06
dtype: float64
Median:
Avg. Area Income 6.880429e+04
Avg. Area House Age 5.970429e+00
Avg. Area Number of Rooms 7.002902e+00
Avg. Area Number of Bedrooms 4.050000e+00
Area Population 3.619941e+04
Price 1.232669e+06
dtype: float64
Mode:
Avg. Area Income 17796.631190
Avg. Area House Age 2.644304
Avg. Area Number of Rooms 3.236194
Avg. Area Number of Bedrooms 4.380000
Area Population 172.610686
Price 15938.657920
Name: 0, dtype: float64
import pandas as pd
pop_variance = num.var(ddof=0)
pop_std = num.std(ddof=0)
sample_variance = num.var(ddof=1)
sample_std = num.std(ddof=1)
Sample Variance:
type NaN
fixed acidity 1.681560
volatile acidity 0.027109
citric acid 0.021102
residual sugar 22.639751
chlorides 0.001228
free sulfur dioxide 315.041192
total sulfur dioxide 3194.720039
density 0.000009
pH 0.025840
sulphates 0.022146
alcohol 1.422561
quality 0.762575
dtype: float64
import pandas as pd
from scipy.stats import skew, kurtosis
df = pd.read_csv("housing.csv")
print("Skewness:\n", df.skew(numeric_only=True))
print("Kurtosis:\n", df.kurtosis(numeric_only=True))
Skewness:
Avg. Area Income -0.033720
Avg. Area House Age -0.007214
Avg. Area Number of Rooms -0.040996
Avg. Area Number of Bedrooms 0.376240
Area Population 0.050650
Price -0.002718
dtype: float64
Kurtosis:
Avg. Area Income 0.045574
Avg. Area House Age -0.083437
Avg. Area Number of Rooms -0.074652
Avg. Area Number of Bedrooms -0.701566
Area Population -0.006733
Price -0.054918
dtype: float64
#22. Check frequency counts of categorical variables (titanic.csv).
import pandas as pd
df = pd.read_csv("Titanic-Dataset.csv")
print(df.nunique())
print(df['Sex'].value_counts())
print(df['Embarked'].value_counts())
print(df['Pclass'].value_counts())
PassengerId 891
Survived 2
Pclass 3
Name 891
Sex 2
Age 88
SibSp 7
Parch 7
Ticket 681
Fare 248
Cabin 147
Embarked 3
dtype: int64
Sex
male 577
female 314
Name: count, dtype: int64
Embarked
S 644
C 168
Q 77
Name: count, dtype: int64
Pclass
3 491
1 216
2 184
Name: count, dtype: int64
import pandas as pd
import io
from IPython.display import display
try:
df = pd.read_csv("student-data.csv")
except Exception:
from google.colab import files
uploaded = files.upload()
fn = next(iter(uploaded))
df = pd.read_csv(io.BytesIO(uploaded[fn]))
cols = list(df.columns)
age_col = None
for c in cols:
if str(c).strip().lower() == "age":
age_col = c
break
if age_col is None:
for c in cols:
if "age" in str(c).strip().lower():
age_col = c
break
if age_col is None:
print("No 'Age' column found. Columns:", cols)
else:
ages = pd.to_numeric(df[age_col], errors="coerce").dropna()
if ages.empty:
print("No numeric values found in Age column.")
else:
five_num = [float(ages.min()), float(ages.quantile(0.25)), float(ages.median()), float(ages.quantile
summary_df = pd.DataFrame([five_num], columns=["Min", "Q1", "Median", "Q3", "Max"])
display(summary_df)
df = pd.read_csv("Iris.csv")
df.boxplot(figsize=(10,6))
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("Titanic-Dataset.csv")
plt.hist(df["Fare"], bins=30)
plt.xlabel("Fare")
plt.ylabel("Frequency")
plt.title("Fare Distribution")
plt.show()