0% found this document useful (0 votes)
7 views11 pages

SML - Lab03 - Colab

Uploaded by

harshpandey53151
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
7 views11 pages

SML - Lab03 - Colab

Uploaded by

harshpandey53151
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 11

Name-Harsh Pandey

ID - S25MCAG0021
LAB - III Statistical Machine Learning

keyboard_arrow_down A. Data Cleaning


#1. Load the dataset (titanic.csv) and display the first 10 rows.

import pandas as pd

df = pd.read_csv('Titanic-Dataset.csv')
print(df.head(10))

PassengerId Survived Pclass \


0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3
5 6 0 3
6 7 0 1
7 8 0 3
8 9 1 3
9 10 1 2

Name Sex Age SibSp \


0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
4 Allen, Mr. William Henry male 35.0 0
5 Moran, Mr. James male NaN 0
6 McCarthy, Mr. Timothy J male 54.0 0
7 Palsson, Master. Gosta Leonard male 2.0 3
8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0
9 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1

Parch Ticket Fare Cabin Embarked


0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
3 0 113803 53.1000 C123 S
4 0 373450 8.0500 NaN S
5 0 330877 8.4583 NaN Q
6 0 17463 51.8625 E46 S
7 1 349909 21.0750 NaN S
8 2 347742 11.1333 NaN S
9 0 237736 30.0708 NaN C

#2. Find the number of missing values in each column (housing.csv).

import pandas as pd
df = pd.read_csv("housing.csv")

missing_values = df.isnull().sum()

print("Missing values in each column:")


print(missing_values)

Missing values in each column:


Avg. Area Income 0
Avg. Area House Age 0
Avg. Area Number of Rooms 0
Avg. Area Number of Bedrooms 0
Area Population 0
Price 0
Address 0
dtype: int64

#3. Replace missing age values with the mean age (titanic.csv).
import pandas as pd

df = pd.read_csv("Titanic-Dataset.csv")
df["Age"].fillna(df["Age"].mean(), inplace=True)
print(df["Age"])

0 22.000000
1 38.000000
2 26.000000
3 35.000000
4 35.000000
...
886 27.000000
887 19.000000
888 29.699118
889 26.000000
890 32.000000
Name: Age, Length: 891, dtype: float64
/tmp/ipython-input-3046390652.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series throug
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[c

df["Age"].fillna(df["Age"].mean(), inplace=True)

#4. Detect and remove duplicate rows (winequality.csv).

import pandas as pd

df = pd.read_csv("winequalityN.csv")
print("Duplicate rows:", df.duplicated().sum())

df = df.drop_duplicates()

print("After removing duplicates:", df.duplicated().sum())

df.to_csv("winequality_cleaned.csv", index=False)

Duplicate rows: 1168


After removing duplicates: 0

#5. Standardize column names to lowercase (iris.csv).

import pandas as pd

df = pd.read_csv("Iris.csv")
df.columns = df.columns.str.lower()
print(df.head())
import pandas as pd
from google.colab import drive

id sepallengthcm sepalwidthcm petallengthcm petalwidthcm species


0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
3 4 4.6 3.1 1.5 0.2 Iris-setosa
4 5 5.0 3.6 1.4 0.2 Iris-setosa

keyboard_arrow_down B. Data Transformation


#6. Convert categorical column “Sex” into numerical (0 = male, 1 = female) (titanic.csv).

import pandas as pd

df = pd.read_csv("Titanic-Dataset.csv")
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
print(df.head())

PassengerId Survived Pclass \


0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3

Name Sex Age SibSp Parch \


0 Braund, Mr. Owen Harris 0 22.0 1 0
1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 38.0 1 0
2 Heikkinen, Miss. Laina 1 26.0 0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 35.0 1 0
4 Allen, Mr. William Henry 0 35.0 0 0

Ticket Fare Cabin Embarked


0 A/5 21171 7.2500 NaN S
1 PC 17599 71.2833 C85 C
2 STON/O2. 3101282 7.9250 NaN S
3 113803 53.1000 C123 S
4 373450 8.0500 NaN S

#7. Apply One-Hot Encoding on “Embarked” (titanic.csv).

import pandas as pd

df = pd.read_csv("Titanic-Dataset.csv")
df = pd.get_dummies(df, columns=["Embarked"])
print(df.head())

PassengerId Survived Pclass \


0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3

Name Sex Age SibSp \


0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
4 Allen, Mr. William Henry male 35.0 0

Parch Ticket Fare Cabin Embarked_C Embarked_Q Embarked_S


0 0 A/5 21171 7.2500 NaN False False True
1 0 PC 17599 71.2833 C85 True False False
2 0 STON/O2. 3101282 7.9250 NaN False False True
3 0 113803 53.1000 C123 False False True
4 0 373450 8.0500 NaN False False True

#8. Standardize numerical values using Z-score normalization (winequality.csv).

import pandas as pd

df = pd.read_csv("winequalityN.csv")
num_cols = df.select_dtypes(include=['float64','int64']).columns
df[num_cols] = (df[num_cols] - df[num_cols].mean()) / df[num_cols].std()
print(df.head())

type fixed acidity volatile acidity citric acid residual sugar \


0 white -0.167017 -0.423270 0.284158 3.206237
1 white -0.706828 -0.241064 0.146478 -0.807950
2 white 0.681258 -0.362535 0.559517 0.305934
3 white -0.012785 -0.666211 0.008798 0.642201
4 white -0.012785 -0.666211 0.008798 0.642201

chlorides free sulfur dioxide total sulfur dioxide density pH \


0 -0.315149 0.815503 0.959902 2.102052 -1.358618
1 -0.200981 -0.931035 0.287595 -0.232314 0.507654
2 -0.172439 -0.029596 -0.331634 0.134515 0.258818
3 0.055898 0.928182 1.242978 0.301255 -0.176646
4 0.055898 0.928182 1.242978 0.301255 -0.176646

sulphates alcohol quality


0 -0.545749 -1.418449 0.207983
1 -0.276957 -0.831551 0.207983
2 -0.612947 -0.328496 0.207983
3 -0.881739 -0.496181 0.207983
4 -0.881739 -0.496181 0.207983

#9. Create a new feature “BMI” from height and weight columns (students.csv).

import pandas as pd
df = pd.read_csv("student-data.csv")
print(df.columns)

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',


'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
'Walc', 'health', 'absences', 'passed'],
dtype='object')

keyboard_arrow_down C . Data Reduction


#10. Drop irrelevant columns like PassengerId (titanic.csv).
import pandas as pd

df = pd.read_csv("Titanic-Dataset.csv")
df = df.drop(["PassengerId"], axis=1)
print(df.head())

Survived Pclass Name \


0 0 3 Braund, Mr. Owen Harris
1 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th...
2 1 3 Heikkinen, Miss. Laina
3 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel)
4 0 3 Allen, Mr. William Henry

Sex Age SibSp Parch Ticket Fare Cabin Embarked


0 male 22.0 1 0 A/5 21171 7.2500 NaN S
1 female 38.0 1 0 PC 17599 71.2833 C85 C
2 female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 female 35.0 1 0 113803 53.1000 C123 S
4 male 35.0 0 0 373450 8.0500 NaN S

#11. Keep only categorical columns (titanic.csv).

import pandas as pd

df = pd.read_csv("Titanic-Dataset.csv")
cat_df = df.select_dtypes(include=['object'])
cat_df

Name Sex Ticket Cabin Embarked

0 Braund, Mr. Owen Harris male A/5 21171 NaN S

1 Cumings, Mrs. John Bradley (Florence Briggs Th... female PC 17599 C85 C

2 Heikkinen, Miss. Laina female STON/O2. 3101282 NaN S

3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 113803 C123 S

4 Allen, Mr. William Henry male 373450 NaN S

... ... ... ... ... ...

886 Montvila, Rev. Juozas male 211536 NaN S

887 Graham, Miss. Margaret Edith female 112053 B42 S

888 Johnston, Miss. Catherine Helen "Carrie" female W./C. 6607 NaN S

889 Behr, Mr. Karl Howell male 111369 C148 C

890 Dooley, Mr. Patrick male 370376 NaN Q

891 rows × 5 columns

#12. Keep only numerical columns (housing.csv).

import pandas as pd

df = pd.read_csv("housing.csv")
df = df.select_dtypes(include=['number'])
print(df)

Avg. Area Income Avg. Area House Age Avg. Area Number of Rooms \
0 79545.45857 5.682861 7.009188
1 79248.64245 6.002900 6.730821
2 61287.06718 5.865890 8.512727
3 63345.24005 7.188236 5.586729
4 59982.19723 5.040555 7.839388
... ... ... ...
4995 60567.94414 7.830362 6.137356
4996 78491.27543 6.999135 6.576763
4997 63390.68689 7.250591 4.805081
4998 68001.33124 5.534388 7.130144
4999 65510.58180 5.992305 6.792336

Avg. Area Number of Bedrooms Area Population Price


0 4.09 23086.80050 1.059034e+06
1 3.09 40173.07217 1.505891e+06
2 5.13 36882.15940 1.058988e+06
3 3.26 34310.24283 1.260617e+06
4 4.23 26354.10947 6.309435e+05
... ... ... ...
4995 3.46 22837.36103 1.060194e+06
4996 4.02 25616.11549 1.482618e+06
4997 2.13 33266.14549 1.030730e+06
4998 5.44 42625.62016 1.198657e+06
4999 4.07 46501.28380 1.298950e+06

[5000 rows x 6 columns]

keyboard_arrow_down D. Data Integration


#13. Merge train and test datasets into a single dataframe (titanic_train.csv + titanic_test.csv).

import pandas as pd

train = pd.read_csv("Titanic-Dataset.csv")
test = pd.read_csv("Titanic-Dataset.csv")
df = pd.concat([train, test], ignore_index=True)
df.head()

PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked

0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S

Cumings, Mrs. John Bradley


1 2 1 1 female 38.0 1 0 PC 17599 71.2833 C85 C
(Florence Briggs Th...

STON/O2.
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 7.9250 NaN S
3101282

Futrelle, Mrs. Jacques Heath


3 4 1 1 female 35 0 1 0 113803 53 1000 C123 S

#14. Combine student grades from two CSV files (students_math.csv + students_portuguese.csv).

import pandas as pd

math = pd.read_csv("student-data.csv")
portuguese = pd.read_csv("student-data.csv")
combined = pd.concat([math, portuguese], ignore_index=True)
print(combined)

school sex age address famsize Pstatus Medu Fedu Mjob Fjob \
0 GP F 18 U GT3 A 4 4 at_home teacher
1 GP F 17 U GT3 T 1 1 at_home other
2 GP F 15 U LE3 T 1 1 at_home other
3 GP F 15 U GT3 T 4 2 health services
4 GP F 16 U GT3 T 3 3 other other
.. ... .. ... ... ... ... ... ... ... ...
785 MS M 20 U LE3 A 2 2 services services
786 MS M 17 U LE3 T 3 1 services services
787 MS M 21 R GT3 T 1 1 other other
788 MS M 18 R LE3 T 3 2 services other
789 MS M 19 U LE3 T 1 1 other at_home

... internet romantic famrel freetime goout Dalc Walc health absences \
0 ... no no 4 3 4 1 1 3 6
1 ... yes no 5 3 3 1 1 3 4
2 ... yes no 4 3 2 2 3 3 10
3 ... yes yes 3 2 2 1 1 5 2
4 ... no no 4 3 2 1 2 5 4
.. ... ... ... ... ... ... ... ... ... ...
785 ... no no 5 5 4 4 5 4 11
786 ... yes no 2 4 5 3 4 2 3
787 ... no no 5 5 3 3 3 3 3
788 ... yes no 4 4 1 3 4 5 0
789 ... yes no 3 2 3 3 3 5 5

passed
0 no
1 no
2 yes
3 yes
4 yes
.. ...
785 no
786 yes
787 no
788 yes
789 no

[790 rows x 31 columns]

#15. Concatenate datasets row-wise (iris_setosa.csv + iris_versicolor.csv).

import pandas as pd
setosa = pd.read_csv("Iris.csv")
versicolor = pd.read_csv("Iris.csv")
data = pd.concat([setosa, versicolor], axis=0)
print(data)

Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm \


0 1 5.1 3.5 1.4 0.2
1 2 4.9 3.0 1.4 0.2
2 3 4.7 3.2 1.3 0.2
3 4 4.6 3.1 1.5 0.2
4 5 5.0 3.6 1.4 0.2
.. ... ... ... ... ...
145 146 6.7 3.0 5.2 2.3
146 147 6.3 2.5 5.0 1.9
147 148 6.5 3.0 5.2 2.0
148 149 6.2 3.4 5.4 2.3
149 150 5.9 3.0 5.1 1.8

Species
0 Iris-setosa
1 Iris-setosa
2 Iris-setosa
3 Iris-setosa
4 Iris-setosa
.. ...
145 Iris-virginica
146 Iris-virginica
147 Iris-virginica
148 Iris-virginica
149 Iris-virginica

[300 rows x 6 columns]

keyboard_arrow_down E. Data Sampling & Splitting


#16. Split dataset into 80% train and 20% test (iris.csv).

import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("Iris.csv")
train, test = train_test_split(data, test_size=0.2, random_state=42)
print(train.shape)
print(test.shape)

(120, 6)
(30, 6)

#17. Create stratified split based on class (titanic.csv).

import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("Titanic-Dataset.csv")
X = df.drop("Survived", axis=1)
y = df["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

#18. Randomly sample 100 rows (winequality.csv).

import pandas as pd

df = pd.read_csv("winequalityN.csv")
sampled = df.sample(n=100, random_state=42)
print(sampled)

type fixed acidity volatile acidity citric acid residual sugar \


3103 white 7.0 0.250 0.45 2.30
1419 white 7.6 0.140 0.74 1.60
4761 white 6.2 0.150 0.27 11.00
4690 white 6.7 0.160 0.32 12.50
4032 white 6.8 0.270 0.22 17.80
... ... ... ... ... ...
3244 white 6.7 0.210 0.36 8.55
1321 white 7.3 0.430 0.37 4.60
2585 white 6.8 0.180 0.30 12.80
2981 white 8.5 0.230 0.28 11.10
132 white 6.5 0.425 0.40 13.10
chlorides free sulfur dioxide total sulfur dioxide density pH \
3103 0.045 40.0 118.0 0.99064 3.16
1419 0.040 27.0 103.0 0.99160 3.07
4761 0.035 46.0 116.0 0.99602 3.12
4690 0.035 18.0 156.0 0.99666 2.88
4032 0.034 16.0 116.0 0.99890 3.07
... ... ... ... ... ...
3244 0.020 20.0 86.0 0.99146 3.19
1321 0.028 17.0 114.0 0.99100 3.23
2585 0.062 19.0 171.0 0.99808 3.00
2981 0.033 30.0 97.0 0.99507 3.03
132 0.038 59.0 241.0 0.99790 3.23

sulphates alcohol quality


3103 0.48 11.9 7
1419 0.40 10.8 7
4761 0.38 9.1 6
4690 0.36 9.0 6
4032 0.53 9.2 5
... ... ... ...
3244 0.22 13.4 7
1321 0.43 13.2 6
2585 0.52 9.0 7
2981 0.39 10.5 7
132 0.57 9.0 5

[100 rows x 13 columns]

keyboard_arrow_down F. Data exploration


#19. Find mean, median, mode of numeric columns (housing.csv).

import pandas as pd

df = pd.read_csv("housing.csv")
print("Mean:\n", df.mean(numeric_only=True))
print("Median:\n", df.median(numeric_only=True))
print("Mode:\n", df.mode(numeric_only=True).iloc[0])

Mean:
Avg. Area Income 6.858311e+04
Avg. Area House Age 5.977222e+00
Avg. Area Number of Rooms 6.987792e+00
Avg. Area Number of Bedrooms 3.981330e+00
Area Population 3.616352e+04
Price 1.232073e+06
dtype: float64
Median:
Avg. Area Income 6.880429e+04
Avg. Area House Age 5.970429e+00
Avg. Area Number of Rooms 7.002902e+00
Avg. Area Number of Bedrooms 4.050000e+00
Area Population 3.619941e+04
Price 1.232669e+06
dtype: float64
Mode:
Avg. Area Income 17796.631190
Avg. Area House Age 2.644304
Avg. Area Number of Rooms 3.236194
Avg. Area Number of Bedrooms 4.380000
Area Population 172.610686
Price 15938.657920
Name: 0, dtype: float64

#20. Calculate standard deviation and variance (winequality.csv).

import pandas as pd

df = pd.read_csv("winequalityN.csv", sep=None, engine="python")


num = df.apply(pd.to_numeric, errors="coerce").select_dtypes(include="number")

pop_variance = num.var(ddof=0)
pop_std = num.std(ddof=0)
sample_variance = num.var(ddof=1)
sample_std = num.std(ddof=1)

print("Population Variance:\n", pop_variance)


print("\nPopulation Standard Deviation:\n", pop_std)
print("\nSample Variance:\n", sample_variance)
print("\nSample Standard Deviation:\n", sample_std)
Population Variance:
type NaN
fixed acidity 1.681301
volatile acidity 0.027105
citric acid 0.021099
residual sugar 22.636265
chlorides 0.001227
free sulfur dioxide 314.992702
total sulfur dioxide 3194.228316
density 0.000009
pH 0.025836
sulphates 0.022142
alcohol 1.422342
quality 0.762457
dtype: float64

Population Standard Deviation:


type NaN
fixed acidity 1.296650
volatile acidity 0.164636
citric acid 0.145254
residual sugar 4.757758
chlorides 0.035033
free sulfur dioxide 17.748034
total sulfur dioxide 56.517505
density 0.002998
pH 0.160736
sulphates 0.148803
alcohol 1.192620
quality 0.873188
dtype: float64

Sample Variance:
type NaN
fixed acidity 1.681560
volatile acidity 0.027109
citric acid 0.021102
residual sugar 22.639751
chlorides 0.001228
free sulfur dioxide 315.041192
total sulfur dioxide 3194.720039
density 0.000009
pH 0.025840
sulphates 0.022146
alcohol 1.422561
quality 0.762575
dtype: float64

Sample Standard Deviation:


type NaN
fixed acidity 1.296750
volatile acidity 0.164649
citric acid 0.145265
residual sugar 4.758125
chlorides 0.035036
free sulfur dioxide 17.749400
total sulfur dioxide 56.521855
density 0.002999
H 0 160748

#21. Find skewness and kurtosis (housing.csv).

import pandas as pd
from scipy.stats import skew, kurtosis

df = pd.read_csv("housing.csv")
print("Skewness:\n", df.skew(numeric_only=True))
print("Kurtosis:\n", df.kurtosis(numeric_only=True))

Skewness:
Avg. Area Income -0.033720
Avg. Area House Age -0.007214
Avg. Area Number of Rooms -0.040996
Avg. Area Number of Bedrooms 0.376240
Area Population 0.050650
Price -0.002718
dtype: float64
Kurtosis:
Avg. Area Income 0.045574
Avg. Area House Age -0.083437
Avg. Area Number of Rooms -0.074652
Avg. Area Number of Bedrooms -0.701566
Area Population -0.006733
Price -0.054918
dtype: float64
#22. Check frequency counts of categorical variables (titanic.csv).

import pandas as pd

df = pd.read_csv("Titanic-Dataset.csv")
print(df.nunique())
print(df['Sex'].value_counts())
print(df['Embarked'].value_counts())
print(df['Pclass'].value_counts())

PassengerId 891
Survived 2
Pclass 3
Name 891
Sex 2
Age 88
SibSp 7
Parch 7
Ticket 681
Fare 248
Cabin 147
Embarked 3
dtype: int64
Sex
male 577
female 314
Name: count, dtype: int64
Embarked
S 644
C 168
Q 77
Name: count, dtype: int64
Pclass
3 491
1 216
2 184
Name: count, dtype: int64

#23. Display 5-number summary for “Age” (students.csv).

import pandas as pd
import io
from IPython.display import display
try:
df = pd.read_csv("student-data.csv")
except Exception:
from google.colab import files
uploaded = files.upload()
fn = next(iter(uploaded))
df = pd.read_csv(io.BytesIO(uploaded[fn]))
cols = list(df.columns)
age_col = None
for c in cols:
if str(c).strip().lower() == "age":
age_col = c
break
if age_col is None:
for c in cols:
if "age" in str(c).strip().lower():
age_col = c
break
if age_col is None:
print("No 'Age' column found. Columns:", cols)
else:
ages = pd.to_numeric(df[age_col], errors="coerce").dropna()
if ages.empty:
print("No numeric values found in Age column.")
else:
five_num = [float(ages.min()), float(ages.quantile(0.25)), float(ages.median()), float(ages.quantile
summary_df = pd.DataFrame([five_num], columns=["Min", "Q1", "Median", "Q3", "Max"])
display(summary_df)

Min Q1 Median Q3 Max

0 15.0 16.0 17.0 18.0 22.0

#24. Plot boxplot to detect outliers (iris.csv).


import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("Iris.csv")
df.boxplot(figsize=(10,6))
plt.show()

#25. Plot histogram for “Fare” distribution (titanic.csv).

import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("Titanic-Dataset.csv")
plt.hist(df["Fare"], bins=30)
plt.xlabel("Fare")
plt.ylabel("Frequency")
plt.title("Fare Distribution")
plt.show()

You might also like