0% found this document useful (0 votes)
69 views7 pages

Real Estate Data Insights

This document analyzes housing data from India. It begins by importing necessary libraries and reading in a CSV file containing housing data. Some initial data preprocessing steps are performed, including handling missing values, dropping unnecessary features, and encoding categorical features. Exploratory data analysis is conducted through visualizations of categorical and numerical features. Correlations between features are also analyzed through a heatmap. The goal of this analysis is to understand patterns in the Indian housing data.

Uploaded by

mellouk ayoub
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
69 views7 pages

Real Estate Data Insights

This document analyzes housing data from India. It begins by importing necessary libraries and reading in a CSV file containing housing data. Some initial data preprocessing steps are performed, including handling missing values, dropping unnecessary features, and encoding categorical features. Exploratory data analysis is conducted through visualizations of categorical and numerical features. Correlations between features are also analyzed through a heatmap. The goal of this analysis is to understand patterns in the Indian housing data.

Uploaded by

mellouk ayoub
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 7

IndianHousingAnalysis By Ahmad Raza

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('IndianHouses.csv')

df.head()

Area BHK Bathroom Furnishing Locality Parking Price Status Transaction Type Per_Sqft

Semi-
0 800.0 3 2.0 Rohini Sector 25 1.0 6500000 Ready_to_move New_Property Builder_Floor NaN
Furnished

Semi- J R Designers Floors,


1 750.0 2 2.0 1.0 5000000 Ready_to_move New_Property Apartment 6667.0
Furnished Rohini Sector 24

Citizen Apartment, Rohini


2 950.0 2 2.0 Furnished 1.0 15500000 Ready_to_move Resale Apartment 6667.0
Sector 13

Semi-
3 600.0 2 2.0 Rohini Sector 24 1.0 4200000 Ready_to_move Resale Builder_Floor 6667.0
Furnished

Semi- Rohini Sector 24 carpet


4 650.0 2 2.0 1.0 6200000 Ready_to_move New_Property Builder_Floor 6667.0
Furnished area 650 sqft status R...

Data Preprocessing Part 1


df.select_dtypes(include='object').nunique()

Furnishing 3
Locality 365
Status 2
Transaction 2
Type 2
dtype: int64

df['Locality']

0 Rohini Sector 25
1 J R Designers Floors, Rohini Sector 24
2 Citizen Apartment, Rohini Sector 13
3 Rohini Sector 24
4 Rohini Sector 24 carpet area 650 sqft status R...
...
1254 Chittaranjan Park
1255 Chittaranjan Park
1256 Chittaranjan Park
1257 Chittaranjan Park Block A
1258 Chittaranjan Park
Name: Locality, Length: 1259, dtype: object

#we dont need deep information of locality we need main detail as i split in feature
df['Locality'].str.split(' ').str[1]

0 Sector
1 R
2 Apartment,
3 Sector
4 Sector
...
1254 Park
1255 Park
1256 Park
1257 Park
1258 Park
Name: Locality, Length: 1259, dtype: object

df['Locality'] = df['Locality'].str.split(' ').str[1]

#using split string funtion we split and got less number of unique values but we need top 30
plt.figure(figsize=(10,5))
df['Locality'].value_counts().head(50).plot(kind='bar')
plt.show()
df['Locality'].nunique()

119

#the number opf unique values is less than nefore but we cant handle it so we drop it
df.drop('Locality',axis=1,inplace=True)

Handle Missing Values


df.isnull().sum()

Area 0
BHK 0
Bathroom 2
Furnishing 5
Parking 33
Price 0
Status 0
Transaction 0
Type 5
Per_Sqft 241
dtype: int64

#we fill nan values by mean because of numerical values


df['Per_Sqft'] = df['Per_Sqft'].fillna(df['Per_Sqft'].mean())

#we fill nan values by mode because of categorical values


df['Bathroom'] = df['Bathroom'].fillna(df['Bathroom'].mode()[0])
df['Parking'] = df['Parking'].fillna(df['Parking'].mode()[0])
df['Furnishing'] = df['Furnishing'].fillna(df['Furnishing'].mode()[0])
df['Type'] = df['Type'].fillna(df['Type'].mode()[0])

Handle DataTypes
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Area 1259 non-null float64
1 BHK 1259 non-null int64
2 Bathroom 1259 non-null float64
3 Furnishing 1259 non-null object
4 Parking 1259 non-null float64
5 Price 1259 non-null int64
6 Status 1259 non-null object
7 Transaction 1259 non-null object
8 Type 1259 non-null object
9 Per_Sqft 1259 non-null float64
dtypes: float64(4), int64(2), object(4)
memory usage: 98.5+ KB

df['Bathroom'] = df['Bathroom'].astype('int')

df['Parking'] = df['Parking'].astype('int')

Exploratary Data Analysis


cat_vars = ['Furnishing','Status','Transaction','Type']

num_cols = len(cat_vars)

fig , axs = plt.subplots(nrows=2,ncols=2,figsize=(15,10))


axs = axs.flatten()

for i , var in enumerate (cat_vars):


sns.countplot(y=var,data=df,ax=axs[i])
axs[i].set_title(var)

if num_cols < len(axs):


for i in range(num_cols, len(axs)):
fig.delaxes(axs[i])

fig.tight_layout()
plt.show()

int_vars = df.select_dtypes(include=['int','float'])

num_cols = len(int_vars)

fig , axs = plt.subplots(nrows=3,ncols=2,figsize=(15,10))


axs = axs.flatten()
for i , var in enumerate (int_vars):
df[var].plot(kind='hist',ax=axs[i])
axs[i].set_title(var)

if num_cols < len(axs):


for i in range(num_cols, len(axs)):
fig.delaxes(axs[i])

fig.tight_layout()
plt.show()

#exploratary data anaylsis with kde(kernal density estimation)


num = df.select_dtypes(include=['int','float']).columns.tolist()
col = len(num)

fig , axs = plt.subplots(nrows=col,ncols=2,figsize=(15,20))


axs = axs.flatten()

for i , var in enumerate (num):


sns.histplot(data=df,x=var,kde=True,ax=axs[i])
axs[i].set_title(var)

if col < len(axs):


for i in range(col, len(axs)):
fig.delaxes(axs[i])

fig.tight_layout()
plt.show()
#exploratary data anaylsis with boxplot to indentity outliers
num = df.select_dtypes(include=['int','float']).columns.tolist()
col = len(num)

fig , axs = plt.subplots(nrows=col,ncols=2,figsize=(15,20))


axs = axs.flatten()

for i , var in enumerate (num):


sns.boxplot(data=df,x=var,ax=axs[i])
axs[i].set_title(var)

if col < len(axs):


for i in range(col, len(axs)):
fig.delaxes(axs[i])

fig.tight_layout()
plt.show()
#eda using dependent feature price
cat = ['Furnishing','Status','Transaction','Type']
col = len(cat)
fig, axs = plt.subplots(nrows=col,ncols=2,figsize=(15,15))
axs = axs.flatten()

for i, var in enumerate (cat):


sns.barplot(x='Price', y=var, data=df, ax=axs[i])
axs[i].set_title(var)

if col < len(axs):


for i in range(col, len(axs)):
fig.delaxes(axs[i])

fig.tight_layout()
plt.show()

Data Preprocessing Part 2


#print all the unique values ob object datatypes to enoded them
#print all the unique values ob object datatypes to enoded them
for col in df.select_dtypes(include='object').columns:
print(f'{col}: {df[col].unique()}')

Furnishing: ['Semi-Furnished' 'Furnished' 'Unfurnished']


Status: ['Ready_to_move' 'Almost_ready']
Transaction: ['New_Property' 'Resale']
Type: ['Builder_Floor' 'Apartment']

#encoded all values by labelencoder


from sklearn import preprocessing
for col in df.select_dtypes(include=['object']).columns:
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(df[col].unique())
df[col] = label_encoder.transform(df[col])
print(f'{col} : {df[col].unique()}')

Furnishing : [1 0 2]
Status : [1 0]
Transaction : [0 1]
Type : [1 0]

#co_relation heatmap
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(),fmt='.2g',annot=True)
plt.show()

Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js

You might also like