1.
Create NumPy arrays from Python Data Structures, Intrinsic NumPy objects and Random Functions
import numpy as np
# From Python Data Structures
list_data = np.array([1, 2, 3, 4, 5])
tuple_data = np.array((1, 2, 3, 4, 5))
dict_data = np.array({'a': 1, 'b': 2})
# Intrinsic NumPy Objects
zeros = np.zeros((3, 3))
ones = np.ones((2, 2))
identity = np.eye(4)
arange = np.arange(0, 10, 2)
linspace = np.linspace(0, 1, 5)
# Random Functions
rand = np.random.rand(3, 3)
randint = np.random.randint(1, 10, size=(2, 3))
normal = np.random.normal(0, 1, 5)
# Output
print("From Python Data Structures:")
print("List to Array:", list_data)
print("Tuple to Array:", tuple_data)
print("Dict to Array:", dict_data)
print("\nIntrinsic NumPy Objects:")
print("Zeros Array:", zeros)
print("Ones Array:", ones)
print("Identity Matrix:", identity)
print("Arange:", arange)
print("Linspace:", linspace)
print("\nRandom Functions:")
print("Random Array (Uniform):", rand)
print("Random Integers:", randint)
print("Random Normal Distribution:", normal)
2.Create Pandas Series and DataFrame from various inputs.
import pandas as pd
import numpy as np
# Pandas Series from different inputs
series_from_list = pd.Series([1, 2, 3, 4, 5])
series_from_dict = pd.Series({'a': 1, 'b': 2, 'c': 3})
series_from_numpy = pd.Series(np.array([10, 20, 30, 40]))
# Pandas DataFrame from various inputs
df_from_dict = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df_from_list = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=['X', 'Y'])
df_from_numpy = pd.DataFrame(np.random.rand(3, 4), columns=['W', 'X', 'Y', 'Z'])
# Outputs
print("Series from List:", series_from_list)
print("Series from Dict:", series_from_dict)
print("Series from Numpy Array:", series_from_numpy)
print("\nDataFrame from Dict:")
print(df_from_dict)
print("\nDataFrame from List:")
print(df_from_list)
print("\nDataFrame from Numpy Array:")
print(df_from_numpy)
3,4. Develop a model on residual analysis of simple linear regression.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing # Use California housing dataset
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# Load California housing dataset
california = fetch_california_housing()
X = california.data[:, 3].reshape(-1, 1) # Using 'AveRooms' (average rooms per household)
y = california.target
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Fit the simple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict the target values using the test set
y_pred = model.predict(X_test)
# Calculate residuals
residuals = y_test - y_pred
# Residual Analysis
# 1. Plotting residuals vs. fitted values (predictions)
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.scatter(y_pred, residuals, color='blue')
plt.axhline(y=0, color='red', linestyle='--')
plt.title('Residuals vs Fitted Values')
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
# 2. Plotting histogram of residuals
plt.subplot(1, 2, 2)
sns.histplot(residuals, kde=True, color='green')
plt.title('Histogram of Residuals')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# 3. Checking Normality with a Q-Q plot
import scipy.stats as stats
plt.figure(figsize=(6, 6))
stats.probplot(residuals, dist="norm", plot=plt)
plt.title('Q-Q Plot of Residuals')
plt.show()
# 4. Checking Homoscedasticity: Residuals vs Fitted values (already shown above)
# 5. Print model performance metrics
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred)}")
print(f"R-squared: {r2_score(y_test, y_pred)}")
5. Import any CSV file to Pandas DataFrame and perform the following:
(a) Handle missing data by detecting and dropping/ filling missing values.
(b) Transform data using apply () and map() method.
import pandas as pd
import numpy as np
file_path = r'C:\Users\peral\Downloads\test.csv' # Use raw string for Windows paths
# Load the CSV file
df = pd.read_csv(file_path)
print(df.head())
# (a) Handle Missing Data
print("\nMissing Data Detection:")
print(df.isnull().sum())
# Dropping rows with missing values
df_dropped = df.dropna()
print("\nDataFrame after Dropping Rows with Missing Values:")
print(df_dropped.head())
# Filling missing values (corrected)
df_filled = df.copy()
for column in df.columns:
if df[column].dtype == 'object': # For categorical columns
df_filled[column] = df[column].fillna(df[column].mode()[0]) # Corrected assignment
else: # For numerical columns
df_filled[column] = df[column].fillna(df[column].mean()) # Corrected assignment
print("\nDataFrame after Filling Missing Values:")
print(df_filled.head())
# (b) Transform Data using apply() and map()
# Apply transformation for numerical columns
for column in df.columns:
if df[column].dtype != 'object': # Apply transformation for numerical columns
df[column + ' Group'] = df[column].apply(lambda x: 'High' if x > df[column].median() else 'Low')
print(f"\nDataFrame after Applying Transformation to {column}:")
print(df[[column, column + ' Group']].head())
# Map categorical columns to uppercase
for column in df.select_dtypes(include=['object']).columns:
df[column] = df[column].map(lambda x: x.upper() if isinstance(x, str) else x)
print(f"\nDataFrame after Mapping {column} to Uppercase:")
print(df[[column]].head())
6. Visualize data using Line Plots, Bar Plots, Histograms, Density Plots and Scatter Plots.
import seaborn as sns
import matplotlib.pyplot as plt
# Load the built-in Iris dataset
df = sns.load_dataset('iris')
# Line Plot
plt.figure(figsize=(10, 6))
sns.lineplot(x=df.index, y=df['sepal_length'])
plt.title('Line Plot of Sepal Length')
plt.show()
# Bar Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='species', y='sepal_length', data=df)
plt.title('Bar Plot of Sepal Length by Species')
plt.show()
# Histogram
plt.figure(figsize=(10, 6))
sns.histplot(df['sepal_length'], kde=False, bins=20)
plt.title('Histogram of Sepal Length')
plt.show()
# Density Plot (Updated with fill=True instead of shade=True)
plt.figure(figsize=(10, 6))
sns.kdeplot(df['sepal_length'], fill=True)
plt.title('Density Plot of Sepal Length')
plt.show()
# Scatter Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='sepal_length', y='sepal_width', hue='species', data=df)
plt.title('Scatter Plot of Sepal Length vs Sepal Width')
plt.show()
7. Manipulation of NumPy arrays - Indexing, Slicing, Reshaping, Joining and Splitting.
import numpy as np
# Indexing
arr = np.array([1, 2, 3, 4, 5])
print(arr[2])
# Slicing
arr2 = np.array([10, 20, 30, 40, 50, 60])
print(arr2[1:4])
# Reshaping
arr3 = np.array([1, 2, 3, 4, 5, 6])
print(arr3.reshape(2, 3))
# Joining
arr4 = np.array([1, 2, 3])
arr5 = np.array([4, 5, 6])
print(np.concatenate((arr4, arr5)))
# Splitting
arr6 = np.array([10, 20, 30, 40, 50, 60])
print(np.split(arr6, 3))
8. Import any CSV file to Pandas DataFrame and perform the following:
(a) Visualize the first and last 10 records.
(b) Get the shape, index and column details.
import pandas as pd
# Import CSV file
df = pd.read_csv(r"C:\Users\peral\Downloads\train.csv") # Make sure the path is correct
# (a) Visualize the first and last 10 records
print("First 10 records:")
print(df.head(10))
print("\nLast 10 records:")
print(df.tail(10))
# (b) Get the shape, index, and column details
print("\nShape:", df.shape)
print("Index:", df.index)
print("Columns:", df.columns)
9. Residual plots of linear regression.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Load data
df = pd.read_csv(r'C:\Users\peral\Downloads\train.csv') # Updated file path
# Choose the feature and target columns
X = df[['age']].values # Use 'age' as the feature (replace with any other numerical column you prefer)
y = df['stroke'].values # Use 'stroke' as the target column
# Train linear regression model
model = LinearRegression()
model.fit(X, y)
# Predictions
y_pred = model.predict(X)
# Residuals
residuals = y - y_pred
# Plot residuals
plt.scatter(X, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Age')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
10. Computation on NumPy arrays using Universal Functions and Mathematical methods.
import numpy as np
# Create array
arr = np.array([1, 2, 3, 4, 5])
# Universal functions (ufuncs)
print(np.sqrt(arr))
print(np.exp(arr))
print(np.log(arr))
# Mathematical methods
print(np.sum(arr))
print(np.mean(arr))
print(np.median(arr))
print(np.std(arr))
print(np.var(arr))
11. import any CSV file to Pandas DataFrame and perform the following:
(a) Detect and filter outliers.
(b) Perform Vectorized String operations on Pandas Series.
import pandas as pd
import numpy as np
df = pd.read_csv(r'C:\Users\peral\Downloads\train.csv')
column_name = 'age'
Q1 = df[column_name].quantile(0.25)
Q3 = df[column_name].quantile(0.75)
IQR = Q3 - Q1
filtered_df = df[(df[column_name] >= (Q1 - 1.5 * IQR)) & (df[column_name] <= (Q3 + 1.5 * IQR))]
print("Filtered DataFrame (Outliers removed):")
print(filtered_df)
string_column = 'gender'
df[string_column] = df[string_column].str.lower()
df[string_column] = df[string_column].str.replace('old', 'new')
print("\nDataFrame after string operations:")
print(df[[string_column]].head())
12. Download the House Pricing dataset from Kaggle and map the values to 23 Aesthetics.
import pandas as pd
file_path = 'path_to_your_downloaded_folder/train.csv'
df = pd.read_csv(file_path)
aesthetic_mapping = {
'ExterCond': {'Po': 'Poor', 'Fa': 'Fair', 'TA': 'Average', 'Gd': 'Good', 'Ex': 'Excellent'},
'ExterQual': {'Po': 'Poor', 'Fa': 'Fair', 'TA': 'Average', 'Gd': 'Good', 'Ex': 'Excellent'},
'PoolQC': {'NA': 'No Pool', 'Ex': 'Excellent', 'Gd': 'Good', 'TA': 'Average', 'Fa': 'Fair'},
for column, mapping in aesthetic_mapping.items():
if column in df.columns:
df[column] = df[column].map(mapping).fillna(df[column])
print(df[['ExterCond', 'ExterQual', 'PoolQC']].head())