Program 1
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Step 1: Load the California Housing dataset
housing_df = pd.read_csv("C:/Users/Mohammed
sadiq/OneDrive/Desktop/python1/Datasets/housing (1).csv")
# Step 2: Create histograms for numerical features
numerical_features = housing_df.select_dtypes(include=[np.number]).columns
# Plot histograms
plt.figure(figsize=(15, 10))
for i, feature in enumerate(numerical_features):
plt.subplot(3, 3, i + 1)
sns.histplot(housing_df[feature], kde=True, bins=30, color='blue')
plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()
# Step 3: Generate box plots for numerical features
plt.figure(figsize=(15, 10))
for i, feature in enumerate(numerical_features):
plt.subplot(3, 3, i + 1)
sns.boxplot(x=housing_df[feature], color='orange')
plt.title(f'Box Plot of {feature}')
plt.tight_layout()
plt.show()
# Step 4: Identify outliers using the IQR method
print("Outliers Detection:")
outliers_summary = {}
for feature in numerical_features:
Q1 = housing_df[feature].quantile(0.25)
Q3 = housing_df[feature].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = housing_df[(housing_df[feature] < lower_bound) | (housing_df[feature] >
upper_bound)]
outliers_summary[feature] = len(outliers)
print(f"{feature}: {len(outliers)} outliers")
# Optional: Print a summary of the dataset
print("\nDataset Summary:")
print(housing_df.describe())
program 2
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
# Step 1: Load the California Housing Dataset
data=pd.read_csv("C:/Users/Mohammed sadiq/OneDrive/Desktop/python1/Datasets/housing
(1).csv")
# Step 2: Compute the correlation matrix
correlation_matrix = data.corr(numeric_only=True)
# Step 3: Visualize the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of California Housing Features')
plt.show()
# Step 4: Create a pair plot to visualize pairwise relationships
sns.pairplot(data, diag_kind='kde', plot_kws={'alpha': 0.5})
plt.suptitle('Pair Plot of California Housing Features', y=1.02)
plt.show()
program-3
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
# Load the Iris dataset
iris = load_iris()
data = iris.data
labels = iris.target
label_names = iris.target_names
# Convert to a DataFrame for better visualization
iris_df = pd.DataFrame(data, columns=iris.feature_names)
# Perform PCA to reduce dimensionality to 2
pca = PCA(n_components=2)
data_reduced = pca.fit_transform(data)
# Create a DataFrame for the reduced data
reduced_df = pd.DataFrame(data_reduced, columns=['Principal Component 1', 'Principal Component
2'])
reduced_df['Label'] = labels
# Plot the reduced data
plt.figure(figsize=(8, 6))
colors = ['r', 'g', 'b']
for i, label in enumerate(np.unique(labels)):
plt.scatter(
reduced_df[reduced_df['Label'] == label]['Principal Component 1'],
reduced_df[reduced_df['Label'] == label]['Principal Component 2'],
label=label_names[label],
color=colors[i]
plt.title('PCA on Iris Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.grid()
plt.show()
program-4
import pandas as pd
def find_s_algorithm(file_path):
data = pd.read_csv(file_path)
print("Training data:")
print(data)
attributes = data.columns[:-1]
class_label = data.columns[-1]
hypothesis = ['?' for _ in attributes]
for index, row in data.iterrows():
if row[class_label] == 'yes':
for i, value in enumerate(row[attributes]):
if hypothesis[i] == '?' or hypothesis[i] == value:
hypothesis[i] = value
else:
hypothesis[i] = '?'
return hypothesis
file_path ="C:/Users/Mohammed sadiq/OneDrive/Desktop/python1/Datasets/data (1).csv"
hypothesis = find_s_algorithm(file_path)
print("\nThe final hypothesis is:", hypothesis)
program-5
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
# Generate random data
data = np.random.rand(100)
labels = ["Class1" if x <= 0.5 else "Class2" for x in data[:50]]
# Define Euclidean distance function
def euclidean_distance(x1, x2):
return abs(x1 - x2)
# Define k-NN classifier
def knn_classifier(train_data, train_labels, test_point, k):
distances = [(euclidean_distance(test_point, train_data[i]), train_labels[i]) for i in
range(len(train_data))]
distances.sort(key=lambda x: x[0])
k_nearest_neighbors = distances[:k]
k_nearest_labels = (label for _, label in k_nearest_neighbors)
return Counter(k_nearest_labels).most_common(1)[0][0]
# Split into training and testing data
train_data = data[:50]
train_labels = labels
test_data = data[50:]
# k values to test
k_values = [1, 2, 3, 4, 5, 20, 30]
print("--- k-Nearest Neighbors Classification ---")
print("Training dataset: First 50 points labeled based on the rule (x <= 0.5 -> Class1, x > 0.5 ->
Class2)")
print("Testing dataset: Remaining 50 points to be classified\n")
# Store classification results
results = {}
for k in k_values:
print(f"Results for k = {k}:")
classified_labels = [knn_classifier(train_data, train_labels, test_point, k) for test_point in test_data]
results[k] = classified_labels
for i, label in enumerate(classified_labels, start=51):
print(f"Point x{i} (value: {test_data[i - 51]:.4f}) is classified as {label}")
print("\n")
print("Classification complete.\n")
# Plot results
for k in k_values:
classified_labels = results[k]
class1_points = [test_data[i] for i in range(len(test_data)) if classified_labels[i] == "Class1"]
class2_points = [test_data[i] for i in range(len(test_data)) if classified_labels[i] == "Class2"]
plt.figure(figsize=(10, 6))
plt.scatter(train_data, [0] * len(train_data), c=["blue" if label == "Class1" else "red" for label in
train_labels],
label="Training Data", marker="o")
plt.scatter(class1_points, [1] * len(class1_points), c="blue", label="Class1 (Test)", marker="x")
plt.scatter(class2_points, [1] * len(class2_points), c="red", label="Class2 (Test)", marker="x")
plt.title(f"k-NN Classification Results for k = {k}")
plt.xlabel("Data Points")
plt.ylabel("Classification Level")
plt.legend()
plt.grid(True)
plt.show()
program-6
import numpy as np
import matplotlib.pyplot as plt
def lwr(x, X, y, tau):
W = np.diag(np.exp(-np.sum((X - x)**2, axis=1) / (2 * tau**2)))
theta = np.linalg.pinv(X.T @ W @ X) @ X.T @ W @ y
return x @ theta
# Data
np.random.seed(42)
X = np.linspace(0, 2 * np.pi, 100)
y = np.sin(X) + 0.1 * np.random.randn(100)
Xb = np.c_[np.ones(X.shape), X]
# Prediction
x_test = np.linspace(0, 2 * np.pi, 200)
x_test_b = np.c_[np.ones(x_test.shape), x_test]
tau = 0.5
y_pred = np.array([lwr(xi, Xb, y, tau) for xi in x_test_b])
# Plot
plt.scatter(X, y, color='red', label='Training Data', alpha=0.7)
plt.plot(x_test, y_pred, color='blue', label=f'LWR Fit (tau={tau})')
plt.xlabel('X'); plt.ylabel('y'); plt.title('Locally Weighted Regression')
plt.legend(); plt.grid(True); plt.show()
program-7
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
# Load Boston Housing Dataset from CSV
# Make sure you have the correct column names
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
boston_data = pd.read_csv("C:/Users/Mohammed
sadiq/OneDrive/Desktop/python1/Datasets/BostonHousing.csv",
header=None, names=column_names)
# Use a single feature (e.g., RM) for simplicity and visualization
X = boston_data[['RM']] # Average number of rooms per dwelling
y = boston_data['MEDV'] # Median value of owner-occupied homes
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Visualization
plt.figure(figsize=(8, 5))
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.plot(X_test, y_pred, color='red', label='Predicted', linewidth=2)
plt.xlabel("Average Number of Rooms (RM)")
plt.ylabel("Median Value (MEDV)")
plt.title("Linear Regression on Boston Housing Data")
plt.legend()
plt.grid(True)
plt.show()
# Evaluation
print("Linear Regression:")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))
# ---------- Part 2: Polynomial Regression with Auto MPG Dataset ----------
# Load dataset (Assume offline CSV file, adjust path if needed)
auto_data = pd.read_csv("C:/Users/Mohammed sadiq/OneDrive/Desktop/python1/Datasets/auto-
mpg.csv")
# Clean data
auto_data.replace('?', np.nan, inplace=True)
auto_data.dropna(inplace=True)
auto_data['horsepower'] = auto_data['horsepower'].astype(float)
# Use horsepower to predict mpg
X_auto = auto_data[['horsepower']].values
y_auto = auto_data['mpg'].values
# Polynomial Features
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_auto)
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_poly, y_auto, test_size=0.2,
random_state=42)
# Train model
poly_reg = LinearRegression()
poly_reg.fit(X_train, y_train)
# Predict
y_pred = poly_reg.predict(X_test)
# Sort for plotting
sort_idx = X_test[:, 1].argsort()
X_sorted = X_test[sort_idx][:, 1]
y_sorted = y_pred[sort_idx]
# Plot results
plt.figure(figsize=(8, 5))
plt.scatter(X_test[:, 1], y_test, color='green', label='Actual')
plt.plot(X_sorted, y_sorted, color='orange', linewidth=2, label='Predicted')
plt.title("Polynomial Regression - Auto MPG (Horsepower vs MPG)")
plt.xlabel("Horsepower")
plt.ylabel("Miles per Gallon (MPG)")
plt.legend()
plt.grid(True)
plt.show()
print("Polynomial Regression:")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))
program-8
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Decision Tree
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
# Predict & evaluate
y_pred = model.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
# Predict for a new sample
sample = X_test[0].reshape(1, -1)
result = model.predict(sample)
label = "Benign" if result[0] == 1 else "Malignant"
print(f"Predicted Class for the new sample: {label}")
# Plot the decision tree
plt.figure(figsize=(12, 8))
plot_tree(model, feature_names=data.feature_names, class_names=data.target_names, filled=True)
plt.title("Decision Tree - Breast Cancer Dataset")
plt.show()
program-9
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Load dataset
data = fetch_olivetti_faces(shuffle=True, random_state=42)
X, y = data.data, data.target
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)
# Predict and evaluate
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%')
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
# Cross-validation accuracy
cv_scores = cross_val_score(model, X, y, cv=5)
print(f'\nCross-validation Accuracy: {cv_scores.mean() * 100:.2f}%')
# Plot predictions
fig, axes = plt.subplots(3, 5, figsize=(12, 8))
for ax, image, label, pred in zip(axes.ravel(), X_test, y_test, y_pred):
ax.imshow(image.reshape(64, 64), cmap='gray')
ax.set_title(f"True: {label}, Pred: {pred}")
ax.axis('off')
plt.show()
program-10
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, classification_report
data = load_breast_cancer()
X = data.data
y = data.target
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=2, random_state=42)
y_kmeans = kmeans.fit_predict(X_scaled)
print("Confusion Matrix:")
print(confusion_matrix(y, y_kmeans))
print("\nClassification Report:")
print(classification_report(y, y_kmeans))
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
df['Cluster'] = y_kmeans
df['True Label'] = y
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='PC1', y='PC2', hue='Cluster', palette='Set1', s=100,
edgecolor='black', alpha=0.7)
plt.title('K-Means Clustering of Breast Cancer Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title="Cluster")
plt.show()
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='PC1', y='PC2', hue='True Label', palette='coolwarm',
s=100, edgecolor='black', alpha=0.7)
plt.title('True Labels of Breast Cancer Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title="True Label")
plt.show()
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='PC1', y='PC2', hue='Cluster', palette='Set1', s=100,
edgecolor='black', alpha=0.7)
centers = pca.transform(kmeans.cluster_centers_)
plt.scatter(centers[:, 0], centers[:, 1], s=200, c='red', marker='X', label='Centroids')
plt.title('K-Means Clustering with Centroids')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title="Cluster")
plt.show()