1   1.
2
 3   import pandas as pd
 4   import numpy as np
 5   import seaborn as sns
 6   import matplotlib.pyplot as plt
 7   from sklearn.datasets import fetch_california_housing
 8
 9   # Step 1: Load the California Housing dataset
10   data = fetch_california_housing(as_frame=True)
11   housing_df = data.frame
12
13   # Step 2: Create histograms for numerical features
14   numerical_features = housing_df.select_dtypes(include=[np.number]).columns
15
16   # Plot histograms
17   plt.figure(figsize=(15, 10))
18   for i, feature in enumerate(numerical_features):
19       plt.subplot(3, 3, i + 1)
20       sns.histplot(housing_df[feature], kde=True, bins=30, color='blue')
21       plt.title(f'Distribution of {feature}')
22   plt.tight_layout()
23   plt.show()
24
25   # Step 3: Generate box plots for numerical features
26   plt.figure(figsize=(15, 10))
27   for i, feature in enumerate(numerical_features):
28       plt.subplot(3, 3, i + 1)
29       sns.boxplot(x=housing_df[feature], color='orange')
30       plt.title(f'Box Plot of {feature}')
31   plt.tight_layout()
32   plt.show()
33
34   # Step 4: Identify outliers using the IQR method
35   print("Outliers Detection:")
36   outliers_summary = {}
37   for feature in numerical_features:
38       Q1 = housing_df[feature].quantile(0.25)
39       Q3 = housing_df[feature].quantile(0.75)
40       IQR = Q3 - Q1
41       lower_bound = Q1 - 1.5 * IQR
42       upper_bound = Q3 + 1.5 * IQR
43       outliers = housing_df[(housing_df[feature] < lower_bound) | (housing_df[feature] >
     upper_bound)]
44       outliers_summary[feature] = len(outliers)
45       print(f"{feature}: {len(outliers)} outliers")
46
47   # Optional: Print a summary of the dataset
48   print("\nDataset Summary:")
49   print(housing_df.describe())
50
51
52
53   2.
54
55   import pandas as pd
56   import seaborn as sns
57   import matplotlib.pyplot as plt
58   from sklearn.datasets import fetch_california_housing
59
60   # Step 1: Load the California Housing Dataset
61   california_data = fetch_california_housing(as_frame=True)
62   data = california_data.frame
63
64   # Step 2: Compute the correlation matrix
65   correlation_matrix = data.corr()
66
67   # Step 3: Visualize the correlation matrix using a heatmap
68   plt.figure(figsize=(10, 8))
69    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f',
      linewidths=0.5)
 70   plt.title('Correlation Matrix of California Housing Features')
 71   plt.show()
 72
 73   # Step 4: Create a pair plot to visualize pairwise relationships
 74   sns.pairplot(data, diag_kind='kde', plot_kws={'alpha': 0.5})
 75   plt.suptitle('Pair Plot of California Housing Features', y=1.02)
 76   plt.show()
 77
 78
 79   3.
 80
 81   import numpy as np
 82   import pandas as pd
 83   from sklearn.datasets import load_iris
 84   from sklearn.decomposition import PCA
 85   import matplotlib.pyplot as plt
 86
 87   # Load the Iris dataset
 88   iris = load_iris()
 89   data = iris.data
 90   labels = iris.target
 91   label_names = iris.target_names
 92
 93   # Convert to a DataFrame for better visualization
 94   iris_df = pd.DataFrame(data, columns=iris.feature_names)
 95
 96   # Perform PCA to reduce dimensionality to 2 components
 97   pca = PCA(n_components=2)
 98   data_reduced = pca.fit_transform(data)
 99
100   # Create a DataFrame for the reduced data
101   reduced_df = pd.DataFrame(data_reduced, columns=['Principal Component 1', 'Principal
      Component 2'])
102   reduced_df['Label'] = labels
103
104   # Plot the reduced data
105   plt.figure(figsize=(8, 6))
106   colors = ['r', 'g', 'b']
107   for i, label in enumerate(np.unique(labels)):
108       plt.scatter(
109           reduced_df[reduced_df['Label'] == label]['Principal Component 1'],
110           reduced_df[reduced_df['Label'] == label]['Principal Component 2'],
111           label=label_names[label],
112           color=colors[i]
113       )
114
115   plt.title('PCA on Iris Dataset')
116   plt.xlabel('Principal Component 1')
117   plt.ylabel('Principal Component 2')
118   plt.legend()
119   plt.grid()
120   plt.show()
121
122
123   4.
124
125   import pandas as pd
126
127   def find_s_algorithm(file_path):
128       data = pd.read_csv(file_path)
129       print("Training data:")
130       print(data)
131
132        attributes = data.columns[:-1]
133        class_label = data.columns[-1]
134
135        # Initialize hypothesis with most specific values
136        hypothesis = ['?' for _ in attributes]
137
138        # Iterate through each training instance
139        for index, row in data.iterrows():
140            if row[class_label] == 'Yes': # Consider only positive examples
141                for i, value in enumerate(row[attributes]):
142                    if hypothesis[i] == '?' or hypothesis[i] == value:
143                        hypothesis[i] = value
144                    else:
145                        hypothesis[i] = '?'
146
147        return hypothesis
148
149   # Example usage
150   file_path = 'training_data.csv'
151   hypothesis = find_s_algorithm(file_path)
152   print("\nThe final hypothesis is:", hypothesis)
153
154
155   5.
156
157   import numpy as np
158   import matplotlib.pyplot as plt
159   from collections import Counter
160
161   # Generate random data
162   data = np.random.rand(100)
163
164   # Assign labels to the first 50 points
165   labels = ["Class1" if x <= 0.5 else "Class2" for x in data[:50]]
166
167   # Define Euclidean distance
168   def euclidean_distance(x1, x2):
169       return abs(x1 - x2)
170
171   # k-NN Classifier
172   def knn_classifier(train_data, train_labels, test_point, k):
173       distances = [(euclidean_distance(test_point, train_data[i]), train_labels[i]) for
      i in range(len(train_data))]
174       distances.sort(key=lambda x: x[0])
175       k_nearest_neighbors = distances[:k]
176       k_nearest_labels = [label for _, label in k_nearest_neighbors]
177       return Counter(k_nearest_labels).most_common(1)[0][0]
178
179   # Split data into train and test
180   train_data = data[:50]
181   train_labels = labels
182   test_data = data[50:]
183
184   # Different k values to test
185   k_values = [1, 2, 3, 4, 5, 20, 30]
186
187   print("--- k-Nearest Neighbors Classification ---")
188   print("Training dataset: First 50 points labeled based on the rule (x <= 0.5 ->
      Class1, x > 0.5 -> Class2)")
189   print("Testing dataset: Remaining 50 points to be classified\n")
190
191   # Store results for each k
192   results = {}
193
194   for k in k_values:
195       print(f"Results for k = {k}:")
196       classified_labels = [knn_classifier(train_data, train_labels, test_point, k) for
      test_point in test_data]
197       results[k] = classified_labels
198       for i, label in enumerate(classified_labels, start=51):
199           print(f"Point x{i} (value: {test_data[i - 51]:.4f}) is classified as {label}")
200       print("\n")
201
202   print("Classification complete.\n")
203
204   # Visualization for each k
205   for k in k_values:
206       classified_labels = results[k]
207       class1_points = [test_data[i] for i in range(len(test_data)) if
      classified_labels[i] == "Class1"]
208       class2_points = [test_data[i] for i in range(len(test_data)) if
      classified_labels[i] == "Class2"]
209
210       plt.figure(figsize=(10, 6))
211       plt.scatter(train_data, [0] * len(train_data), c=["blue" if label == "Class1" else
      "red" for label in train_labels],
212                   label="Training Data", marker="o")
213       plt.scatter(class1_points, [1] * len(class1_points), c="blue", label="Class1
      (Test)", marker="x")
214       plt.scatter(class2_points, [1] * len(class2_points), c="red", label="Class2
      (Test)", marker="x")
215       plt.title(f"k-NN Classification Results for k = {k}")
216       plt.xlabel("Data Points")
217       plt.ylabel("Classification Level")
218       plt.legend()
219       plt.grid(True)
220       plt.show()
221
222
223   6.
224
225   import numpy as np
226   import matplotlib.pyplot as plt
227
228   # Gaussian kernel function
229   def gaussian_kernel(x, xi, tau):
230       return np.exp(-np.sum((x - xi) ** 2) / (2 * tau ** 2))
231
232   # Locally Weighted Regression function
233   def locally_weighted_regression(x, X, y, tau):
234       m = X.shape[0]
235       weights = np.array([gaussian_kernel(x, X[i], tau) for i in range(m)])
236       W = np.diag(weights)
237       X_transpose_W = X.T @ W
238       theta = np.linalg.inv(X_transpose_W @ X) @ X_transpose_W @ y
239       return x @ theta
240
241   # Generate synthetic training data
242   np.random.seed(42)
243   X = np.linspace(0, 2 * np.pi, 100)
244   y = np.sin(X) + 0.1 * np.random.randn(100)
245
246   # Add bias term (intercept)
247   X_bias = np.c_[np.ones(X.shape), X]
248
249   # Test data
250   x_test = np.linspace(0, 2 * np.pi, 200)
251   x_test_bias = np.c_[np.ones(x_test.shape), x_test]
252
253   # Set kernel bandwidth (tau)
254   tau = 0.5
255
256   # Predict using LWR
257   y_pred = np.array([locally_weighted_regression(xi, X_bias, y, tau) for xi in
      x_test_bias])
258
259   # Plot results
260   plt.figure(figsize=(10, 6))
261   plt.scatter(X, y, color='red', label='Training Data', alpha=0.7)
262   plt.plot(x_test, y_pred, color='blue', label=f'LWR Fit (tau={tau})', linewidth=2)
263   plt.xlabel('X', fontsize=12)
264   plt.ylabel('y', fontsize=12)
265   plt.title('Locally Weighted Regression', fontsize=14)
266   plt.legend(fontsize=10)
267   plt.grid(alpha=0.3)
268   plt.show()
269
270
271   7.
272
273   # Machine Learning Lab -- BCSL606
274   # Asst. Prof. Manjunatha S., Govt. Engineering College, Challakere
275   # Program 7: Linear Regression and Polynomial Regression
276
277   import numpy as np
278   import pandas as pd
279   import matplotlib.pyplot as plt
280   from sklearn.datasets import fetch_california_housing
281   from sklearn.model_selection import train_test_split
282   from sklearn.linear_model import LinearRegression
283   from sklearn.preprocessing import PolynomialFeatures, StandardScaler
284   from sklearn.pipeline import make_pipeline
285   from sklearn.metrics import mean_squared_error, r2_score
286
287   # Linear Regression using California Housing Dataset
288   def linear_regression_california():
289       housing = fetch_california_housing(as_frame=True)
290       X = housing.data[["AveRooms"]]
291       y = housing.target
292
293       X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
      random_state=42)
294
295        model = LinearRegression()
296        model.fit(X_train, y_train)
297        y_pred = model.predict(X_test)
298
299        plt.scatter(X_test, y_test, color="blue", label="Actual")
300        plt.plot(X_test, y_pred, color="red", label="Predicted")
301        plt.xlabel("Average number of rooms (AveRooms)")
302        plt.ylabel("Median value of homes ($100,000)")
303        plt.title("Linear Regression - California Housing Dataset")
304        plt.legend()
305        plt.grid(True)
306        plt.show()
307
308        print("Linear Regression - California Housing Dataset")
309        print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
310        print("R^2 Score:", r2_score(y_test, y_pred))
311
312   # Polynomial Regression using Auto MPG Dataset
313   def polynomial_regression_auto_mpg():
314       url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-
      mpg.data"
315       column_names = ["mpg", "cylinders", "displacement", "horsepower",
316                       "weight", "acceleration", "model_year", "origin", "car_name"]
317       data = pd.read_csv(url, sep='\s+', names=column_names, na_values="?",
      comment='\t', engine='python')
318       data = data.dropna()
319
320        X = data["displacement"].values.reshape(-1, 1)
321        y = data["mpg"].values
322
323       X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
      random_state=42)
324
325       poly_model = make_pipeline(PolynomialFeatures(degree=2), StandardScaler(),
      LinearRegression())
326       poly_model.fit(X_train, y_train)
327       y_pred = poly_model.predict(X_test)
328
329        plt.scatter(X_test, y_test, color="blue", label="Actual")
330        plt.scatter(X_test, y_pred, color="red", label="Predicted", alpha=0.6)
331        plt.xlabel("Displacement")
332        plt.ylabel("Miles per gallon (mpg)")
333        plt.title("Polynomial Regression - Auto MPG Dataset")
334        plt.legend()
335        plt.grid(True)
336        plt.show()
337
338        print("Polynomial Regression - Auto MPG Dataset")
339        print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
340        print("R^2 Score:", r2_score(y_test, y_pred))
341
342   # Main program
343   if __name__ == "__main__":
344       print("Demonstrating Linear Regression and Polynomial Regression\n")
345       linear_regression_california()
346       polynomial_regression_auto_mpg()
347
348
349   8.
350
351   # Importing necessary libraries
352   import numpy as np
353   import matplotlib.pyplot as plt
354   from sklearn.datasets import load_breast_cancer
355   from sklearn.model_selection import train_test_split
356   from sklearn.tree import DecisionTreeClassifier
357   from sklearn.metrics import accuracy_score
358   from sklearn import tree
359
360   # Load dataset
361   data = load_breast_cancer()
362   X = data.data
363   y = data.target
364
365   # Split into training and test sets
366   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
      random_state=42)
367
368   # Train the Decision Tree Classifier
369   clf = DecisionTreeClassifier(random_state=42)
370   clf.fit(X_train, y_train)
371
372   # Make predictions
373   y_pred = clf.predict(X_test)
374
375   # Evaluate accuracy
376   accuracy = accuracy_score(y_test, y_pred)
377   print(f"Model Accuracy: {accuracy * 100:.2f}%")
378
379   # Predict class for a new sample (first test sample)
380   new_sample = np.array([X_test[0]])
381   prediction = clf.predict(new_sample)
382   prediction_class = "Benign" if prediction == 1 else "Malignant"
383   print(f"Predicted Class for the new sample: {prediction_class}")
384
385   # Visualize the decision tree
386   plt.figure(figsize=(12, 8))
387   tree.plot_tree(clf, filled=True, feature_names=data.feature_names,
      class_names=data.target_names)
388   plt.title("Decision Tree - Breast Cancer Dataset")
389   plt.show()
390
391
392   9.
393
394   import numpy as np
395   import matplotlib.pyplot as plt
396   from   sklearn.datasets import fetch_olivetti_faces
397   from   sklearn.model_selection import train_test_split, cross_val_score
398   from   sklearn.naive_bayes import GaussianNB
399   from   sklearn.metrics import accuracy_score, classification_report, confusion_matrix
400
401   # Load the dataset
402   data = fetch_olivetti_faces(shuffle=True, random_state=42)
403   X = data.data
404   y = data.target
405
406   # Split into training and test sets
407   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
      random_state=42)
408
409   # Train the Gaussian Naive Bayes model
410   gnb = GaussianNB()
411   gnb.fit(X_train, y_train)
412
413   # Make predictions
414   y_pred = gnb.predict(X_test)
415
416   # Evaluate performance
417   accuracy = accuracy_score(y_test, y_pred)
418   print(f'Accuracy: {accuracy * 100:.2f}%')
419
420   print("\nClassification Report:")
421   print(classification_report(y_test, y_pred, zero_division=1))
422
423   print("\nConfusion Matrix:")
424   print(confusion_matrix(y_test, y_pred))
425
426   # Cross-validation accuracy
427   cross_val_accuracy = cross_val_score(gnb, X, y, cv=5, scoring='accuracy')
428   print(f'\nCross-validation accuracy: {cross_val_accuracy.mean() * 100:.2f}%')
429
430   # Visualize predictions
431   fig, axes = plt.subplots(3, 5, figsize=(12, 8))
432   for ax, image, label, prediction in zip(axes.ravel(), X_test, y_test, y_pred):
433       ax.imshow(image.reshape(64, 64), cmap=plt.cm.gray)
434       ax.set_title(f"True: {label}, Pred: {prediction}", fontsize=8)
435       ax.axis('off')
436
437   plt.tight_layout()
438   plt.show()
439
440
441   10.
442
443   import numpy as np
444   import pandas as pd
445   import matplotlib.pyplot as plt
446   import seaborn as sns
447   from sklearn.datasets import load_breast_cancer
448   from sklearn.cluster import KMeans
449   from sklearn.preprocessing import StandardScaler
450   from sklearn.decomposition import PCA
451   from sklearn.metrics import confusion_matrix, classification_report
452
453   # Load dataset
454   data = load_breast_cancer()
455   X = data.data
456   y = data.target
457
458   # Standardize the features
459   scaler = StandardScaler()
460   X_scaled = scaler.fit_transform(X)
461
462   # Apply K-Means clustering
463   kmeans = KMeans(n_clusters=2, random_state=42)
464   y_kmeans = kmeans.fit_predict(X_scaled)
465
466   # Evaluate clustering
467   print("Confusion Matrix:")
468   print(confusion_matrix(y, y_kmeans))
469
470   print("\nClassification Report:")
471   print(classification_report(y, y_kmeans))
472
473   # Reduce dimensions for visualization using PCA
474   pca = PCA(n_components=2)
475   X_pca = pca.fit_transform(X_scaled)
476
477   # Create a DataFrame for plotting
478   df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
479   df['Cluster'] = y_kmeans
480   df['True Label'] = y
481
482   # Plot K-Means clustering results
483   plt.figure(figsize=(8, 6))
484   sns.scatterplot(data=df, x='PC1', y='PC2', hue='Cluster', palette='Set1',
485                   s=100, edgecolor='black', alpha=0.7)
486   plt.title('K-Means Clustering of Breast Cancer Dataset')
487   plt.xlabel('Principal Component 1')
488   plt.ylabel('Principal Component 2')
489   plt.legend(title="Cluster")
490   plt.grid(True)
491   plt.show()
492
493   # Plot actual true labels
494   plt.figure(figsize=(8, 6))
495   sns.scatterplot(data=df, x='PC1', y='PC2', hue='True Label',
496                   palette='coolwarm', s=100, edgecolor='black', alpha=0.7)
497   plt.title('True Labels of Breast Cancer Dataset')
498   plt.xlabel('Principal Component 1')
499   plt.ylabel('Principal Component 2')
500   plt.legend(title="True Label")
501   plt.grid(True)
502   plt.show()
503
504   # Plot clustering with centroids
505   plt.figure(figsize=(8, 6))
506   sns.scatterplot(data=df, x='PC1', y='PC2', hue='Cluster', palette='Set1',
507                   s=100, edgecolor='black', alpha=0.7)
508
509   # Transform cluster centers to PCA space
510   centers = pca.transform(kmeans.cluster_centers_)
511
512   # Plot cluster centroids
513   plt.scatter(centers[:, 0], centers[:, 1], s=200, c='red', marker='X',
      label='Centroids')
514
515   plt.title('K-Means Clustering with Centroids')
516   plt.xlabel('Principal Component 1')
517   plt.ylabel('Principal Component 2')
518   plt.legend(title="Cluster")
519   plt.grid(True)
520   plt.show()
521