0% found this document useful (0 votes)
19 views8 pages

V

The document contains multiple Python scripts demonstrating various machine learning techniques, including data visualization, regression, and classification. Key examples include loading datasets, creating histograms and box plots, implementing k-NN classifiers, and performing PCA and locally weighted regression. Additionally, it showcases linear and polynomial regression using California housing and auto MPG datasets, as well as training a decision tree classifier on breast cancer data.

Uploaded by

poorna2130
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
19 views8 pages

V

The document contains multiple Python scripts demonstrating various machine learning techniques, including data visualization, regression, and classification. Key examples include loading datasets, creating histograms and box plots, implementing k-NN classifiers, and performing PCA and locally weighted regression. Additionally, it showcases linear and polynomial regression using California housing and auto MPG datasets, as well as training a decision tree classifier on breast cancer data.

Uploaded by

poorna2130
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 8

1 1.

2
3 import pandas as pd
4 import numpy as np
5 import seaborn as sns
6 import matplotlib.pyplot as plt
7 from sklearn.datasets import fetch_california_housing
8
9 # Step 1: Load the California Housing dataset
10 data = fetch_california_housing(as_frame=True)
11 housing_df = data.frame
12
13 # Step 2: Create histograms for numerical features
14 numerical_features = housing_df.select_dtypes(include=[np.number]).columns
15
16 # Plot histograms
17 plt.figure(figsize=(15, 10))
18 for i, feature in enumerate(numerical_features):
19 plt.subplot(3, 3, i + 1)
20 sns.histplot(housing_df[feature], kde=True, bins=30, color='blue')
21 plt.title(f'Distribution of {feature}')
22 plt.tight_layout()
23 plt.show()
24
25 # Step 3: Generate box plots for numerical features
26 plt.figure(figsize=(15, 10))
27 for i, feature in enumerate(numerical_features):
28 plt.subplot(3, 3, i + 1)
29 sns.boxplot(x=housing_df[feature], color='orange')
30 plt.title(f'Box Plot of {feature}')
31 plt.tight_layout()
32 plt.show()
33
34 # Step 4: Identify outliers using the IQR method
35 print("Outliers Detection:")
36 outliers_summary = {}
37 for feature in numerical_features:
38 Q1 = housing_df[feature].quantile(0.25)
39 Q3 = housing_df[feature].quantile(0.75)
40 IQR = Q3 - Q1
41 lower_bound = Q1 - 1.5 * IQR
42 upper_bound = Q3 + 1.5 * IQR
43 outliers = housing_df[(housing_df[feature] < lower_bound) | (housing_df[feature] >
upper_bound)]
44 outliers_summary[feature] = len(outliers)
45 print(f"{feature}: {len(outliers)} outliers")
46
47 # Optional: Print a summary of the dataset
48 print("\nDataset Summary:")
49 print(housing_df.describe())
50
51
52
53 2.
54
55 import pandas as pd
56 import seaborn as sns
57 import matplotlib.pyplot as plt
58 from sklearn.datasets import fetch_california_housing
59
60 # Step 1: Load the California Housing Dataset
61 california_data = fetch_california_housing(as_frame=True)
62 data = california_data.frame
63
64 # Step 2: Compute the correlation matrix
65 correlation_matrix = data.corr()
66
67 # Step 3: Visualize the correlation matrix using a heatmap
68 plt.figure(figsize=(10, 8))
69 sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f',
linewidths=0.5)
70 plt.title('Correlation Matrix of California Housing Features')
71 plt.show()
72
73 # Step 4: Create a pair plot to visualize pairwise relationships
74 sns.pairplot(data, diag_kind='kde', plot_kws={'alpha': 0.5})
75 plt.suptitle('Pair Plot of California Housing Features', y=1.02)
76 plt.show()
77
78
79 3.
80
81 import numpy as np
82 import pandas as pd
83 from sklearn.datasets import load_iris
84 from sklearn.decomposition import PCA
85 import matplotlib.pyplot as plt
86
87 # Load the Iris dataset
88 iris = load_iris()
89 data = iris.data
90 labels = iris.target
91 label_names = iris.target_names
92
93 # Convert to a DataFrame for better visualization
94 iris_df = pd.DataFrame(data, columns=iris.feature_names)
95
96 # Perform PCA to reduce dimensionality to 2 components
97 pca = PCA(n_components=2)
98 data_reduced = pca.fit_transform(data)
99
100 # Create a DataFrame for the reduced data
101 reduced_df = pd.DataFrame(data_reduced, columns=['Principal Component 1', 'Principal
Component 2'])
102 reduced_df['Label'] = labels
103
104 # Plot the reduced data
105 plt.figure(figsize=(8, 6))
106 colors = ['r', 'g', 'b']
107 for i, label in enumerate(np.unique(labels)):
108 plt.scatter(
109 reduced_df[reduced_df['Label'] == label]['Principal Component 1'],
110 reduced_df[reduced_df['Label'] == label]['Principal Component 2'],
111 label=label_names[label],
112 color=colors[i]
113 )
114
115 plt.title('PCA on Iris Dataset')
116 plt.xlabel('Principal Component 1')
117 plt.ylabel('Principal Component 2')
118 plt.legend()
119 plt.grid()
120 plt.show()
121
122
123 4.
124
125 import pandas as pd
126
127 def find_s_algorithm(file_path):
128 data = pd.read_csv(file_path)
129 print("Training data:")
130 print(data)
131
132 attributes = data.columns[:-1]
133 class_label = data.columns[-1]
134
135 # Initialize hypothesis with most specific values
136 hypothesis = ['?' for _ in attributes]
137
138 # Iterate through each training instance
139 for index, row in data.iterrows():
140 if row[class_label] == 'Yes': # Consider only positive examples
141 for i, value in enumerate(row[attributes]):
142 if hypothesis[i] == '?' or hypothesis[i] == value:
143 hypothesis[i] = value
144 else:
145 hypothesis[i] = '?'
146
147 return hypothesis
148
149 # Example usage
150 file_path = 'training_data.csv'
151 hypothesis = find_s_algorithm(file_path)
152 print("\nThe final hypothesis is:", hypothesis)
153
154
155 5.
156
157 import numpy as np
158 import matplotlib.pyplot as plt
159 from collections import Counter
160
161 # Generate random data
162 data = np.random.rand(100)
163
164 # Assign labels to the first 50 points
165 labels = ["Class1" if x <= 0.5 else "Class2" for x in data[:50]]
166
167 # Define Euclidean distance
168 def euclidean_distance(x1, x2):
169 return abs(x1 - x2)
170
171 # k-NN Classifier
172 def knn_classifier(train_data, train_labels, test_point, k):
173 distances = [(euclidean_distance(test_point, train_data[i]), train_labels[i]) for
i in range(len(train_data))]
174 distances.sort(key=lambda x: x[0])
175 k_nearest_neighbors = distances[:k]
176 k_nearest_labels = [label for _, label in k_nearest_neighbors]
177 return Counter(k_nearest_labels).most_common(1)[0][0]
178
179 # Split data into train and test
180 train_data = data[:50]
181 train_labels = labels
182 test_data = data[50:]
183
184 # Different k values to test
185 k_values = [1, 2, 3, 4, 5, 20, 30]
186
187 print("--- k-Nearest Neighbors Classification ---")
188 print("Training dataset: First 50 points labeled based on the rule (x <= 0.5 ->
Class1, x > 0.5 -> Class2)")
189 print("Testing dataset: Remaining 50 points to be classified\n")
190
191 # Store results for each k
192 results = {}
193
194 for k in k_values:
195 print(f"Results for k = {k}:")
196 classified_labels = [knn_classifier(train_data, train_labels, test_point, k) for
test_point in test_data]
197 results[k] = classified_labels
198 for i, label in enumerate(classified_labels, start=51):
199 print(f"Point x{i} (value: {test_data[i - 51]:.4f}) is classified as {label}")
200 print("\n")
201
202 print("Classification complete.\n")
203
204 # Visualization for each k
205 for k in k_values:
206 classified_labels = results[k]
207 class1_points = [test_data[i] for i in range(len(test_data)) if
classified_labels[i] == "Class1"]
208 class2_points = [test_data[i] for i in range(len(test_data)) if
classified_labels[i] == "Class2"]
209
210 plt.figure(figsize=(10, 6))
211 plt.scatter(train_data, [0] * len(train_data), c=["blue" if label == "Class1" else
"red" for label in train_labels],
212 label="Training Data", marker="o")
213 plt.scatter(class1_points, [1] * len(class1_points), c="blue", label="Class1
(Test)", marker="x")
214 plt.scatter(class2_points, [1] * len(class2_points), c="red", label="Class2
(Test)", marker="x")
215 plt.title(f"k-NN Classification Results for k = {k}")
216 plt.xlabel("Data Points")
217 plt.ylabel("Classification Level")
218 plt.legend()
219 plt.grid(True)
220 plt.show()
221
222
223 6.
224
225 import numpy as np
226 import matplotlib.pyplot as plt
227
228 # Gaussian kernel function
229 def gaussian_kernel(x, xi, tau):
230 return np.exp(-np.sum((x - xi) ** 2) / (2 * tau ** 2))
231
232 # Locally Weighted Regression function
233 def locally_weighted_regression(x, X, y, tau):
234 m = X.shape[0]
235 weights = np.array([gaussian_kernel(x, X[i], tau) for i in range(m)])
236 W = np.diag(weights)
237 X_transpose_W = X.T @ W
238 theta = np.linalg.inv(X_transpose_W @ X) @ X_transpose_W @ y
239 return x @ theta
240
241 # Generate synthetic training data
242 np.random.seed(42)
243 X = np.linspace(0, 2 * np.pi, 100)
244 y = np.sin(X) + 0.1 * np.random.randn(100)
245
246 # Add bias term (intercept)
247 X_bias = np.c_[np.ones(X.shape), X]
248
249 # Test data
250 x_test = np.linspace(0, 2 * np.pi, 200)
251 x_test_bias = np.c_[np.ones(x_test.shape), x_test]
252
253 # Set kernel bandwidth (tau)
254 tau = 0.5
255
256 # Predict using LWR
257 y_pred = np.array([locally_weighted_regression(xi, X_bias, y, tau) for xi in
x_test_bias])
258
259 # Plot results
260 plt.figure(figsize=(10, 6))
261 plt.scatter(X, y, color='red', label='Training Data', alpha=0.7)
262 plt.plot(x_test, y_pred, color='blue', label=f'LWR Fit (tau={tau})', linewidth=2)
263 plt.xlabel('X', fontsize=12)
264 plt.ylabel('y', fontsize=12)
265 plt.title('Locally Weighted Regression', fontsize=14)
266 plt.legend(fontsize=10)
267 plt.grid(alpha=0.3)
268 plt.show()
269
270
271 7.
272
273 # Machine Learning Lab -- BCSL606
274 # Asst. Prof. Manjunatha S., Govt. Engineering College, Challakere
275 # Program 7: Linear Regression and Polynomial Regression
276
277 import numpy as np
278 import pandas as pd
279 import matplotlib.pyplot as plt
280 from sklearn.datasets import fetch_california_housing
281 from sklearn.model_selection import train_test_split
282 from sklearn.linear_model import LinearRegression
283 from sklearn.preprocessing import PolynomialFeatures, StandardScaler
284 from sklearn.pipeline import make_pipeline
285 from sklearn.metrics import mean_squared_error, r2_score
286
287 # Linear Regression using California Housing Dataset
288 def linear_regression_california():
289 housing = fetch_california_housing(as_frame=True)
290 X = housing.data[["AveRooms"]]
291 y = housing.target
292
293 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
294
295 model = LinearRegression()
296 model.fit(X_train, y_train)
297 y_pred = model.predict(X_test)
298
299 plt.scatter(X_test, y_test, color="blue", label="Actual")
300 plt.plot(X_test, y_pred, color="red", label="Predicted")
301 plt.xlabel("Average number of rooms (AveRooms)")
302 plt.ylabel("Median value of homes ($100,000)")
303 plt.title("Linear Regression - California Housing Dataset")
304 plt.legend()
305 plt.grid(True)
306 plt.show()
307
308 print("Linear Regression - California Housing Dataset")
309 print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
310 print("R^2 Score:", r2_score(y_test, y_pred))
311
312 # Polynomial Regression using Auto MPG Dataset
313 def polynomial_regression_auto_mpg():
314 url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-
mpg.data"
315 column_names = ["mpg", "cylinders", "displacement", "horsepower",
316 "weight", "acceleration", "model_year", "origin", "car_name"]
317 data = pd.read_csv(url, sep='\s+', names=column_names, na_values="?",
comment='\t', engine='python')
318 data = data.dropna()
319
320 X = data["displacement"].values.reshape(-1, 1)
321 y = data["mpg"].values
322
323 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
324
325 poly_model = make_pipeline(PolynomialFeatures(degree=2), StandardScaler(),
LinearRegression())
326 poly_model.fit(X_train, y_train)
327 y_pred = poly_model.predict(X_test)
328
329 plt.scatter(X_test, y_test, color="blue", label="Actual")
330 plt.scatter(X_test, y_pred, color="red", label="Predicted", alpha=0.6)
331 plt.xlabel("Displacement")
332 plt.ylabel("Miles per gallon (mpg)")
333 plt.title("Polynomial Regression - Auto MPG Dataset")
334 plt.legend()
335 plt.grid(True)
336 plt.show()
337
338 print("Polynomial Regression - Auto MPG Dataset")
339 print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
340 print("R^2 Score:", r2_score(y_test, y_pred))
341
342 # Main program
343 if __name__ == "__main__":
344 print("Demonstrating Linear Regression and Polynomial Regression\n")
345 linear_regression_california()
346 polynomial_regression_auto_mpg()
347
348
349 8.
350
351 # Importing necessary libraries
352 import numpy as np
353 import matplotlib.pyplot as plt
354 from sklearn.datasets import load_breast_cancer
355 from sklearn.model_selection import train_test_split
356 from sklearn.tree import DecisionTreeClassifier
357 from sklearn.metrics import accuracy_score
358 from sklearn import tree
359
360 # Load dataset
361 data = load_breast_cancer()
362 X = data.data
363 y = data.target
364
365 # Split into training and test sets
366 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
367
368 # Train the Decision Tree Classifier
369 clf = DecisionTreeClassifier(random_state=42)
370 clf.fit(X_train, y_train)
371
372 # Make predictions
373 y_pred = clf.predict(X_test)
374
375 # Evaluate accuracy
376 accuracy = accuracy_score(y_test, y_pred)
377 print(f"Model Accuracy: {accuracy * 100:.2f}%")
378
379 # Predict class for a new sample (first test sample)
380 new_sample = np.array([X_test[0]])
381 prediction = clf.predict(new_sample)
382 prediction_class = "Benign" if prediction == 1 else "Malignant"
383 print(f"Predicted Class for the new sample: {prediction_class}")
384
385 # Visualize the decision tree
386 plt.figure(figsize=(12, 8))
387 tree.plot_tree(clf, filled=True, feature_names=data.feature_names,
class_names=data.target_names)
388 plt.title("Decision Tree - Breast Cancer Dataset")
389 plt.show()
390
391
392 9.
393
394 import numpy as np
395 import matplotlib.pyplot as plt
396 from sklearn.datasets import fetch_olivetti_faces
397 from sklearn.model_selection import train_test_split, cross_val_score
398 from sklearn.naive_bayes import GaussianNB
399 from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
400
401 # Load the dataset
402 data = fetch_olivetti_faces(shuffle=True, random_state=42)
403 X = data.data
404 y = data.target
405
406 # Split into training and test sets
407 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=42)
408
409 # Train the Gaussian Naive Bayes model
410 gnb = GaussianNB()
411 gnb.fit(X_train, y_train)
412
413 # Make predictions
414 y_pred = gnb.predict(X_test)
415
416 # Evaluate performance
417 accuracy = accuracy_score(y_test, y_pred)
418 print(f'Accuracy: {accuracy * 100:.2f}%')
419
420 print("\nClassification Report:")
421 print(classification_report(y_test, y_pred, zero_division=1))
422
423 print("\nConfusion Matrix:")
424 print(confusion_matrix(y_test, y_pred))
425
426 # Cross-validation accuracy
427 cross_val_accuracy = cross_val_score(gnb, X, y, cv=5, scoring='accuracy')
428 print(f'\nCross-validation accuracy: {cross_val_accuracy.mean() * 100:.2f}%')
429
430 # Visualize predictions
431 fig, axes = plt.subplots(3, 5, figsize=(12, 8))
432 for ax, image, label, prediction in zip(axes.ravel(), X_test, y_test, y_pred):
433 ax.imshow(image.reshape(64, 64), cmap=plt.cm.gray)
434 ax.set_title(f"True: {label}, Pred: {prediction}", fontsize=8)
435 ax.axis('off')
436
437 plt.tight_layout()
438 plt.show()
439
440
441 10.
442
443 import numpy as np
444 import pandas as pd
445 import matplotlib.pyplot as plt
446 import seaborn as sns
447 from sklearn.datasets import load_breast_cancer
448 from sklearn.cluster import KMeans
449 from sklearn.preprocessing import StandardScaler
450 from sklearn.decomposition import PCA
451 from sklearn.metrics import confusion_matrix, classification_report
452
453 # Load dataset
454 data = load_breast_cancer()
455 X = data.data
456 y = data.target
457
458 # Standardize the features
459 scaler = StandardScaler()
460 X_scaled = scaler.fit_transform(X)
461
462 # Apply K-Means clustering
463 kmeans = KMeans(n_clusters=2, random_state=42)
464 y_kmeans = kmeans.fit_predict(X_scaled)
465
466 # Evaluate clustering
467 print("Confusion Matrix:")
468 print(confusion_matrix(y, y_kmeans))
469
470 print("\nClassification Report:")
471 print(classification_report(y, y_kmeans))
472
473 # Reduce dimensions for visualization using PCA
474 pca = PCA(n_components=2)
475 X_pca = pca.fit_transform(X_scaled)
476
477 # Create a DataFrame for plotting
478 df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
479 df['Cluster'] = y_kmeans
480 df['True Label'] = y
481
482 # Plot K-Means clustering results
483 plt.figure(figsize=(8, 6))
484 sns.scatterplot(data=df, x='PC1', y='PC2', hue='Cluster', palette='Set1',
485 s=100, edgecolor='black', alpha=0.7)
486 plt.title('K-Means Clustering of Breast Cancer Dataset')
487 plt.xlabel('Principal Component 1')
488 plt.ylabel('Principal Component 2')
489 plt.legend(title="Cluster")
490 plt.grid(True)
491 plt.show()
492
493 # Plot actual true labels
494 plt.figure(figsize=(8, 6))
495 sns.scatterplot(data=df, x='PC1', y='PC2', hue='True Label',
496 palette='coolwarm', s=100, edgecolor='black', alpha=0.7)
497 plt.title('True Labels of Breast Cancer Dataset')
498 plt.xlabel('Principal Component 1')
499 plt.ylabel('Principal Component 2')
500 plt.legend(title="True Label")
501 plt.grid(True)
502 plt.show()
503
504 # Plot clustering with centroids
505 plt.figure(figsize=(8, 6))
506 sns.scatterplot(data=df, x='PC1', y='PC2', hue='Cluster', palette='Set1',
507 s=100, edgecolor='black', alpha=0.7)
508
509 # Transform cluster centers to PCA space
510 centers = pca.transform(kmeans.cluster_centers_)
511
512 # Plot cluster centroids
513 plt.scatter(centers[:, 0], centers[:, 1], s=200, c='red', marker='X',
label='Centroids')
514
515 plt.title('K-Means Clustering with Centroids')
516 plt.xlabel('Principal Component 1')
517 plt.ylabel('Principal Component 2')
518 plt.legend(title="Cluster")
519 plt.grid(True)
520 plt.show()
521

You might also like