Que : You work in XYZ Company as a Python Developer.
The company officials want you to
write code for a clustering problem. Dataset: customers.csv Tasks to be performed: 1. K-
Means Clustering: - Load customer data. - Check the number of cells in each column with
null values. - Create a scatter plot with Age as X and Spending Score as Y. - Find out the best
number for clusters between 1 and 10 (inclusive) using the elbowmethod. - Draw a scatter
plot displaying data points colored on the basis of clusters
For Optimal:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
# Load customer data
data = pd.read_csv('customers.csv')
# Check for null values
print(data.isnull().sum())
# Create a scatter plot with Age as X and Spending Score as Y
plt.scatter(data['Age'], data['Spending Score (1-100)'])
plt.xlabel('Age')
plt.ylabel('Spending Score (1-100)')
plt.show()
# Use the optimal k-means clustering algorithm to determine the number of clusters
model = KMeans()
visualizer = KElbowVisualizer(model, k=(1,10))
visualizer.fit(data[['Age', 'Spending Score (1-100)']])
visualizer.show()
# Draw a scatter plot displaying data points colored on the basis of clusters
optimal_k = visualizer.elbow_value_
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', max_iter=300, n_init=10,
random_state=0)
clusters = kmeans.fit_predict(data[['Age', 'Spending Score (1-100)']])
data['Cluster'] = clusters
plt.scatter(data['Age'], data['Spending Score (1-100)'], c=data['Cluster'], cmap='viridis')
plt.xlabel('Age')
plt.ylabel('Spending Score (1-100)')
plt.show()
For Sub-Optimal:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# Load customer data
data = pd.read_csv('customers.csv')
# Check for null values
print(data.isnull().sum())
# Create a scatter plot with Age as X and Spending Score as Y
plt.scatter(data['Age'], data['Spending Score (1-100)'])
plt.xlabel('Age')
plt.ylabel('Spending Score (1-100)')
plt.show()
# Use the suboptimal k-means clustering algorithm to determine the number of clusters
sum_of_squared_distances = []
K = range(1,11)
for k in K:
km = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=0)
km = km.fit(data[['Age', 'Spending Score (1-100)']])
sum_of_squared_distances.append(km.inertia_)
plt.plot(K, sum_of_squared_distances, 'bx-')
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of Squared Distances')
plt.title('Elbow Method For Optimal k')
plt.show()
# Draw a scatter plot displaying data points colored on the basis of clusters
suboptimal_k = 6 # a value selected as an example
kmeans = KMeans(n_clusters=suboptimal_k, init='k-means++', max_iter=300, n_init=10,
random_state=0)
clusters = kmeans.fit_predict(data[['Age', 'Spending Score (1-100)']])
data['Cluster'] = clusters
plt.scatter(data['Age'], data['Spending Score (1-100)'], c=data['Cluster'], cmap='viridis')
plt.xlabel('Age')
plt.ylabel('Spending Score (1-100)')
plt.show()