Pattern Recognition & Anomaly Detection
Lab
EXPERIMENT – 12
Density-based spatial clustering(DBSCAN)
NAME – ANVITA SINGH
ROLL NO – R2142221063
SAP_ID – 500107712
BATCH – 8
CODE -
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# Load dataset
df = pd.read_csv("/content/city_day.csv")
print("Initial Data Sample:")
print(df.head())
# Remove missing values
df.dropna(inplace=True)
# Feature Selection (only numeric columns)
X = df.select_dtypes(include=['float64', 'int64'])
# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Apply PCA for dimensionality reduction
pca = PCA(n_components=2) # You can choose the number of components (2
for 2D, or more for higher dimensions)
X_pca = pca.fit_transform(X_scaled)
# Explained variance ratio
print("\nExplained Variance Ratio of the PCA Components:")
print(pca.explained_variance_ratio_)
# Train-Test Split (optional for DBSCAN, but we'll do it for
visualization)
X_train, X_test = train_test_split(X_pca, test_size=0.2,
random_state=42)
# DBSCAN Model
dbscan = DBSCAN(eps=0.5, min_samples=5) # You can adjust eps and
min_samples based on your data
dbscan.fit(X_train)
# Predict Clusters
y_pred_train = dbscan.labels_ # DBSCAN assigns labels, where -1
represents noise (outliers)
# Show sample predictions
print("\nSample DBSCAN Clusters (Noise = -1, Clusters = 0, 1,
2, ...):")
print(y_pred_train[:10])
# Count of Clusters vs Noise
unique, counts = np.unique(y_pred_train, return_counts=True)
result_counts = dict(zip(unique, counts))
print("\nCluster Counts:")
print(result_counts)
# Visualize Clusters and Noise
plt.figure(figsize=(8,5))
sns.countplot(x=y_pred_train)
plt.title("DBSCAN Clustering Output")
plt.xlabel("Cluster/Noise")
plt.ylabel("Count")
plt.show()
# Visualizing the PCA-reduced data with clusters highlighted
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_train[:, 0], y=X_train[:, 1], hue=y_pred_train,
palette="coolwarm", style=y_pred_train, legend="full")
plt.title("DBSCAN Clustering on PCA-reduced Data (Train Set)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.show()
# Visualizing the test set with clusters
y_pred_test = dbscan.fit_predict(X_test) # DBSCAN on the test set
# Visualizing the PCA-reduced data with anomalies (clusters)
highlighted for the test set
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_test[:, 0], y=X_test[:, 1], hue=y_pred_test,
palette="coolwarm", style=y_pred_test, legend="full")
plt.title("DBSCAN Clustering on PCA-reduced Data (Test Set)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.show()
print("✅ DBSCAN Clustering Model Trained, Clusters Identified, and
Visualized.")
OUTPUT –