1)StandardScaler : It is used for preprocessing the data.
It calculates the
Variance, Mean, Standard Deviation etc. Eg->
from sklearn.preprocessing import StandardScaler
x=np.random.randint(4,4) #data
scaler=StandardScaler()
scaler.fit(x)
scaler.mean_ #calculates the mean of each feature
scaler.var_ #calculates the variance of each feature
scaler.scale_ #calculates the standard deviation of each feature
scaler.n_features_in_ #gives number of feature
scaler.transform(x) #process the data
scaler.inverse_transform(scaler.transform(x)) #gives the inverse processed data
2)MinMaxScaler : It is also used for preprocessing of data. It calculates the min
value(Min), max value(Max), Scaled value=(X-Min)/(Max-Min) and inverse preprcossed
data.
from sklearn.preprocessing import MinMaxScaler
X=[2,4,6,8,10,12]
scaler=MinMaxScaler()
scaled=scaler.fit_transform(x)
print("Original Data:\n", X)
print("Scaled Data:\n", scaled) #(X-2)/10
print("Data Min:", scaler.data_min_) #gives 2
print("Data Max:", scaler.data_max_) #gives 12
print("Data Range:", scaler.data_range_) #gives 12-2=10
X_original = scaler.inverse_transform(X_scaled)
print("Recovered Original Data:\n", X_original)
3)LabelEncoder : It is used for processing data when the original data is in the
string formal, and it needs to be turned to Numbers for Machine Learning Algorithm.
from sklearn.preprocessing import LabelEncoder
labels = ['cat', 'dog', 'mouse', 'dog', 'cat']
encoder = LabelEncoder() #Create Encoder Instance
encoded = encoder.fit_transform(labels) #Fit and transform the data
print("Original labels: ", labels)
print("Encoded labels: ", list(encoded)) #gives [0,1,2,1,0]
print("Classes found: ", list(encoder.classes_)) # All unique labels in
sorted order ['cat','dog','mouse']
print("Transform 'cat': ", encoder.transform(['cat'])) # Encode new label
gives [0]
print("Inverse transform:", encoder.inverse_transform([0, 1, 2])) # Convert
numbers back to original labels gives ['cat','dog','mouse']
4)OneHotEncoder :
from sklearn.preprcocessing import OneHotEncoder
data = pd.DataFrame({'Color': ['Red', 'Green', 'Blue', 'Green', 'Red']})
X = data[['Color']]
encoder = OneHotEncoder(
categories='auto', # Can be 'auto' or list of categories
drop=None, # Use 'first' to drop first column
(for avoiding dummy variable trap)
sparse_output=False, # Set to False to return dense NumPy array
dtype=int, # Output data type
handle_unknown='ignore', # Avoid errors for unseen categories during
transform
max_categories=None, # Set limit on number of categories (None = no
limit)
min_frequency=None # Minimum frequency for a category to be kept
)
encoded_array = encoder.fit_transform(X) # Fit encoder and transform data
encoded_df = pd.DataFrame(encoded_array,
columns=encoder.get_feature_names_out(['Color'])) # Convert to DataFrame for
display
print(encoded_df)
OUTPUT
Color_Blue Color_Green Color_Red
0 0 0 1
1 0 1 0
2 1 0 0
3 0 1 0
4 0 0 1
5)accuracy_score: It provides the accuracy of the model
from sklearn.metrics import accuracy_score
y_true = [0, 1, 2, 2, 0] # True labels
y_pred = [0, 0, 2, 2, 0] # Predicted labels by model
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)
6)precision_score : It is a classification metric used to measure the accuracy of
positive predictions.
from sklearn.metrics import precision_score
y_true = [1, 0, 1, 1, 0, 1, 0, 0]
y_pred = [1, 0, 1, 0, 0, 1, 1, 0]
score = precision_score(y_true, y_pred)
print("Precision:", score)
7)recall_score : recall function measures the ability of a classifier to find all
the relevant (positive) instances in a dataset.
from sklearn.metrics import recall_score
y_true = [0, 1, 2, 2, 0] # True labels
y_pred = [0, 0, 2, 2, 0] # Predicted labels by model
recall = recall_score(y_true, y_pred)
print("Recall:", recall)
8)f1_score : Provides f1 score for the model
from sklearn.metrics import f1_score
y_true = [0, 1, 2, 2, 0] # True labels
y_pred = [0, 0, 2, 2, 0] # Predicted labels by model
f1 = f1_score(y_true, y_pred)
print("F1 SCORE:", f1)
9)confusion_matrix : Provides confusion matrix for the model
from sklearn.metrics import confusion_matrix
y_true = [0, 1, 2, 2, 0] # True labels
y_pred = [0, 0, 2, 2, 0] # Predicted labels by model
confusion = confusion_matrix(y_true, y_pred)
print("Confusion Matrix :", confusion)
10)classification_report : Provides classification report for the model
from sklearn.metrics import classification_report
y_true = [0, 1, 2, 2, 0] # True labels
y_pred = [0, 0, 2, 2, 0] # Predicted labels by model
classification = classification_report(y_true, y_pred)
print("Classification Report:", classification)
11)mean_squared_error : Measures the mean square error for the model
from sklearn.metrics import mean_squared_error
y_true = [0, 1, 2, 2, 0] # True labels
y_pred = [0, 0, 2, 2, 0] # Predicted labels by model
mse =mean_squared_error(y_true, y_pred)
print("Mean Squared Error:", mse)
12)r2_score : Measures the R^2 for the model
from sklearn.metrics import r2_score
y_true = [0, 1, 2, 2, 0] # True labels
y_pred = [0, 0, 2, 2, 0] # Predicted labels by model
r2 = r2_score(y_true, y_pred)
print("R2:", r2)
13)load_iris : it loads and returns the famous Iris dataset. The Iris dataset
contains three flowe speices 'setosa','virginica','versicolor' with 4 features
sample width,
sample length, petal width and petal length.
Each row is a flower , each column is a feature so 4 features and 3 rows.
feature_names : It is the List of the various features like sample width, sample
length, petal width, petal length.
target_names: List of flower species.
DESCR: Full description of the dataset
from sklearn.datasets import load_iris
import pandas as pd
iris = load_iris() # Load the dataset
X = iris.data # Access data and target
y = iris.target
print("Feature names:", iris.feature_names) # Feature names
print("Target names:", iris.target_names) #Target names
# Convert to DataFrame for easy viewing
df = pd.DataFrame(X, columns=iris.feature_names)
df['species'] = y
df['species'] = df['species'].replace({0: 'setosa', 1: 'versicolor', 2:
'virginica'})
print(df.head())
14) LinearRegression : It is a supervised Learning model
import numpy as np
from sklearn.linear_model import LinearRegression
X = np.array([[1], [2], [3], [4], [5]]) # Hours studied
y = np.array([10, 20, 30, 40, 50]) # Corresponding scores
model = LinearRegression() # Create and train the model
model.fit(X, y)
predicted_score = model.predict([[6]]) # Predict score for 6 hours of study
print("Predicted Score for 6 hours of study:", predicted_score[0])
ATTRIBUTES
1)fit_intercept: True,False
3)copy_X: True,False
4)n_jobs: None
5)positive: True,False
15) LogisticRegression : It is a supervised Learning model
from sklearn.linear_model import LogisticRegression
X = np.array([3.78, 2.44, 2.09, 0.14, 1.72, 1.65, 4.92, 4.37, 4.96, 4.52, 3.69,
5.88]).reshape(-1, 1)
y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
model = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', max_iter=100,
random_state=None)
model.fit(X, y)
predicted = model.predict(np.array([3.46]).reshape(-1, 1))
print(predicted)
SYNTAX
1)penalty: Regularization type ('l2','l1','elasticnet','none')
2)C: Inverse of regularization strength (default is 1.0)
3)solver: Algorithm to use in the optimization problem ('lbfgs','saga','liblinear')
4)max_iter: Maximum number of iterations (default is 100)
5)random_state: Seed for reproducibility
6)muti_class: Strategy for muti class ('auto','ovr','multinomial')
7)fit_intercept: True,False
8)n_jobs: None
9)class_weight: None,'balanced'
16) DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(
criterion='gini',
#criterion= It is the criterion to decide the Nodes it can be "gini","entropy",
"log loss"
splitter='best', #splitter =
strategy to split at each node it can Eg- "best", "random"
max_depth=None, #max_depth=max
depth of the tree Eg-2,3 etc.
min_samples_split=2, #min_samples= Min.
number of samples required to split the internal node. Eg-1,2 etc
min_samples_leaf=1, #min_samples_leaf=
Min. number of samples required to be at leaf node. Eg-1,2 etc
min_weight_fraction_leaf=0.0,
max_features=None, #max_features= The
number of features to consider when looking for best split.
random_state=None, #random_state= Controls
randomness of estimator
max_leaf_nodes=None, #max_leaf_nodes= max number
of leaf nodes in a tree
min_impurity_decrease=0.0,
class_weight=None, #class_weight= weight
associated with classes
ccp_alpha=0.0, #ccp_alpha=
Complexity parameter used for Minimal Cost-Complexity Pruning
monotonic_cst=None
)
EXAMPLE
from sklearn.tree import DecisionTreeClassifier
iris = load_iris()
X = iris.data
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=42)
clf.fit(X, y)
X_test = [[5.0, 3.6, 1.3, 0.25]]
y_pred = clf.predict(X_test)
print(f"Predicted class: {iris.target_names[y_pred[0]]}")
tree_rules = export_text(clf, feature_names=iris['feature_names'])
print(tree_rules)
17) RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(
n_estimators=100, # Number of
trees in the forest
criterion='gini', #
Function to measure split quantity
max_depth=None, # Maximum depth
of the tree
min_samples_split=2, # Min. number of
split in internal node
min_samples_leaf=1, # Min. number of
samples required to be a leaf node
min_weight_fraction_leaf=0.0,
max_features='sqrt', # Number of
features to consider when looking for best split
max_leaf_nodes=None, # Maximum number of
leaf nodes
min_impurity_decrease=0.0,
bootstrap=True, # Whether to use
bootstrap samples to build trees
oob_score=False, # Whether to use
out of bag samples to estimate generalization frequency
n_jobs=None, # Number of
jobs to run in parallel
random_state=None, # Controls randomness of
the estimator
verbose=0, # Controls
the verbosity when fitting and predicting
warm_start=False, # Reuse solution of
previous call to fit more estimators.
class_weight=None, # Weights associated
with classes
ccp_alpha=0.0, # Complexity
parameter for minimal cost Complexity pruning
max_samples=None # Max, number of samples
to draw to train each base estimator
)
18)SVC : It is used for implement SVM supervised model
from sklearn.svm import SVC
clf = SVC(
C=1.0, # Controls regularization between
smooth decision boundary and classifying training points correctly. Eg-
1,2,3,int,etc.
kernel='rbf', # Specifies kernel type to be used.
Eg-'linear', 'poly' ,'rbf', 'sigmoid', 'precomputed'
degree=3, # Degree of 'poly' kernel function
ignored or other kernels
gamma='scale', # Kernel Coefficients
coef0=0.0, # Independent term in Kernel function
Eg-1,2,0
shrinking=True, # Whether to use the shrinking heuristic.
Eg-True,False
probability=False, # Whether to enable probability estimates.
Must be enabled prior to fitting Eg-True,False
tol=0.001, # Tolerance for stopping criterion.
Eg-0.001,0.002,0.05 etc.
cache_size=200, # Size of the kernel cache (in MB).
class_weight=None, # Set the parameter C of class i to
class_weight[i]*C.
verbose=False, # Enable verbose output. Eg-True,False
max_iter=-1, # Hard limit on
iterations within solver. -1 means no limit.
decision_function_shape='ovr',
break_ties=False, # predict the class with the
highest decision function value in case of ties.
random_state=None # Controls the pseudo random number
generation for shuffling data.
)
EXAMPLE
from sklearn.svm import SVC
X = [[0, 0], [1, 1]]
y = [0, 1]
clf = SVC()
clf.fit(X, y)
prediction = clf.predict([[2., 2.]])
print(prediction)
19) KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(
n_neighbors=5, # Number of neighbors to use
Eg-1,2,3,int etc. Default=5
weights='uniform', # Weight function for
prediction Eg- 'uniform', 'distance', callable, None Default= 'uniform'
algorithm='auto', # Search algorithm for
neighbors. Eg- 'auto', 'ball_tree', 'kd_tree', 'brute' etc.
leaf_size=30, # Leaf size for tree-based
algorithms Eg-2,3,4, etc .Defualt=30
p=2, # Power parameter
for Minkowski metric. Eg- 2,3,int etc. Default=2 etc
metric='minkowski', # Distance metric. Eg-
'minkowski','cityblock','manhattan','hamming','chebyshev','canberra','jaccard'
metric_params=None, # Additional metric parameters .Eg-
dict,None
n_jobs=None # Number of parallel jobs. Eg-
int,None
)
EXAMPLE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
random_state=1)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Predicted labels:", y_pred)
print("Actual labels: ", y_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
20)GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(
loss='log_loss', # Specifies the loss function to be
optimized.
learning_rate=0.1, # Shrinks the contribution of each
tree. Typical values are between 0.01 and 0.2.
n_estimators=100, # Number of boosting stages (trees) to be
built.
subsample=1.0, # Fraction of samples used for fitting
each base learner.
criterion='friedman_mse', # Function to measure the quality of a split
min_samples_split=2, # Minimum number of samples required to
split an internal node.
min_samples_leaf=1, # Minimum number of samples required to be
at a leaf node
min_weight_fraction_leaf=0.0,
max_depth=3, # Maximum depth of the individual
regression estimators
min_impurity_decrease=0.0,
init=None,
random_state=None, # Controls the randomness of the estimator
max_features=None, # Number of features to consider when
looking for the best split.
verbose=0,
max_leaf_nodes=None,
warm_start=False,
validation_fraction=0.1, # Proportion of training data to set aside
as validation set for early stopping
n_iter_no_change=None,
tol=0.0001, # Tolerance for the early stopping
criterion
ccp_alpha=0.0 # Complexity parameter used for Minimal
Cost-Complexity Pruning
)
EXAMPLE
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5,
n_redundant=5, random_state=1)
model = GradientBoostingClassifier()
model.fit(X, y)
row = [[2.57, -0.13, 3.16, -4.36, -1.61, -1.39, -2.49, -1.93, 3.26, 2.06]]
yhat = model.predict(row)
print('Prediction:', yhat[0])
21)KMeans
from sklearn.cluster import KMeans
kmeans = KMeans(
n_clusters=8, # Number of clusters to form and centroids
to generate. Eg-1,2,3,5,8,10 etc.
init='k-means++', # Method for initializing centroids. 'k-means+
+' is recommended for faster convergence. Eg-"k-means++","random"
n_init='auto', # Number of times the algorithm will run
with different centroid seeds. Best result is kept. Eg-"auto",10,5,6 etc.
max_iter=300, # Maximum number of iterations for a single
run. Eg-300,250,200 etc.
tol=0.0001, # Relative tolerance with regards to inertia
to declare convergence. Eg- 1e-5,1e-4,1e-3 etc.
verbose=0, # Verbosity mode. Eg-0,1,2
random_state=None, Eg-None,42,30,25 etc.
copy_x=True, # Whether to copy data (True) or overwrite it
(False). Eg-True,False
algorithm='lloyd' # K-means algorithm to use. 'lloyd' is standard,
'elkan' is faster for dense data. Eg- "elkan","lloyd","auto","full"
)
EXAMPLE
from sklearn.cluster import KMeans
import numpy as np
X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]])
kmeans = KMeans(n_clusters=2, random_state=0, n_init='auto')
kmeans.fit(X)
print(kmeans.labels_)
print(kmeans.cluster_centers_)
22)DBSCAN
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=None)
#eps= The maximum distance between two samples for one to be considered as in the
neighborhood of the other. Eg-0.5,0.7,float etc.
#min_samples=The minimum number of samples (including the point itself) required in
the neighborhood for a point to be considered a core point. Eg-2,5,8,10,5 etc.
#metric =The distance metric to use for the neighborhood calculation. Eg -
'euclidean' or 'minkowski'
#algorithm= The algorithm to compute nearest neighbors. Options include 'auto',
'ball_tree', 'kd_tree', or 'brute'
#leaf_size= Leaf size passed to BallTree or KDTree
#p= The power parameter for the Minkowski metric.
#sample_weight= Optional weights for each sample
#n_jobs= The number of parallel jobs to run for neighbors search
import numpy as np
from sklearn.cluster import DBSCAN
X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]])
clustering = DBSCAN(eps=3, min_samples=2).fit(X)
print(clustering.labels_)
23)PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=None, copy=True, whiten=False, svd_solver='auto',tol=0.0,
iterated_power='auto', n_oversamples=10,power_iteration_normalizer='auto',
random_state=None)
n_components= Number of principal components to keep.
copy= If True (default), data is copied; otherwise, it may be overwritten.
whiten= If True, the components vectors are multiplied by the square root of the
number of samples and divided by the singular values to ensure uncorrelated outputs
with unit variance
svd_solver= Specifies the algorithm to use for Singular Value Decomposition.
Options include 'auto', 'full', 'arpack', 'randomized'
tol= Tolerance for singular values computed by svd_solver 'arpack'.
iterated_power= Number of iterations for the power method, used when
svd_solver='randomized'.
n_oversamples= Additional number of random vectors to sample the range of the data
matrix
power_iteration_normalizer= Power iteration normalizer for svd_solver='randomized'
random_state= Controls the randomness of the SVD solver.
from sklearn.decomposition import PCA
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
pca = PCA(n_components=2)
pca.fit(X)
X_pca = pca.transform(X)
24)cross_val_score : It is used to estimate the model's performance using cross
validation.
from sklearn.model_selection import cross_val_score
scores = cross_val_score(estimator, X, y=None, *, groups=None, scoring=None,
cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs',
error_score=np.nan)
estimator: The model object
X: Feature data (array-like or sparse matrix).
y: Target variable (array-like), required for supervised learning.
scoring: (Optional) Metric for evaluation (e.g., 'accuracy',
'neg_mean_squared_error'). If None, uses default for estimator.
cv: (Optional) Number of folds (int), or a cross-validation strategy object. If
cv=5, splits data into 5 folds
n_jobs: (Optional) Number of jobs to run in parallel.
Additional parameters allow further customization
EXAMPLE
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
X, y = load_diabetes(return_X_y=True)
lasso = Lasso()
scores = cross_val_score(lasso, X, y, cv=3)
print(scores)
[0.3315, 0.0802, 0.0353]
25)GridSearchCV : GridSearchCV is a function in scikit-learn used for
hyperparameter tuning, which helps find the optimal combination of parameters for a
given machine learning model. It helps finding the optimal combination of
parameters for a given Machine Learning Model. This is done by evaluating various
parameters and testing them on cross validation
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(
estimator, # The model/classifier to tune
param_grid, # Dictionary or list of dictionaries with parameter names
scoring=None, # (Optional) Metric for evaluation (e.g., 'accuracy','f1')
n_jobs=None, # (Optional) Number of jobs to run in parallel
refit=True, # (Optional) Refit the best estimator with the entire dataset
cv=None, # (Optional) Cross-validation splitting strategy (e.g., 5 for 5-
fold CV)
verbose=0 # (Optional) Controls the verbosity
)
EXAMPLE
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
param_grid = {
'C': [0.1, 1, 10],
'kernel': ['linear', 'rbf', 'poly'],
'gamma': [0.1, 1, 'scale', 'auto']
}
model = SVC()
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best Model:", grid_search.best_estimator_)
26)RandomziedSearchCV : RandomizedSearchCV is a hyperparameter optimization
technique provided by scikit-learn that searches across a specified hyperparameter
space by sampling a fixed number of parameter
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(
estimator, # The model/estimator
param_distributions, # Dict with hyperparameters and their distributions
n_iter=10, # Number of parameter settings sampled
scoring=None, # Scoring metric (optional)
n_jobs=1, # Number of parallel jobs (optional)
cv=None, # Cross-validation splitting strategy (optional)
verbose=0, # Verbosity level (optional)
random_state=None, # Random seed (optional)
refit=True # Refit the best estimator on the whole dataset
(optional)
)
random_search.fit(X_train, y_train)
EXAMPLE
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.svm import SVC
param_dist = {
'C': uniform(0.1, 10),
'kernel': ['linear', 'rbf', 'poly'],
'gamma': ['scale', 'auto']
}
random_search = RandomizedSearchCV(
estimator=SVC(),
param_distributions=param_dist,
n_iter=20,
cv=5
)
random_search.fit(X_train, y_train)
print("Best Hyperparameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)
27)joblil.dump :The joblib.dump function is used to serialize and save Python
objects to disk, especially objects containing large data such as NumPy arrays or
machine learning models
joblib.dump(value, filename, compress=0, protocol=None)
EXAMPLE
import joblib
joblib.dump(model, 'model.pkl') # Saving a Python object (e.g., a trained model)
joblib.dump(model, 'model_compressed.pkl.gz', compress=('gzip', 3)) # Saving with
compression
28)joblil.load :