import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
# Load the dataset
df = pd.read_csv('auto-mpg.csv')
# Display the first few rows of the dataset
print("Dataset Preview:")
print(df.head())
# Check for null values
print("\nNull Values:")
print(df.isnull().sum())
# Basic information about the dataset
print("\nDataset Info:")
print(df.info())
# Handle missing values (optional, depending on dataset)
df = df.dropna()
# Remove non-numeric columns like 'car name' (or any other categorical columns)
df = df.drop(columns=['car name']) # Dropping the 'car name' column
# If there are any other categorical columns, encode them (e.g., using one-hot
encoding)
# df = pd.get_dummies(df, drop_first=True) # Uncomment if you have categorical
features
# Splitting into features (X) and target (y)
X = df.drop(columns=['mpg']) # Assuming 'mpg' is the target variable
y = df['mpg']
# Feature selection using SelectKBest
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
selector = SelectKBest(score_func=f_regression, k='all')
X_new = selector.fit_transform(X, y)
# Display feature scores
feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': selector.scores_})
print("\nFeature Scores:")
print(feature_scores)
# Visualizing feature scores
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.barplot(x='Feature', y='Score', data=feature_scores)
plt.title('Feature Selection Scores')
plt.xticks(rotation=45)
plt.show()
# Selecting top 3 features (example)
top_features = feature_scores.nlargest(3, 'Score')['Feature'].tolist()
print("\nTop 3 Relevant Features:")
print(top_features)