3).
Product review dataset
In [ ]: import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
In [ ]: nltk.download('punkt')
nltk.download('stopwords')
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
Out[ ]: True
Load the dataset
In [ ]: with open('/content/dataset.txt', 'r', encoding='utf-8') as file:
lines = file.readlines()
In [ ]: # Initialize lists to store labels and reviews
labels = []
reviews = []
In [ ]: # Process each line in the dataset
for line in lines:
# Split the line by '__label__'
parts = line.split('__label__')
# Check if there are two parts
if len(parts) == 2:
# Extract label and review
label = '__label__' + parts[1].strip().split()[0] # Extracting the label
review = ' '.join(parts[1].strip().split()[1:]) # Extracting the review
labels.append(label)
reviews.append(review)
Creating a dataframe
In [ ]: # Create a DataFrame
df = pd.DataFrame({'label': labels, 'review': reviews})
# Map labels to sentiments
sentiment_map = {
'__label__1': 'positive',
'__label__2': 'negative'
}
df['sentiment'] = df['label'].map(sentiment_map)
# Drop the 'label' column
df.drop(columns=['label'], inplace=True)
# Display the DataFrame
print(df.head())
print(df.tail())
review sentiment
0 Great CD: My lovely Pat has one of the GREAT v... negative
1 One of the best game music soundtracks - for a... negative
2 Batteries died within a year ...: I bought thi... positive
3 works fine, but Maha Energy is better: Check o... negative
4 Great for the non-audiophile: Reviewed quite a... negative
review sentiment
399995 Unbelievable- In a Bad Way: We bought this Tho... positive
399996 Almost Great, Until it Broke...: My son reciev... positive
399997 Disappointed !!!: I bought this toy for my son... positive
399998 Classic Jessica Mitford: This is a compilation... negative
399999 Comedy Scene, and Not Heard: This DVD will be ... positive
Preprocessing
In [ ]: def preprocess_text(text):
text = text.lower() # Convert to lowercase
tokenizer = nltk.RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text) # Tokenize
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words] # Remove stopwords
stemmer = PorterStemmer()
tokens = [stemmer.stem(word) for word in tokens] # Stemming
return ' '.join(tokens)
In [ ]: df['review'] = df['review'].apply(preprocess_text)
Sentiment Distribution
In [ ]: df['sentiment'].value_counts().plot(kind='bar')
plt.title('Product Review Data Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()
Vectorization using TF-IDF
In [ ]: tfidf = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7)
X = tfidf.fit_transform(df['review']).toarray()
Split the data into training and testing sets
In [ ]: y = df['sentiment'].map({'positive': 1, 'negative': 0}).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
In [ ]: y
Out[ ]: array([0, 0, 1, ..., 1, 0, 1])
Logistic Regression classifier
In [ ]: lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)
lr_pred = lr_classifier.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)
print("Logistic Regression Accuracy:", lr_accuracy)
Logistic Regression Accuracy: 0.86795
Naive Bayes classifier
In [ ]: nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
nb_pred = nb_classifier.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_pred)
print("Naive Bayes Accuracy:", nb_accuracy)
Naive Bayes Accuracy: 0.827925
In [ ]: def plot_confusion_matrix(y_true, y_pred, model_name):
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Negative','Positive'],
yticklabels=['Negative','Positive'])
plt.title(f'{model_name} Confusion Matrix')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.show()
Confusion matrix for Logistic Regression
In [ ]: plot_confusion_matrix(y_test, lr_pred, "Logistic Regression")
Confusion matrix for Naive Bayes
In [ ]: plot_confusion_matrix(y_test, nb_pred, "Naive Bayes")
Accuracy, Precision, Recall, and F1 score for Logistic Regression and Naive Bayes
In [ ]: def print_evaluation_metrics(y_true, y_pred, model_name):
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')
print(f"----------- {model_name} Evaluation Metrics -----------")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
In [ ]: print_evaluation_metrics(y_test, lr_pred, "Logistic Regression")
print_evaluation_metrics(y_test, nb_pred, "Naive Bayes")
----------- Logistic Regression Evaluation Metrics -----------
Accuracy: 0.8679
Precision: 0.8680
Recall: 0.8679
F1 Score: 0.8679
----------- Naive Bayes Evaluation Metrics -----------
Accuracy: 0.8279
Precision: 0.8280
Recall: 0.8279
F1 Score: 0.8279