9/24/2020 course3project - Jupyter Notebook
Step 1: Finding the data set
This dataset is an amazon review data. The data set consists of multiple product reviews
In [1]:
import gzip
path = "amazon_reviews_multilingual_UK_v1_00.tsv.gz"
f = gzip.open(path, 'rt', encoding="utf8")
Step 2: Exploring the dataset
The data set consist of multiple entries in the form of market place, customer id etc. Each entitiy represents the
unique charactristics of the product
In [2]:
header = f.readline()
header = header.strip().split('\t')
print(header)
['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent',
'product_title', 'product_category', 'star_rating', 'helpful_votes', 'total_
votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'revi
ew_date']
Step 3: Cleaning the dataset
Here typecasting is used to filter the data required and converting the boolean responses to true and false
In [3]:
dataset = []
In [4]:
for line in f:
fields = line.strip().split('\t')
d = dict(zip(header, fields))
d['star_rating'] = int(d['star_rating'])
d['helpful_votes'] = int(d['helpful_votes'])
d['total_votes'] = int(d['total_votes'])
for field in ['verified_purchase','vine']:
if d[field] == 'Y':
d[field]=True
else:
d[field]=False
dataset.append(d)
localhost:8891/notebooks/course3project.ipynb# 1/4
9/24/2020 course3project - Jupyter Notebook
In [5]:
dataset[20]
Out[5]:
{'marketplace': 'UK',
'customer_id': '20222',
'review_id': 'R3I6A1LWUUVBRE',
'product_id': 'B0002CVQCW',
'product_parent': '281008695',
'product_title': "Les Miserables 10th Anniversary Concert At The Royal Albe
rt Hall (2 Disc Collector's Edition) [DVD]",
'product_category': 'Video DVD',
'star_rating': 5,
'helpful_votes': 0,
'total_votes': 0,
'vine': False,
'verified_purchase': True,
'review_headline': 'some of the best voices in the world',
'review_body': 'I liked it so much I bought it twice just so that I could s
hare it with a friend. Excellant',
'review_date': '2013-02-26'}
Step 4: Dividing the data set
Here the dataset has been divided into two parts. First part is training set, which consists of 80 percentage of
data and the remaining will be used for testing
In [6]:
import random
random.shuffle(dataset)
N = len(dataset)
trainingSet = dataset[:4*N//5]
testingSet = dataset[4*N//5:]
print("Training Set: ",len(trainingSet), "\nTest Set: ",len(testingSet), "\nTotal no.of row
Training Set: 1365995
Test Set: 341499
Total no.of rows 1707494
Step 5: Performing basic operation and refining and
evaluating the model
localhost:8891/notebooks/course3project.ipynb# 2/4
9/24/2020 course3project - Jupyter Notebook
In [7]:
# Defining the feature function and the implementation will be based on star rating and len
from collections import defaultdict
from nltk.stem.porter import PorterStemmer
import string
wordCount = defaultdict(int)
stemmer = PorterStemmer() #use stemmer.stem(stuff)
for d in trainingSet:
f = ''.join([x for x in d['review_body'].lower() if not x in string.punctuation])
for w in f.split():
w = stemmer.stem(w) # with stemming
wordCount[w]+=1
def feature(dat):
feat = [1, dat['star_rating'], len(wordCount)]
return feat
Fitting the model through
creating a vector feature creating a label vector defining a logistic regeression model and fitting the model
In [8]:
from sklearn import preprocessing
from sklearn import linear_model
X_train = [feature(d) for d in trainingSet]
y_train = [d['verified_purchase'] for d in trainingSet]
X_test = [feature(d) for d in testingSet]
y_test = [d['verified_purchase'] for d in testingSet]
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
# print("Label: ", y[:100], "\nFeatures:", X[:10])
model = linear_model.LogisticRegression()
model.fit(X_train_scaled, y_train)
Out[8]:
LogisticRegression()
Calculating the accuracy of the model
localhost:8891/notebooks/course3project.ipynb# 3/4
9/24/2020 course3project - Jupyter Notebook
In [9]:
from sklearn.metrics import confusion_matrix
predictions_train = model.predict(X_train_scaled)
predictions_test = model.predict(X_test_scaled)
correctPredictions_train = predictions_train == y_train
correctPredictions_test = predictions_test == y_test
accuracy_train = sum(correctPredictions_train) / len(correctPredictions_train)*100
accuracy_test = sum(correctPredictions_test) / len(correctPredictions_test)*100
print("Training accuracy: ",round(accuracy_train,2),"%","\nTest accuracy: ",round(accuracy_
print("Confusion matrix: \n",confusion_matrix(y_test, predictions_test))
Training accuracy: 76.23 %
Test accuracy: 76.07 %
Confusion matrix:
[[ 0 81733]
[ 0 259766]]
Finding error rate
In [10]:
TP_train = sum([(p and l) for (p, l) in zip(predictions_train, y_train)])
FP_train = sum([(p and not l) for (p, l) in zip(predictions_train, y_train)])
TN_train = sum([(not p and not l) for (p, l) in zip(predictions_train, y_train)])
FN_train = sum([(not p and l) for (p, l) in zip(predictions_train, y_train)])
TF_accuracy = (TP_train + TN_train) / (TP_train + FP_train + TN_train + FN_train)
BER = 1 - 1/2 * (TP_train / (TP_train + FN_train) + TN_train / (TN_train + FP_train))
print(f'TP_train = {TP_train}')
print(f'FP_train = {FP_train}')
print(f'TN_train = {TN_train}')
print(f'FN_train = {FN_train}')
print(f'TF_Accuracy: {round(TF_accuracy*100,2)}%')
print(f'BER_train = {BER}')
TP_train = 1041287
FP_train = 324708
TN_train = 0
FN_train = 0
TF_Accuracy: 76.23%
BER_train = 0.5
In [ ]:
localhost:8891/notebooks/course3project.ipynb# 4/4