Description

Write a classification script named ‘classifier.py’ that trains on reviews_train.txt and achieves the maximum possible accuracy on reviews_test.txt.

You can use any type of classifier that you want.

DT Notepad

“””

A simple script that demonstrates how we classify textual data with sklearn.

“””

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier

#read the reviews and their polarities from a given file

def loadData(fname):

reviews=[]

labels=[]

f=open(fname)

for line in f:

review,rating=line.strip().split(‘t’)

reviews.append(review.lower())

labels.append(int(rating))

f.close()

return reviews,labels

rev_train,labels_train=loadData(‘reviews_train.txt’)

rev_test,labels_test=loadData(‘reviews_test.txt’)

#Build a counter based on the training dataset

counter = CountVectorizer()

counter.fit(rev_train)

#count the number of times each term appears in a document and transform each doc into a count vector

counts_train = counter.transform(rev_train)#transform the training data

counts_test = counter.transform(rev_test)#transform the testing data

#train classifier

clf = DecisionTreeClassifier()

#train all classifier on the same datasets

clf.fit(counts_train,labels_train)

#use hard voting to predict (majority voting)

pred=clf.predict(counts_test)

#print accuracy

print (accuracy_score(pred,labels_test))