Skip to content
This repository was archived by the owner on May 10, 2023. It is now read-only.

Commit 91f5288

Browse files
committed
NLP using random forest classification
1 parent b484e19 commit 91f5288

File tree

1 file changed

+76
-0
lines changed

1 file changed

+76
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Tue Sep 18 17:56:22 2018
4+
5+
@author: Mohammad Doosti Lakhani
6+
"""
7+
8+
# -*- coding: utf-8 -*-
9+
"""
10+
Created on Mon Sep 17 21:44:06 2018
11+
12+
@author: Mohammad Doosti Lakhani
13+
"""
14+
15+
# Importing the libraries
16+
import pandas as pd
17+
18+
# Importing the dataset
19+
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3) # remove quoting marks
20+
x = dataset.iloc[:,0].values
21+
y = dataset.iloc[:,1].values
22+
23+
# Cleaning the Text
24+
import re
25+
from nltk.corpus import stopwords # removing useless words
26+
from nltk.stem.porter import PorterStemmer # getting root of words
27+
stopwords_list = stopwords.words('english')
28+
ps = PorterStemmer()
29+
30+
corpus = []
31+
for i in range(0,len(x)):
32+
corp = re.sub(pattern = '[^a-zA-Z]', repl = ' ', string = x[i])
33+
corp = corp.split()
34+
corp = [ps.stem(word) for word in corp]
35+
corp = ' '.join(corp)
36+
corpus.append(corp)
37+
38+
# Creating the Bag of Words model
39+
from sklearn.feature_extraction.text import CountVectorizer # text to column vector os sparse matrix
40+
cv = CountVectorizer(lowercase=True, stop_words = stopwords_list) # ignoring useless words and to lowercase
41+
x_sparse = cv.fit_transform(corpus) # bag of word sparse matrix
42+
x = x_sparse.todense() # Convert sparse matrix to dense matrix
43+
44+
# Splitting dataset into Train set and Test set
45+
from sklearn.model_selection import train_test_split
46+
x_train,x_test,y_train,y_test = train_test_split(x,y, train_size = 0.85 , random_state=8)
47+
48+
# Fitting the Naive Bayes (Gausiian form) model to the train set
49+
from sklearn.ensemble import RandomForestClassifier
50+
classifier = RandomForestClassifier(n_estimators=12,criterion='entropy',random_state=0)
51+
classifier = classifier.fit(x_train,y_train)
52+
53+
# Make the prediction on train set
54+
y_train_pred = classifier.predict(x_train)
55+
56+
# Make the prediction on train set
57+
y_test_pred = classifier.predict(x_test)
58+
59+
# Acurracy on test and train set
60+
from sklearn.metrics import confusion_matrix
61+
cm_train = confusion_matrix(y_train,y_train_pred)
62+
cm_test = confusion_matrix(y_test,y_test_pred)
63+
64+
import os
65+
import sys
66+
67+
scriptpath = "../../Tools" # functions of acc
68+
# Add the directory containing your module to the Python path
69+
sys.path.append(os.path.abspath(scriptpath))
70+
import accuracy as ac
71+
72+
t_train,f_train,acc_train = ac.accuracy_on_cm(cm_train)
73+
print('Train status = #{} True, #{} False, %{} Accuracy'.format(t_train,f_train,acc_train*100))
74+
75+
t_test,f_test,acc_test = ac.accuracy_on_cm(cm_test)
76+
print('Test status = #{} True, #{} False, %{} Accuracy'.format(t_test,f_test,acc_test*100))

0 commit comments

Comments
 (0)