# Identify the IMDB Movies review recieved as either **positive or negative**


In [247]:
#importing the libraries
import numpy as np # Numerical Computations
import pandas as pd # Data Manipulations

In [248]:
# reading the imdb movies review store into df
df=pd.read_csv('/content/IMDB_Dataset.csv',)

In [249]:
#dataset taken and uploaded
#source https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/data

In [250]:
#top 5 rows
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


*there are 2 columns with review and sentiment as names

In [251]:
# shape of the dataset
df.shape

(50000, 2)

*50000 rows and 2 columns

In [252]:
df = df[0:3000]  # with 50000 records we are having resource limitations hence taken only 3000 rows
df.shape

(3000, 2)

*3000 rows  2 columns  

In [253]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [254]:
df.review[1]  # here we can see our data is not processed ex. html tags are there, special character, case of words not uniform


'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [255]:
df['sentiment'].value_counts() # Which shows data is approximately balanced


positive    1508
negative    1492
Name: sentiment, dtype: int64

In [256]:
df.isnull().sum()  # Here we can see data does not have null values


review       0
sentiment    0
dtype: int64

In [257]:
df.duplicated().sum()  # if there are duplicate records which we can drop


0

In [258]:
#df.drop_duplicates(inplace=True) # there are duplicate records which are drop


In [259]:
#df.duplicated().sum()  # check to see whether duplicate data is dropped or not


In [260]:
#df.reset_index(drop=True, inplace=True) #If you have dropped some rows (e.g., duplicates),you have removed some of the indices,
#thus they are not in your DataFrame index anymore.
#applying above code to generate a fresh index with consecutive numbers in your clean DataFrame.


In [261]:
len(df)
# length of data after duplicates removed

3000

In [262]:
# data cleaning or preprocessing using libraries
import re # regular expression library
import nltk # text based library
nltk.download('stopwords') # downloaded the stopwords library

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [263]:
# imported the libraries stopwords and stemming
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [264]:
ps=PorterStemmer() # initalized ps object to the porter stemmer

*dataset is in df, creating 'i' index corpus range and loop

In [265]:
corpus=[]
for i in range(0,len(df)):
#  print(df['review'][i])
 review=re.sub('[^a-zA-Z]'," ",df['review'][i])
 #print(review)
 review=review.lower() # lowering all text
 #print(review)
 review=review.split() # splitting (converting each sentence into the list of words)
 #print("before stemming",review[1])
 review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
 # Each data in english taken
 #print("after stemming",review[1])
 review=" ".join(review)
 #print(review)
 corpus.append(review) # appending to corpus everytime

In [266]:
# bag of words initalizing
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=3000) #object cv created and maxfeature to 3k to reduced execution time
X=cv.fit_transform(corpus).toarray() # transform cv to X via corpus

In [267]:
X.shape


(3000, 3000)

In [268]:
y=pd.get_dummies(df['sentiment']) # converting the categorical output variable into numbers

In [269]:
y

Unnamed: 0,negative,positive
0,0,1
1,0,1
2,0,1
3,1,0
4,0,1
...,...,...
2995,0,1
2996,1,0
2997,0,1
2998,1,0


In [270]:
y=y.iloc[:,1].values

In [271]:
y

array([1, 1, 1, ..., 1, 0, 0], dtype=uint8)

*need only one column of both pos and neg

In [272]:
# train and test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=.20,random_state=0)

In [273]:
# training model using multinomial navie bayes algotithm
from sklearn.naive_bayes import MultinomialNB
spam_model=MultinomialNB().fit(X_train,y_train)

In [274]:
#predict the model with test data
y_pred=spam_model.predict(X_test)

In [275]:
# check the performance
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8233333333333334

In [276]:
# check confusion matrix to true positive and negatives
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[251,  55],
       [ 51, 243]])

*true positive are 251 and negatives are 243

In [277]:
# lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
ln=WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [278]:
# reading the imdb movies review and store into df again
df=pd.read_csv('/content/IMDB_Dataset.csv',)

In [279]:
df = df[0:3000]  # with 50000 records we are having resource limitations hence taken only 3000 rows
df.shape

(3000, 2)

In [280]:
df.duplicated().sum()  # there are no duplicate records which we can drop


0

In [281]:
#df.drop_duplicates(inplace=True)
#df.duplicated().sum()  # check to see whether duplicate data is dropped or not


In [282]:
#df.reset_index(drop=True, inplace=True) #If you have dropped some rows (e.g., duplicates),you have removed some of the indices,
#thus they are not in your DataFrame index anymore.
#applying above code to generate a fresh index with consecutive numbers in your clean DataFrame.


In [283]:
corpus=[]
for i in range(0,len(df)):
  #print()
  review=re.sub('[^a-zA-Z]'," ",df['review'][i]) # we removed all the numbers and special characters only allowed alphabets
  #print(review)
  review=review.lower() # lowering all text
  review= review.split() # splitting (converting each sentence into the list of words)
  #print("before lemmatizing",review[1])
  review=[ln.lemmatize(word) for word in review if not word in stopwords.words('english')]
  #print("after lemmatizing",review[1])
  review=" ".join(review)
  #print(review)
  corpus.append(review)

In [286]:
# bag of words initialising
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=3000)
X=cv.fit_transform(corpus).toarray() # transform cv to X via corpus

In [287]:
y=pd.get_dummies(df['sentiment']) # converting the categorical output variable into numbers
y=y.iloc[:,1].values

In [288]:
# train and test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=.20,random_state=0)

In [289]:
#predict the model with test data
y_pred=spam_model.predict(X_test)

In [290]:
# check the performance
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.4533333333333333

#stemming is giving better score than lemmatization in Bag of words using navie Bayes algorithm

In [291]:
#tf idf on stemming and lemmatization using Bayes algorithm

In [293]:
corpus=[]
for i in range(0,len(df)):
#  print(df['review'][i])
 review=re.sub('[^a-zA-Z]'," ",df['review'][i])
 #print(review)
 review=review.lower() # lowering all text
 #print(review)
 review=review.split() # splitting (converting each sentence into the list of words)
 #print("before stemming",review[1])
 review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
 # Each data in english taken
 #print("after stemming",review[1])
 review=" ".join(review)
 #print(review)
 corpus.append(review) # appending to corpus everytime

In [294]:
# TF-IDf iniatializing
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer(max_features=3000) # tf object creating
X=tf.fit_transform(corpus).toarray()  # transform tf to X via corpus

In [295]:
y=pd.get_dummies(df['sentiment']) # converting the categorical output variable into numbers
y=y.iloc[:,1].values

In [296]:
# train and test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=.20,random_state=0)

In [297]:
#predict the model with test data
y_pred=spam_model.predict(X_test)

In [298]:
# check the performance
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8083333333333333

In [299]:
# check confusion matrix to true positive and negatives
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[236,  70],
       [ 45, 249]])

*true positives are 236 and negatives are 249

In [300]:
corpus=[]
for i in range(0,len(df)):
  #print()
  review=re.sub('[^a-zA-Z]'," ",df['review'][i]) # we removed all the numbers and special characters only allowed alphabets
  #print(review)
  review=review.lower() # lowering all text
  review= review.split() # splitting (converting each sentence into the list of words)
  #print("before lemmatizing",review[1])
  review=[ln.lemmatize(word) for word in review if not word in stopwords.words('english')]
  #print("after lemmatizing",review[1])
  review=" ".join(review)
  #print(review)
  corpus.append(review)

In [301]:
# TF-IDf iniatializing
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer(max_features=3000) # tf object creating
X=tf.fit_transform(corpus).toarray()  # transform tf to X via corpus

In [302]:
# train and test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=.20,random_state=0)

In [303]:
#predict the model with test data
y_pred=spam_model.predict(X_test)

In [304]:
# check the performance
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.4666666666666667

In [305]:
# check confusion matrix to true positive and negatives
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[ 99, 207],
       [113, 181]])

***IN tf-idf stemming score is better than lemmatization with true postives 99 and true negatives 181