In [149]:
# 1. Text Classification of News Articles using NLP.
#        Article Id – Article id unique given to the record
#        Article – Text of the header and article
#        Category – Category of the article (tech, business, sport, entertainment, politics)

#     Consider BBC News as corpus for implementing question 1

In [150]:
import pandas as pd


In [151]:
newsdf = pd.read_csv('BBC News.csv')
newsdf

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment
1487,1590,weak dollar hits reuters revenues at media gro...,business
1488,1587,apple ipod family expands market apple has exp...,tech


In [152]:
newsdf.shape

(1490, 3)

In [153]:
newsdf.describe

<bound method NDFrame.describe of       ArticleId                                               Text  \
0          1833  worldcom ex-boss launches defence lawyers defe...   
1           154  german business confidence slides german busin...   
2          1101  bbc poll indicates economic gloom citizens in ...   
3          1976  lifestyle  governs mobile choice  faster  bett...   
4           917  enron bosses in $168m payout eighteen former e...   
...         ...                                                ...   
1485        857  double eviction from big brother model caprice...   
1486        325  dj double act revamp chart show dj duo jk and ...   
1487       1590  weak dollar hits reuters revenues at media gro...   
1488       1587  apple ipod family expands market apple has exp...   
1489        538  santy worm makes unwelcome visit thousands of ...   

           Category  
0          business  
1          business  
2          business  
3              tech  
4          busi

In [154]:
category = newsdf[['Category']].drop_duplicates().sort_values('Category')
category

Unnamed: 0,Category
0,business
7,entertainment
5,politics
6,sport
3,tech


In [155]:
newsdf['category_id'] = newsdf['Category'].factorize()[0]

In [156]:
cols = ['Index', 'Content', 'Category', 'Category_id']
newsdf.columns = cols
newsdf

Unnamed: 0,Index,Content,Category,Category_id
0,1833,worldcom ex-boss launches defence lawyers defe...,business,0
1,154,german business confidence slides german busin...,business,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,0
3,1976,lifestyle governs mobile choice faster bett...,tech,1
4,917,enron bosses in $168m payout eighteen former e...,business,0
...,...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment,4
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment,4
1487,1590,weak dollar hits reuters revenues at media gro...,business,0
1488,1587,apple ipod family expands market apple has exp...,tech,1


In [157]:
categories = newsdf[['Category', 'Category_id']]
print(categories['Category'].unique())
print(categories['Category_id'].unique())

['business' 'tech' 'politics' 'sport' 'entertainment']
[0 1 2 3 4]


In [158]:
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))
# Preprocessing
def preprocessing(text):
    text = re.sub(r"\ 's", "", text)
    text = re.sub(r"\ 've", "", text)
    text = re.sub(r"\)","", text)
    text = re.sub(r"\?", "", text)
    text = re.sub(r"[^A-Za-z0-9(),!?\'\~]" ,"", text)
    text = re.sub(r"[0-9]\w+|[0-9]","",text)
    text = re.sub(r"\s{2,}","",text)
    text = text.lower()
    text = " ".join([word for word in text.split() if word not in stop_words])
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemm_text = [lemmatizer.lemmatize(word) for word in tokens]
    text = ' '.join(lemm_text)
    return text

newsdf['Content'] = newsdf['Content'].apply(preprocessing)
newsdf

Unnamed: 0,Index,Content,Category,Category_id
0,1833,worldcomexbosslaunchesdefencelawyersdefendingf...,business,0
1,154,germanbusinessconfidenceslidesgermanbusinessco...,business,0
2,1101,bbcpollindicateseconomicgloomcitizensinamajori...,business,0
3,1976,lifestylegovernsmobilechoicefasterbetterorfunk...,tech,1
4,917,enronbossesin ( (,business,0
...,...,...,...,...
1485,857,doubleevictionfrombigbrothermodelcapriceandhol...,entertainment,4
1486,325,djdoubleactrevampchartshowdjduojkandjoelaretak...,entertainment,4
1487,1590,weakdollarhitsreutersrevenuesatmediagroupreute...,business,0
1488,1587,appleipodfamilyexpandsmarketapplehasexpandedit...,tech,1


In [159]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
# Model Training
x = newsdf['Content'].tolist()
y = newsdf['Category_id'].tolist()

# TF-IDF

vector = TfidfVectorizer(stop_words='english',min_df=2)
X = vector.fit_transform(x)
Y = np.array(y)

tf_idf = vector.fit_transform(x)

print ("No. of features extracted:" ,X.shape[1])
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.20, random_state = 45)


No. of features extracted: 115


In [160]:
from sklearn.metrics import confusion_matrix, cohen_kappa_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=300, max_depth=150,n_jobs=1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
c_mat = confusion_matrix(y_test,y_pred)
kappa = cohen_kappa_score(y_test,y_pred)
acc = accuracy_score(y_test,y_pred)
print ("Confusion Matrix:\n", c_mat)
print ("\nKappa: ",kappa)
print ("\nAccuracy: ",acc)


Confusion Matrix:
 [[264   0   0   0   0]
 [198   8   0   0   0]
 [218   0   4   0   0]
 [278   0   0   2   0]
 [214   0   0   0   6]]

Kappa:  0.02231536349183405

Accuracy:  0.23825503355704697


In [161]:
headline = ['Australian Parliament passes free trade agreement with India']
vec = vector.transform(headline).toarray()
print('Headline:', headline)
print(str(list(model.predict(vec))[0]).replace('0', 'BUSINESS').replace('1', 'TECH').replace('2', 'POLITICS').replace('3','SPORTS').replace('4','ENTERTAINMENT'))

Headline: ['Australian Parliament passes free trade agreement with India']
BUSINESS
