Article Id – Article id unique given to the record
Article – Text of the header and article
Category – Category of the article (tech, business, sport, entertainment, politics)
import os
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
path = 'C:\\Users\\mahit\\OneDrive\\Desktop\\DSPP\\NLP & Big Data\\Assignments\\'
df = pd.read_csv(path + 'BBC News.csv')
df
ArticleId | Text | Category | |
---|---|---|---|
0 | 1833 | worldcom ex-boss launches defence lawyers defe... | business |
1 | 154 | german business confidence slides german busin... | business |
2 | 1101 | bbc poll indicates economic gloom citizens in ... | business |
3 | 1976 | lifestyle governs mobile choice faster bett... | tech |
4 | 917 | enron bosses in $168m payout eighteen former e... | business |
... | ... | ... | ... |
1485 | 857 | double eviction from big brother model caprice... | entertainment |
1486 | 325 | dj double act revamp chart show dj duo jk and ... | entertainment |
1487 | 1590 | weak dollar hits reuters revenues at media gro... | business |
1488 | 1587 | apple ipod family expands market apple has exp... | tech |
1489 | 538 | santy worm makes unwelcome visit thousands of ... | tech |
1490 rows × 3 columns
len(df)
1490
df.isnull().sum()
ArticleId 0 Text 0 Category 0 dtype: int64
df.drop('ArticleId', axis = 1)
Text | Category | |
---|---|---|
0 | worldcom ex-boss launches defence lawyers defe... | business |
1 | german business confidence slides german busin... | business |
2 | bbc poll indicates economic gloom citizens in ... | business |
3 | lifestyle governs mobile choice faster bett... | tech |
4 | enron bosses in $168m payout eighteen former e... | business |
... | ... | ... |
1485 | double eviction from big brother model caprice... | entertainment |
1486 | dj double act revamp chart show dj duo jk and ... | entertainment |
1487 | weak dollar hits reuters revenues at media gro... | business |
1488 | apple ipod family expands market apple has exp... | tech |
1489 | santy worm makes unwelcome visit thousands of ... | tech |
1490 rows × 2 columns
df['Category'].unique()
array(['business', 'tech', 'politics', 'sport', 'entertainment'], dtype=object)
df['ArticleId'].describe()
count 1490.000000 mean 1119.696644 std 641.826283 min 2.000000 25% 565.250000 50% 1112.500000 75% 1680.750000 max 2224.000000 Name: ArticleId, dtype: float64
df['Category'].value_counts(normalize = True)
sport 0.232215 business 0.225503 politics 0.183893 entertainment 0.183221 tech 0.175168 Name: Category, dtype: float64
sns.countplot(df['Category'])
<AxesSubplot:xlabel='Category', ylabel='count'>
from sklearn.model_selection import train_test_split
X = df['Text']
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X
0 worldcom ex-boss launches defence lawyers defe... 1 german business confidence slides german busin... 2 bbc poll indicates economic gloom citizens in ... 3 lifestyle governs mobile choice faster bett... 4 enron bosses in $168m payout eighteen former e... ... 1485 double eviction from big brother model caprice... 1486 dj double act revamp chart show dj duo jk and ... 1487 weak dollar hits reuters revenues at media gro... 1488 apple ipod family expands market apple has exp... 1489 santy worm makes unwelcome visit thousands of ... Name: Text, Length: 1490, dtype: object
y
0 business 1 business 2 business 3 tech 4 business ... 1485 entertainment 1486 entertainment 1487 business 1488 tech 1489 tech Name: Category, Length: 1490, dtype: object
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
print(X_train_counts.shape)
print(X_train_counts)
(998, 20875) (0, 14877) 3 (0, 18978) 13 (0, 8359) 1 (0, 1739) 1 (0, 14083) 4 (0, 11111) 4 (0, 13209) 1 (0, 12584) 16 (0, 1641) 10 (0, 4881) 5 (0, 10265) 3 (0, 1869) 3 (0, 2498) 1 (0, 18589) 1 (0, 16661) 2 (0, 16525) 2 (0, 9721) 12 (0, 18778) 23 (0, 19448) 2 (0, 1383) 2 (0, 2047) 5 (0, 60) 1 (0, 121) 1 (0, 20757) 2 (0, 13187) 1 : : (997, 9973) 1 (997, 8667) 1 (997, 18137) 1 (997, 9691) 1 (997, 14790) 1 (997, 1994) 1 (997, 4177) 2 (997, 1549) 1 (997, 10186) 1 (997, 15432) 1 (997, 19710) 1 (997, 9614) 1 (997, 14616) 1 (997, 13910) 1 (997, 18367) 2 (997, 9094) 2 (997, 14679) 1 (997, 7100) 1 (997, 14750) 1 (997, 20776) 1 (997, 13608) 2 (997, 18733) 1 (997, 19758) 1 (997, 2017) 1 (997, 18536) 1
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)
print(X_train_tfidf)
(998, 20875) (0, 20804) 0.0616912886065074 (0, 20792) 0.09422625290210854 (0, 20757) 0.027849536428000365 (0, 20660) 0.014201980695658225 (0, 20637) 0.018477191857034246 (0, 20632) 0.03586039874568629 (0, 20618) 0.06765562752747772 (0, 20616) 0.02379862613725528 (0, 20552) 0.020171310369424333 (0, 20546) 0.043907755673260135 (0, 20531) 0.02876455068168828 (0, 20489) 0.03607157173632206 (0, 20485) 0.04844000229876237 (0, 20456) 0.03607157173632206 (0, 20448) 0.032604005104225606 (0, 20445) 0.0146791123973463 (0, 20422) 0.05210770870386959 (0, 20414) 0.03336877070200023 (0, 20368) 0.02003095602591858 (0, 20287) 0.015467965090728918 (0, 20284) 0.03607157173632206 (0, 20242) 0.020818402108144864 (0, 20080) 0.04044018904116368 (0, 19872) 0.06867201421954602 (0, 19692) 0.05822372197441094 : : (997, 2051) 0.09249120169408614 (997, 2049) 0.13078045416915438 (997, 2047) 0.04534493373600035 (997, 2021) 0.0361175892543833 (997, 2017) 0.06650162403236264 (997, 1994) 0.058054419089917556 (997, 1953) 0.023871707865995538 (997, 1926) 0.04624560084704307 (997, 1850) 0.050785698934084966 (997, 1807) 0.04856244680742958 (997, 1792) 0.0384344352147698 (997, 1786) 0.038217509595959896 (997, 1704) 0.03016452253325912 (997, 1641) 0.09293035755774524 (997, 1621) 0.013237947381796365 (997, 1549) 0.06276367629802855 (997, 1526) 0.08635089337493325 (997, 1519) 0.03759577426606561 (997, 1480) 0.09054858274653231 (997, 1469) 0.03459064499250703 (997, 1315) 0.016489479408291294 (997, 389) 0.038217509595959896 (997, 356) 0.045274291373266155 (997, 344) 0.03530122314953035 (997, 43) 0.043593484723051454
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
print(X_train_tfidf.shape)
print(X_train_tfidf)
(998, 20875) (0, 20284) 0.03607157173632207 (0, 12811) 0.0154105041471869 (0, 9371) 0.03506428278321159 (0, 20448) 0.03260400510422562 (0, 14828) 0.025668871840032716 (0, 6094) 0.04792726035848692 (0, 5052) 0.033348132629451584 (0, 1505) 0.03882177791227634 (0, 18631) 0.028584500418998852 (0, 3858) 0.030037489144874335 (0, 18628) 0.04711216087747208 (0, 13264) 0.03797991136217772 (0, 3840) 0.041582159070502556 (0, 7324) 0.034692394992770714 (0, 1544) 0.02867405172511228 (0, 9552) 0.041999416047404495 (0, 18787) 0.028537068235696076 (0, 19376) 0.03246264533059357 (0, 10766) 0.02639573988091263 (0, 5060) 0.12338257721301484 (0, 20080) 0.04044018904116369 (0, 18609) 0.05097757001368143 (0, 13664) 0.02340078363314239 (0, 6990) 0.04290046672014966 (0, 2599) 0.02742028831836916 : : (997, 10235) 0.03205881532556317 (997, 1526) 0.08635089337493324 (997, 20660) 0.015309370286648721 (997, 1953) 0.023871707865995535 (997, 16329) 0.021459309954633472 (997, 9014) 0.040495403299055446 (997, 20445) 0.03164741200345572 (997, 20552) 0.03261623145004439 (997, 20242) 0.033662553852158265 (997, 20422) 0.014042692784046821 (997, 15999) 0.03412899059336392 (997, 7873) 0.04939356424439194 (997, 2557) 0.07076174900895837 (997, 8965) 0.022304617721905493 (997, 8061) 0.012250953417679336 (997, 13128) 0.11151642906929428 (997, 20485) 0.0522170778689048 (997, 20757) 0.030021084708143193 (997, 2047) 0.04534493373600034 (997, 18778) 0.3410997972567003 (997, 9721) 0.0744187447173515 (997, 2498) 0.022906156015038866 (997, 1641) 0.09293035755774523 (997, 13209) 0.05090499693587914 (997, 18978) 0.07412136000992117
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)
LinearSVC()
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
('clf', LinearSVC()),
])
text_clf.fit(X_train, y_train)
Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])
# Form a prediction set
predictions = text_clf.predict(X_test)
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))
[[114 0 2 0 1] [ 1 87 0 0 0] [ 2 1 88 0 2] [ 0 0 0 110 0] [ 1 1 0 0 82]]
print(metrics.classification_report(y_test,predictions))
precision recall f1-score support business 0.97 0.97 0.97 117 entertainment 0.98 0.99 0.98 88 politics 0.98 0.95 0.96 93 sport 1.00 1.00 1.00 110 tech 0.96 0.98 0.97 84 accuracy 0.98 492 macro avg 0.98 0.98 0.98 492 weighted avg 0.98 0.98 0.98 492
print(metrics.accuracy_score(y_test,predictions))
0.9776422764227642