import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud ,STOPWORDS
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize import word_tokenize
from sklearn.model_selection import GridSearchCV
import re
import warnings
"ignore") warnings.filterwarnings(
= pd.read_csv('BBC News.csv')
data data
ArticleId | Text | Category | |
---|---|---|---|
0 | 1833 | worldcom ex-boss launches defence lawyers defe... | business |
1 | 154 | german business confidence slides german busin... | business |
2 | 1101 | bbc poll indicates economic gloom citizens in ... | business |
3 | 1976 | lifestyle governs mobile choice faster bett... | tech |
4 | 917 | enron bosses in $168m payout eighteen former e... | business |
... | ... | ... | ... |
1485 | 857 | double eviction from big brother model caprice... | entertainment |
1486 | 325 | dj double act revamp chart show dj duo jk and ... | entertainment |
1487 | 1590 | weak dollar hits reuters revenues at media gro... | business |
1488 | 1587 | apple ipod family expands market apple has exp... | tech |
1489 | 538 | santy worm makes unwelcome visit thousands of ... | tech |
1490 rows × 3 columns
data.head()
ArticleId | Text | Category | |
---|---|---|---|
0 | 1833 | worldcom ex-boss launches defence lawyers defe... | business |
1 | 154 | german business confidence slides german busin... | business |
2 | 1101 | bbc poll indicates economic gloom citizens in ... | business |
3 | 1976 | lifestyle governs mobile choice faster bett... | tech |
4 | 917 | enron bosses in $168m payout eighteen former e... | business |
'Category'].unique() data[
array(['business', 'tech', 'politics', 'sport', 'entertainment'],
dtype=object)
data.shape
(1490, 3)
data.dtypes
ArticleId int64
Text object
Category object
dtype: object
any() data.isnull().
ArticleId False
Text False
Category False
dtype: bool
sns.countplot(data.Category)
<AxesSubplot:xlabel='Category', ylabel='count'>
'News_length'] = data['Text'].str.len()
data[print(data['News_length'])
0 1866
1 2016
2 3104
3 3618
4 2190
...
1485 1266
1486 3111
1487 1370
1488 3242
1489 1723
Name: News_length, Length: 1490, dtype: int64
'News_length']).set_title('News length distribution'); sns.distplot(data[
def create_wordcloud(words):
= WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(words)
wordcloud =(10, 7))
plt.figure(figsize="bilinear")
plt.imshow(wordcloud, interpolation'off')
plt.axis( plt.show()
=data[data.Category=="business"]
subset=subset.Text.values
text=" ".join(text)
words create_wordcloud(words)
=data[data.Category=="entertainment"]
subset=subset.Text.values
text=" ".join(text)
words create_wordcloud(words)
=data[data.Category=="politics"]
subset=subset.Text.values
text=" ".join(text)
words create_wordcloud(words)
=data[data.Category=="sport"]
subset=subset.Text.values
text=" ".join(text)
words create_wordcloud(words)
=data[data.Category=="tech"]
subset=subset.Text.values
text=" ".join(text)
words create_wordcloud(words)
def process_text(text):
= text.lower().replace('\n',' ').replace('\r','').strip()
text = re.sub(' +', ' ', text)
text = re.sub(r'[^\w\s]','',text)
text
= set(stopwords.words('english'))
stop_words = word_tokenize(text)
word_tokens = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
filtered_sentence for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
= " ".join(filtered_sentence)
text return text
import nltk
'stopwords') nltk.download(
[nltk_data] Downloading package stopwords to
[nltk_data] /Users/gundarohith/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
True
#data['Text_parsed'] = data['Text'].apply(process_text)
data.head()
ArticleId | Text | Category | News_length | |
---|---|---|---|---|
0 | 1833 | worldcom ex-boss launches defence lawyers defe... | business | 1866 |
1 | 154 | german business confidence slides german busin... | business | 2016 |
2 | 1101 | bbc poll indicates economic gloom citizens in ... | business | 3104 |
3 | 1976 | lifestyle governs mobile choice faster bett... | tech | 3618 |
4 | 917 | enron bosses in $168m payout eighteen former e... | business | 2190 |
= preprocessing.LabelEncoder()
label_encoder 'Category_target']= label_encoder.fit_transform(data['Category']) data[
data.head()
ArticleId | Text | Category | News_length | Category_target | |
---|---|---|---|---|---|
0 | 1833 | worldcom ex-boss launches defence lawyers defe... | business | 1866 | 0 |
1 | 154 | german business confidence slides german busin... | business | 2016 | 0 |
2 | 1101 | bbc poll indicates economic gloom citizens in ... | business | 3104 | 0 |
3 | 1976 | lifestyle governs mobile choice faster bett... | tech | 3618 | 4 |
4 | 917 | enron bosses in $168m payout eighteen former e... | business | 2190 | 0 |
'BBC_News_processed.csv') data.to_csv(
= (1,2)
ngram_range = 10
min_df = 1.
max_df = 300 max_features
model.get_params()
{'bootstrap': True,
'ccp_alpha': 0.0,
'class_weight': None,
'criterion': 'gini',
'max_depth': None,
'max_features': 'auto',
'max_leaf_nodes': None,
'max_samples': None,
'min_impurity_decrease': 0.0,
'min_samples_leaf': 1,
'min_samples_split': 2,
'min_weight_fraction_leaf': 0.0,
'n_estimators': 100,
'n_jobs': None,
'oob_score': False,
'random_state': 1,
'verbose': 0,
'warm_start': False}