import numpy as np 
import pandas as pd
import os
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud ,STOPWORDS
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize import word_tokenize 
from sklearn.model_selection import GridSearchCV

import re
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv('BBC News.csv')
data
ArticleId Text Category
0 1833 worldcom ex-boss launches defence lawyers defe... business
1 154 german business confidence slides german busin... business
2 1101 bbc poll indicates economic gloom citizens in ... business
3 1976 lifestyle governs mobile choice faster bett... tech
4 917 enron bosses in $168m payout eighteen former e... business
... ... ... ...
1485 857 double eviction from big brother model caprice... entertainment
1486 325 dj double act revamp chart show dj duo jk and ... entertainment
1487 1590 weak dollar hits reuters revenues at media gro... business
1488 1587 apple ipod family expands market apple has exp... tech
1489 538 santy worm makes unwelcome visit thousands of ... tech

1490 rows × 3 columns

data.head()
ArticleId Text Category
0 1833 worldcom ex-boss launches defence lawyers defe... business
1 154 german business confidence slides german busin... business
2 1101 bbc poll indicates economic gloom citizens in ... business
3 1976 lifestyle governs mobile choice faster bett... tech
4 917 enron bosses in $168m payout eighteen former e... business
data['Category'].unique()
array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)
data.shape
(1490, 3)
data.dtypes
ArticleId     int64
Text         object
Category     object
dtype: object
data.isnull().any()
ArticleId    False
Text         False
Category     False
dtype: bool
sns.countplot(data.Category)
<AxesSubplot:xlabel='Category', ylabel='count'>

data['News_length'] = data['Text'].str.len()
print(data['News_length'])
0       1866
1       2016
2       3104
3       3618
4       2190
        ... 
1485    1266
1486    3111
1487    1370
1488    3242
1489    1723
Name: News_length, Length: 1490, dtype: int64
sns.distplot(data['News_length']).set_title('News length distribution');

def create_wordcloud(words):
    wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(words)
    plt.figure(figsize=(10, 7))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis('off')
    plt.show()
subset=data[data.Category=="business"]
text=subset.Text.values
words =" ".join(text)
create_wordcloud(words)

subset=data[data.Category=="entertainment"]
text=subset.Text.values
words =" ".join(text)
create_wordcloud(words)

subset=data[data.Category=="politics"]
text=subset.Text.values
words =" ".join(text)
create_wordcloud(words)

subset=data[data.Category=="sport"]
text=subset.Text.values
words =" ".join(text)
create_wordcloud(words)

subset=data[data.Category=="tech"]
text=subset.Text.values
words =" ".join(text)
create_wordcloud(words)

def process_text(text):
    text = text.lower().replace('\n',' ').replace('\r','').strip()
    text = re.sub(' +', ' ', text)
    text = re.sub(r'[^\w\s]','',text)
    
    
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    
    text = " ".join(filtered_sentence)
    return text
import nltk
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gundarohith/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
True
#data['Text_parsed'] = data['Text'].apply(process_text)
data.head()
ArticleId Text Category News_length
0 1833 worldcom ex-boss launches defence lawyers defe... business 1866
1 154 german business confidence slides german busin... business 2016
2 1101 bbc poll indicates economic gloom citizens in ... business 3104
3 1976 lifestyle governs mobile choice faster bett... tech 3618
4 917 enron bosses in $168m payout eighteen former e... business 2190
label_encoder = preprocessing.LabelEncoder() 
data['Category_target']= label_encoder.fit_transform(data['Category']) 
data.head()
ArticleId Text Category News_length Category_target
0 1833 worldcom ex-boss launches defence lawyers defe... business 1866 0
1 154 german business confidence slides german busin... business 2016 0
2 1101 bbc poll indicates economic gloom citizens in ... business 3104 0
3 1976 lifestyle governs mobile choice faster bett... tech 3618 4
4 917 enron bosses in $168m payout eighteen former e... business 2190 0
data.to_csv('BBC_News_processed.csv')
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300
model.get_params()
{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}