In [None]:
import numpy as np
import pandas as pd
from pandas import read_csv
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

## Case 1 : Would use Word2Vec and then apply different classifiers
from gensim.models import Word2Vec

## Case 2 : Would use TfidfVectorizer and then apply different classifiers
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
filename = 'BBC_News.csv'
dataset = read_csv(filename)
print(dataset)

In [None]:
print("Shape of Dataset: ", dataset.shape)

In [None]:
print("Columns of Dataset: ", dataset.columns)

In [None]:
print("Categories of Dataset: ", dataset.Category.unique())

In [None]:
print("Samples of Dataset: ", dataset.sample(n=5))

In [None]:
# Plotting number of samples within each category
print('NUMBER OF SAMPLES IN EACH CATEGORY: \n')
sns.countplot(dataset.Category)

In [None]:
# checking for any null() values
dataset.isna().sum()

In [None]:
dataset.columns = dataset.columns.str.lower()
print("Columns of Dataset: ", dataset.columns)

**Note :** Data set seems balanced.  

In [None]:
# DATA CLEANING
print('Data cleaning in progress...')

# Tokenize : dividing Sentences into words
dataset['text_clean'] = dataset['text'].apply(nltk.word_tokenize)
print('Tokenization complete.')


# Remove stop words
stop_words=set(nltk.corpus.stopwords.words("english"))
dataset['text_clean'] = dataset['text_clean'].apply(lambda x: [item for item in x if item not in stop_words])
print('Stop words removed.')


# Remove numbers, punctuation and special characters (only keep words)
regex = '[a-z]+'
dataset['text_clean'] = dataset['text_clean'].apply(lambda x: [item for item in x if re.match(regex, item)])
print('Numbers, punctuation and special characters removed.')


# Lemmatization : lemma means base form of a word.  // Example : leaf and leaves get lemmatized to leaf
lem = nltk.stem.wordnet.WordNetLemmatizer()
dataset['text_clean'] = dataset['text_clean'].apply(lambda x: [lem.lemmatize(item, pos='v') for item in x])
print('Lemmatization complete.\nData cleaning complete.\n')



## **Solution** :  Using sklearn.feature_extraction.text.TfidfVectorizer  


<span style="background-color:yellow">Conclusion of this Ananlysis</span> : TfidfVectorizer seems to have performed far better with good results.