# BBC News Assignment
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
#from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\MNLVBPV\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\MNLVBPV\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\MNLVBPV\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date!
nltk.download()
showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
True
df = pd.read_csv("C:\\Users\MNLVBPV\BBC News.csv")
!pip install nltk
Requirement already satisfied: nltk in h:\anakonda\lib\site-packages (3.7) Requirement already satisfied: regex>=2021.8.3 in h:\anakonda\lib\site-packages (from nltk) (2022.3.15) Requirement already satisfied: click in h:\anakonda\lib\site-packages (from nltk) (8.0.4) Requirement already satisfied: tqdm in h:\anakonda\lib\site-packages (from nltk) (4.64.0) Requirement already satisfied: joblib in h:\anakonda\lib\site-packages (from nltk) (1.1.0) Requirement already satisfied: colorama in h:\anakonda\lib\site-packages (from click->nltk) (0.4.4)
print(df)
ArticleId Text \ 0 1833 worldcom ex-boss launches defence lawyers defe... 1 154 german business confidence slides german busin... 2 1101 bbc poll indicates economic gloom citizens in ... 3 1976 lifestyle governs mobile choice faster bett... 4 917 enron bosses in $168m payout eighteen former e... ... ... ... 1485 857 double eviction from big brother model caprice... 1486 325 dj double act revamp chart show dj duo jk and ... 1487 1590 weak dollar hits reuters revenues at media gro... 1488 1587 apple ipod family expands market apple has exp... 1489 538 santy worm makes unwelcome visit thousands of ... Category 0 business 1 business 2 business 3 tech 4 business ... ... 1485 entertainment 1486 entertainment 1487 business 1488 tech 1489 tech [1490 rows x 3 columns]
df.shape
(1490, 3)
df.info
<bound method DataFrame.info of ArticleId Text \ 0 1833 worldcom ex-boss launches defence lawyers defe... 1 154 german business confidence slides german busin... 2 1101 bbc poll indicates economic gloom citizens in ... 3 1976 lifestyle governs mobile choice faster bett... 4 917 enron bosses in $168m payout eighteen former e... ... ... ... 1485 857 double eviction from big brother model caprice... 1486 325 dj double act revamp chart show dj duo jk and ... 1487 1590 weak dollar hits reuters revenues at media gro... 1488 1587 apple ipod family expands market apple has exp... 1489 538 santy worm makes unwelcome visit thousands of ... Category 0 business 1 business 2 business 3 tech 4 business ... ... 1485 entertainment 1486 entertainment 1487 business 1488 tech 1489 tech [1490 rows x 3 columns]>
df.head(5)
ArticleId | Text | Category | |
---|---|---|---|
0 | 1833 | worldcom ex-boss launches defence lawyers defe... | business |
1 | 154 | german business confidence slides german busin... | business |
2 | 1101 | bbc poll indicates economic gloom citizens in ... | business |
3 | 1976 | lifestyle governs mobile choice faster bett... | tech |
4 | 917 | enron bosses in $168m payout eighteen former e... | business |
target_category = df['Category'].unique()
print(target_category)
['business' 'tech' 'politics' 'sport' 'entertainment']
df['CategoryId'] = df['Category'].factorize()[0]
df.head()
ArticleId | Text | Category | CategoryId | |
---|---|---|---|---|
0 | 1833 | worldcom ex-boss launches defence lawyers defe... | business | 0 |
1 | 154 | german business confidence slides german busin... | business | 0 |
2 | 1101 | bbc poll indicates economic gloom citizens in ... | business | 0 |
3 | 1976 | lifestyle governs mobile choice faster bett... | tech | 1 |
4 | 917 | enron bosses in $168m payout eighteen former e... | business | 0 |
category = df[['Category', 'CategoryId']].drop_duplicates().sort_values('CategoryId')
category
Category | CategoryId | |
---|---|---|
0 | business | 0 |
3 | tech | 1 |
5 | politics | 2 |
6 | sport | 3 |
7 | entertainment | 4 |
df.groupby('Category').CategoryId.value_counts().plot(kind = "bar", color = ["pink", "orange", "red", "yellow", "blue"])
plt.xlabel("Category of data")
plt.title("Visulaize numbers of Category of data")
plt.show()
text = df["Text"]
text.head(10)
0 worldcom ex-boss launches defence lawyers defe... 1 german business confidence slides german busin... 2 bbc poll indicates economic gloom citizens in ... 3 lifestyle governs mobile choice faster bett... 4 enron bosses in $168m payout eighteen former e... 5 howard truanted to play snooker conservative... 6 wales silent on grand slam talk rhys williams ... 7 french honour for director parker british film... 8 car giant hit by mercedes slump a slump in pro... 9 fockers fuel festive film chart comedy meet th... Name: Text, dtype: object
category = df['Category']
category.head(10)
0 business 1 business 2 business 3 tech 4 business 5 politics 6 sport 7 entertainment 8 business 9 entertainment Name: Category, dtype: object
def remove_tags(text):
remove = re.compile(r'')
return re.sub(remove, '', text)
df['Text'] = df['Text'].apply(remove_tags)
def convert_lower(text):
return text.lower()
df['Text'] = df['Text'].apply(convert_lower)
df['Text'][1]
'german business confidence slides german business confidence fell in february knocking hopes of a speedy recovery in europe s largest economy. munich-based research institute ifo said that its confidence index fell to 95.5 in february from 97.5 in january its first decline in three months. the study found that the outlook in both the manufacturing and retail sectors had worsened. observers had been hoping that a more confident business sector would signal that economic activity was picking up. we re surprised that the ifo index has taken such a knock said dz bank economist bernd weidensteiner. the main reason is probably that the domestic economy is still weak particularly in the retail trade. economy and labour minister wolfgang clement called the dip in february s ifo confidence figure a very mild decline . he said that despite the retreat the index remained at a relatively high level and that he expected a modest economic upswing to continue. germany s economy grew 1.6% last year after shrinking in 2003. however the economy contracted by 0.2% during the last three months of 2004 mainly due to the reluctance of consumers to spend. latest indications are that growth is still proving elusive and ifo president hans-werner sinn said any improvement in german domestic demand was sluggish. exports had kept things going during the first half of 2004 but demand for exports was then hit as the value of the euro hit record levels making german products less competitive overseas. on top of that the unemployment rate has been stuck at close to 10% and manufacturing firms including daimlerchrysler siemens and volkswagen have been negotiating with unions over cost cutting measures. analysts said that the ifo figures and germany s continuing problems may delay an interest rate rise by the european central bank. eurozone interest rates are at 2% but comments from senior officials have recently focused on the threat of inflation prompting fears that interest rates may rise.'
def lemmatize_word(text):
wordnet = WordNetLemmatizer()
return " ".join([wordnet.lemmatize(word) for word in text])
df['Text'] =df['Text'].apply(lemmatize_word)
df['Text'][1]
df.shape
df['Category'].value_counts()
df['CategoryId'] = df['Category'].factorize()[0]
df.head()
category = df[['Category', 'CategoryId']].drop_duplicates().sort_values('CategoryId')
category
df.groupby('Category').CategoryId.value_counts().plot(kind = "bar", color = ["pink", "orange", "red", "yellow", "blue"])
plt.xlabel("Category of data")
plt.title("Visulaize numbers of Category of data")
plt.show()
fig = plt.figure(figsize = (5,5))
colors = ["skyblue"]
business = df[df['CategoryId'] == 0 ]
tech = df[df['CategoryId'] == 1 ]
politics = df[df['CategoryId'] == 2]
sport = df[df['CategoryId'] == 3]
entertainment = df[df['CategoryId'] == 4]
count = [business['CategoryId'].count(), tech['CategoryId'].count(), politics['CategoryId'].count(), sport['CategoryId'].count(), entertainment['CategoryId'].count()]
pie = plt.pie(count, labels = ['business', 'tech', 'politics', 'sport', 'entertainment'],
autopct = "%1.1f%%",
shadow = True,
colors = colors,
startangle = 45,
explode = (0.05, 0.05, 0.05, 0.05,0.05))
text = df["Text"]
text.head(10)
category = df['Category']
category.head(10)
def remove_tags(text):
remove = re.compile(r'')
return re.sub(remove, '', text)
df['Text'] = df['Text'].apply(remove_tags)
def special_char(text):
reviews = ''
for x in text:
if x.isalnum():
reviews = reviews + x
else:
reviews = reviews + ' '
return reviews
df['Text'] = df['Text'].apply(special_char)
def convert_lower(text):
return text.lower()
df['Text'] = df['Text'].apply(convert_lower)
df['Text'][1]
def remove_stopwords(text):
stop_words = set(stopwords.words('english'))
words = word_tokenize(text)
return [x for x in words if x not in stop_words]
df['Text'] = df['Text'].apply(remove_stopwords)
df['Text'][1]
df
x = df['Text']
y = df['CategoryId']
from sklearn.feature_extraction.text import CountVectorizer
x = np.array(df.iloc[:,0].values)
y = np.array(df.CategoryId.values)
cv = CountVectorizer(max_features = 5000)
x = cv.fit_transform(df.Text).toarray()
print("X.shape = ",x.shape)
print("y.shape = ",y.shape)