In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
In [2]:
df = pd.read_csv(r"C:/Users/Ismail/Downloads/BBC News.csv")
In [3]:
df
Out[3]:
ArticleId Text Category
0 1833 worldcom ex-boss launches defence lawyers defe... business
1 154 german business confidence slides german busin... business
2 1101 bbc poll indicates economic gloom citizens in ... business
3 1976 lifestyle governs mobile choice faster bett... tech
4 917 enron bosses in $168m payout eighteen former e... business
... ... ... ...
1485 857 double eviction from big brother model caprice... entertainment
1486 325 dj double act revamp chart show dj duo jk and ... entertainment
1487 1590 weak dollar hits reuters revenues at media gro... business
1488 1587 apple ipod family expands market apple has exp... tech
1489 538 santy worm makes unwelcome visit thousands of ... tech

1490 rows × 3 columns

In [4]:
df.head
Out[4]:
<bound method NDFrame.head of       ArticleId                                               Text  \
0          1833  worldcom ex-boss launches defence lawyers defe...   
1           154  german business confidence slides german busin...   
2          1101  bbc poll indicates economic gloom citizens in ...   
3          1976  lifestyle  governs mobile choice  faster  bett...   
4           917  enron bosses in $168m payout eighteen former e...   
...         ...                                                ...   
1485        857  double eviction from big brother model caprice...   
1486        325  dj double act revamp chart show dj duo jk and ...   
1487       1590  weak dollar hits reuters revenues at media gro...   
1488       1587  apple ipod family expands market apple has exp...   
1489        538  santy worm makes unwelcome visit thousands of ...   

           Category  
0          business  
1          business  
2          business  
3              tech  
4          business  
...             ...  
1485  entertainment  
1486  entertainment  
1487       business  
1488           tech  
1489           tech  

[1490 rows x 3 columns]>
In [5]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ismail\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [6]:
def preprocess_text(text):
    if isinstance(text, str):
        tokens = nltk.word_tokenize(text.lower())
        tokens = [t for t in tokens if t not in stop_words and t.isalpha()]
        tokens = [stemmer.stem(t) for t in tokens]
    else:
        return text
    return " ".join(tokens)

  
   
In [7]:
df['ArticleId'] = df['ArticleId'].apply(preprocess_text)
In [8]:
df['ArticleId']
Out[8]:
0       1833
1        154
2       1101
3       1976
4        917
        ... 
1485     857
1486     325
1487    1590
1488    1587
1489     538
Name: ArticleId, Length: 1490, dtype: int64
In [9]:
df['ArticleId'] = df['ArticleId'].astype(str)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['ArticleId'])
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X)
In [10]:
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [11]:
y
Out[11]:
0            business
1            business
2            business
3                tech
4            business
            ...      
1485    entertainment
1486    entertainment
1487         business
1488             tech
1489             tech
Name: Category, Length: 1490, dtype: object
In [12]:
model = MultinomialNB()
model.fit(X_train, y_train)
Out[12]:
MultinomialNB()
In [15]:
from sklearn.metrics import precision_score,recall_score
precision_score(y_train, y_pred, average='macro', zero_division=1)
recall_score(y_train, y_pred, average='macro', zero_division=1)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-15-83d4c5385b53> in <module>
      1 from sklearn.metrics import precision_score,recall_score
----> 2 precision_score(y_train, y_pred, average='macro', zero_division=1)
      3 recall_score(y_train, y_pred, average='macro', zero_division=1)

NameError: name 'y_pred' is not defined
In [16]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 0.21140939597315436
               precision    recall  f1-score   support

     business       0.00      0.00      0.00        75
entertainment       0.00      0.00      0.00        46
     politics       0.00      0.00      0.00        56
        sport       0.21      1.00      0.35        63
         tech       0.00      0.00      0.00        58

     accuracy                           0.21       298
    macro avg       0.04      0.20      0.07       298
 weighted avg       0.04      0.21      0.07       298

C:\Users\Ismail\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
In [17]:
new_article = "Apple announces new iPhone release"
new_article = preprocess_text(new_article)
new_article_vec = vectorizer.transform([new_article])
new_article_vec = tfidf_transformer.transform(new_article_vec)
print(model.predict(new_article_vec))
['sport']
In [18]:
from sklearn.pipeline import Pipeline
Text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
In [19]:
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

# define the pipeline
Text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=0.1)),
])

# train the model
Text_clf.fit(train_df['Text'], train_df['Category'])

# make predictions on the test set
predicted = Text_clf.predict(test_df['Text'])

# print the classification report
print(classification_report(test_df['Category'], predicted))
               precision    recall  f1-score   support

     business       0.98      0.98      0.98        65
entertainment       1.00      0.94      0.97        51
     politics       1.00      0.98      0.99        57
        sport       1.00      1.00      1.00        74
         tech       0.93      1.00      0.96        51

     accuracy                           0.98       298
    macro avg       0.98      0.98      0.98       298
 weighted avg       0.98      0.98      0.98       298

In [ ]:
 
In [ ]:
 
In [ ]: