import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv(r"C:/Users/Ismail/Downloads/BBC News.csv")

df

df.head

<bound method NDFrame.head of       ArticleId                                               Text  \
0          1833  worldcom ex-boss launches defence lawyers defe...   
1           154  german business confidence slides german busin...   
2          1101  bbc poll indicates economic gloom citizens in ...   
3          1976  lifestyle  governs mobile choice  faster  bett...   
4           917  enron bosses in $168m payout eighteen former e...   
...         ...                                                ...   
1485        857  double eviction from big brother model caprice...   
1486        325  dj double act revamp chart show dj duo jk and ...   
1487       1590  weak dollar hits reuters revenues at media gro...   
1488       1587  apple ipod family expands market apple has exp...   
1489        538  santy worm makes unwelcome visit thousands of ...   

           Category  
0          business  
1          business  
2          business  
3              tech  
4          business  
...             ...  
1485  entertainment  
1486  entertainment  
1487       business  
1488           tech  
1489           tech  

[1490 rows x 3 columns]>

nltk.download('stopwords')
stop_words = stopwords.words('english')
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ismail\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

def preprocess_text(text):
    if isinstance(text, str):
        tokens = nltk.word_tokenize(text.lower())
        tokens = [t for t in tokens if t not in stop_words and t.isalpha()]
        tokens = [stemmer.stem(t) for t in tokens]
    else:
        return text
    return " ".join(tokens)

df['ArticleId'] = df['ArticleId'].apply(preprocess_text)

df['ArticleId']

0       1833
1        154
2       1101
3       1976
4        917
        ... 
1485     857
1486     325
1487    1590
1488    1587
1489     538
Name: ArticleId, Length: 1490, dtype: int64

df['ArticleId'] = df['ArticleId'].astype(str)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['ArticleId'])
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X)

y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y

0            business
1            business
2            business
3                tech
4            business
            ...      
1485    entertainment
1486    entertainment
1487         business
1488             tech
1489             tech
Name: Category, Length: 1490, dtype: object

model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

from sklearn.metrics import precision_score,recall_score
precision_score(y_train, y_pred, average='macro', zero_division=1)
recall_score(y_train, y_pred, average='macro', zero_division=1)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-15-83d4c5385b53> in <module>
      1 from sklearn.metrics import precision_score,recall_score
----> 2 precision_score(y_train, y_pred, average='macro', zero_division=1)
      3 recall_score(y_train, y_pred, average='macro', zero_division=1)

NameError: name 'y_pred' is not defined

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.21140939597315436
               precision    recall  f1-score   support

     business       0.00      0.00      0.00        75
entertainment       0.00      0.00      0.00        46
     politics       0.00      0.00      0.00        56
        sport       0.21      1.00      0.35        63
         tech       0.00      0.00      0.00        58

     accuracy                           0.21       298
    macro avg       0.04      0.20      0.07       298
 weighted avg       0.04      0.21      0.07       298

C:\Users\Ismail\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

new_article = "Apple announces new iPhone release"
new_article = preprocess_text(new_article)
new_article_vec = vectorizer.transform([new_article])
new_article_vec = tfidf_transformer.transform(new_article_vec)
print(model.predict(new_article_vec))

['sport']

from sklearn.pipeline import Pipeline
Text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

# define the pipeline
Text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=0.1)),
])

# train the model
Text_clf.fit(train_df['Text'], train_df['Category'])

# make predictions on the test set
predicted = Text_clf.predict(test_df['Text'])

# print the classification report
print(classification_report(test_df['Category'], predicted))

               precision    recall  f1-score   support

     business       0.98      0.98      0.98        65
entertainment       1.00      0.94      0.97        51
     politics       1.00      0.98      0.99        57
        sport       1.00      1.00      1.00        74
         tech       0.93      1.00      0.96        51

     accuracy                           0.98       298
    macro avg       0.98      0.98      0.98       298
 weighted avg       0.98      0.98      0.98       298

	ArticleId	Text	Category
0	1833	worldcom ex-boss launches defence lawyers defe...	business
1	154	german business confidence slides german busin...	business
2	1101	bbc poll indicates economic gloom citizens in ...	business
3	1976	lifestyle governs mobile choice faster bett...	tech
4	917	enron bosses in $168m payout eighteen former e...	business
...	...	...	...
1485	857	double eviction from big brother model caprice...	entertainment
1486	325	dj double act revamp chart show dj duo jk and ...	entertainment
1487	1590	weak dollar hits reuters revenues at media gro...	business
1488	1587	apple ipod family expands market apple has exp...	tech
1489	538	santy worm makes unwelcome visit thousands of ...	tech