import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
df = pd.read_csv(r"C:/Users/Ismail/Downloads/BBC News.csv")
df
df.head
nltk.download('stopwords')
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
def preprocess_text(text):
if isinstance(text, str):
tokens = nltk.word_tokenize(text.lower())
tokens = [t for t in tokens if t not in stop_words and t.isalpha()]
tokens = [stemmer.stem(t) for t in tokens]
else:
return text
return " ".join(tokens)
df['ArticleId'] = df['ArticleId'].apply(preprocess_text)
df['ArticleId']
df['ArticleId'] = df['ArticleId'].astype(str)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['ArticleId'])
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X)
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y
model = MultinomialNB()
model.fit(X_train, y_train)
from sklearn.metrics import precision_score,recall_score
precision_score(y_train, y_pred, average='macro', zero_division=1)
recall_score(y_train, y_pred, average='macro', zero_division=1)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
new_article = "Apple announces new iPhone release"
new_article = preprocess_text(new_article)
new_article_vec = vectorizer.transform([new_article])
new_article_vec = tfidf_transformer.transform(new_article_vec)
print(model.predict(new_article_vec))
from sklearn.pipeline import Pipeline
Text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)
# define the pipeline
Text_clf = Pipeline([
('vect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB(alpha=0.1)),
])
# train the model
Text_clf.fit(train_df['Text'], train_df['Category'])
# make predictions on the test set
predicted = Text_clf.predict(test_df['Text'])
# print the classification report
print(classification_report(test_df['Category'], predicted))