import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


# Load the dataset
df = pd.read_csv('C:/Users/STS/Desktop/Gagan/BBC News.csv')


# Tokenize the text into words
df['tokens'] = df['Text'].apply(nltk.word_tokenize)


# Remove stop words and perform stemming
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
df['preprocessed'] = df['tokens'].apply(lambda x: [stemmer.stem(word.lower()) for word in x if word.lower() not in stop_words])


# Convert the preprocessed text into a string
df['text'] = df['preprocessed'].apply(lambda x: ' '.join(x))


# Convert the text into text-numeric vectors using the TF-IDF model
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['text'])


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(vectors, df['Category'], test_size=0.2, random_state=42)


# Train a Naive Bayes classifier on the training data
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()


#Evaluate the performance of the classifier on the testing data
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.959731543624161