import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# importing required libraries
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from string import punctuation
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
# Load the text from the file
with open('novel.txt', 'r') as f:
text = f.read()
# Tokenize the text
tokens = word_tokenize(text)
# Remove stopwords and punctuation
stop_words = set(stopwords.words('english'))
filtered_tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token not in punctuation]
# Perform stemming and lemmatization
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
print("Original text:\n\n", text)
print("Filtered tokens:\n\n", filtered_tokens)
print("Stemmed tokens:\n\n", stemmed_tokens)
print("Lemmatized tokens:\n\n", lemmatized_tokens)
from sklearn.feature_extraction.text import TfidfVectorizer
# Join the lemmatized tokens into a single string
processed_text = ' '.join(lemmatized_tokens)
# Convert the text to numeric vectors using the TF-IDF model
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([processed_text])
print("TF-IDF Vectors:\n\n", vectors.toarray())