1. Tokenization, Stopword removal, Stemming/Lemmatization using NLTK¶

In [ ]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
In [ ]:
# importing required libraries 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from string import punctuation
In [ ]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
In [ ]:
# Load the text from the file
with open('novel.txt', 'r') as f:
    text = f.read()
In [ ]:
# Tokenize the text
tokens = word_tokenize(text)
In [ ]:
# Remove stopwords and punctuation
stop_words = set(stopwords.words('english'))
filtered_tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token not in punctuation]
In [ ]:
# Perform stemming and lemmatization
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

Results of NLTK¶

In [ ]:
print("Original text:\n\n", text)
In [ ]:
print("Filtered tokens:\n\n", filtered_tokens)
In [ ]:
print("Stemmed tokens:\n\n", stemmed_tokens)
In [ ]:
print("Lemmatized tokens:\n\n", lemmatized_tokens)

2. Converting Text-Numeric vectors using TF-IDF model with scikit-learn¶

In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
In [ ]:
# Join the lemmatized tokens into a single string
processed_text = ' '.join(lemmatized_tokens)
In [ ]:
# Convert the text to numeric vectors using the TF-IDF model
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([processed_text])

Print the resulting vectors¶

In [ ]:
print("TF-IDF Vectors:\n\n", vectors.toarray())
In [ ]: