3/3/25, 12:37 PM NLP_Lab_1.
ipynb - Colab
keyboard_arrow_down Tokenization
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
def tokenize_text(text):
tokens = word_tokenize(text)
return tokens
# Example usage:
text = "This is an example sentence. Tokenization is important in NLP."
tokens = tokenize_text(text)
tokens
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data] Package punkt_tab is already up-to-date!
['This',
'is',
'an',
'example',
'sentence',
'.',
'Tokenization',
'is',
'important',
'in',
'NLP',
'.']
keyboard_arrow_down Stopwords removal
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
text = "This is a sample sentence, showing off stop word filtering."
words = word_tokenize(text)
filtered_text = [word for word in words if word.lower() not in stopwords.words('english')]
print(filtered_text) # Output: ['sample', 'sentence', ',', 'showing', 'stop', 'word', 'filt
['sample', 'sentence', ',', 'showing', 'stop', 'word', 'filtering', '.']
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
https://colab.research.google.com/drive/1ZkyIk18BbWhzTjFZ_358EaB-3VBgogi9?authuser=3#scrollTo=eq6UqK-1k9j4&printMode=true 1/4
3/3/25, 12:37 PM NLP_Lab_1.ipynb - Colab
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Package punkt is already up-to-date!
keyboard_arrow_down POS tagging
import nltk
nltk.download('averaged_perceptron_tagger_eng')
text = "The quick brown fox jumps over the lazy dog."
words = word_tokenize(text)
pos_tags = nltk.pos_tag(words)
filtered_text = [word for word, tag in pos_tags if tag.startswith('NN')] # Keep only nouns
print(filtered_text) # Output: ['fox', 'dog']
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data] /root/nltk_data...
[nltk_data] Unzipping taggers/averaged_perceptron_tagger_eng.zip.
['brown', 'fox', 'dog']
keyboard_arrow_down Stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
# Initialize the Porter Stemmer
ps = PorterStemmer()
# Example sentence
text = "Running runners run easily and are loving the adventure."
# Tokenize the sentence
words = word_tokenize(text)
# Apply stemming
stemmed_words = [ps.stem(word) for word in words]
print("Stemmed Words:", stemmed_words)
Stemmed Words: ['run', 'runner', 'run', 'easili', 'and', 'are', 'love', 'the', 'adventur
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Package punkt is already up-to-date!
keyboard_arrow_down lemmatization
https://colab.research.google.com/drive/1ZkyIk18BbWhzTjFZ_358EaB-3VBgogi9?authuser=3#scrollTo=eq6UqK-1k9j4&printMode=true 2/4
3/3/25, 12:37 PM NLP_Lab_1.ipynb - Colab
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
# Download necessary datasets
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
# Sample text
text = "The leaves are falling from the trees and the wolves are howling."
# Tokenize words
words = word_tokenize(text)
# Apply lemmatization
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
print("Lemmatized Words:", lemmatized_words)
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
Lemmatized Words: ['The', 'leaf', 'are', 'falling', 'from', 'the', 'tree', 'and', 'the',
keyboard_arrow_down word frequency count
from collections import Counter
import re
# Sample text
text = "Natural Language Processing is amazing! NLP is a subset of AI, and AI is the future.
# Preprocessing: Convert to lowercase and remove punctuation
text = re.sub(r'[^\w\s]', '', text.lower())
# Tokenize words
words = text.split()
# Count word frequency
word_counts = Counter(words)
print("Word Frequency:", word_counts)
Word Frequency: Counter({'is': 3, 'ai': 2, 'natural': 1, 'language': 1, 'processing': 1,
https://colab.research.google.com/drive/1ZkyIk18BbWhzTjFZ_358EaB-3VBgogi9?authuser=3#scrollTo=eq6UqK-1k9j4&printMode=true 3/4
3/3/25, 12:37 PM NLP_Lab_1.ipynb - Colab
https://colab.research.google.com/drive/1ZkyIk18BbWhzTjFZ_358EaB-3VBgogi9?authuser=3#scrollTo=eq6UqK-1k9j4&printMode=true 4/4