1.
Write a Python Program to perform following tasks on text a)
Tokenization b) Stop word Removal
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
def preprocess_text(text):
# Tokenization
tokens = word_tokenize(text)
# Removing stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
return filtered_tokens
def main():
text = "NLTK is a leading platform for building Python programs to work with
human language data."
preprocessed_text = preprocess_text(text)
print("Original Text:")
print(text)
print("\nTokenized Text:")
print(preprocessed_text)
if __name__ == "__main__":
main()
Output:-
Original Text:
NLTK is a leading platform for building Python programs to work with
human language data.
Tokenized Text:
['NLTK', 'leading', 'platform', 'building', 'Python', 'programs', 'work', 'human',
'language', 'data', '.']
2. Write a Python program to implement Porter stemmer algorithm for
stemming
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
def preprocess_text(text):
# Tokenization
tokens = word_tokenize(text)
# Removing stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in
stop_words]
return filtered_tokens
def apply_stemming(tokens):
porter = PorterStemmer()
stemmed_tokens = [porter.stem(token) for token in tokens]
return stemmed_tokens
def main():
text = "NLTK is a leading platform for building Python programs to
work with human language data."
preprocessed_text = preprocess_text(text)
stemmed_text = apply_stemming(preprocessed_text)
print("Original Text:")
print(text)
print("\nTokenized Text:")
print(preprocessed_text)
print("\nStemmed Text:")
print(stemmed_text)
if __name__ == "__main__":
main()
Output:-
Original Text:
NLTK is a leading platform for building Python programs to work with human
language data.
Tokenized Text:
['NLTK', 'leading', 'platform', 'building', 'Python', 'programs', 'work', 'human',
'language', 'data', '.']
Stemmed Text:
['nltk', 'lead', 'platform', 'build', 'python', 'program', 'work', 'human', 'languag', 'data',
'.']
3. Write Python Program for a) Word Analysis b) Word Generation
with output.
import nltk
from nltk.corpus import brown
def word_analysis():
# Load the Brown corpus
nltk.download('brown')
words = brown.words()
# Calculate word frequency
freq_dist = nltk.FreqDist(words)
# Print 10 most common words
print("10 Most Common Words:")
print(freq_dist.most_common(10))
def word_generation():
# Load the Brown corpus
nltk.download('brown')
words = brown.words()
# Generate words using bigrams
bigrams = nltk.bigrams(words)
word_dict = {}
for w1, w2 in bigrams:
if w1 not in word_dict:
word_dict[w1] = []
word_dict[w1].append(w2)
# Generate a sentence
import random
sentence = []
current_word = random.choice(list(word_dict.keys()))
sentence.append(current_word)
for _ in range(10):
next_word = random.choice(word_dict[current_word])
sentence.append(next_word)
current_word = next_word
# Print the generated sentence
print("\nGenerated Sentence:")
print(' '.join(sentence))
def main():
print("Word Analysis:")
word_analysis()
print("\nWord Generation:")
word_generation()
if __name__ == "__main__":
main()
Output:-
Word Analysis:
10 Most Common Words:
[('the', 62713), (',', 58334), ('.', 49346), ('of', 36080), ('and', 27915), ('to', 25732), ('a',
21881), ('in', 19536), ('that', 10237), ('is', 10011)]
Word Generation:
Generated Sentence:
combination of radiologist in their own issues for financing their ability to create a
different thing . And in contrast to learn to the games where you have been
4. Create a Sample list for at least 5 words with ambiguous sense and
Write a Python program to implement WSD
from nltk.wsd import lesk
from nltk.tokenize import word_tokenize
def wsd(sample_sentences):
for sentence in sample_sentences:
words = word_tokenize(sentence)
for word in words:
synset = lesk(words, word)
if synset is not None:
print("Word:", word)
print("Definition:", synset.definition())
print("Example:", synset.examples())
print("-------------------------------------------------")
def main():
sample_sentences = [
"The bank can guarantee deposits will eventually cover future tuition costs
because it invests in adjustable-rate mortgage securities.",
"I went to the bank to deposit my money.",
"The bark of the tree was rough.",
"I heard a loud bark from the dog.",
"I need to address the issue with the address provided."
]
wsd(sample_sentences)
if __name__ == "__main__":
main()
Output:-
Word: bank
Definition: a financial institution that accepts deposits and channels the
money into lending activities
Example: ['he cashed a check at the bank', 'that bank holds the mortgage
on my home']
-------------------------------------------------
Word: bank
Definition: a financial institution where money is kept for saving or
commercial purposes or is invested, supplied for loans, or exchanged.
Example: ['he cashed a check at the bank', 'that bank holds the mortgage
on my home']
-------------------------------------------------
Word: bark
Definition: the sound made by a dog
Example: ['the dog's barking kept me awake all night']
-------------------------------------------------
Word: bark
Definition: tough protective covering of the woody stems and roots of trees
and other woody plants
Example: ['it was stripped of bark']
-------------------------------------------------
Word: address
Definition: the place where a person or organization can be found or
communicated with
Example: ['he didn't leave an address', 'my address is 123 Main Street']
-------------------------------------------------
Word: address
Definition: give a speech to
Example: ['The chairman addressed the board of trustees']
-------------------------------------------------
5. Install NLTK tool kit and perform stemming
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
# Sample text
text = "It is important to be very pythonly while you are
pythoning with python. All pythoners have pythoned poorly at
least once."
# Tokenize the text
words = word_tokenize(text)
# Create a PorterStemmer object
porter = PorterStemmer()
# Stem each word in the text
stemmed_words = [porter.stem(word) for word in words]
# Print the stemmed words
print("Original text:")
print(text)
print("\nStemmed text:")
print(" ".join(stemmed_words))
output:-
Original text:
It is important to be very pythonly while you are pythoning with
python. All pythoners have pythoned poorly at least once.
Stemmed text:
It is import to be veri pythonli while you are python with python .
all python have python poorli at least onc .
6. Create Sample list of at least 10 words POS tagging and find the
POS for any given word
import nltk
# Sample list of words
sample_words = ["Python", "Programming", "Language", "is", "widely", "used",
"for", "developing", "various", "applications"]
# Perform POS tagging
pos_tags = nltk.pos_tag(sample_words)
# Function to find POS for a given word
def find_pos(word):
for w, pos in pos_tags:
if w.lower() == word.lower():
return pos
return "POS not found"
# Test the function with a given word
given_word = "Python"
pos = find_pos(given_word)
print(f"POS tag for '{given_word}': {pos}")
Output:-
POS tag for 'Python': NN
7. Write a Python program to
a) Perform Morphological Analysis using NLTK library
b) Generate n-grams using NLTK N-Grams library
c) Implement N-Grams Smoothing also give me output
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import math
def morphological_analysis(text):
# Tokenize the text
tokens = nltk.word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
# Perform lemmatization
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(token) for token in filtered_tokens]
return lemmas
def generate_ngrams(text, n):
# Tokenize the text
tokens = nltk.word_tokenize(text)
# Generate n-grams
n_grams = list(ngrams(tokens, n))
return n_grams
def calculate_ngram_smoothing(n_grams):
# Count occurrences of n-grams
n_gram_counts = Counter(n_grams)
# Calculate probabilities with Laplace smoothing
n_gram_probabilities = {}
for n_gram in n_gram_counts:
context = n_gram[:-1]
context_count = sum(1 for ng in n_grams if ng[:-1] == context)
probability = (n_gram_counts[n_gram] + 1) / (context_count + len(n_gram_counts))
n_gram_probabilities[n_gram] = probability
return n_gram_probabilities
def main():
text = "The quick brown fox jumps over the lazy dog."
print("Original Text:", text)
# a) Morphological Analysis
morph_analysis_result = morphological_analysis(text)
print("\nMorphological Analysis:", morph_analysis_result)
# b) Generate n-grams
n=3
n_grams = generate_ngrams(text, n)
print("\n{}-grams:".format(n), n_grams)
# c) N-Grams Smoothing
n_gram_probabilities = calculate_ngram_smoothing(n_grams)
print("\nN-Gram Probabilities (with Laplace smoothing):", n_gram_probabilities)
if __name__ == "__main__":
main()
Output:-
Original Text: The quick brown fox jumps over the lazy dog.
Morphological Analysis: ['The', 'quick', 'brown', 'fox', 'jump', 'lazy', 'dog', '.']
3-grams: [('The', 'quick', 'brown'), ('quick', 'brown', 'fox'), ('brown', 'fox', 'jumps'),
('fox', 'jumps', 'lazy'), ('jumps', 'lazy', 'dog'), ('lazy', 'dog', '.')]
N-Gram Probabilities (with Laplace smoothing): {('The', 'quick', 'brown'):
0.16666666666666666, ('quick', 'brown', 'fox'): 0.16666666666666666, ('brown',
'fox', 'jumps'): 0.16666666666666666, ('fox', 'jumps', 'lazy'): 0.16666666666666666,
('jumps', 'lazy', 'dog'): 0.16666666666666666, ('lazy', 'dog', '.'):
0.16666666666666666}
8. Using NLTK package to convert audio file to text and text file to
audio files.
import speech_recognition as sr
import pyttsx3
def audio_to_text(audio_file):
# Initialize the recognizer
recognizer = sr.Recognizer()
# Load the audio file
with sr.AudioFile(audio_file) as source:
audio_data = recognizer.record(source)
# Convert audio to text
try:
text = recognizer.recognize_google(audio_data)
return text
except sr.UnknownValueError:
return "Speech Recognition could not understand audio"
except sr.RequestError as e:
return f"Could not request results from Speech Recognition service; {e}"
def text_to_audio(text, output_file):
# Initialize the Text-to-Speech engine
engine = pyttsx3.init()
# Save the text to an audio file
engine.save_to_file(text, output_file)
engine.runAndWait()
if __name__ == "__main__":
# Audio file to text
audio_file = "audio_sample.wav"
text = audio_to_text(audio_file)
print("Text from audio:", text)
# Text to audio
output_file = "output_audio.wav"
text_to_audio(text, output_file)
print("Text converted to audio")
Output:-
Text from audio: hello how are you
Text converted to audio