LAB-3_Extraction of Raw Data
from google.colab import files
from nltk.tokenize import word_tokenize
import nltk
# Step 1: Upload the file
print("Upload your .txt file:")
uploaded = files.upload()
# Step 2: Retrieve the file name
filename = list(uploaded.keys())[0]
print(f"File {filename} uploaded successfully!")
# Step 3: Read the file
print("\nReading the file...")
with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
raw_data = file.read()
print("\nExtracted Raw Data:")
print(raw_data)
# Step 4: Convert Raw Data into Tokens
print("\nConverting Raw Data into Tokens...")
# Download NLTK punkt tokenizer
try:
nltk.download('punkt')
except Exception as e:
print(f"Error downloading punkt tokenizer: {e}")
# Tokenize the raw data
tokens = word_tokenize(raw_data)
print("\nTokens:")
print(tokens)
Additionally, Can compile this code also
import nltk
nltk.download('punkt_tab')
LAB-4_Implementation of Term
Weighting
from sklearn.feature_extraction.text import TfidfVectorizer
# Sample documents
documents = [
"Information retrieval is the process of finding relevant
information.",
"The retrieval process involves techniques like term weighting and
ranking.",
"Term weighting methods like TF-IDF are used in information
retrieval."
]
# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)
# Get the feature names (terms)
terms = vectorizer.get_feature_names_out()
# Convert the matrix to a dense array and print it
tfidf_dense = tfidf_matrix.toarray()
# Display TF-IDF weights
print("TF-IDF Weights:")
for i, doc in enumerate(tfidf_dense):
print(f"\nDocument {i + 1}:")
for term, weight in zip(terms, doc):
if weight > 0:
print(f" {term}: {weight:.3f}")
_________________________________
LAB-5_Implementation of Text
Processing Model
# Import necessary libraries
from google.colab import files
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
# Step 1: Upload the .txt file
print("Upload your .txt file:")
uploaded = files.upload()
# Retrieve the file name
filename = list(uploaded.keys())[0]
print(f"\nFile {filename} uploaded successfully!")
# Step 2: Read the file
with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
raw_data = file.read()
print("\nExtracted Raw Data:")
print(raw_data)
# Step 3: Preprocess the text
def preprocess_text(text):
"""
Preprocess the text data:
1. Lowercase the text
2. Tokenize the text
3. Remove stopwords
4. Perform stemming
"""
# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
# Tokenize the text
tokens = word_tokenize(text)
# Normalize: convert to lowercase
tokens = [token.lower() for token in tokens if token.isalnum()]
# Remove stopwords
filtered_tokens = [word for word in tokens if word not in
stop_words]
# Apply stemming
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
return stemmed_tokens
# Preprocess the extracted raw data
processed_tokens = preprocess_text(raw_data)
print("\nProcessed Tokens:")
print(processed_tokens)
_________________________________
LAB-6: Implementation of Neural
Network Model
# Import necessary libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from google.colab import files
import numpy as np
# Step 1: Upload a .txt file
print("Upload your .txt file (with labeled data, e.g., text,label):")
uploaded = files.upload()
# Retrieve the file name
filename = list(uploaded.keys())[0]
print(f"\nFile {filename} uploaded successfully!")
# Step 2: Read and process the file
texts, labels = [], []
with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
for line in file:
parts = line.strip().split(",")
if len(parts) == 2: # Ensure there is both text and label
text, label = parts[0].strip(), parts[1].strip()
try:
labels.append(int(label)) # Convert label to integer
texts.append(text)
except ValueError:
print(f"Skipping invalid label: {label}")
else:
print(f"Skipping malformed line: {line.strip()}")
# Validate the data
if len(texts) < 2:
raise ValueError("Not enough data. Ensure the file contains at
least 2 valid text-label pairs.")
# Display sample texts and labels
print("\nSample Texts and Labels:")
print(texts[:5], labels[:5])
# Step 3: Tokenize the text
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
# Pad sequences to ensure uniform input size
max_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_length,
padding='post')
# Convert labels to a NumPy array
labels = np.array(labels)
# Step 4: Split data into training and testing sets
train_size = int(len(texts) * 0.8)
if train_size == 0 or len(texts) - train_size == 0:
raise ValueError("Insufficient data for splitting. Ensure there are
enough samples.")
train_data = padded_sequences[:train_size]
train_labels = labels[:train_size]
test_data = padded_sequences[train_size:]
test_labels = labels[train_size:]
print(f"\nTraining Samples: {len(train_data)}, Testing Samples:
{len(test_data)}")
# Step 5: Build a Neural Network Model
model = Sequential([
Embedding(input_dim=5000, output_dim=64, input_length=max_length),
LSTM(64, return_sequences=True),
Flatten(),
Dense(64, activation='relu'),
Dense(1, activation='sigmoid') # Binary classification
])
model.compile(optimizer='adam', loss='binary_crossentropy',
metrics=['accuracy'])
# Step 6: Train the Model
print("\nTraining the model...")
batch_size = min(len(train_data), 32) # Ensure batch_size is not
larger than the training set
history = model.fit(train_data, train_labels, epochs=5,
validation_data=(test_data, test_labels), batch_size=batch_size)
# Step 7: Evaluate the Model
print("\nEvaluating the model...")
loss, accuracy = model.evaluate(test_data, test_labels)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")
# Step 8: Make Predictions
print("\nMaking predictions on test data...")
predictions = (model.predict(test_data) > 0.5).astype("int32")
print("\nSample Predictions:")
for i in range(min(5, len(test_data))):
print(f"Text: {texts[train_size + i]}, Actual Label:
{test_labels[i]}, Predicted Label: {predictions[i][0]}")
.TXT file content:
I love this product,1 This is the worst
experience ever,0 Absolutely fantastic!
Highly recommend.,1 Not worth the
money.,0
—-----------------------------------------------------
Note: Similar to that one, you can
create own text file and compile the
program
################ LAB-7
##########################
# Step 1: Install necessary libraries
!pip install tqdm --quiet # For progress bars
# Step 2: Upload a sample text file
from google.colab import files
print("Upload a sample text file for indexing (e.g., a .txt file).")
uploaded_files = files.upload()
# Read the uploaded file
file_content = None
file_name = list(uploaded_files.keys())[0]
with open(file_name, 'r', encoding='utf-8') as file:
file_content = file.readlines()
# Step 3: Scalable Indexing Implementation
from collections import defaultdict
from tqdm import tqdm
class ScalableIndexer:
def __init__(self): # Corrected constructor
self.index = defaultdict(list) # Dictionary for word-to-line
mapping
def index_file(self, file_content):
"""Indexes a text file line by line."""
for line_num, line in enumerate(tqdm(file_content,
desc="Indexing lines")):
words = line.strip().split()
for word in words:
self.index[word.lower()].append(line_num) # Convert
words to lowercase for case-insensitive indexing
def search(self, term):
"""Searches for a term in the indexed data."""
term = term.lower()
if term in self.index:
return self.index[term]
else:
return []
# Step 4: Index the uploaded file
indexer = ScalableIndexer()
indexer.index_file(file_content)
# Step 5: Perform a search
search_term = input("Enter a word to search for: ")
search_results = indexer.search(search_term)
# Display results
if search_results:
print(f"Found '{search_term}' in lines: {search_results}")
print("\nLines containing the term:")
for line_num in search_results:
print(f"Line {line_num + 1}: {file_content[line_num].strip()}")
else:
print(f"'{search_term}' not found in the file.")
INPUT:
Create own .TXT file and add few lines in TXT file,
then search the term in TXT file