0% found this document useful (0 votes)

6 views9 pages

IRT Lab Programs

The document outlines a series of labs focused on data extraction, term weighting, text processing, neural network implementation, and scalable indexing using Python and various libraries. Each lab includes steps for uploading text files, processing data, and implementing machine learning models, with specific code examples provided. The labs emphasize practical applications of natural language processing and machine learning techniques.

Uploaded by

chaitanyamalli10

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

6 views9 pages

IRT Lab Programs

Uploaded by

chaitanyamalli10

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 9

LAB-3_Extraction of Raw Data

from google.colab import files

from nltk.tokenize import word_tokenize
import nltk

# Step 1: Upload the file

print("Upload your .txt file:")
uploaded = files.upload()

# Step 2: Retrieve the file name

filename = list(uploaded.keys())[0]
print(f"File {filename} uploaded successfully!")

# Step 3: Read the file

print("\nReading the file...")
with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
raw_data = file.read()

print("\nExtracted Raw Data:")

print(raw_data)

# Step 4: Convert Raw Data into Tokens

print("\nConverting Raw Data into Tokens...")

# Download NLTK punkt tokenizer

try:
nltk.download('punkt')
except Exception as e:
print(f"Error downloading punkt tokenizer: {e}")

# Tokenize the raw data

tokens = word_tokenize(raw_data)

print("\nTokens:")
print(tokens)

Additionally, Can compile this code also

import nltk
nltk.download('punkt_tab')
LAB-4_Implementation of Term
Weighting

from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
"Information retrieval is the process of finding relevant
information.",
"The retrieval process involves techniques like term weighting and
ranking.",
"Term weighting methods like TF-IDF are used in information
retrieval."
]
# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)
# Get the feature names (terms)
terms = vectorizer.get_feature_names_out()
# Convert the matrix to a dense array and print it
tfidf_dense = tfidf_matrix.toarray()
# Display TF-IDF weights
print("TF-IDF Weights:")
for i, doc in enumerate(tfidf_dense):
print(f"\nDocument {i + 1}:")
for term, weight in zip(terms, doc):
if weight > 0:
print(f" {term}: {weight:.3f}")

_________________________________
LAB-5_Implementation of Text
Processing Model
# Import necessary libraries
from google.colab import files
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

# Download required NLTK data

nltk.download('punkt')
nltk.download('stopwords')

# Step 1: Upload the .txt file

print("Upload your .txt file:")
uploaded = files.upload()

# Retrieve the file name

filename = list(uploaded.keys())[0]
print(f"\nFile {filename} uploaded successfully!")

# Step 2: Read the file

with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
raw_data = file.read()

print("\nExtracted Raw Data:")

print(raw_data)

# Step 3: Preprocess the text

def preprocess_text(text):
"""
Preprocess the text data:
1. Lowercase the text
2. Tokenize the text
3. Remove stopwords
4. Perform stemming
"""
# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Tokenize the text

tokens = word_tokenize(text)

# Normalize: convert to lowercase

tokens = [token.lower() for token in tokens if token.isalnum()]

# Remove stopwords
filtered_tokens = [word for word in tokens if word not in
stop_words]

# Apply stemming
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

return stemmed_tokens

# Preprocess the extracted raw data

processed_tokens = preprocess_text(raw_data)

print("\nProcessed Tokens:")
print(processed_tokens)

_________________________________
LAB-6: Implementation of Neural
Network Model

# Import necessary libraries

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from google.colab import files
import numpy as np

# Step 1: Upload a .txt file

print("Upload your .txt file (with labeled data, e.g., text,label):")
uploaded = files.upload()

# Retrieve the file name

filename = list(uploaded.keys())[0]
print(f"\nFile {filename} uploaded successfully!")

# Step 2: Read and process the file

texts, labels = [], []
with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
for line in file:
parts = line.strip().split(",")
if len(parts) == 2: # Ensure there is both text and label
text, label = parts[0].strip(), parts[1].strip()
try:
labels.append(int(label)) # Convert label to integer
texts.append(text)
except ValueError:
print(f"Skipping invalid label: {label}")
else:
print(f"Skipping malformed line: {line.strip()}")

# Validate the data

if len(texts) < 2:
raise ValueError("Not enough data. Ensure the file contains at
least 2 valid text-label pairs.")
# Display sample texts and labels
print("\nSample Texts and Labels:")
print(texts[:5], labels[:5])

# Step 3: Tokenize the text

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

# Pad sequences to ensure uniform input size

max_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_length,
padding='post')

# Convert labels to a NumPy array

labels = np.array(labels)

# Step 4: Split data into training and testing sets

train_size = int(len(texts) * 0.8)
if train_size == 0 or len(texts) - train_size == 0:
raise ValueError("Insufficient data for splitting. Ensure there are
enough samples.")

train_data = padded_sequences[:train_size]
train_labels = labels[:train_size]
test_data = padded_sequences[train_size:]
test_labels = labels[train_size:]

print(f"\nTraining Samples: {len(train_data)}, Testing Samples:

{len(test_data)}")

# Step 5: Build a Neural Network Model

model = Sequential([
Embedding(input_dim=5000, output_dim=64, input_length=max_length),
LSTM(64, return_sequences=True),
Flatten(),
Dense(64, activation='relu'),
Dense(1, activation='sigmoid') # Binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy',
metrics=['accuracy'])
# Step 6: Train the Model
print("\nTraining the model...")
batch_size = min(len(train_data), 32) # Ensure batch_size is not
larger than the training set
history = model.fit(train_data, train_labels, epochs=5,
validation_data=(test_data, test_labels), batch_size=batch_size)

# Step 7: Evaluate the Model

print("\nEvaluating the model...")
loss, accuracy = model.evaluate(test_data, test_labels)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

# Step 8: Make Predictions

print("\nMaking predictions on test data...")
predictions = (model.predict(test_data) > 0.5).astype("int32")

print("\nSample Predictions:")
for i in range(min(5, len(test_data))):
print(f"Text: {texts[train_size + i]}, Actual Label:
{test_labels[i]}, Predicted Label: {predictions[i][0]}")

.TXT file content:

I love this product,1 This is the worst

experience ever,0 Absolutely fantastic!
Highly recommend.,1 Not worth the
money.,0
—-----------------------------------------------------
Note: Similar to that one, you can
create own text file and compile the
program
################ LAB-7
##########################

# Step 1: Install necessary libraries

!pip install tqdm --quiet # For progress bars

# Step 2: Upload a sample text file

from google.colab import files

print("Upload a sample text file for indexing (e.g., a .txt file).")

uploaded_files = files.upload()

# Read the uploaded file

file_content = None
file_name = list(uploaded_files.keys())[0]
with open(file_name, 'r', encoding='utf-8') as file:
file_content = file.readlines()

# Step 3: Scalable Indexing Implementation

from collections import defaultdict
from tqdm import tqdm

class ScalableIndexer:
def __init__(self): # Corrected constructor
self.index = defaultdict(list) # Dictionary for word-to-line
mapping

def index_file(self, file_content):

"""Indexes a text file line by line."""
for line_num, line in enumerate(tqdm(file_content,
desc="Indexing lines")):
words = line.strip().split()
for word in words:
self.index[word.lower()].append(line_num) # Convert
words to lowercase for case-insensitive indexing

def search(self, term):

"""Searches for a term in the indexed data."""
term = term.lower()
if term in self.index:
return self.index[term]
else:
return []

# Step 4: Index the uploaded file

indexer = ScalableIndexer()
indexer.index_file(file_content)

# Step 5: Perform a search

search_term = input("Enter a word to search for: ")
search_results = indexer.search(search_term)

# Display results
if search_results:
print(f"Found '{search_term}' in lines: {search_results}")
print("\nLines containing the term:")
for line_num in search_results:
print(f"Line {line_num + 1}: {file_content[line_num].strip()}")
else:
print(f"'{search_term}' not found in the file.")

INPUT:

Create own .TXT file and add few lines in TXT file,
then search the term in TXT file

Medical Text Classifier GabrieldeOlaguibel
No ratings yet
Medical Text Classifier GabrieldeOlaguibel
12 pages
Computer Vision Lab Guide
No ratings yet
Computer Vision Lab Guide
120 pages
Sample
No ratings yet
Sample
6 pages
DL Practical 09text Pre Processing
No ratings yet
DL Practical 09text Pre Processing
6 pages
Glove
100% (1)
Glove
10 pages
Classification CNN
No ratings yet
Classification CNN
7 pages
Sentence Embedding Code
No ratings yet
Sentence Embedding Code
9 pages
Sumati
No ratings yet
Sumati
10 pages
Transform Raw Texts Into Training and Development Data: Instructor: Nikos Aletras
No ratings yet
Transform Raw Texts Into Training and Development Data: Instructor: Nikos Aletras
2 pages
Python Text Classification Guide
No ratings yet
Python Text Classification Guide
34 pages
Code Explanation
No ratings yet
Code Explanation
8 pages
Information Retrival
No ratings yet
Information Retrival
43 pages
Deep Learning Manual
No ratings yet
Deep Learning Manual
24 pages
Assingment-3 NLP
No ratings yet
Assingment-3 NLP
5 pages
Rajeev Mishra 20 SCSE1180087
No ratings yet
Rajeev Mishra 20 SCSE1180087
29 pages
566f0619-9145-4b8f-b12b-cb8a5b0cd30d
No ratings yet
566f0619-9145-4b8f-b12b-cb8a5b0cd30d
17 pages
WDM - Week - I
No ratings yet
WDM - Week - I
24 pages
NLP Lab Manual for B.E. Students
No ratings yet
NLP Lab Manual for B.E. Students
21 pages
Deep Learning Manual
No ratings yet
Deep Learning Manual
53 pages
Lab Manual - NNDL
No ratings yet
Lab Manual - NNDL
63 pages
Deep Learning PGM 1
No ratings yet
Deep Learning PGM 1
6 pages
Expt 5 Expt 6
No ratings yet
Expt 5 Expt 6
10 pages
NLP Lab Assignment - 05
No ratings yet
NLP Lab Assignment - 05
6 pages
Next Word Prediction With NLP and Deep Learning
No ratings yet
Next Word Prediction With NLP and Deep Learning
13 pages
Research Paper Summarization
No ratings yet
Research Paper Summarization
13 pages
Lab 5
No ratings yet
Lab 5
7 pages
Text Classification With Transformer - 1716327784332
No ratings yet
Text Classification With Transformer - 1716327784332
3 pages
NLP
No ratings yet
NLP
15 pages
Deep Learning Lab
No ratings yet
Deep Learning Lab
7 pages
Adobe Scan 08 Jan 2025
No ratings yet
Adobe Scan 08 Jan 2025
7 pages
Fake News Detection with LSTM
No ratings yet
Fake News Detection with LSTM
8 pages
Course 3 - Week 2 - Exercise - Answer - Ipynb - Colaboratory
No ratings yet
Course 3 - Week 2 - Exercise - Answer - Ipynb - Colaboratory
8 pages
Deep Learning Lab Practicals
No ratings yet
Deep Learning Lab Practicals
24 pages
DL Lab Manual
No ratings yet
DL Lab Manual
67 pages
CCS355
No ratings yet
CCS355
29 pages
NLP Assignment (917722H031)
No ratings yet
NLP Assignment (917722H031)
18 pages
Wa0000.
No ratings yet
Wa0000.
40 pages
Python NLP Techniques Guide
No ratings yet
Python NLP Techniques Guide
18 pages
DL Programs
No ratings yet
DL Programs
13 pages
Machine Learning Lecture - 4 and Lecture - 5
No ratings yet
Machine Learning Lecture - 4 and Lecture - 5
73 pages
Spam Detection Using Tensorflow
No ratings yet
Spam Detection Using Tensorflow
13 pages
1b DL Nandy-1
No ratings yet
1b DL Nandy-1
6 pages
SocrAI Day 3
No ratings yet
SocrAI Day 3
43 pages
AI Lab6
No ratings yet
AI Lab6
22 pages
Keras RNN Guide for Beginners
No ratings yet
Keras RNN Guide for Beginners
13 pages
Methodology
No ratings yet
Methodology
9 pages
NLP
No ratings yet
NLP
6 pages
Merge
No ratings yet
Merge
33 pages
NLP Lab Manual
No ratings yet
NLP Lab Manual
21 pages
NLP Lab Programs
No ratings yet
NLP Lab Programs
18 pages
DSBA+Master+Codebook+ +Text+Mining+&+TSF
No ratings yet
DSBA+Master+Codebook+ +Text+Mining+&+TSF
11 pages
Cyberbullying Code
No ratings yet
Cyberbullying Code
6 pages
Code
No ratings yet
Code
18 pages
Experiment 6.2
No ratings yet
Experiment 6.2
4 pages
NNDL Lab Exp
No ratings yet
NNDL Lab Exp
50 pages
RNN Text Generation
No ratings yet
RNN Text Generation
3 pages
Report On - Social Media Research Topic Modeling
No ratings yet
Report On - Social Media Research Topic Modeling
26 pages
Lab 6
No ratings yet
Lab 6
29 pages
Secure Software Engineering Lab Manual
No ratings yet
Secure Software Engineering Lab Manual
27 pages
Full Dynamics in Engineering Practice Eleventh Edition Childs Ebook All Chapters
100% (5)
Full Dynamics in Engineering Practice Eleventh Edition Childs Ebook All Chapters
84 pages
AP 550 Asphalt Paver Sell Sheet MSS-1172-02-EN
No ratings yet
AP 550 Asphalt Paver Sell Sheet MSS-1172-02-EN
2 pages
Fluid Properties: Density, Specific Volume, Specific Weight, Specific Gravity, and Pressure
No ratings yet
Fluid Properties: Density, Specific Volume, Specific Weight, Specific Gravity, and Pressure
1 page
Harmonic Form (Worksheet)
No ratings yet
Harmonic Form (Worksheet)
2 pages
RD Sharma Dec20 Class 11 Maths Chapter 18 Binomial Theorem Exercise 18.1
No ratings yet
RD Sharma Dec20 Class 11 Maths Chapter 18 Binomial Theorem Exercise 18.1
10 pages
XED and YED + Exercises
No ratings yet
XED and YED + Exercises
5 pages
XRS-FP Software Guide v460
No ratings yet
XRS-FP Software Guide v460
145 pages
On Chemical Eqiulibrium For G-11
No ratings yet
On Chemical Eqiulibrium For G-11
44 pages
PK Valve Page-13-17
No ratings yet
PK Valve Page-13-17
5 pages
Narayana 16-06-2022 - Outgoing SR - Jee Main Model Gtm-11 - Sol
No ratings yet
Narayana 16-06-2022 - Outgoing SR - Jee Main Model Gtm-11 - Sol
20 pages
Types Roof Trusses: Building Technology 3 2012
100% (1)
Types Roof Trusses: Building Technology 3 2012
34 pages
Acceptance for Road Repair Contract
No ratings yet
Acceptance for Road Repair Contract
1 page
AC Compressor Manual
No ratings yet
AC Compressor Manual
48 pages
Biologytest
No ratings yet
Biologytest
10 pages
Smart Grid Innovations for Utilities
No ratings yet
Smart Grid Innovations for Utilities
13 pages
Final Revisiom T 2 Grade 12 G M101
No ratings yet
Final Revisiom T 2 Grade 12 G M101
39 pages
Third Quarter Departmental Test in ICT 1
100% (1)
Third Quarter Departmental Test in ICT 1
3 pages
VDSL Tutorial
No ratings yet
VDSL Tutorial
10 pages
Advances in DEFORM-3D Extrusion Simulation
No ratings yet
Advances in DEFORM-3D Extrusion Simulation
5 pages
Optimization of Submerged Arc Welding
No ratings yet
Optimization of Submerged Arc Welding
4 pages
Hospital Management Software Development: Olawale Ayotunde Sobogungod
No ratings yet
Hospital Management Software Development: Olawale Ayotunde Sobogungod
3 pages
Applications of Optimization With Xpress
No ratings yet
Applications of Optimization With Xpress
264 pages
4.4 Soakaways
No ratings yet
4.4 Soakaways
19 pages
Determinants and Matrices Previous Year Questions With Answer
75% (4)
Determinants and Matrices Previous Year Questions With Answer
15 pages
GLC60 70VX 1
No ratings yet
GLC60 70VX 1
8 pages
Gurgaon
No ratings yet
Gurgaon
48 pages
Process Control Benefits in Grinding
No ratings yet
Process Control Benefits in Grinding
4 pages
Oracle E-Business Tax Extensibility
No ratings yet
Oracle E-Business Tax Extensibility
5 pages
Stairs: A Little Bit About Them: Slope
No ratings yet
Stairs: A Little Bit About Them: Slope
2 pages

IRT Lab Programs

Uploaded by

IRT Lab Programs

Uploaded by

LAB-3_Extraction of Raw Data

from google.colab import files

# Step 1: Upload the file

# Step 2: Retrieve the file name

# Step 3: Read the file

print("\nExtracted Raw Data:")

# Step 4: Convert Raw Data into Tokens

# Download NLTK punkt tokenizer

# Tokenize the raw data

Additionally, Can compile this code also

from sklearn.feature_extraction.text import TfidfVectorizer

# Download required NLTK data

# Step 1: Upload the .txt file

# Retrieve the file name

# Step 2: Read the file

print("\nExtracted Raw Data:")

# Step 3: Preprocess the text

# Tokenize the text

# Normalize: convert to lowercase

# Preprocess the extracted raw data

# Import necessary libraries

# Step 1: Upload a .txt file

# Retrieve the file name

# Step 2: Read and process the file

# Validate the data

# Step 3: Tokenize the text

# Pad sequences to ensure uniform input size

# Convert labels to a NumPy array

# Step 4: Split data into training and testing sets

print(f"\nTraining Samples: {len(train_data)}, Testing Samples:

# Step 5: Build a Neural Network Model

# Step 7: Evaluate the Model

# Step 8: Make Predictions

.TXT file content:

I love this product,1 This is the worst

# Step 1: Install necessary libraries

# Step 2: Upload a sample text file

print("Upload a sample text file for indexing (e.g., a .txt file).")

# Read the uploaded file

# Step 3: Scalable Indexing Implementation

def index_file(self, file_content):

def search(self, term):

# Step 4: Index the uploaded file

# Step 5: Perform a search

You might also like