4/19/24, 4:06 PM nlp2.
ipynb - Colab
1 import nltk
2 from sklearn.feature_extraction.text import CountVectorizer
3 nltk.download('punkt')
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
True
1 # Sample data
2 corpus = [
3 "SPPU is the one of the best university in India.",
4 "India has already allowded so many new universities.",
5 "AICTE is main authority in technical education.",
6 "UGC and AICTE allowded technical education in india?",
7 ]
8
1 # Create the Bag of Words model
2 vectorizer = CountVectorizer()
3 X_bow = vectorizer.fit_transform(corpus)
1 # Get feature names and transformed data
2 feature_names_bow = vectorizer.get_feature_names_out()
3 bow_matrix = X_bow.toarray()
1 # Print feature names and BoW matrix
2 print("Feature Names (BoW):", feature_names_bow)
3 print("BoW Matrix:\n", bow_matrix)
Feature Names (BoW): ['aicte' 'allowded' 'already' 'and' 'authority' 'best' 'education' 'has'
'in' 'india' 'is' 'main' 'many' 'new' 'of' 'one' 'so' 'sppu' 'technical'
'the' 'ugc' 'universities' 'university']
BoW Matrix:
[[0 0 0 0 0 1 0 0 1 1 1 0 0 0 1 1 0 1 0 2 0 0 1]
[0 1 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0]
[1 0 0 0 1 0 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0]
[1 1 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0]]
1 !pip install scikit-learn gensim nltk
2 from sklearn.feature_extraction.text import TfidfVectorizer
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (1.2.2)
Requirement already satisfied: gensim in /usr/local/lib/python3.10/dist-packages (4.3.2)
Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1)
Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.25.2)
Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.11.4)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.4.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (3.4.0)
Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from gensim) (6.4.0)
Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.7)
Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2023.12.25)
Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk) (4.66.2)
1 # Create the TF-IDF model
2 vectorizer_tfidf = TfidfVectorizer()
3 X_tfidf = vectorizer_tfidf.fit_transform(corpus)
1 # Get feature names and transformed data
2 feature_names_tfidf = vectorizer_tfidf.get_feature_names_out()
3 tfidf_matrix = X_tfidf.toarray()
1 # Print feature names and TF-IDF matrix
2 print("Feature Names (TF-IDF):", feature_names_tfidf)
3 print("TF-IDF Matrix:\n", tfidf_matrix)
Feature Names (TF-IDF): ['aicte' 'allowded' 'already' 'and' 'authority' 'best' 'education' 'has'
'in' 'india' 'is' 'main' 'many' 'new' 'of' 'one' 'so' 'sppu' 'technical'
'the' 'ugc' 'universities' 'university']
TF-IDF Matrix:
[[0. 0. 0. 0. 0. 0.30954541
0. 0. 0.19757882 0.19757882 0.24404915 0.
0. 0. 0.30954541 0.30954541 0. 0.30954541
0. 0.61909081 0. 0. 0.30954541]
[0. 0.29737611 0.37718389 0. 0. 0.
0. 0.37718389 0. 0.24075159 0. 0.
0.37718389 0.37718389 0. 0. 0.37718389 0.
0. 0. 0. 0.37718389 0. ]
[0.35639424 0. 0. 0. 0.4520409 0.
0.35639424 0. 0.28853185 0. 0.35639424 0.4520409
https://colab.research.google.com/drive/1GWw2psLJ4rs1IT5iUg9xdoMOZ87BiYaF#scrollTo=9b32KeWq6dd-&uniqifier=2&printMode=true 1/3
4/19/24, 4:06 PM nlp2.ipynb - Colab
0. 0. 0. 0. 0. 0.
0.35639424 0. 0. 0. 0. ]
[0.34242558 0.34242558 0. 0.43432343 0. 0.
0.34242558 0. 0.27722302 0.27722302 0. 0.
0. 0. 0. 0. 0. 0.
0.34242558 0. 0.43432343 0. 0. ]]
1 from gensim.models import Word2Vec
2 from nltk.tokenize import word_tokenize
1 # Tokenize the documents
2 tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]
3 # Train the Word2Vec model
4 model_w2v = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
5
1 # Get the Word2Vec embeddings for each word
2 embeddings_w2v = [model_w2v.wv[word] for doc in tokenized_corpus for word in doc]
3
1 print("Word2Vec Embeddings (Example):", embeddings_w2v[:5])
output Word2Vec Embeddings (Example): [array([-4.9724146e-03, -1.2821439e-03, 3.2808294e-03, -6.4131343e-03,
-9.7032748e-03, -9.2617292e-03, 9.0226065e-03, 5.3696753e-03,
-4.7882134e-03, -8.3339782e-03, 1.2951550e-03, 2.8790133e-03,
-1.2458978e-03, 1.2699742e-03, -4.3185740e-03, 4.7948617e-03,
1.4796027e-03, 8.8773808e-03, -9.9788336e-03, -5.2726669e-03,
-9.1006216e-03, -3.4521171e-04, -7.8554507e-03, 5.0299861e-03,
-6.3978485e-03, -5.9502255e-03, 5.0689173e-03, -8.1629418e-03,
1.4552462e-03, -7.2365543e-03, 9.8626213e-03, 8.6347228e-03,
1.7700142e-03, 5.7870778e-03, 4.5951647e-03, -5.9907152e-03,
9.7548291e-03, -9.6800094e-03, 8.0489898e-03, 2.7558431e-03,
-3.0530239e-03, -3.5616157e-03, 9.0742577e-03, -5.4402603e-03,
8.1877513e-03, -6.0094744e-03, 8.3887624e-03, -5.5658707e-04,
7.9459315e-03, -3.1532587e-03, 5.9769000e-03, 8.8024903e-03,
2.5420673e-03, 1.3162253e-03, 5.0389166e-03, 8.0063958e-03,
8.5699316e-03, 8.4947534e-03, 7.0525687e-03, 8.0050612e-03,
8.6004017e-03, -3.2667242e-05, -1.0029497e-03, 1.6668305e-03,
4.6866389e-06, 6.8768725e-04, -8.6033335e-03, -9.5959110e-03,
-2.3133331e-03, 8.9247189e-03, -3.6475467e-03, -6.9804057e-03,
4.8784767e-03, 1.0698296e-03, 1.8517259e-03, 3.6527361e-03,
3.5221805e-03, 5.7269363e-03, 1.2339676e-03, 8.4258645e-04,
9.0451026e-03, 2.7826610e-03, -4.7025373e-03, 6.5429192e-03,
5.2161720e-03, 2.8710719e-03, -3.1352045e-03, 3.3388904e-03,
6.3642915e-03, 7.0779454e-03, 9.4181398e-04, -8.5304342e-03,
2.5565538e-04, 3.7333352e-04, 3.9412794e-03, -9.4706584e-03,
9.7080907e-03, -6.9747777e-03, 5.7595358e-03, -9.4276723e-03],
dtype=float32), array([-0.0071398 , 0.00124439, -0.00717616, -0.00223565, 0.00371874,
0.00583367, 0.001202 , 0.00210848, -0.00410963, 0.00722465,
-0.00630294, 0.00464309, -0.0082172 , 0.00204422, -0.00497717,
-0.00425125, -0.00310916, 0.00565882, 0.00579249, -0.00497653,
0.00077368, -0.00849352, 0.00780642, 0.00925912, -0.00274006,
0.00079614, 0.00074861, 0.00547782, -0.00860957, 0.00058059,
0.00686888, 0.00222321, 0.00112738, -0.00932088, 0.00847669,
-0.00625879, -0.00298613, 0.00349368, -0.00077095, 0.00141088,
0.00178102, -0.00682666, -0.00973249, 0.00904355, 0.00619567,
-0.00691088, 0.00339972, 0.00020398, 0.00475398, -0.00711601,
0.00402788, 0.00434206, 0.0099519 , -0.00447311, -0.00138774,
-0.00731545, -0.00969014, -0.00908436, -0.00102474, -0.00650439,
0.00484432, -0.00616408, 0.0025211 , 0.00072896, -0.00339727,
-0.00097363, 0.00997826, 0.00914278, -0.00446263, 0.00908478,
-0.00564142, 0.00593425, -0.00309757, 0.00342886, 0.00302015,
0.006903 , -0.00237185, 0.00877823, 0.00758474, -0.0095498 ,
-0.00801289, -0.00763687, 0.00292587, -0.00279558, -0.00693359,
-0.00812493, 0.00830964, 0.00197929, -0.00933083, -0.00478753,
0.00313186, -0.0047108 , 0.00528206, -0.00423214, 0.00264669,
-0.00804493, 0.00620823, 0.00481998, 0.00078511, 0.00301797],
dtype=float32), array([ 8.1650205e-03, -4.4393395e-03, 8.9832470e-03, 8.2537076e-03,
-4.4381348e-03, 3.0088305e-04, 4.2714751e-03, -3.9304695e-03,
-5.5628875e-03, -6.5138922e-03, -6.7317014e-04, -2.9316242e-04,
4.4594160e-03, -2.4768524e-03, -1.6832585e-04, 2.4654416e-03,
4.8718420e-03, -2.8879360e-05, -6.3401391e-03, -9.2649423e-03,
2.9410048e-05, 6.6641076e-03, 1.4697608e-03, -8.9649623e-03,
-7.9361815e-03, 6.5568490e-03, -3.7907732e-03, 6.2528555e-03,
-6.6814339e-03, 8.4838886e-03, -6.5139448e-03, 3.2910376e-03,
-1.0536474e-03, -6.7908973e-03, -3.2850883e-03, -1.1634642e-03,
-5.4759043e-03, -1.2073567e-03, -7.5638522e-03, 2.6458006e-03,
9.0703918e-03, -2.3795378e-03, -9.7446056e-04, 3.5161036e-03,
8.6651891e-03, -5.9261033e-03, -6.8902504e-03, -2.9335832e-03,
9.1518667e-03, 8.6510333e-04, -8.6797718e-03, -1.4467967e-03,
https://colab.research.google.com/drive/1GWw2psLJ4rs1IT5iUg9xdoMOZ87BiYaF#scrollTo=9b32KeWq6dd-&uniqifier=2&printMode=true 2/3
4/19/24, 4:06 PM nlp2.ipynb - Colab
https://colab.research.google.com/drive/1GWw2psLJ4rs1IT5iUg9xdoMOZ87BiYaF#scrollTo=9b32KeWq6dd-&uniqifier=2&printMode=true 3/3