unit2
December 25, 2023
3)Implement the Euclidean distance and Cosine similarity metrics from scratch in Python and
apply them to compare two vectors or data points.
[ ]: import math
# Function to calculate Euclidean distance
def euclidean_distance(vector1, vector2):
if len(vector1) != len(vector2):
raise ValueError("Both vectors must have the same dimensions.")
return math.sqrt(sum((x - y) ** 2 for x, y in zip(vector1, vector2))))
# Function to calculate Cosine similarity
def cosine_similarity(vector1, vector2):
if len(vector1) != len(vector2):
raise ValueError("Both vectors must have the same dimensions.")
dot_product = sum(x * y for x, y in zip(vector1, vector2))
magnitude_vector1 = math.sqrt(sum(x ** 2 for x in vector1))
magnitude_vector2 = math.sqrt(sum(x ** 2 for x in vector2))
return dot_product / (magnitude_vector1 * magnitude_vector2)
# Define two vectors
vector1 = [3, 1, 4, 1, 5]
vector2 = [2, 1, 2, 2, 3]
# Calculate Euclidean distance between vector1 and vector2
print("Euclidean distance: ", euclidean_distance(vector1, vector2))
# Calculate Cosine similarity between vector1 and vector2
print("Cosine similarity: ", cosine_similarity(vector1, vector2))
7)Implement the KNN algorithm in Python and apply it to a dataset to make predictions for a
new data point.
[ ]: import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
1
# Generate a synthetic dataset
np.random.seed(42)
X = np.sort(5 * np.random.rand(100, 1), axis=0)
y = np.sin(X).ravel()
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣
↪random_state=42)
# Visualize the dataset
plt.scatter(X, y, color='blue', label='Original Data')
plt.title('Synthetic Dataset for KNN Regression')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()
# Implement KNN Regression
knn_regressor = KNeighborsRegressor(n_neighbors=3) # You can adjust the number␣
↪of neighbors (k)
knn_regressor.fit(X_train, y_train)
# Make predictions for a new data point
new_data_point = np.array([[2.5]]) # Replace with your own data point
predicted_value = knn_regressor.predict(new_data_point)
# Visualize the regression line
X_range = np.linspace(0, 5, 100).reshape(-1, 1)
y_pred_range = knn_regressor.predict(X_range)
plt.scatter(X, y, color='blue', label='Original Data')
plt.plot(X_range, y_pred_range, color='red', label='KNN Regression Line')
plt.scatter(new_data_point, predicted_value, color='green', label='New Data␣
↪Point Prediction', marker='x', s=100)
plt.title('KNN Regression')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()
# Evaluate the model on the test set
y_pred_test = knn_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
print(f'Mean Squared Error on Test Set: {mse:.2f}')
9)Train a logistic regression model on a binary classification dataset and analyze the importance
2
of each feature using their corresponding coefficients.
[ ]: import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
# Considering two classes for binary classification
X_binary = X[y != 2]
y_binary = y[y != 2]
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_binary, y_binary,␣
↪test_size=0.2, random_state=42)
# Train a logistic regression model
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = logreg_model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on Test Set: {accuracy:.2f}')
# Analyze feature importance using coefficients
feature_importance = pd.DataFrame({
'Feature': iris.feature_names,
'Coefficient': logreg_model.coef_[0]
})
# Display the feature importance
print('\nFeature Importance:')
print(feature_importance)
11)Implement a linear SVM classifier using Python’s scikit-learn library for a binary classification
problem. Visualize the decision boundary and support vectors.
[ ]: import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
3
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
# Load a synthetic dataset for binary classification
X, y = datasets.make_classification(n_samples=100, n_features=2, n_classes=2,␣
↪n_clusters_per_class=1, random_state=42)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣
↪random_state=42)
# Train a linear SVM classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)
# Visualize the decision boundary and support vectors
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap='viridis',␣
↪marker='o', label='Training Data')
plt.scatter(svm_classifier.support_vectors_[:, 0], svm_classifier.
↪support_vectors_[:, 1],
s=100, facecolors='none', edgecolors='k', marker='o',␣
↪label='Support Vectors')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
# Create a meshgrid to plot the decision boundary
xx, yy = np.meshgrid(np.linspace(X[:, 0].min(), X[:, 0].max(), 100),
np.linspace(X[:, 1].min(), X[:, 1].max(), 100))
Z = svm_classifier.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot decision boundary
plt.contour(xx, yy, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,␣
↪linestyles=['--', '-', '--'])
plt.title('Linear SVM Classifier with Decision Boundary and Support Vectors')
plt.legend()
plt.show()
13)Build a Naive Bayes classifier to classify text documents into different categories. Preprocess
the text data and use the Laplace smoothing technique to handle unseen words.
[ ]: import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
4
# Sample text data for illustration
data = {'text': ["This is a positive document.",
"Negative sentiment detected in this text.",
"The sentiment in this document is positive.",
"This is another positive example."],
'label': ['Positive', 'Negative', 'Positive', 'Positive']}
df = pd.DataFrame(data)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'],␣
↪test_size=0.2, random_state=42)
# Preprocess the text data using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
# Build a Naive Bayes classifier with Laplace smoothing
naive_bayes_classifier = MultinomialNB(alpha=1.0) # Laplace smoothing␣
↪parameter (alpha=1.0 for add-one smoothing)
naive_bayes_classifier.fit(X_train_vectorized, y_train)
# Make predictions on the test set
y_pred = naive_bayes_classifier.predict(X_test_vectorized)
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}\n')
print('Classification Report:')
print(classification_report(y_test, y_pred))
# Test with a new document
new_document = ["This document is very positive and contains positive words."]
new_document_vectorized = vectorizer.transform(new_document)
predicted_category = naive_bayes_classifier.predict(new_document_vectorized)
print(f'\nPredicted Category for the New Document: {predicted_category[0]}')
14)Compare the performance of the Naive Bayes algorithm with other classification algorithms on
a given dataset.
[ ]: import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
5
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣
↪random_state=42)
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Define classifiers
classifiers = {
'Naive Bayes': GaussianNB(),
'Random Forest': RandomForestClassifier(random_state=42),
'Support Vector Machine': SVC(kernel='linear', random_state=42),
'K-Nearest Neighbors': KNeighborsClassifier(),
}
# Train and evaluate each classifier
results = []
for clf_name, clf in classifiers.items():
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred,␣
↪target_names=iris.target_names, output_dict=True)
results.append({'Classifier': clf_name, 'Accuracy': accuracy,␣
↪'Classification Report': classification_rep})
# Display the results
df_results = pd.DataFrame(results)
print(df_results)
15) Build a Random Forest classifier using scikit-learn and apply it to a dataset
6
[ ]: import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣
↪random_state=42)
# Build a Random Forest classifier
random_forest_classifier = RandomForestClassifier(n_estimators=100,␣
↪random_state=42)
random_forest_classifier.fit(X_train, y_train)
# Make predictions on the test set
y_pred = random_forest_classifier.predict(X_test)
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}\n')
print('Classification Report:')
print(classification_report(y_test, y_pred, target_names=iris.target_names))