10-02-2025
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error,
r2_score
# Sample dataset: Study Hours, Practice Tests, and Exam Scores
X = np.array([
[1, 2], # 1 hour of study, 2 practice tests
[2, 3], # 2 hours of study, 3 practice tests
[3, 5], # 3 hours of study, 5 practice tests
[4, 7], # 4 hours of study, 7 practice tests
[5, 8], # 5 hours of study, 8 practice tests
[6, 10], # 6 hours of study, 10 practice tests
[7, 12], # 7 hours of study, 12 practice tests
[8, 15], # 8 hours of study, 15 practice tests
[9, 18], # 9 hours of study, 18 practice tests
[10, 20] # 10 hours of study, 20 practice tests
])
Y = np.array([50, 55, 60, 65, 70, 75, 80, 85, 90, 95]) # Exam Scores
(dependent variable)
# Create a multiple linear regression model
model = LinearRegression()
model.fit(X, Y)
# Predict scores
Y_pred = model.predict(X)
# Print model parameters (coefficients and intercept)
print("Coefficients (m1, m2):", model.coef_) # m1 for Study Hours, m2
for Practice Tests
print("Intercept (b):", model.intercept_)
# Evaluate model performance
mae = mean_absolute_error(Y, Y_pred)
mse = mean_squared_error(Y, Y_pred)
r2 = r2_score(Y, Y_pred)
print("\nModel Evaluation Metrics:")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
# 3D Visualization of Multiple Linear Regression
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
# Scatter plot of actual data
ax.scatter(X[:, 0], X[:, 1], Y, color='blue', label='Actual Data')
# Generate a grid for plotting the regression plane
X1, X2 = np.meshgrid(np.linspace(1, 10, 10), np.linspace(2, 20, 10))
Y_plane = model.intercept_ + model.coef_[0] * X1 + model.coef_[1] * X2
# Plot the regression plane
ax.plot_surface(X1, X2, Y_plane, color='red', alpha=0.5)
ax.set_xlabel("Study Hours")
ax.set_ylabel("Practice Tests")
ax.set_zlabel("Exam Score")
ax.set_title("Multiple Linear Regression: Study Hours & Practice Tests
vs Exam Score")
plt.legend()
plt.show()
Coefficients (m1, m2): [ 5.0000000e+00 -2.4404744e-15]
Intercept (b): 45.0
Model Evaluation Metrics:
Mean Absolute Error (MAE): 0.0
Mean Squared Error (MSE): 0.0
R² Score: 1.0
Logistic Regression Logistic regression is a statistical method used for classification.
Unlike linear regression, which predicts continuous values, logistic regression is used to predict
categorical outcomes.
Types of Logistic Regressioni
a) Binary Classification Used when there are only two possible outcomes (e.g., Yes/No,
Pass/Fail).
Example: Predicting whether a student will pass or fail based on study hours.
b) Multiclass Classification
Used when there are more than two possible outcomes (e.g., A/B/C/D grades).
Example: Predicting the grade of a student based on test scores and attendance.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix,
classification_report
# Sample Data: Study hours and past scores
data = {'Study_Hours': [2, 4, 6, 8, 10, 1, 3, 5, 7, 9],
'Past_Score': [40, 50, 60, 70, 80, 30, 45, 55, 65, 75],
'Pass': [0, 0, 1, 1, 1, 0, 0, 1, 1, 1]} # 1 = Pass, 0 = Fail
df = pd.DataFrame(data)
# Splitting data into training and testing sets
X = df[['Study_Hours', 'Past_Score']]
y = df['Pass']
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
# Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)
# Predictions
y_pred = model.predict(X_test)
# Model Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test,
y_pred))
# Sigmoid Function
def sigmoid(z):
return 1 / (1 + np.exp(-z))
# Generate values for sigmoid graph
z_values = np.linspace(-10, 10, 100)
sigmoid_values = sigmoid(z_values)
# Plot Sigmoid Function
plt.figure(figsize=(8, 5))
plt.plot(z_values, sigmoid_values, label="Sigmoid Function",
color='blue')
plt.axhline(y=0.5, color='r', linestyle='--', label="Decision Boundary
(0.5)")
plt.xlabel("z (Weighted Sum)")
plt.ylabel("Sigmoid Output (Probability)")
plt.title("Sigmoid Function in Logistic Regression")
plt.legend()
plt.grid()
plt.show()
Accuracy: 0.5
Confusion Matrix:
[[0 1]
[0 1]]
Classification Report:
precision recall f1-score support
0 0.00 0.00 0.00 1
1 0.50 1.00 0.67 1
accuracy 0.50 2
macro avg 0.25 0.50 0.33 2
weighted avg 0.25 0.50 0.33 2
/usr/local/lib/python3.11/dist-packages/sklearn/metrics/
_classification.py:1565: UndefinedMetricWarning: Precision is ill-
defined and being set to 0.0 in labels with no predicted samples. Use
`zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is",
len(result))
/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classificatio
n.py:1565: UndefinedMetricWarning: Precision is ill-defined and being
set to 0.0 in labels with no predicted samples. Use `zero_division`
parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is",
len(result))
/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classificatio
n.py:1565: UndefinedMetricWarning: Precision is ill-defined and being
set to 0.0 in labels with no predicted samples. Use `zero_division`
parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is",
len(result))
Precision: Out of all the students predicted to pass, how many actually passed?
Recall: Out of all the students who should pass, how many were correctly predicted?
F1-Score: A balance between precision and recall to check overall accuracy.
Support: The number of students in each category (pass or fail).
Macro Avg: The average of precision, recall, and F1-score across both classes.
Weighted Avg: Takes into account the number of instances in each class.
Accuracy: Accuracy: 0.5 → This means that the model correctly predicted 50% of the test cases.
Since we only have two test samples, the model got one prediction correct and one incorrect.
Confusion Matrix: [[0 1] [0 1]]
The confusion matrix helps understand the model's predictions.
Rows → Actual Classes (0 = Fail, 1 = Pass) Columns → Predicted Classes (0 = Fail, 1 = Pass)
Actual / Predicted 0 (Fail) 1 (Pass) 0 (Fail) 0 (True Negative) 1 (False Positive) 1 (Pass) 0 (False
Negative) 1 (True Positive)
image.png
True Negative (TN) = 0 → The model never predicted "Fail" correctly.
False Positive (FP) = 1 → The model incorrectly predicted "Pass" for a failing student.
False Negative (FN) = 0 → The model never missed a passing student.
True Positive (TP) = 1 → The model correctly predicted one student as "Pass."
Evaluation Metrics for Classification To check how well our logistic regression model performs,
we use: Accuracy: Measures how many predictions are correct.
Precision & Recall: Useful when data is imbalanced.
Confusion Matrix: Shows true positives, false positives, true negatives, and false negatives.
F1 Score: A balance between precision and recall.
EXPLAINATION
1.Logistic regression is used for classification problems.
2.It works for both binary (two classes) and multiclass problems.
3.Evaluation metrics like accuracy, precision, and confusion matrix help assess model
performance.
4.Python makes it easy to implement using libraries like sklearn.
Python Code for Multiclass Classification
Multiclass classification is used when we have more than two possible categories. In this
example, we predict student grades (A, B, or C) based on study hours and test scores.
# Import Necessary Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix,
classification_report
import seaborn as sns
#Create a Simple Dataset
# Creating a dataset with Study Hours, Test Scores, and Grades
data = {
'Study_Hours': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'Test_Score': [40, 50, 55, 60, 65, 70, 75, 80, 85, 90],
'Grade': ['C', 'C', 'C', 'B', 'B', 'B', 'A', 'A', 'A', 'A'] # A =
High, B = Medium, C = Low
}
# Convert dictionary to pandas DataFrame
df = pd.DataFrame(data)
# Convert dictionary to pandas DataFrame
df = pd.DataFrame(data)
# Encode Grades into numerical values (C -> 0, B -> 1, A -> 2)
label_encoder = LabelEncoder()
df['Grade'] = label_encoder.fit_transform(df['Grade']) # C=0, B=1,
A=2
# Splitting Data into Features (X) and Target (y)
X = df[['Study_Hours', 'Test_Score']]
y = df['Grade']
# Splitting into Training (80%) and Testing (20%) Data
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
#Train a Logistic Regression Model
# Initialize and Train Logistic Regression Model
model = LogisticRegression(multi_class='ovr', solver='lbfgs')
model.fit(X_train, y_train)
# Predict on Test Data
y_pred = model.predict(X_test)
# Print Evaluation Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test,
y_pred))
# Plot the Decision Boundaries
# Plot Decision Boundaries
plt.figure(figsize=(8, 6))
# Scatter plot of Study Hours vs. Test Score
sns.scatterplot(x=df['Study_Hours'], y=df['Test_Score'],
hue=df['Grade'], palette='coolwarm')
# Create a mesh grid
x_min, x_max = X['Study_Hours'].min() - 1, X['Study_Hours'].max() + 1
y_min, y_max = X['Test_Score'].min() - 1, X['Test_Score'].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
# Predict the class for each point in the mesh grid
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Contour plot to show decision regions
plt.contourf(xx, yy, Z, alpha=0.2, cmap='coolwarm')
plt.xlabel('Study Hours')
plt.ylabel('Test Score')
plt.title('Multiclass Classification: Predicting Student Grades')
plt.legend(title='Grade')
plt.show()
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/
_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in
version 1.5 and will be removed in 1.7. Use
OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its
default value to avoid this warning.
warnings.warn(
/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py:27
39: UserWarning: X does not have valid feature names, but
LogisticRegression was fitted with feature names
warnings.warn(
Accuracy: 1.0
Confusion Matrix:
[[1 0]
[0 1]]
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 1
2 1.00 1.00 1.00 1
accuracy 1.00 2
macro avg 1.00 1.00 1.00 2
weighted avg 1.00 1.00 1.00 2
1. Decision Tree Basics A Decision Tree is a supervised learning algorithm used for
classification and regression tasks.
It splits the data into branches based on feature conditions to make predictions.
Root Node: The first decision point. Internal Nodes: Decision points based on feature values.
Leaf Nodes: Final output categories.
For example,
If a student studies for more than 6 hours, they are likely to get an 'A' grade. If they study
between 3-6 hours, they might get a 'B' grade. If they study less than 3 hours, they might get a
'C' grade.
Entropy
If a class has 10 students, and 5 passed while 5 failed, the entropy is high (uncertain). But if 9
passed and only 1 failed, the entropy is lower because most students belong to one category.
In decision trees, the goal is to split the data in a way that reduces entropy, making the groups
more pure.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report,
confusion_matrix
# Creating a dataset with Study Hours, Test Scores, and Grades
data = {
'Study_Hours': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'Test_Score': [40, 50, 55, 60, 65, 70, 75, 80, 85, 90],
'Grade': ['C', 'C', 'C', 'B', 'B', 'B', 'A', 'A', 'A', 'A'] # A =
High, B = Medium, C = Low
}
df = pd.DataFrame(data)
# Convert Grades into numerical values: A -> 2, B -> 1, C -> 0
df['Grade'] = df['Grade'].map({'C': 0, 'B': 1, 'A': 2})
# Splitting data into training and testing sets
X = df[['Study_Hours', 'Test_Score']]
y = df['Grade']
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
# Creating and training the Decision Tree model
model = DecisionTreeClassifier(criterion='entropy', max_depth=3,
random_state=42)
model.fit(X_train, y_train)
# Making predictions
y_pred = model.predict(X_test)
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", report)
# Plotting the Decision Tree
plt.figure(figsize=(10, 6))
plot_tree(model, feature_names=['Study_Hours', 'Test_Score'],
class_names=['C', 'B', 'A'], filled=True)
plt.title("Decision Tree Visualization")
plt.show()
# Scatter Plot to Show Classification
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df['Study_Hours'], y=df['Test_Score'],
hue=df['Grade'], palette=['red', 'orange', 'green'], s=100,
edgecolor='k')
plt.xlabel("Study Hours")
plt.ylabel("Test Score")
plt.title("Student Performance Classification")
plt.show()
Accuracy: 1.0
Confusion Matrix:
[[1 0]
[0 1]]
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 1
2 1.00 1.00 1.00 1
accuracy 1.00 2
macro avg 1.00 1.00 1.00 2
weighted avg 1.00 1.00 1.00 2
Random Forest
Random Forest is a machine learning method that helps computers make decisions by learning
from examples, just like how students learn from past experiences.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report,
confusion_matrix
import seaborn as sns
# Creating a dataset with Study Hours, Test Scores, and Grades
data = {
'Study_Hours': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'Test_Score': [40, 50, 55, 60, 65, 70, 75, 80, 85, 90],
'Grade': ['C', 'C', 'C', 'B', 'B', 'B', 'A', 'A', 'A', 'A'] # A =
High, B = Medium, C = Low
}
df = pd.DataFrame(data)
# Convert Grades into numerical values: A -> 2, B -> 1, C -> 0
df['Grade'] = df['Grade'].map({'C': 0, 'B': 1, 'A': 2})
# Splitting data into training and testing sets
X = df[['Study_Hours', 'Test_Score']]
y = df['Grade']
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
# Random Forest Model
model = RandomForestClassifier(n_estimators=10, random_state=42)
model.fit(X_train, y_train)
# Predictions
y_pred = model.predict(X_test)
# Model Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test,
y_pred))
# Plot Decision Boundaries
plt.figure(figsize=(8, 6))
# Creating a grid of values
x_min, x_max = X['Study_Hours'].min() - 1, X['Study_Hours'].max() + 1
y_min, y_max = X['Test_Score'].min() - 10, X['Test_Score'].max() + 10
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
# Predicting across the grid
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Contour plot for decision boundaries
plt.contourf(xx, yy, Z, alpha=0.3)
plt.scatter(df['Study_Hours'], df['Test_Score'], c=df['Grade'],
edgecolor='k', s=100, cmap='viridis')
# Labels and title
plt.xlabel('Study Hours')
plt.ylabel('Test Score')
plt.title('Random Forest Classification of Student Grades')
plt.colorbar(label='Grade (0=C, 1=B, 2=A)')
plt.show()
Accuracy: 1.0
Confusion Matrix:
[[1 0]
[0 1]]
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 1
2 1.00 1.00 1.00 1
accuracy 1.00 2
macro avg 1.00 1.00 1.00 2
weighted avg 1.00 1.00 1.00 2
/usr/local/lib/python3.11/dist-packages/sklearn/utils/
validation.py:2739: UserWarning: X does not have valid feature names,
but RandomForestClassifier was fitted with feature names
warnings.warn(
Ensemble methods with Random Forest Ensemble methods are machine learning techniques
that combine multiple models to make better predictions than a single model.
Instead of relying on one decision tree (which can be biased or inaccurate), ensemble methods
average multiple models for better accuracy and stability. This reduces errors and makes the
model more powerful and reliable.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,
classification_report
# Creating a dataset for students
data = {
'Study_Hours': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'Test_Score': [40, 50, 55, 60, 65, 70, 75, 80, 85, 90],
'Grade': ['C', 'C', 'C', 'B', 'B', 'B', 'A', 'A', 'A', 'A'] # A =
High, B = Medium, C = Low
}
df = pd.DataFrame(data)
# Convert Grades into numerical values: A -> 2, B -> 1, C -> 0
df['Grade'] = df['Grade'].map({'C': 0, 'B': 1, 'A': 2})
# Splitting data into training and testing sets
X = df[['Study_Hours', 'Test_Score']]
y = df['Grade']
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
# Creating the Random Forest Model (Ensemble Learning)
rf_model = RandomForestClassifier(n_estimators=20, random_state=42) #
Using 20 decision trees
rf_model.fit(X_train, y_train)
# Making Predictions
y_pred = rf_model.predict(X_test)
# Evaluating the Model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test,
y_pred))
# Plotting the Decision Boundary
plt.figure(figsize=(8,6))
# Creating a grid of values for plotting
x_min, x_max = X['Study_Hours'].min() - 1, X['Study_Hours'].max() + 1
y_min, y_max = X['Test_Score'].min() - 1, X['Test_Score'].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
# Predicting values for the grid
Z = rf_model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Creating the contour plot
plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm')
# Scatter plot of original data
sns.scatterplot(x='Study_Hours', y='Test_Score', hue=df['Grade'],
palette='coolwarm', data=df, s=100, edgecolor='black')
plt.xlabel('Study Hours')
plt.ylabel('Test Score')
plt.title('Random Forest (Ensemble) Decision Boundary')
plt.legend(['C (Low)', 'B (Medium)', 'A (High)'])
plt.show()
Accuracy: 1.0
Confusion Matrix:
[[1 0]
[0 1]]
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 1
2 1.00 1.00 1.00 1
accuracy 1.00 2
macro avg 1.00 1.00 1.00 2
weighted avg 1.00 1.00 1.00 2
/usr/local/lib/python3.11/dist-packages/sklearn/utils/
validation.py:2739: UserWarning: X does not have valid feature names,
but RandomForestClassifier was fitted with feature names
warnings.warn(
UNIT 2 ENDS