In [ ]: # This Python 3 environment comes with many helpful analytics libraries inst
# It is defined by the kaggle/python Docker image: https://github.com/kaggle
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will l
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that
# You can also write temporary files to /kaggle/temp/, but they won't be sav
Student Performance Analysis Using
Machine Learning
This notebook implements various machine learning techniques on the Student
Performance dataset from Kaggle.
Included Techniques:
Simple & Multiple Linear Regression
Polynomial, Lasso & Ridge Regression
Naïve Bayes, Logistic Regression
Decision Tree, SVM, K-NN Classifier
Artificial Neural Network
K-Means and Hierarchical Clustering
Evaluation Metrics:
R² Score (Regression)
Confusion Matrix, F1 Score (Classification)
Silhouette Score (Clustering)
In [1]: import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix,
In [4]: df = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerfor
df = pd.get_dummies(df, drop_first=True)
X = df.drop(['math score'], axis=1)
y = df['math score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, ran
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
📈 Regression Models
1️⃣ Simple Linear Regression
In [5]: from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train[:, [0]], y_train)
y_pred = model.predict(X_test[:, [0]])
print("R² Score:", r2_score(y_test, y_pred))
R² Score: 0.6804469009921283
2️⃣ Multiple Linear Regression
In [6]: model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
R² Score: 0.8804332983749564
3️⃣ Polynomial Regression
In [7]: poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_train)
model = LinearRegression()
model.fit(X_poly, y_train)
y_pred = model.predict(poly.transform(X_test))
print("R² Score:", r2_score(y_test, y_pred))
R² Score: 0.8650480765142721
4️⃣ Ridge and Lasso Regression
In [8]: from sklearn.linear_model import Ridge, Lasso
ridge = Ridge(alpha=1.0).fit(X_train, y_train)
lasso = Lasso(alpha=0.1).fit(X_train, y_train)
print("Ridge R²:", r2_score(y_test, ridge.predict(X_test)))
print("Lasso R²:", r2_score(y_test, lasso.predict(X_test)))
Ridge R²: 0.8805453685953484
🤖 Classification Models
Lasso R²: 0.8822147639745545
🔁 Convert Target to Binary (Pass/Fail)
In [9]: df['target'] = ['pass' if s >= 50 else 'fail' for s in df['math score']]
X = df.drop(['math score', 'target'], axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, ran
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
5️⃣ Logistic Regression
In [10]: from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
log_preds = log_model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, log_preds))
print("Classification Report:\n", classification_report(y_test, log_preds))
Confusion Matrix:
[[ 20 14]
[ 7 159]]
Classification Report:
precision recall f1-score support
fail 0.74 0.59 0.66 34
pass 0.92 0.96 0.94 166
accuracy 0.90 200
macro avg 0.83 0.77 0.80 200
weighted avg 0.89 0.90 0.89 200
6️⃣ Naïve Bayes Classifier
In [11]: from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, nb_preds))
print("Classification Report:\n", classification_report(y_test, nb_preds))
Confusion Matrix:
[[ 23 11]
[ 18 148]]
Classification Report:
precision recall f1-score support
fail 0.56 0.68 0.61 34
pass 0.93 0.89 0.91 166
accuracy 0.85 200
macro avg 0.75 0.78 0.76 200
weighted avg 0.87 0.85 0.86 200
7️⃣ K-Nearest Neighbors (K-NN)
In [12]: from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_preds = knn_model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, knn_preds))
print("Classification Report:\n", classification_report(y_test, knn_preds))
Confusion Matrix:
[[ 9 25]
[ 4 162]]
Classification Report:
precision recall f1-score support
fail 0.69 0.26 0.38 34
pass 0.87 0.98 0.92 166
accuracy 0.85 200
macro avg 0.78 0.62 0.65 200
weighted avg 0.84 0.85 0.83 200
8️⃣ Decision Tree Classification
In [13]: from sklearn.tree import DecisionTreeClassifier
tree_model = DecisionTreeClassifier(random_state=0)
tree_model.fit(X_train, y_train)
tree_preds = tree_model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, tree_preds))
print("Classification Report:\n", classification_report(y_test, tree_preds))
Confusion Matrix:
[[ 20 14]
[ 11 155]]
Classification Report:
precision recall f1-score support
fail 0.65 0.59 0.62 34
pass 0.92 0.93 0.93 166
accuracy 0.88 200
macro avg 0.78 0.76 0.77 200
weighted avg 0.87 0.88 0.87 200
9️⃣ Support Vector Machine (SVM)
Classification
In [14]: from sklearn.svm import SVC
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, svm_preds))
print("Classification Report:\n", classification_report(y_test, svm_preds))
Confusion Matrix:
[[ 16 18]
[ 2 164]]
Classification Report:
precision recall f1-score support
fail 0.89 0.47 0.62 34
pass 0.90 0.99 0.94 166
accuracy 0.90 200
macro avg 0.89 0.73 0.78 200
weighted avg 0.90 0.90 0.89 200
🔟 Artificial Neural Network (ANN)
In [15]: from sklearn.neural_network import MLPClassifier
ann_model = MLPClassifier(hidden_layer_sizes=(10,), max_iter=300, random_sta
ann_model.fit(X_train, y_train)
ann_preds = ann_model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, ann_preds))
print("Classification Report:\n", classification_report(y_test, ann_preds))
Confusion Matrix:
[[ 19 15]
[ 8 158]]
Classification Report:
precision recall f1-score support
fail 0.70 0.56 0.62 34
pass 0.91 0.95 0.93 166
accuracy 0.89 200
macro avg 0.81 0.76 0.78 200
weighted avg 0.88 0.89 0.88 200
/usr/local/lib/python3.11/dist-packages/sklearn/neural_network/_multilayer_p
erceptron.py:686: ConvergenceWarning: Stochastic Optimizer: Maximum iteratio
ns (300) reached and the optimization hasn't converged yet.
📊 Clustering Models
warnings.warn(
1️⃣1️⃣ K-Means Clustering
In [16]: from sklearn.cluster import KMeans
X = df.drop(['math score', 'target'], axis=1)
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=2, random_state=0).fit(X_scaled)
print("KMeans Silhouette Score:", silhouette_score(X_scaled, kmeans.labels_)
KMeans Silhouette Score: 0.10611407723279058
/usr/local/lib/python3.11/dist-packages/sklearn/cluster/_kmeans.py:870: Futu
reWarning: The default value of `n_init` will change from 10 to 'auto' in 1.
4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(
1️⃣2️⃣ Hierarchical Clustering
In [17]: from sklearn.cluster import AgglomerativeClustering
hclust = AgglomerativeClustering(n_clusters=2).fit(X_scaled)
print("Hierarchical Clustering Silhouette Score:", silhouette_score(X_scaled
Hierarchical Clustering Silhouette Score: 0.15767520836587193
📌 Conclusion
This notebook demonstrates the application of various machine learning
algorithms to predict student performance. We implemented multiple regression
models, classification algorithms, and clustering techniques to analyze the
dataset and evaluate the effectiveness of each model.
Thank you for reviewing this analysis!
Submitted by:
Raunak Kumar Singh
University Name:
Atmaram Sanatan Dharma College
Course Name:
B.sc Computer Science Hons
Date of Submission:
15-04-2025
This notebook was converted with convert.ploomber.io