Project 05/04/25, 5:47 PM
In [1]: import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]: data = pd.read_csv('/Users/devanshdeepgupta/Downloads/winequality-red.csv')
data.head()
Out[2]: free total
fixed volatile citric residual
chlorides sulfur sulfur density pH
acidity acidity acid sugar
dioxide dioxide
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51
In [3]: print(data.dtypes)
fixed acidity float64
volatile acidity float64
citric acid float64
residual sugar float64
chlorides float64
free sulfur dioxide float64
total sulfur dioxide float64
density float64
pH float64
sulphates float64
alcohol float64
quality int64
dtype: object
Feature-target separation
In [5]: target_col = 'quality'
features = data.drop(columns=[target_col]).values
target = data[target_col].values
Train-test split (80-20)
In [7]: split_index = int(0.8 * len(features))
X_train, X_test = features[:split_index], features[split_index:]
y_train, y_test = target[:split_index], target[split_index:]
file:///Users/devanshdeepgupta/Downloads/Project.html Page 1 of 4
Project 05/04/25, 5:47 PM
Standardize features
In [9]: mean_vals = X_train.mean(axis=0)
std_vals = X_train.std(axis=0)
std_vals[std_vals == 0] = 1e-6 # Avoid divide-by-zero
X_train_norm = (X_train - mean_vals) / std_vals
X_test_norm = (X_test - mean_vals) / std_vals
X_test_bias = np.hstack((np.ones((X_test.shape[0], 1)), X_test_norm))
Linear Regression
In [11]: def train_linear(X, y):
X_bias = np.hstack((np.ones((X.shape[0], 1)), X))
return np.linalg.pinv(X_bias.T @ X_bias) @ X_bias.T @ y
theta_lin = train_linear(X_train_norm, y_train)
y_pred_lin = X_test_bias @ theta_lin
mse_lin = np.mean((y_test - y_pred_lin) ** 2)
r2_lin = 1 - np.sum((y_test - y_pred_lin) ** 2) / np.sum((y_test - y_test.me
Binary classification for quality >= 6
In [13]: y_binary = (target >= 6).astype(int)
y_train_bin = y_binary[:split_index]
y_test_bin = y_binary[split_index:]
Logistic Regression
In [15]: def sigmoid(z):
return 1 / (1 + np.exp(-z))
In [16]: def train_logistic(X, y, lr=0.01, epochs=1000):
weights = np.zeros(X.shape[1] + 1)
X_bias = np.hstack((np.ones((X.shape[0], 1)), X))
for _ in range(epochs):
preds = sigmoid(X_bias @ weights)
grad = X_bias.T @ (preds - y) / len(y)
weights -= lr * grad
return weights
In [17]: theta_log = train_logistic(X_train_norm, y_train_bin)
y_pred_log = sigmoid(X_test_bias @ theta_log) >= 0.5
acc_log = np.mean(y_pred_log == y_test_bin)
KNN Classifier
file:///Users/devanshdeepgupta/Downloads/Project.html Page 2 of 4
Project 05/04/25, 5:47 PM
In [19]: def knn_predict(X_train, y_train, X_test, k=5):
y_pred = []
for x in X_test:
dists = np.linalg.norm(X_train - x, axis=1)
top_k = np.argsort(dists)[:k]
top_labels = y_train[top_k]
y_pred.append(np.bincount(top_labels).argmax())
return np.array(y_pred)
In [20]: y_pred_knn = knn_predict(X_train_norm, y_train_bin, X_test_norm, k=5)
acc_knn = np.mean(y_pred_knn == y_test_bin)
Naive Bayes Classifier
In [22]: def train_naive_bayes(X, y):
classes = np.unique(y)
priors = {c: np.mean(y == c) for c in classes}
means = {c: X[y == c].mean(axis=0) for c in classes}
vars_ = {c: np.where(X[y == c].var(axis=0) == 0, 1e-6, X[y == c].var(axi
return classes, priors, means, vars_
In [23]: def predict_naive_bayes(X, classes, priors, means, vars_):
preds = []
for row in X:
scores = {}
for c in classes:
likelihood = -0.5 * np.sum(((row - means[c]) ** 2) / vars_[c])
scores[c] = np.log(priors[c]) + likelihood
preds.append(max(scores, key=scores.get))
return np.array(preds)
In [24]: classes, priors, means, vars_ = train_naive_bayes(X_train_norm, y_train_bin)
y_pred_nb = predict_naive_bayes(X_test_norm, classes, priors, means, vars_)
acc_nb = np.mean(y_pred_nb == y_test_bin)
Multiple Linear Regression (same as
Linear)
In [26]: theta_multi = train_linear(X_train_norm, y_train)
y_pred_multi = X_test_bias @ theta_multi
mse_multi = np.mean((y_test - y_pred_multi) ** 2)
r2_multi = 1 - np.sum((y_test - y_pred_multi) ** 2) / np.sum((y_test - y_tes
Plotting performance
In [28]: models = ["Linear Reg.", "Multi Linear Reg.", "Logistic Reg.", "KNN", "Naive
accuracy = [None, None, acc_log, acc_knn, acc_nb]
file:///Users/devanshdeepgupta/Downloads/Project.html Page 3 of 4
Project 05/04/25, 5:47 PM
r2_scores = [r2_lin, r2_multi, None, None, None]
In [29]: plt.figure(figsize=(10, 5))
plt.bar(models, [a if a else 0 for a in accuracy], label='Accuracy', color='
plt.bar(models, [r if r else 0 for r in r2_scores], label='R² Score', color=
plt.title("Model Performance Comparison")
plt.ylabel("Score")
plt.legend()
plt.tight_layout()
plt.show()
Summary table
In [31]: summary = pd.DataFrame({
"Model": models,
"Accuracy": accuracy,
"MSE": [mse_lin, mse_multi, None, None, None],
"R² Score": r2_scores
})
print(summary)
Model Accuracy MSE R² Score
0 Linear Reg. NaN 0.431522 0.287476
1 Multi Linear Reg. NaN 0.431522 0.287476
2 Logistic Reg. 0.731250 NaN NaN
3 KNN 0.646875 NaN NaN
4 Naive Bayes 0.728125 NaN NaN
In [ ]:
file:///Users/devanshdeepgupta/Downloads/Project.html Page 4 of 4