# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from keras.models import Sequential
from keras.layers import Dense
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Download the dataset directly from the UCI repository
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls'
df = pd.read_excel(url, header=1) # The first row contains column headers
# Display the first few rows of the DataFrame
print(df.head())
# Handle categorical columns (SEX, EDUCATION, and MARRIAGE)
df['SEX'] = df['SEX'].map({1: 'Male', 2: 'Female'})
df['EDUCATION'] = df['EDUCATION'].map({1: 'Graduate', 2: 'University', 3: 'High school', 4: 'Other'})
df['MARRIAGE'] = df['MARRIAGE'].map({1: 'Married', 2: 'Single', 3: 'Other'})
# Convert these columns to numerical values for model training
df['SEX'] = df['SEX'].map({'Male': 0, 'Female': 1})
df['EDUCATION'] = df['EDUCATION'].map({'Graduate': 0, 'University': 1, 'High school': 2, 'Other': 3})
df['MARRIAGE'] = df['MARRIAGE'].map({'Married': 0, 'Single': 1, 'Other': 2})
# Split data into features (X) and target variable (y)
X = df.drop(['ID', 'default payment next month'], axis=1) # Drop ID and target column
y = df['default payment next month'] # The target variable (whether the user defaulted)
# Normalize the data (standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 \
0 1 20000 2 2 1 24 2 2 -1 -1
1 2 120000 2 2 2 26 -1 2 0 0
2 3 90000 2 2 2 34 0 0 0 0
3 4 50000 2 2 1 37 0 0 0 0
4 5 50000 1 2 1 57 -1 0 -1 0
... BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 \
0 ... 0 0 0 0 689 0
1 ... 3272 3455 3261 0 1000 1000
2 ... 14331 14948 15549 1518 1500 1000
3 ... 28314 28959 29547 2000 2019 1200
4 ... 20940 19146 19131 2000 36681 10000
PAY_AMT4 PAY_AMT5 PAY_AMT6 default payment next month
0 0 0 0 1
1 1000 0 2000 1
2 1000 1000 5000 0
3 1100 1069 1000 0
4 9000 689 679 0
[5 rows x 25 columns]
# Logistic Regression Model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
# Predicting the test set results
y_pred_logreg = logreg.predict(X_test)
# Evaluate the Logistic Regression model
print("Logistic Regression Accuracy: ", accuracy_score(y_test, y_pred_logreg))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_logreg))
# Confusion Matrix for Logistic Regression
sns.heatmap(confusion_matrix(y_test, y_pred_logreg), annot=True, fmt='d', cmap='Blues', xticklabels=['No Default', 'Default'], yticklabe
plt.title('Logistic Regression Confusion Matrix')
plt.show()
Logistic Regression Accuracy: 0.47
Logistic Regression Classification Report:
precision recall f1-score support
0 0.47 0.33 0.39 102
1 0.47 0.61 0.53 98
accuracy 0.47 200
macro avg 0.47 0.47 0.46 200
weighted avg 0.47 0.47 0.46 200
# Decision Tree Model
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train, y_train)
# Predicting the test set results
y_pred_dtree = dtree.predict(X_test)
# Evaluate the Decision Tree model
print("Decision Tree Accuracy: ", accuracy_score(y_test, y_pred_dtree))
print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_dtree))
# Confusion Matrix for Decision Tree
sns.heatmap(confusion_matrix(y_test, y_pred_dtree), annot=True, fmt='d', cmap='Blues', xticklabels=['No Default', 'Default'], yticklabel
plt.title('Decision Tree Confusion Matrix')
plt.show()
Decision Tree Accuracy: 0.495
Decision Tree Classification Report:
precision recall f1-score support
0 0.51 0.46 0.48 102
1 0.49 0.53 0.51 98
accuracy 0.49 200
macro avg 0.50 0.50 0.49 200
weighted avg 0.50 0.49 0.49 200
# Random Forest Model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
# Predicting the test set results
y_pred_rf = rf.predict(X_test)
# Evaluate the Random Forest model
print("Random Forest Accuracy: ", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
# Confusion Matrix for Random Forest
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', cmap='Blues', xticklabels=['No Default', 'Default'], yticklabels=[
plt.title('Random Forest Confusion Matrix')
plt.show()
Random Forest Accuracy: 0.505
Random Forest Classification Report:
precision recall f1-score support
0 0.52 0.36 0.43 102
1 0.50 0.65 0.56 98
accuracy 0.51 200
macro avg 0.51 0.51 0.50 200
weighted avg 0.51 0.51 0.49 200
# SVM Model
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)
# Predicting the test set results
y_pred_svm = svm.predict(X_test)
# Evaluate the SVM model
print("SVM Accuracy: ", accuracy_score(y_test, y_pred_svm))
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))
# Confusion Matrix for SVM
sns.heatmap(confusion_matrix(y_test, y_pred_svm), annot=True, fmt='d', cmap='Blues', xticklabels=['No Default', 'Default'], yticklabels=
plt.title('SVM Confusion Matrix')
plt.show()
SVM Accuracy: 0.465
SVM Classification Report:
precision recall f1-score support
0 0.47 0.37 0.42 102
1 0.46 0.56 0.51 98
accuracy 0.47 200
macro avg 0.47 0.47 0.46 200
weighted avg 0.47 0.47 0.46 200
# Neural Network Model
ann = Sequential()
# Input Layer
ann.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))
# Hidden Layer
ann.add(Dense(units=32, activation='relu'))
# Output Layer
ann.add(Dense(units=1, activation='sigmoid')) # Sigmoid for binary classification
# Compile the ANN model
ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Train the model
ann.fit(X_train, y_train, epochs=20, batch_size=32, verbose=1)
# Predicting the test set results
y_pred_ann = (ann.predict(X_test) > 0.5)
# Evaluate the ANN model
print("Neural Network Accuracy: ", accuracy_score(y_test, y_pred_ann))
print("Neural Network Classification Report:\n", classification_report(y_test, y_pred_ann))
# Confusion Matrix for Neural Network
sns.heatmap(confusion_matrix(y_test, y_pred_ann), annot=True, fmt='d', cmap='Blues', xticklabels=['No Default', 'Default'], yticklabels=
plt.title('Neural Network Confusion Matrix')
plt.show()
/usr/local/lib/python3.11/dist-packages/keras/src/layers/core/dense.py:87: UserWarning: Do not pass an `input_shape`/`input_dim` arg
super().__init__(activity_regularizer=activity_regularizer, **kwargs)
Epoch 1/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 6s 7ms/step - accuracy: 0.5537 - loss: 0.7256
Epoch 2/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - accuracy: 0.5628 - loss: 0.6892
Epoch 3/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 1s 20ms/step - accuracy: 0.5913 - loss: 0.6700
Epoch 4/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 1s 24ms/step - accuracy: 0.6065 - loss: 0.6442
Epoch 5/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 1s 18ms/step - accuracy: 0.6539 - loss: 0.6362
Epoch 6/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - accuracy: 0.6412 - loss: 0.6340
Epoch 7/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step - accuracy: 0.6932 - loss: 0.6016
Epoch 8/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.7007 - loss: 0.6024
Epoch 9/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.7290 - loss: 0.5780
Epoch 10/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step - accuracy: 0.6993 - loss: 0.5842
Epoch 11/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.7370 - loss: 0.5601
Epoch 12/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.7592 - loss: 0.5485
Epoch 13/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.7631 - loss: 0.5508
Epoch 14/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.7523 - loss: 0.5306
Epoch 15/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.7679 - loss: 0.5294
Epoch 16/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.8134 - loss: 0.4902
Epoch 17/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.8027 - loss: 0.4838
Epoch 18/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step - accuracy: 0.8028 - loss: 0.4866
Epoch 19/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.8393 - loss: 0.4661
Epoch 20/20
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.8603 - loss: 0.4406
7/7 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step
Neural Network Accuracy: 0.495
Neural Network Classification Report:
precision recall f1-score support
0 0.50 0.50 0.50 102
1 0.48 0.49 0.49 98
accuracy 0.49 200
macro avg 0.49 0.49 0.49 200
weighted avg 0.50 0.49 0.50 200