keyboard_arrow_down Import Libraries
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
keyboard_arrow_down 1. Data/Domain Understanding and Exploration
1.1. Meaning and Type of Features
# Load the dataset and inspect the features
# Load the dataset
data = pd.read_csv("adverts.csv")
# Display the first few rows and column information
data.head()
public_reference mileage reg_code standard_colour standard_make standard_model vehicle_condition year_of_registration pr
0 202006039777689 0.0 NaN Grey Volvo XC90 NEW NaN 739
1 202007020778260 108230.0 61 Blue Jaguar XF USED 2011.0 7
2 202007020778474 7800.0 17 Grey SKODA Yeti USED 2017.0 14
3 202007080986776 45000.0 16 Brown Vauxhall Mokka USED 2016.0 7
Range Rover
4 202007161321269 64000 0 64 G L dR USED 2015 0 269
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402005 entries, 0 to 402004
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 public_reference 402005 non-null int64
1 mileage 401878 non-null float64
2 reg_code 370148 non-null object
3 standard_colour 396627 non-null object
4 standard_make 402005 non-null object
5 standard_model 402005 non-null object
6 vehicle_condition 402005 non-null object
7 year_of_registration 368694 non-null float64
8 price 402005 non-null int64
9 body_type 401168 non-null object
10 crossover_car_and_van 402005 non-null bool
11 fuel_type 401404 non-null object
dtypes: bool(1), float64(2), int64(2), object(7)
memory usage: 34.1+ MB
data.describe()
public_reference mileage year_of_registration price
count 4.020050e+05 401878.000000 368694.000000 4.020050e+05
mean 2.020071e+14 37743.595656 2015.006206 1.734197e+04
std 1.691662e+10 34831.724018 7.962667 4.643746e+04
min 2.013072e+14 0.000000 999.000000 1.200000e+02
25% 2.020090e+14 10481.000000 2013.000000 7.495000e+03
50% 2.020093e+14 28629.500000 2016.000000 1.260000e+04
75% 2.020102e+14 56875.750000 2018.000000 2.000000e+04
max 2.020110e+14 999999.000000 2020.000000 9.999999e+06
data.shape
(402005, 12)
data.isnull().sum()
public_reference 0
mileage 127
reg_code 31857
standard_colour 5378
standard_make 0
standard_model 0
vehicle_condition 0
year_of_registration 33311
price 0
body_type 837
crossover_car_and_van 0
fuel_type 601
dtype: int64
keyboard_arrow_down Analysis of Distributions
# Analyze distributions of numerical features
numerical_features = ['mileage', 'year_of_registration', 'price', 'body_type']
data[numerical_features].hist(bins=30, figsize=(10, 6))
plt.suptitle('Histograms of Numerical Features')
plt.show()
keyboard_arrow_down 1.2. Analysis of Predictive Power of Features
# Correlation matrix
correlation_matrix = data[numerical_features].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()
# Scatter plots for numerical features vs price
for feature in ['mileage', 'year_of_registration']:
plt.figure(figsize=(8, 4))
sns.scatterplot(x=data[feature], y=data['price'], color='green')
plt.title(feature + ' vs Price')
plt.xlabel(feature)
plt.ylabel('Price')
plt.show()
keyboard_arrow_down 1.3. Data Processing for Data Exploration and Visualisation
# Check for missing values
print(data.isnull().sum())
# Dealing with missing values
data.dropna(subset=['price'], inplace=True)
public_reference 0
mileage 127
reg_code 31857
standard_colour 5378
standard_make 0
standard_model 0
vehicle_condition 0
year_of_registration 33311
price 0
body_type 837
crossover_car_and_van 0
fuel_type 601
dtype: int64
# Preprocessing the data
from sklearn.preprocessing import LabelEncoder
# Handle missing values
data.fillna({'mileage': 0, 'year_of_registration': 0, 'price': 0, 'standard_colour': 'Unknown',
'standard_make': 'Unknown', 'standard_model': 'Unknown', 'vehicle_condition': 'Unknown',
'body_type': 'Unknown', 'fuel_type': 'Unknown'}, inplace=True)
# Encode categorical variables
label_encoders = {}
categorical_features = ['standard_colour', 'standard_make', 'standard_model', 'vehicle_condition', 'body_type', 'fuel_type']
for feature in categorical_features:
le = LabelEncoder()
data[feature] = le.fit_transform(data[feature])
label_encoders[feature] = le
# Visualizing relationships
plt.figure(figsize=(10, 6))
sns.scatterplot(x='mileage', y='price', data=data)
plt.title('Mileage vs Price')
plt.show()
data.isnull().sum()
public_reference 0
mileage 0
reg_code 31857
standard_colour 0
standard_make 0
standard_model 0
vehicle_condition 0
year_of_registration 0
price 0
body_type 0
crossover_car_and_van 0
fuel_type 0
dtype: int64
Start coding or generate with AI.
keyboard_arrow_down 2. Data Processing for Machine Learning
2.1. Dealing with Missing Values, Outliers, and Noise
# Data Processing for Data Exploration and Visualization
# Check for missing values and handle them
missing_values = data.isnull().sum()
print("Missing values in each column:")
print(missing_values)
Missing values in each column:
public_reference 0
mileage 0
reg_code 31857
standard_colour 0
standard_make 0
standard_model 0
vehicle_condition 0
year_of_registration 0
price 0
body_type 0
crossover_car_and_van 0
fuel_type 0
dtype: int64
# Verify preprocessing
print("Data preprocessing complete. Here's the head of the processed dataframe:")
data.head()
Data preprocessing complete. Here's the head of the processed dataframe:
public_reference mileage reg_code standard_colour standard_make standard_model vehicle_condition year_of_registration pr
0 202006039777689 0.0 NaN 8 106 1107 0 0.0 739
1 202007020778260 108230.0 61 2 47 1110 1 2011.0 7
2 202007020778474 7800.0 17 8 91 1130 1 2017.0 14
3 202007080986776 45000.0 16 4 104 702 1 2016.0 7
4 202007161321269 64000.0 64 8 54 833 1 2015.0 269
# Subplot for before outlier removal
plt.subplot(1, 2, 1)
sns.boxplot(y=data['price'])
plt.title('Price Distribution (Before Outlier Removal)')
plt.ylabel('Price')
# Identifying and removing outliers using quantile method
q_low = data["price"].quantile(0.01)
q_hi = data["price"].quantile(0.99)
data_filtered = data[(data["price"] < q_hi) & (data["price"] > q_low)]
# Subplot for after outlier removal
plt.subplot(1, 2, 2)
sns.boxplot(y=data_filtered['price'])
plt.title('Price Distribution (After Outlier Removal)')
plt.ylabel('Price')
# Adjust layout and show the plot
plt.tight_layout()
plt.show()
# Verify the shape of the dataset after removing outliers
print("Shape of dataset after removing outliers:", data.shape)
Shape of dataset after removing outliers: (402005, 12)
keyboard_arrow_down 2.2. Feature Engineering, Data Transformations, Feature Selection
# Creating a new feature: vehicle age
data_filtered['vehicle_age'] = 2024 - data_filtered['year_of_registration']
# Selecting features for the model
features = ['mileage', 'vehicle_age']
X = data_filtered[features]
y = data_filtered['price']
<ipython-input-28-e38c903681f7>:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus
data_filtered['vehicle_age'] = 2024 - data_filtered['year_of_registration']
Start coding or generate with AI.
keyboard_arrow_down 3. Model Building
3.1. Algorithm Selection, Model Instantiation and Configuration
# Train-validation-test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
##Algorithm Selection
# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
# Decision Tree
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)
# k-Nearest Neighbors
knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train, y_train)
▾ KNeighborsRegressor i ?
KNeighborsRegressor()
LinearRegression()
▾ LinearRegression i ?
LinearRegression()
DecisionTreeRegressor()
▾ DecisionTreeRegressor i ?
DecisionTreeRegressor()
keyboard_arrow_down 3.2. Grid Search, and Model Ranking and Selection
## 3.2. Grid Search for Hyperparameter Tuning
# Grid search for Decision Tree
param_grid = {'max_depth': [None, 5, 10, 20]}
grid_search = GridSearchCV(tree_reg, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_tree = grid_search.best_estimator_
keyboard_arrow_down 4. Model Evaluation and Analysis
4.1. Coarse-Grained Evaluation/Analysis
models = {
"Linear Regression": lin_reg,
"Best Decision Tree": best_tree,
"KNN": knn_reg
}
results = {}
for model_name, model in models.items():
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
results[model_name] = {'MSE': mse, 'R²': r2}
print(f"{model_name} - MSE: {mse:.2f}, R²: {r2:.2f}")
Linear Regression - MSE: 115341217.98, R²: 0.28
Best Decision Tree - MSE: 101868407.49, R²: 0.36
KNN - MSE: 123313134.73, R²: 0.23
# Converting results to DataFrame for easier plotting
results_df = pd.DataFrame(results).T
# Plotting the MSE comparison
plt.figure(figsize=(10, 6))
sns.barplot(x=results_df.index, y='MSE', data=results_df, palette='viridis')
plt.title('Model Comparison: Mean Squared Error (MSE)')
plt.ylabel('Mean Squared Error (MSE)')
plt.xlabel('Models')
plt.xticks(rotation=15)
plt.show()
<ipython-input-35-a7100df75570>:6: FutureWarning:
Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `le
sns.barplot(x=results_df.index, y='MSE', data=results_df, palette='viridis')
# Plotting the R² comparison
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=results_df.index, y='R²', data=results_df, palette='Blues')
plt.title('Model Comparison: R² Score')
plt.ylabel('R² Score')
plt.xlabel('Models')
plt.xticks(rotation=15)
# Adding values on top of the bars
for p in ax.patches:
ax.annotate(f'{p.get_height():.2f}',
(p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center',
fontsize=12, color='black',
xytext=(0, 5), textcoords='offset points')
plt.show()
<ipython-input-38-deff3738a757>:5: FutureWarning:
Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `le
ax = sns.barplot(x=results_df.index, y='R²', data=results_df, palette='Blues')
keyboard_arrow_down 4.2. Feature Importance
# Feature Importance For Decision Tree
feature_importances = best_tree.feature_importances_
sns.barplot(x=features, y=feature_importances)
plt.title('Feature Importance for Best Decision Tree')
plt.show()
keyboard_arrow_down 4.3. Fine-Grained Evaluation (e.g., with instance-level errors)
# Plotting actual vs predicted prices for the best model
y_test_pred = best_tree.predict(X_test)
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_test_pred)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')