10/20/24, 10:42 PM ml-exp-1 - Jupyter Notebook
In [1]: # Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import os
localhost:8888/notebooks/Downloads/ml-exp-1.ipynb 1/8
10/20/24, 10:42 PM ml-exp-1 - Jupyter Notebook
In [2]: # Check files in the dataset directory
directory_path = '/kaggle/input/uber-fares-dataset'
files = os.listdir(directory_path)
print("Files in the directory:", files)
# Assuming the correct file is found, set the file path accordingly
file_path = '/kaggle/input/uber-fares-dataset/uber.csv' # Update if th
# 1. Pre-process the dataset
def preprocess_data(file_path):
# Load the dataset
df = pd.read_csv(file_path)
# Convert 'pickup_datetime' to datetime
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], error
# Drop rows with missing or NaN values
df.dropna(inplace=True)
# Extract important date and time features from 'pickup_datetime'
df['hour'] = df['pickup_datetime'].dt.hour
df['day'] = df['pickup_datetime'].dt.day
df['month'] = df['pickup_datetime'].dt.month
df['year'] = df['pickup_datetime'].dt.year
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek
# Drop unnecessary columns
df = df.drop(columns=['pickup_datetime', 'key'])
return df
# Call the preprocess function
df_processed = preprocess_data(file_path)
# Display the first few rows of the processed DataFrame
print(df_processed.head()) # Or use df_processed.head() to display in
localhost:8888/notebooks/Downloads/ml-exp-1.ipynb 2/8
10/20/24, 10:42 PM ml-exp-1 - Jupyter Notebook
Files in the directory: ['uber.csv']
Unnamed: 0 fare_amount pickup_longitude pickup_latitude \
0 24238194 7.5 -73.999817 40.738354
1 27835199 7.7 -73.994355 40.728225
2 44984355 12.9 -74.005043 40.740770
3 25894730 5.3 -73.976124 40.790844
4 17610152 16.0 -73.925023 40.744085
dropoff_longitude dropoff_latitude passenger_count hour day m
onth \
0 -73.999512 40.723217 1 19 7
5
1 -73.994710 40.750325 1 20 17
7
2 -73.962565 40.772647 1 21 24
8
3 -73.965316 40.803349 3 8 26
6
4 -73.973082 40.761247 5 17 28
8
year day_of_week
0 2015 3
1 2009 4
2 2009 0
3 2009 4
4 2014 3
localhost:8888/notebooks/Downloads/ml-exp-1.ipynb 3/8
10/20/24, 10:42 PM ml-exp-1 - Jupyter Notebook
In [3]: # 2. Identify outliers using Z-score
def identify_outliers(df):
# Calculate Z-scores for the 'fare_amount' column
z_scores = np.abs(stats.zscore(df['fare_amount']))
# Identify outliers
outliers = df[z_scores > 3]
print(f"Number of outliers: {outliers.shape[0]}")
return outliers
# Assuming df_processed is already defined and contains the preprocesse
outliers_found = identify_outliers(df_processed)
print(outliers_found)
localhost:8888/notebooks/Downloads/ml-exp-1.ipynb 4/8
10/20/24, 10:42 PM ml-exp-1 - Jupyter Notebook
Number of outliers: 5450
Unnamed: 0 fare_amount pickup_longitude pickup_latitude \
48 22405517 56.80 -73.993498 40.764686
84 25485719 49.57 -73.975058 40.788820
104 46435788 43.00 -73.862701 40.768959
204 6403066 45.00 -73.971663 40.757812
226 24085207 49.80 -73.992122 40.748577
... ... ... ... ...
199914 17686068 57.33 -73.776778 40.645427
199972 31236221 45.00 -73.786833 40.639842
199976 1780041 49.70 -73.978225 40.783318
199977 21117828 43.50 -73.996671 40.737483
199982 13096190 57.33 -73.969204 40.754771
dropoff_longitude dropoff_latitude passenger_count hour d
ay \
48 -73.993498 40.764686 1 22
3
84 -73.975058 40.788820 1 10
7
104 -73.999092 40.741829 2 18
15
204 -73.789273 40.641790 1 7
13
226 -73.806072 40.665272 1 17
29
... ... ... ... ...
...
199914 -73.948572 40.789107 5 5
14
199972 -74.001215 40.722429 1 13
20
199976 -73.700963 40.705852 1 23
18
199977 -73.867758 40.897563 1 21
20
199982 -73.790351 40.643802 1 11
6
month year day_of_week
48 1 2013 3
84 8 2009 4
104 5 2015 4
204 11 2010 5
226 7 2012 6
... ... ... ...
199914 11 2014 4
199972 8 2010 4
199976 10 2011 1
199977 11 2012 1
199982 8 2014 2
[5450 rows x 12 columns]
localhost:8888/notebooks/Downloads/ml-exp-1.ipynb 5/8
10/20/24, 10:42 PM ml-exp-1 - Jupyter Notebook
In [8]: def plot_outliers(df, outliers):
plt.figure(figsize=(10, 6))
# Scatter plot of all points
plt.scatter(df.index, df['fare_amount'], label="Normal Data", alpha
# Scatter plot of outliers
plt.scatter(outliers.index, outliers['fare_amount'], color='red', l
# Add labels and title
plt.title('Fare Amount with Outliers Highlighted')
plt.xlabel('Index')
plt.ylabel('Fare Amount')
# Add legend
plt.legend()
# Show the plot
plt.show()
plot_outliers(df_processed,outliers_found)
localhost:8888/notebooks/Downloads/ml-exp-1.ipynb 6/8
10/20/24, 10:42 PM ml-exp-1 - Jupyter Notebook
In [4]: # 3. Check the correlation
def plot_correlation(df):
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()
print(plot_correlation(df_processed))
None
localhost:8888/notebooks/Downloads/ml-exp-1.ipynb 7/8
10/20/24, 10:42 PM ml-exp-1 - Jupyter Notebook
In [5]: #4
def implement_models(df):
X = df.drop(columns='fare_amount')
y = df['fare_amount']
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size
# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)
# Random Forest Regression
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)
y_pred_rf = rf_reg.predict(X_test)
return y_test, y_pred_lin, y_pred_rf, lin_reg, rf_reg
# Call implement_models and unpack the results
y_test, y_pred_lin, y_pred_rf, lin_reg, rf_reg = implement_models(df_pr
#5 Now you can evaluate the models
def evaluate_models(y_test, y_pred_lin, y_pred_rf):
# Evaluation metrics
lin_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lin))
lin_r2 = r2_score(y_test, y_pred_lin)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
rf_r2 = r2_score(y_test, y_pred_rf)
print(f"Linear Regression RMSE: {lin_rmse}, R2: {lin_r2}")
print(f"Random Forest Regression RMSE: {rf_rmse}, R2: {rf_r2}")
# Call the evaluation function
evaluate_models(y_test, y_pred_lin, y_pred_rf)
Linear Regression RMSE: 10.113708439348311, R2: 0.016696500909208156
Random Forest Regression RMSE: 5.356064199348529, R2: 0.7242228535774
927
In [ ]:
localhost:8888/notebooks/Downloads/ml-exp-1.ipynb 8/8