KEMBAR78
Ml-Exp-1 - Jupyter Notebook | PDF | Statistical Analysis | Teaching Mathematics
0% found this document useful (0 votes)
71 views8 pages

Ml-Exp-1 - Jupyter Notebook

Uploaded by

34 Neha Galande
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
71 views8 pages

Ml-Exp-1 - Jupyter Notebook

Uploaded by

34 Neha Galande
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 8

10/20/24, 10:42 PM ml-exp-1 - Jupyter Notebook

In [1]:  # Import necessary libraries


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import os

localhost:8888/notebooks/Downloads/ml-exp-1.ipynb 1/8
10/20/24, 10:42 PM ml-exp-1 - Jupyter Notebook

In [2]:  # Check files in the dataset directory


directory_path = '/kaggle/input/uber-fares-dataset'
files = os.listdir(directory_path)
print("Files in the directory:", files)

# Assuming the correct file is found, set the file path accordingly
file_path = '/kaggle/input/uber-fares-dataset/uber.csv' # Update if th

# 1. Pre-process the dataset
def preprocess_data(file_path):
# Load the dataset
df = pd.read_csv(file_path)

# Convert 'pickup_datetime' to datetime
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], error

# Drop rows with missing or NaN values
df.dropna(inplace=True)

# Extract important date and time features from 'pickup_datetime'
df['hour'] = df['pickup_datetime'].dt.hour
df['day'] = df['pickup_datetime'].dt.day
df['month'] = df['pickup_datetime'].dt.month
df['year'] = df['pickup_datetime'].dt.year
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek

# Drop unnecessary columns
df = df.drop(columns=['pickup_datetime', 'key'])

return df

# Call the preprocess function
df_processed = preprocess_data(file_path)

# Display the first few rows of the processed DataFrame
print(df_processed.head()) # Or use df_processed.head() to display in

localhost:8888/notebooks/Downloads/ml-exp-1.ipynb 2/8
10/20/24, 10:42 PM ml-exp-1 - Jupyter Notebook

Files in the directory: ['uber.csv']


Unnamed: 0 fare_amount pickup_longitude pickup_latitude \
0 24238194 7.5 -73.999817 40.738354
1 27835199 7.7 -73.994355 40.728225
2 44984355 12.9 -74.005043 40.740770
3 25894730 5.3 -73.976124 40.790844
4 17610152 16.0 -73.925023 40.744085

dropoff_longitude dropoff_latitude passenger_count hour day m


onth \
0 -73.999512 40.723217 1 19 7
5
1 -73.994710 40.750325 1 20 17
7
2 -73.962565 40.772647 1 21 24
8
3 -73.965316 40.803349 3 8 26
6
4 -73.973082 40.761247 5 17 28
8

year day_of_week
0 2015 3
1 2009 4
2 2009 0
3 2009 4
4 2014 3

localhost:8888/notebooks/Downloads/ml-exp-1.ipynb 3/8
10/20/24, 10:42 PM ml-exp-1 - Jupyter Notebook

In [3]:  # 2. Identify outliers using Z-score


def identify_outliers(df):
# Calculate Z-scores for the 'fare_amount' column
z_scores = np.abs(stats.zscore(df['fare_amount']))
# Identify outliers
outliers = df[z_scores > 3]
print(f"Number of outliers: {outliers.shape[0]}")
return outliers

# Assuming df_processed is already defined and contains the preprocesse
outliers_found = identify_outliers(df_processed)
print(outliers_found)

localhost:8888/notebooks/Downloads/ml-exp-1.ipynb 4/8
10/20/24, 10:42 PM ml-exp-1 - Jupyter Notebook

Number of outliers: 5450


Unnamed: 0 fare_amount pickup_longitude pickup_latitude \
48 22405517 56.80 -73.993498 40.764686
84 25485719 49.57 -73.975058 40.788820
104 46435788 43.00 -73.862701 40.768959
204 6403066 45.00 -73.971663 40.757812
226 24085207 49.80 -73.992122 40.748577
... ... ... ... ...
199914 17686068 57.33 -73.776778 40.645427
199972 31236221 45.00 -73.786833 40.639842
199976 1780041 49.70 -73.978225 40.783318
199977 21117828 43.50 -73.996671 40.737483
199982 13096190 57.33 -73.969204 40.754771

dropoff_longitude dropoff_latitude passenger_count hour d


ay \
48 -73.993498 40.764686 1 22
3
84 -73.975058 40.788820 1 10
7
104 -73.999092 40.741829 2 18
15
204 -73.789273 40.641790 1 7
13
226 -73.806072 40.665272 1 17
29
... ... ... ... ...
...
199914 -73.948572 40.789107 5 5
14
199972 -74.001215 40.722429 1 13
20
199976 -73.700963 40.705852 1 23
18
199977 -73.867758 40.897563 1 21
20
199982 -73.790351 40.643802 1 11
6

month year day_of_week


48 1 2013 3
84 8 2009 4
104 5 2015 4
204 11 2010 5
226 7 2012 6
... ... ... ...
199914 11 2014 4
199972 8 2010 4
199976 10 2011 1
199977 11 2012 1
199982 8 2014 2

[5450 rows x 12 columns]

localhost:8888/notebooks/Downloads/ml-exp-1.ipynb 5/8
10/20/24, 10:42 PM ml-exp-1 - Jupyter Notebook

In [8]:  def plot_outliers(df, outliers):


plt.figure(figsize=(10, 6))

# Scatter plot of all points
plt.scatter(df.index, df['fare_amount'], label="Normal Data", alpha

# Scatter plot of outliers
plt.scatter(outliers.index, outliers['fare_amount'], color='red', l

# Add labels and title
plt.title('Fare Amount with Outliers Highlighted')
plt.xlabel('Index')
plt.ylabel('Fare Amount')

# Add legend
plt.legend()

# Show the plot
plt.show()
plot_outliers(df_processed,outliers_found)

localhost:8888/notebooks/Downloads/ml-exp-1.ipynb 6/8
10/20/24, 10:42 PM ml-exp-1 - Jupyter Notebook

In [4]:  # 3. Check the correlation


def plot_correlation(df):
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

print(plot_correlation(df_processed))

None

localhost:8888/notebooks/Downloads/ml-exp-1.ipynb 7/8
10/20/24, 10:42 PM ml-exp-1 - Jupyter Notebook

In [5]:  #4
def implement_models(df):
X = df.drop(columns='fare_amount')
y = df['fare_amount']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size

# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)

# Random Forest Regression
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)
y_pred_rf = rf_reg.predict(X_test)

return y_test, y_pred_lin, y_pred_rf, lin_reg, rf_reg

# Call implement_models and unpack the results
y_test, y_pred_lin, y_pred_rf, lin_reg, rf_reg = implement_models(df_pr

#5 Now you can evaluate the models
def evaluate_models(y_test, y_pred_lin, y_pred_rf):
# Evaluation metrics
lin_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lin))
lin_r2 = r2_score(y_test, y_pred_lin)

rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
rf_r2 = r2_score(y_test, y_pred_rf)

print(f"Linear Regression RMSE: {lin_rmse}, R2: {lin_r2}")
print(f"Random Forest Regression RMSE: {rf_rmse}, R2: {rf_r2}")

# Call the evaluation function
evaluate_models(y_test, y_pred_lin, y_pred_rf)

Linear Regression RMSE: 10.113708439348311, R2: 0.016696500909208156


Random Forest Regression RMSE: 5.356064199348529, R2: 0.7242228535774
927

In [ ]:  ​

localhost:8888/notebooks/Downloads/ml-exp-1.ipynb 8/8

You might also like