Machine Learning Models:
Supervised Learning: Classification
Step1: read data
Step2: Get info of the data, accordingly do the missing value imputation.
columns = test_data.columns.to_list()
for col in columns:
print("Missing values % of", col, train_data[col].isna().sum()/train_data.shape[0])
def missing_val_treatment(df):
for col in columns:
miss_perc = df[col].isna().sum()/df.shape[0]
if miss_perc > 0 and miss_perc < 0.7 and df[col].dtype == 'O':
df[col].fillna(df[col].mode()[0], inplace = True)
elif miss_perc > 0 and miss_perc < 0.7 and df[col].dtype != 'O':
df[col].fillna(df[col].median(), inplace = True)
elif miss_perc > 0.7:
df.drop(col, axis = 1, inplace = True)
else:
pass
If any rows to be dropped that contains NA then:
df.dropna(subset = [column_name], inplace = True)
Step3: See if any outliers are there in any column and accordingly deal with the
outliers.
train_data.boxplot(num_cols)
Step4: See if any irrelevant columns present like Name or Address which has lot of
text and mostly unique throughout the rows.
train_data.drop(['Name','Ticket'], axis = 1, inplace = True)
test_data.drop(['Name','Ticket'], axis = 1, inplace = True)
Step5: Look for date-time columns. Here’s how you can deal with them:
df[‘ScheduledDay’] = pd.to_datetime(df[‘ScheduledDay’],
format = ‘%Y-%m-%dT%H:%M:%SZ’, errors = ‘coerce’)
Filteration w. r. t. date columns:
Date Feature Engineering:
df[‘ScheduledDay_year’] = df[‘ScheduledDay’].dt.year
df[‘ScheduledDay_month’] = df[‘ScheduledDay’].dt.month
df[‘ScheduledDay_week’] = df[‘ScheduledDay’].dt.week
df[‘ScheduledDay_day’] = df[‘ScheduledDay’].dt.day
df[‘ScheduledDay_hour’] = df[‘ScheduledDay’].dt.hour
df[‘ScheduledDay_minute’] = df[‘ScheduledDay’].dt.minute
df[‘ScheduledDay_dayofweek’] = df[‘ScheduledDay’].dt.dayofweek
Step6: One-Hot Encoding
train_enc = pd.get_dummies(train_data, drop_first = True)
test_enc = pd.get_dummies(test_data, drop_first = True)
Step7: Data Normalization
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
for col in num_cols:
if test_data[col].dtype != 'O' and test_data[col].dtype != ‘Datetime’:
train_enc[col] = sc.fit_transform(train_enc[[col]])
test_enc[col] = sc.fit_transform(test_enc[[col]])
train_enc.hist()
Step8: Look for imbalance in the data w. r. t. target variable and depending upon
that apply sampling technique like SMOTE.
Step9: Apply ML models for the cleaned data:
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(train_enc, test_size = 0.1, random_state = 0)
print(train_data.shape)
print(val_data.shape)
X_train = train_data.drop('Survived', axis = 1)
X_test = val_data.drop('Survived', axis = 1)
y_train = train_data['Survived']
y_test = val_data['Survived']
OR
X = train_data.drop('Survived', axis = 1)
y = train_data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state =
0)
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
model = lr.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training accuracy:", train_accuracy)
y_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Testing accuracy:", test_accuracy)
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
model2 = RandomForestClassifier(n_estimators = 80, oob_score = True,
random_state= 0)
model2 = model2.fit(X_train, y_train)
print(X_train.columns)
model2.feature_importances_
model2.oob_score_
oobs = []
w_values = list(range(20,300,10))
for w in w_values:
model2 = RandomForestClassifier(n_estimators = w, oob_score = True,
random_state= 0)
model2.fit(X_train, y_train)
oob = m_1.oob_score_
oobs.append(oob)
max_oob_index = oobs.index(max(oobs))
best_w = w_values[max_oob_index]
best_w
model2 = RandomForestClassifier(n_estimators = 280, oob_score = True,
random_state= 0)
model2.fit(X_train, y_train)
model2.oob_score_
model3 = AdaBoostClassifier(n_estimators = 100, random_state = 0)
model3.fit(X_train, y_train)
model3.score(X_test, y_test)
y_pred2 = model2.predict(test_enc)
result = pd.DataFrame()
result['PassengerId'] = test_enc['PassengerId']
result['Survived'] = y_pred2
result
result.to_csv("gender_submission.csv",index=False)
Supervised Learning: Regression
Import required modules:
from sklearn.model_selection import train_test_split
X = train_data.drop("MedHouseVal", axis = 1)
y = train_data['MedHouseVal']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
from sklearn.linear_model import LinearRegression
import statsmodels.api as sma
X_train = sma.add_constant(X_train)
X_test = sma.add_constant(X_test)
model = sma.OLS(y_train, X_train)
model = model.fit()
model.summary()
y_pred = model.predict(X_test)
from sklearn.metrics import r2_score, mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:",rmse)
r2 = r2_score(y_test, y_pred)
print("R2:",r2)
test_data = sma.add_constant(test_data)
y_pred2 = model.predict(test_data)
Unsupervised Learning: K-Means Clustering
Following are the steps to perform K-Means:
Step1: Perform EDA
Step2: Check for random
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 3, init = “k-means++”, random_state = 0)
kmeans = kmeans.fit(scaled_data)
wcss = []
for i in range(1, 30):
kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10,
random_state = 0)
kmeans.fit(scaled_data)
wcss.append(kmeans.inertia_)
import matplotlib.pyplot as plt
plt.plot(range(1,30), wcss)
plt.title('The elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
Step3: Identify the point (No. of clusters) where it starts remaining constant. If not
identifiable (Elbow method fails), then make use of Silhouette score to get the
number of clusters.
Take the range of no. of clusters where you are not sure where the consistency
starts. Let’s say you’re confused between k=3 and k=13.
from sklearn.metrics import silhouette_score
for i in range(3, 13):
labels = KMeans(n_clusters = i).fit(scaled_data).labels_
print(“SC for k =”+ str(i) +“is”+str(silhouette_score(scaled_data, labels)))
Now identify at what value of k the silhouette score is highest.
km = KMeans(n_clusters = 3, init = 'k-means++', random_state = 0)
y_means = km.fit_predict(scaled_data)