1
Data Mining Lab Practical File
Submitted to: Dr. Neeraj Gupta
Submitted by: Gaurav Tripathi
1901840003
B.Sc (H) Data Science (Semester - IV)
29 Jun 2021
2
1. Linear Regression
Code for Salary :
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
linearreg=LinearRegression()
x=np.array([1.1,1.3,1.5,2,2.2,2.9,3,3.2,3.2,3.7,3.9,4,4,4.1,4.5,4.9,5.1,5.
3,5.9,6,6.8,7.1,7.9,8.2,8.7,9,9.5,9.6,10.3,10.5,11.2,11.5,12.3,12.9,13.5])
y=np.array([39343,46205,37731,43525,39891,56642,60150,54445,64445,5718,632
18,55794,56957,57081,61111,67938,66029,83088,81363,93940,91738,98273,10130
2,113812,109431,105582,116969,112635,122391,121872,127345,126756,128765,13
5675,139465])
x=x.reshape(-1,1)
linearreg.fit(x,y)
y_pred=linearreg.predict(x)
plt.scatter(x,y)
plt.plot(x,y_pred,color='red')
plt.show()
Code for Insurance:
import numpy as np
import matplotlib.pyplot as plt
3
from sklearn.linear_model import LinearRegression
linearreg=LinearRegression()
x=np.array([108,19,13,124,40,57,23,14,45,10,5,48,11,23,7,2,24,6,3,23,6,9
,9,3,29,7,4,20,7,4,0,25,6,5,22,11,61,12,4,16,13,60,41,37,55,41,11,27,8,
3 ,17,13,13,15,8,29,30,24,9,31,14,53,26])
y=np.array([392.5,46.2,15.7,422.2,119.4,170.9,56.9,77.5,214,65.3,20.9,24
8.1,23.5,39.6,48.8,6.6,134.9,50.9,4.4,113,14.8,48.7,52.1,13.2,103.9,77.5
,11.8,98.1,27.9,38.1,0,69.2,14.6,40.3,161.5,57.2,217.6,58.1,12.6,59.6,89
.9,202.4,181.3,152.8,162.8,73.4,21.3,92.6,76.1,39.9,142.1,93,31.9,32.1,5
5.6,133.3,194.5,137.9,87.4,209.8,95.5,244.6,187.5,
])
x=x.reshape(-1,1)
linearreg.fit(x,y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)
y_pred=linearreg.predict(x)
plt.scatter(x,y)
plt.plot(x,y_pred,color='red')
plt.show()
4
2. Multiple Linear Regression
Code for mlv:
import pandas as pd
import matplotlib.pyplot as plt
data=pd.read_csv("mlr.csv")
data.head()
X1 X2 X3 X4 X5 X6
0.28 0.14 0.04 0.01 0.01
0 3 4 9 2 30.08 6
0.27 0.12 0.03 0.01 0.00 0.06
1 6 5 9 3 2 2
0.28 0.14 0.04 0.02 0.01 0.07
2 1 1 5 1 3 4
0.32 0.18 0.04 0.00 0.03 0.03
3 8 9 3 1 0 2
0.29 0.16 0.04 0.01 0.07 0.07
4 0 1 4 1 0 6
from sklearn import linear_model
import numpy as np
data.columns = ['x1','x2','x3','x4','x5','x6']
X = data.iloc[:,data.columns != "x1"]
Y = data.iloc[:, 0]
X
x3 x4 x5 x6
x 2
0.14 0.04 0.01 0.01 0.08
0 4 9 2 3 6
0.12 0.03 0.01 0.00 0.06
1 5 9 3 2 2
0.14 0.04 0.02 0.01 0.07
2 1 5 1 3 4
0.18 0.04 0.00 0.03 0.03
3 9 3 1 0 2
0.16 0.04 0.01 0.07 0.07
4 1 4 1 0 6
5
0.18 0.04 0.01 0.05 0.00
5 6 7 8 0 7
0.10 0.03 0.00 0.01 0.09
6 6 6 8 2 5
0.11 0.03 0.00 0.00 0.14
7 7 0 6 3 5
0.17 0.05 0.00 0.06 0.11
8 4 0 8 1 2
Y.head()
0 0.283
1 0.276
2 0.281
3 0.328
4 0.290
Name: x1, dtype: float64
plt.scatter(X.x2,X.x3,marker="*",color="orange")
plt.scatter(X.x2,X.x4,marker="*",color="green")
6
plt.scatter(X.x2,X.x5,marker="*",color="red")
plt.scatter(X.x2,X.x6,marker="*",color="orange")
plt.scatter(X.x3,X.x4,marker="*",color="green")
7
plt.scatter(X.x3,X.x5,marker="*",color="red")
from sklearn.model_selection import train_test_split
from sklearn import linear_model
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,
random_state= 0)
model = linear_model.LinearRegression()
model.fit(X_train, Y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
y_pred = model.predict(X_test)
Y_pred
array([0.23386571, 0.31718764, 0.33311851, 0.29940205, 0.27633325,
0.24228869, 0.23924197, 0.29401786, 0.29011389])
8
from sklearn import metrics
print("mean square error ",metrics.mean_squared_error(Y_test, y_pred))
mean square error 0.0001948634357163518
Code for Insurance:
import pandas as pd
import numpy as np
import io
import matplotlib.pyplot as plt
df = pd.read_csv(io.BytesIO(uploaded['insurance-1.csv']))
ivar = df[['age' , 'sex' , 'bmi' , 'children' , 'smoker' , 'region' , 'charges']]
fig = plt.figure()
ax = fig.add_subplot(111)
csx = ax.matshow(np.linalg.inv(ivar.corr()) , cmap = 'Blues')
fig.colorbar(csx)
ax.set_xlabel(['']+['age' , 'sex' , 'bmi' , 'children' , 'smoker' , 'region' ,
'charges'])
ax.set_ylabel(['']+['age' , 'sex' , 'bmi' , 'children' , 'smoker' , 'region' ,
'charges'])
df.head()
Age sex bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
9
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
df.columns = ['age','sex','bmi','children','smoker','region','charges']
X = df.iloc[:,df.columns != "age"]
Y = df.iloc[:, 0]
X
sex bmi children smoker region charges
0 female 27.900 0 yes southwest 16884.92400 1 male 33.770 1 no
southeast 1725.55230 2 male 33.000 3 no southeast 4449.46200 3 male
22.705 0 no northwest 21984.47061 4 male 28.880 0 no northwest
3866.85520
... ... ... ... ... ... ... 1333 male 30.970 3 no northwest
10600.54830 1334 female 31.920 0 no northeast 2205.98080 1335 female
36.850 0 no southeast 1629.83350 1336 female 25.800 0 no southwest
2007.94500
1337 female 29.070 0 yes northwest 29141.36030
1338 rows × 6 columns
10
Y.head()
0 19
1 18
2 28
3 33
4 32
Name: age, dtype: int64
plt.scatter(X.sex,X.bmi,marker="*",color="green")
11
plt.scatter(X.sex,X.children,marker="*",color="orange")
plt.scatter(X.sex,X.charges,marker="*",color="red")
plt.scatter(X.bmi,X.charges,marker="*",color="red")
plt.scatter(X.children,X.charges,marker="*",color="green")
12
plt.scatter(X.smoker,X.charges,marker="*",color="orange")
3. Logistic Regression
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
age=np.array([22,25,47,52,46,56,55,60,62,61,18,28,27,29,49,55,25,58,19,18,
21,26,4 0,45,50,54,23])
bought_insurance=np.array([0,0,1,0,1,1,0,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0,1,1
13
,1,1,0] )
print(age)
print(bought_insurance)
[22 25 47 52 46 56 55 60 62 61 18 28 27 29 49 55 25 58 19 18 21 26 40 45
50 54 23]
[0 0 1 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 0]
plt.scatter(age,bought_insurance,c=bought_insurance,cmap='rainbow')
plt.title("Scatter Plot Of Logistic Regression")
plt.show()
from sklearn.model_selection import train_test_split
age_train,age_test,bought_insurance_train,bought_insurance_test=train_test_split(
age,bought_insurance,random_state=1)
print(age_train.shape,age_test.shape,bought_insurance_train.shape,bought_insuranc
e_test.shape)
(20,) (7,) (20,) (7,)
log_reg=LogisticRegression
from sklearn.metrics import mean_squared_error
mean_squared_error(age,bought_insurance)
1760.2592592592594
14
4. K-Means Algorithm
import pandas as pd
import io
df = pd.read_csv(io.BytesIO(uploaded['cars.csv']))
df.head()
cubicinches hp time-to-6 0
weightlb s year brand
mpg cylinder s
31.9 4 89 71 1925 14 1980 Europe
0 14.0 8 350 165 4209 12 1972 US. 1 . 2 17.0
8 302 140 3449 11 1971 US. 3 15.0 8 400 150 3761 10 1971 US. 4 30.5 4 98 63
2051 17 1978 US.
X = df.iloc[:,:-1].values
X = pd.DataFrame(X)
X = pd
X.head()
cubicinches hp time-to-6 0
weightlb s year
mpg cylinder s
0 14 8 350 165 4209 12 1972 1 31.9 4 89 71 1925 14 1980 2 17 8
302 140 3449 11 1971 3 15 8 400 150 3761 10 1971 4 30.5 4 98 63
2051 17 1978
X.describe()
cubicinches hp time-to-6 0
mpg cylinder s weightlb s year
count 261.0 261 261 261 261 261 261 unique 103.0 5 75 85 240 17 1
top 14.0 4 97 150 2130 16 1974 freq 16.0 125 16 16 3 45 35
X['mpg'] = pd.to_numeric(X['mpg'], errors='coerce').astype('float64')
15
X['cylinders'] = pd.to_numeric(X['cylinders'],
errors='coerce').astype('float64')
X['cubicinches'] = pd.to_numeric(X['cubicinches'],
errors='coerce').astype('float64')
X['hp'] = pd.to_numeric(X['hp'], errors='coerce').astype('float64')
X['weightlbs'] = pd.to_numeric(X['weightlbs'],
errors='coerce').astype('float64')
X['time-to-60'] = pd.to_numeric(X['time-to-60'],
errors='coerce').astype('float64')
X['year'] = pd.to_numeric(X['year'], errors='coerce').astype('float64')
import numpy as np
X.replace([np.inf, -np.inf], np.nan)
print(X.isna().sum())
mean1=X['cubicinches'].mean()
X['cubicinches']=X['cubicinches'].fillna(mean1)
mean2=X['weightlbs'].mean()
X['weightlbs']=X['weightlbs'].fillna(mean2)
print(mean1,mean2)
print(X.isna().sum())
mpg 0
cylinders 0
cubicinches 2
hp 0
weightlbs 3
time-to-60 0
year 0
dtype: int64
200.9189189189189 3009.8333333333335
mpg 0
cylinders 0
cubicinches 0
hp 0
weightlbs 0
time-to-60 0
year 0
dtype: int64
import matplotlib.pyplot as plt
16
# Using the elbow method to find the optimal number of clusters
from sklearn.cluster import KMeans
wcss = []
for i in range(1,11):
kmeans =
KMeans(n_clusters=i,init='k-means++',max_iter=300,n_init=10,random_state=0
)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
# Applying k-means to the cars dataset
kmeans =
KMeans(n_clusters=3,init='k-means++',max_iter=300,n_init=10,random_state=0
)
y_kmeans = kmeans.fit_predict(X)
#X=X.to_numpy
# Visualising the clusters
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans ==
0,1],s=100,c='red',label='US')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans ==
1,1],s=100,c='blue',label='Japan')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans ==
2,1],s=100,c='green',label='Europe')
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=30
0,c='yellow',label='Centroids')
plt.title('Clusters of car brands')
plt.legend()
plt.show()
17
18
5. Decision Tree Algorithm:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder#for train test splitting
from sklearn.model_selection import train_test_split#for decision tree object
from sklearn.tree import DecisionTreeClassifier#for checking testing results
from sklearn.metrics import classification_report, confusion_matrix#for
visualizing tree
from sklearn.tree import plot_tree
from google.colab import files
uploaded = files.upload()
import io
df = pd.read_csv(io.BytesIO(uploaded['Iris.csv']))
df.head()
SepalLength SepalWidth PetalLength PetalWidth Speci
Cm Cm Cm Cm es
5.1 3.5 1.4 0.2 Iris-s
etosa
4.9 3.0 1.4 0.2 Iris-s
etosa
4.7 3.2 1.3 0.2 Iris-s
etosa
4.6 3.1 1.5 0.2 Iris-s
etosa
5.0 3.6 1.4 0.2 Iris-s
etosa
19
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 150 non-null int64
1 SepalLengthCm 150 non-null float64
2 SepalWidthCm 150 non-null float64
3 PetalLengthCm 150 non-null float64
4 PetalWidthCm 150 non-null float64
5 Species 150 non-null object
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
df.shape
(150, 6)
df.isnull().any()
Id False
SepalLengthCm False
SepalWidthCm False
PetalLengthCm False
PetalWidthCm False
Species False
dtype: bool
20
df['species'] = np.random.choice(2,150)
sns.pairplot(df, hue="species", size=2.5)
21
sns.heatmap(df.corr())
target = df['species']
df1 = df.copy()
df1 = df1.drop('species', axis =1)
X = df1
target
0 0
1 1
2 1
3 1
4 0
..
145 1
146 0
147 1
148 1
149 1
Name: species, Length: 150, dtype: int64
22
le = LabelEncoder()
target = le.fit_transform(target)
target
array([0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,
0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1])
y = target
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size = 0.2,
random_state = 42)
print("Training split input- ", X_train.shape)
print("Testing split input- ", X_test.shape)
Training split input- (120, 6)
Testing split input- (30, 6)
dtree.fit(X_train,y_train)print('Decision Tree Classifier Created')
y_pred = dtree.predict(X_test)
print("Classification report - \n", classification_report(y_test,y_pred))
Classification report-
Precision recall f1-score support
0 1.00 1.00 1.00 10
1 1.00 1.00 1.00 9
2 1.00 1.00 1.00 11
accuracy 1.00 30
macro avg 1.00 1.00 1.00 30
weighted avg 1.00 1.00 1.00 30
23
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,5))sns.heatmap(data=cm,linewidths=.5, annot=True,square
= True, cmap = 'Blues')plt.ylabel('Actual label')
plt.xlabel('Predicted label')all_sample_title = 'Accuracy Score:
{0}'.format(dtree.score(X_test, y_test))
plt.title(all_sample_title, size = 15)
24
dec_tree = plot_tree(decision_tree=dtree, feature_names = df1.columns,
class_names =["setosa", "vercicolor", "verginica"] , filled = True , precision =
4, rounded = True)
25
Neural Networks:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
observations=1000
xs=np.random.uniform(-10,10,(observations,1))
zs=np.random.uniform(-10,10,(observations,1))
generated_inputs=np.column_stack((xs,zs))
noise=np.random.uniform(-10,10,(observations,1))
generated_target=2*xs-3*zs+5+noise
np.savez('TF_intro',input=generated_inputs,targets=generated_target)
training_data=np.load('TF_intro.npz')
input_size=2
output_size=1
models = tf.keras.Sequential([
tf.keras.layers.Dense(output_size)
])
custom_optimizer=tf.keras.optimizers.SGD(learning_rate=0.02)
models.compile(optimizer=custom_optimizer,loss='mean_squared_error')
models.fit(training_data['input'],training_data['targets'],epochs=100,verbose=
1)
Epoch 1/100
32/32 [==============================] - 1s 1ms/step - loss: 60.4431
Epoch 2/100
32/32 [==============================] - 0s 1ms/step - loss: 40.1109
Epoch 3/100
32/32 [==============================] - 0s 1ms/step - loss: 39.9997
Epoch 4/100
32/32 [==============================] - 0s 1ms/step - loss: 34.7306
Epoch 5/100
32/32 [==============================] - 0s 1ms/step - loss: 36.7232
models.layers[0].get_weights()
[array([[ 2.0985565],
[-2.907345 ]], dtype=float32), array([5.32353], dtype=float32)]
26
[array([[2.0985565 ],
[-2.907345]], dtype=float32), array([5.32353], dtype=float32)]
weights=models.layers[0].get_weights()[0]
bias=models.layers[0].get_weights()[1]
out=training_data['targets'].round(1)
from sklearn.metrics import mean_squared_error
mean_squared_error(generated_target, out, squared=False)
0.02858235386343541
plt.scatter(np.squeeze(models.predict_on_batch(training_data['input'])),np.squeez
e(training_data['targets']),c='#88c999')
plt.xlabel('Input')
plt.ylabel('Predicted Output')
plt.show()
27
Naive Bayesian Classification:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats.stats import pearsonr
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
%matplotlib inline
def cross_validate(estimator, train, validation):
X_train = train[0]
Y_train = train[1]
X_val = validation[0]
Y_val = validation[1]
train_predictions = classifier.predict(X_train)
train_accuracy = accuracy_score(train_predictions, Y_train)
train_recall = recall_score(train_predictions, Y_train)
train_precision = precision_score(train_predictions, Y_train)
val_predictions = classifier.predict(X_val)
val_accuracy = accuracy_score(val_predictions, Y_val)
val_recall = recall_score(val_predictions, Y_val)
val_precision = precision_score(val_predictions, Y_val)
print('Model metrics')
print('Accuracy Train: %.2f, Validation: %.2f' % (train_accuracy,
val_accuracy))
print('Recall Train: %.2f, Validation: %.2f' % (train_recall, val_recall))
print('Precision Train: %.2f, Validation: %.2f' % (train_precision,
val_precision))
from google.colab import files
uploaded = files.upload()
test.csv(application/vnd.ms-excel) - 28629 bytes, last modified: 6/29/2021 - 100% done
train.csv(application/vnd.ms-excel) - 61194 bytes, last modified: 6/29/2021 - 100% done
Saving test.csv to test.csv
28
Saving train.csv to train.csv
import io
train_data = pd.read_csv(io.StringIO(uploaded['train.csv'].decode('utf-8')))
import io
test_data = pd.read_csv(io.StringIO(uploaded['test.csv'].decode('utf-8')))
train_data['train'] = 1
test_data['train'] = 0
data = train_data.append(test_data, sort=False)
test_ids = test_data['PassengerId'].values
data.head()
Passeng Surviv Pcla Na Sex Age Sib Par Tic Fare Cabi Embark tra
erId ed ss me Sp ch ket n ed in
1 0.0 3 Braund mal 22. 1 0 A/5 7.25 NaN S 1
, Mr. e 0 21171 00
0
Owen
Harris
2 1.0 1 Cumin fem 38. 1 0 PC 71.2 C85 C 1
gs, ale 0 17599 833
Mrs.
John
Bradle
1
y
(Floren
ce
Briggs
Th...
3 1.0 3 Heikki fem 26. 0 0 STON/ 7.92 NaN S 1
nen, ale 0 O2. 50
2
Miss. 31012
Laina 82
29
4 1.0 1 Futrell fem 35. 1 0 11380 53.1 C123 S 1
e, Mrs. ale 0 3 000
Jacque
s
3
Heath
(Lily
May
Peel)
5 0.0 3 Allen, mal 35. 0 0 37345 8.05 NaN S 1
Mr. e 0 0 00
4 Willia
m
Henry
data.describe()
Passenge Survived Pclass Age SibSp Parch Fare train
rId
cou 1309.000 891.000 1309.000 1046.000 1309.000 1309.000 1308.000 1309.000
nt 000 000 000 000 000 000 000 000
mea 655.0000 0.38383 2.294882 29.88113 0.498854 0.385027 33.29547 0.680672
n 00 8 8 9
378.0200 0.48659 0.837836 14.41349 1.041658 0.865560 51.75866 0.466394
std
61 2 3 8
1.000000 0.00000 1.000000 0.170000 0.000000 0.000000 0.000000 0.000000
min
0
328.0000 0.00000 2.000000 21.00000 0.000000 0.000000 7.895800 0.000000
25%
00 0 0
655.0000 0.00000 3.000000 28.00000 0.000000 0.000000 14.45420 1.000000
50%
00 0 0 0
982.0000 1.00000 3.000000 39.00000 1.000000 0.000000 31.27500 1.000000
75%
00 0 0 0
1309.000 1.00000 3.000000 80.00000 8.000000 9.000000 512.3292 1.000000
max
000 0 0 00
30
features = ['Age', 'Embarked', 'Fare', 'Parch', 'Pclass', 'Sex', 'SibSp']
target = 'Survived'
data = data[features + [target] + ['train']]
data['Sex'] = data['Sex'].replace(["female", "male"], [0, 1])
data['Embarked'] = data['Embarked'].replace(['S', 'C', 'Q'], [1, 2, 3])
data['Age'] = pd.qcut(data['Age'], 10, labels=False)
train = data.query('train == 1')
test = data.query('train == 0')
train.dropna(axis=0, inplace=True)
labels = train[target].values
train.head()
Age Embarked Fare Parch Pclass Sex SibSp Survived train
0 2.0 1.0 7.2500 0 3 1 1 0.0 1
7.0 2.0 71.283 0 1 0 1 1.0 1
1
3
2 4.0 1.0 7.9250 0 3 0 0 1.0 1
6.0 1.0 53.100 0 1 0 1 1.0 1
3
0
4 6.0 1.0 8.0500 0 3 1 0 0.0 1
columns = train[features + [target]].columns.tolist()
nColumns = len(columns)
result = pd.DataFrame(np.zeros((nColumns, nColumns)), columns=columns)
for col_a in range(nColumns):
for col_b in range(nColumns):
result.iloc[[col_a], [col_b]] = pearsonr(train.loc[:, columns[col_a]],
train.loc[:, columns[col_b]])[0]
fig, ax = plt.subplots(figsize=(10,10))
ax = sns.heatmap(result, yticklabels=columns, vmin=-1, vmax=1, annot=True,
fmt='.2f', linewidths=.2)
ax.set_title('PCC - Pearson correlation coefficient')
plt.show()
31
32
continuous_numeric_features = ['Age', 'Fare', 'Parch', 'SibSp']
for feature in continuous_numeric_features:
sns.distplot(train[feature])
plt.show()
33
train.drop(['train', target, 'Pclass'], axis=1, inplace=True)
test.drop(['train', target, 'Pclass'], axis=1, inplace=True)
X_train, X_val, Y_train, Y_val = train_test_split(train, labels, test_size=0.2,
random_state=1)
X_train.head()
Age Embarked Fare Parch Sex SibSp
1.0 2.0 14.454 0 0 1
830
2
566 1.0 1.0 7.8958 0 1 0
7.0 1.0 13.000 0 1 0
149
0
106 2.0 1.0 7.6500 0 0 0
4.0 1.0 78.850 0 0 0
290
0
X_train1, X_train2, Y_train1, Y_train2 = train_test_split(X_train, Y_train,
test_size=0.3, random_state=12)
classifier = GaussianNB()
classifier.fit(X_train2, Y_train2)
GaussianNB(priors=None, var_smoothing=1e-09)
34
print('Metrics with only 30% of train data')
cross_validate(classifier, (X_train, Y_train), (X_val, Y_val))
Metrics with only 30% of train data
Model metrics
Accuracy Train: 0.79, Validation: 0.76
Recall Train: 0.75, Validation: 0.71
Precision Train: 0.71, Validation: 0.71
classifier.partial_fit(X_train1, Y_train1)
GaussianNB(priors=None, var_smoothing=1e-09)
print('Metrics with the remaining 70% of train data')
cross_validate(classifier, (X_train, Y_train), (X_val, Y_val)
Metrics with the remaining 70% of train data
Model metrics
Accuracy Train: 0.80, Validation: 0.76
Recall Train: 0.78, Validation: 0.70
Precision Train: 0.69, Validation: 0.69
print('Probability of each class')
print('Survive = 0: %.2f' % classifier.class_prior_[0])
print('Survive = 1: %.2f' % classifier.class_prior_[1])
Probability of each class
Survive = 0: 0.60
Survive = 1: 0.40
print('Mean of each feature per class')
print(' Age Embarked Fare Parch Sex SibSp')
print('Survive = 0: %s' % classifier.theta_[0])
print('Survive = 1: %s' % classifier.theta_[1])
Mean of each feature per class
Age Embarked Fare Parch Sex SibSp
Survive = 0: [ 4.5339233 1.23893805 22.97357316 0.37463127 0.86135693 0.5280236 ]
Survive = 1: [ 4.27391304 1.33478261 53.52155957 0.52608696 0.33043478 0.49130435]
print('Variance of each feature per class')
print('Survive = 0: %s' % classifier.sigma_[0])
print('Survive = 1: %s' % classifier.sigma_[1])
Variance of each feature per class
Survive = 0: [8.32554825e+00 2.88044224e-01 8.60096730e+02 8.36055467e-01
1.19424042e-01 1.11647419e+00]
Survive = 1: [8.45106148e+00 2.74880003e-01 5.28959488e+03 6.75409304e-01
35
2.21250514e-01 4.41231610e-01]
test.fillna(test.mean(), inplace=True)
test_predictions = classifier.predict(test)
submission = pd.DataFrame({'PassengerId': test_ids})
submission['Survived'] = test_predictions.astype('int')
submission.to_csv('submission.csv', index=False)
submission.head(10)
PassengerId Survived
0 892 0
1 893 1
2 894 0
3 895 0
4 896 1
5 897 0
6 898 1
7 899 0
8 900 1
9 901 0
36