In
[6]:
import pandas as pd
import numpy as np
wcat=pd.read_csv("D:\\Course\\Python\\Datasets\\wc.at.csv")
In [7]:
wcat
Out[7]: Waist AT
0 74.75 25.72
1 72.60 25.89
2 81.80 42.60
3 83.95 42.80
4 74.65 29.84
... ... ...
104 100.10 124.00
105 93.30 62.20
106 101.80 133.00
107 107.90 208.00
108 108.50 208.00
109 rows × 2 columns
In [1]:
import matplotlib.pyplot as plt
In [4]:
plt.scatter(x=wcat['Waist'],y=wcat['AT'])
Out[4]: <matplotlib.collections.PathCollection at 0x28b500acc48>
In [5]: # corealtion coefficent
np.corrcoef(x=wcat['Waist'],y=wcat['AT'])
Out[5]: array([[1. , 0.81855781],
[0.81855781, 1. ]])
In [9]:
#import statsmodels.formula.api as smf
#model=smf.ols('AT~Waist',data=wcat).fit()
#model.params
#model.summary()
#print (model.conf_int(0.05)) # 95% confidence interval`
#model2 = smf.ols('AT~np.log(Waist)',data=wcat).fit()
#model2.params
#model2.summary()
#pred = model2.predict(wcat)
#pred
0 1
Intercept -259.190053 -172.772923
Waist 2.993689 3.924030
In [5]:
wcat
Out[5]: Waist AT
0 74.75 25.72
1 72.60 25.89
2 81.80 42.60
3 83.95 42.80
4 74.65 29.84
... ... ...
104 100.10 124.00
105 93.30 62.20
106 101.80 133.00
107 107.90 208.00
108 108.50 208.00
109 rows × 2 columns
In [8]:
X =wcat[ 'Waist'].values.reshape(-1,1)
#x1 =np.log(wcat[ 'Waist'].values.reshape(-1,1))
#X2 = np.sqrt(wcat[ 'Waist'].values.reshape(-1,1))
#x3 = np.exp(wcat[ 'Waist'].values.reshape(-1,1))
y = wcat['AT'].values.reshape(-1,1)
In [9]:
X
Out[9]: array([[ 74.75],
[ 72.6 ],
[ 81.8 ],
[ 83.95],
[ 74.65],
[ 71.85],
[ 80.9 ],
[ 83.4 ],
[ 63.5 ],
[ 73.2 ],
[ 71.9 ],
[ 75. ],
[ 73.1 ],
[ 79. ],
[ 77. ],
[ 68.85],
[ 75.95],
[ 74.15],
[ 73.8 ],
[ 75.9 ],
[ 76.85],
[ 80.9 ],
[ 79.9 ],
[ 89.2 ],
[ 82. ],
[ 92. ],
[ 86.6 ],
[ 80.5 ],
[ 86. ],
[ 82.5 ],
[ 83.5 ],
[ 88.1 ],
[ 90.8 ],
[ 89.4 ],
[102. ],
[ 94.5 ],
[ 91. ],
[103. ],
[ 80. ],
[ 79. ],
[ 83.5 ],
[ 76. ],
[ 80.5 ],
[ 86.5 ],
[ 83. ],
[107.1 ],
[ 94.3 ],
[ 94.5 ],
[ 79.7 ],
[ 79.3 ],
[ 89.8 ],
[ 83.8 ],
[ 85.2 ],
[ 75.5 ],
[ 78.4 ],
[ 78.6 ],
[ 87.8 ],
[ 86.3 ],
[ 85.5 ],
[ 83.7 ],
[ 77.6 ],
[ 84.9 ],
[ 79.8 ],
[108.3 ],
[119.6 ],
[119.9 ],
[ 96.5 ],
[105.5 ],
[105. ],
[107. ],
[107. ],
[101. ],
[ 97. ],
[100. ],
[108. ],
[100. ],
[103. ],
[104. ],
[106. ],
[109. ],
[103.5 ],
[110. ],
[110. ],
[112. ],
[108.5 ],
[104. ],
[111. ],
[108.5 ],
[121. ],
[109. ],
[ 97.5 ],
[105.5 ],
[ 98. ],
[ 94.5 ],
[ 97. ],
[105. ],
[106. ],
[ 99. ],
[ 91. ],
[102.5 ],
[106. ],
[109.1 ],
[115. ],
[101. ],
[100.1 ],
[ 93.3 ],
[101.8 ],
[107.9 ],
[108.5 ]])
In [10]:
y
Out[10]: array([[ 25.72],
[ 25.89],
[ 42.6 ],
[ 42.8 ],
[ 29.84],
[ 21.68],
[ 29.08],
[ 32.98],
[ 11.44],
[ 32.22],
[ 28.32],
[ 43.86],
[ 38.21],
[ 42.48],
[ 30.96],
[ 55.78],
[ 43.78],
[ 33.41],
[ 43.35],
[ 29.31],
[ 36.6 ],
[ 40.25],
[ 35.43],
[ 60.09],
[ 45.84],
[ 70.4 ],
[ 83.45],
[ 84.3 ],
[ 78.89],
[ 64.75],
[ 72.56],
[ 89.31],
[ 78.94],
[ 83.55],
[127. ],
[121. ],
[107. ],
[129. ],
[ 74.02],
[ 55.48],
[ 73.13],
[ 50.5 ],
[ 50.88],
[140. ],
[ 96.54],
[118. ],
[107. ],
[123. ],
[ 65.92],
[ 81.29],
[111. ],
[ 90.73],
[133. ],
[ 41.9 ],
[ 41.71],
[ 58.16],
[ 88.85],
[155. ],
[ 70.77],
[ 75.08],
[ 57.05],
[ 99.73],
[ 27.96],
[123. ],
[ 90.41],
[106. ],
[144. ],
[121. ],
[ 97.13],
[166. ],
[ 87.99],
[154. ],
[100. ],
[123. ],
[217. ],
[140. ],
[109. ],
[127. ],
[112. ],
[192. ],
[132. ],
[126. ],
[153. ],
[158. ],
[183. ],
[184. ],
[121. ],
[159. ],
[245. ],
[137. ],
[165. ],
[152. ],
[181. ],
[ 80.95],
[137. ],
[125. ],
[241. ],
[134. ],
[150. ],
[198. ],
[151. ],
[229. ],
[253. ],
[188. ],
[124. ],
[ 62.2 ],
[133. ],
[208. ],
[208. ]])
In [11]:
# Data partition into Traning and Testing for 80 and 20 model
In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0
Model Building
In [13]:
from sklearn.linear_model import LinearRegression # Step1
from sklearn import metrics
In [14]:
import sklearn
from sklearn import linear_model
In [15]:
# Training the Algorithm
regressor = LinearRegression() # step 2
regressor.fit(X_train, y_train) #training the algorithm
Out[15]: LinearRegression()
In [16]:
y_pred = regressor.predict(X_test)
In [17]:
y_pred # PRedicted Values for AT
Out[17]: array([[160.65263299],
[ 34.49450742],
[131.35361476],
[ 68.61924631],
[ 69.30863497],
[152.03527469],
[160.65263299],
[ 74.13435562],
[ 48.45462788],
[169.26999129],
[148.58833137],
[ 62.070054 ],
[155.82691234],
[ 54.14208436],
[141.69444472],
[ 80.3388536 ],
[ 58.96780501],
[131.35361476],
[145.14138804],
[ 56.89963901],
[131.69830909],
[ 5.54018352]])
The flatten() function is used to get a copy of an given array collapsed into one dimension.
Array will be converted into One dimensional Line - Y test is array format so it will convert into 1D
format
In [14]:
y_test # Actual Output variable from Y test
Out[14]: array([[183. ],
[ 28.32],
[140. ],
[ 42.6 ],
[ 45.84],
[151. ],
[208. ],
[ 32.98],
[ 43.78],
[121. ],
[ 97.13],
[ 35.43],
[118. ],
[ 57.05],
[109. ],
[133. ],
[ 42.48],
[123. ],
[184. ],
[ 41.71],
[124. ],
[ 11.44]])
In [15]:
df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
df
Out[15]: Actual Predicted
0 183.00 160.652633
1 28.32 34.494507
Actual Predicted
2 140.00 131.353615
3 42.60 68.619246
4 45.84 69.308635
5 151.00 152.035275
6 208.00 160.652633
7 32.98 74.134356
8 43.78 48.454628
9 121.00 169.269991
10 97.13 148.588331
11 35.43 62.070054
12 118.00 155.826912
13 57.05 54.142084
14 109.00 141.694445
15 133.00 80.338854
16 42.48 58.967805
17 123.00 131.353615
18 184.00 145.141388
19 41.71 56.899639
20 124.00 131.698309
21 11.44 5.540184
In [16]:
df1 = df.head(25)
df1.plot(kind='bar',figsize=(16,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()
In [31]:
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
Mean Squared Error: 861.0892456209028
In [32]:
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
Root Mean Squared Error: 29.344322204148842
In [33]:
#Less the error better the model and we can predict the equation
In [34]:
# Get the Intercept Values
print('intercept:', regressor.intercept_)
intercept: [-213.34071739]
In [37]:
# Get the coefficent value
print('slope:', regressor.coef_)
slope: [[3.44694332]]