import
numpy as np
import pandas as pd
from matplotlib import rcParams
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.model_selection import train_test_split
import seaborn as sns
import graphviz
# switch to seaborn default stylistic parameters
sns.set()
# larger plot fonts
sns.set_context('talk')
# change default plot size
rcParams['figure.figsize'] = 10,8
#fig, ax = plt.subplots(1, len(possible_pred), figsize=[18,2.5])
#for i,p in enumerate(possible_pred):
#sns.scatterplot(data=df, x=p, y=target,ax=ax[i])
df.head(10)
myct mmin mmax cach chmin chmax prp erp cs
adviser 32/60 125 256 6000 256 16 128 198 199 8.00
amdahl 470v/7 29 8000 32000 32 8 32 269 253 34.48
amdahl 470v/7a 29 8000 32000 32 8 32 220 253 34.48
amdahl 470v/7b 29 8000 32000 32 8 32 172 253 34.48
amdahl 470v/7c 29 8000 16000 32 8 16 132 132 34.48
amdahl 470v/b 26 8000 32000 64 8 32 318 290 38.46
amdahl 580-5840 23 16000 32000 64 16 32 367 381 43.48
amdahl 580-5850 23 16000 32000 64 16 32 489 381 43.48
amdahl 580-5860 23 16000 64000 64 16 32 636 749 43.48
amdahl 580-5880 23 32000 64000 128 32 64 1144 1238 43.48
# read the cpu data
df = pd.read_csv("https://raw.githubusercontent.com/grbruns/cst383/master/machine.csv
df.index = df['vendor']+' '+df['model']
df.drop(['vendor', 'model'], axis=1, inplace=True)
p([ , ], , p )
df['cs'] = np.round(1e3/df['myct'], 2) # clock speed in MHz
df.describe()
myct mmin mmax cach chmin chmax
count 209.000000 209.000000 209.000000 209.000000 209.000000 209.000000 209.00
mean 203.822967 2867.980861 11796.153110 25.205742 4.698565 18.267943 105.62
std 260.262926 3878.742758 11726.564377 40.628722 6.816274 25.997318 160.83
min 17.000000 64.000000 64.000000 0.000000 0.000000 0.000000 6.00
25% 50.000000 768.000000 4000.000000 0.000000 1.000000 5.000000 27.00
50% 110.000000 2000.000000 8000.000000 8.000000 2.000000 8.000000 50.00
75% 225.000000 4000.000000 16000.000000 32.000000 6.000000 24.000000 113.00
max 1500.000000 32000.000000 64000.000000 256.000000 52.000000 176.000000 1150.00
possible_pred = ['myct', 'mmin','mmax', 'cach','chmin', 'chmax','prp', 'erp','cs']
#target = 'erp'
#t
# get ready for Scikit-Learn
predictors = ['mmin', 'chmax'] # choose predictors as you like
target = 'prp'
X = df[predictors].values
y = df[target].values
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 209 entries, adviser 32/60 to wang vs-90
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 myct 209 non-null int64
1 mmin 209 non-null int64
2 mmax 209 non-null int64
3 cach 209 non-null int64
4 chmin 209 non-null int64
5 chmax 209 non-null int64
6 prp 209 non-null int64
7 erp 209 non-null int64
8 cs 209 non-null float64
dtypes: float64(1), int64(8)
memory usage: 16.3+ KB
test/train split
test/train split
_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state
# train the model
reg = DecisionTreeRegressor(max_depth=2)
reg.fit(X_train, y_train)
DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='best')
# view the tree
# conda install python-graphviz
dot_data = export_graphviz(reg, precision=2,
feature_names=predictors,
proportion=True,
filled=True, rounded=True)
graph = graphviz.Source(dot_data)
graph
h
mmin <= 6620.0
mse = 29246.55
samples = 100.0%
value = 107.01
True False
chmax <= 11.0 chmax <= 48.0
mse = 4107.27 mse = 97100.17
samples = 87.0% samples = 13.0%
value = 62.84 value = 402.21
mse = 610.85 mse = 7168.46 mse = 13869.81 mse = 11
samples = 54.8% samples = 32.2% samples = 11.0% samples
value = 37.78 value = 105.51 value = 277.06 value = 1
# make predictions and compute error
y_predict = reg.predict(X_test)
errors = y_test - y_predict
rmse = np.sqrt((errors**2).mean())
print('rmse: {:0.2f}'.format(rmse))
rmse: 88.42