13/4/2020 Scikit-Learn
Implementaciones de Scikit-learn sobre QSAR
Random Forest
In [2]:
import pandas as pd
import numpy as np
dataset= pd.read_csv("qsar_oral_toxicity.csv", sep=';', prefix='x', header=None)
dataset.head()
Out[2]:
x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... x1015 x1016 x1017 x1018 x1019 x1020
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
1 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0
3 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
5 rows × 1025 columns
In [3]:
from sklearn import preprocessing, model_selection
enc = preprocessing.OrdinalEncoder()
enc.fit(dataset[['x1024']])
for i, cat in enumerate(enc.categories_[0]): print("{} -> {}".format(cat, i))
dataset['output'] = enc.transform(dataset[['x1024']])
dataset.head()
negative -> 0
positive -> 1
Out[3]:
x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... x1016 x1017 x1018 x1019 x1020 x1021
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
1 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 0 0 0
3 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
5 rows × 1026 columns
localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 1/11
13/4/2020 Scikit-Learn
In [4]:
train, test = model_selection.train_test_split(dataset, test_size=0.2, random_state=42)
train.x1024.value_counts()
Out[4]:
negative 6609
positive 584
Name: x1024, dtype: int64
In [5]:
from sklearn.ensemble import RandomForestClassifier
X_train = train.iloc[:, 0:1024].values
Y_train = train.output
clf = RandomForestClassifier(n_estimators=100, max_features="sqrt", max_depth=None, min
_samples_split=2)
clf = clf.fit(X_train, Y_train)
In [6]:
X_test = test.iloc[:, 0:1024].values
Y_test = test.output
test_pred=clf.predict(X_test)
In [7]:
from sklearn import metrics
print("\nAcierto:", metrics.accuracy_score(test.output, test_pred))
print(metrics.classification_report(test.output, test_pred))
Acierto: 0.9394107837687604
precision recall f1-score support
0.0 0.95 0.99 0.97 1642
1.0 0.79 0.42 0.55 157
accuracy 0.94 1799
macro avg 0.87 0.70 0.76 1799
weighted avg 0.93 0.94 0.93 1799
In [8]:
from sklearn.metrics import roc_auc_score
roc_value = roc_auc_score(test.output, test_pred)
print(roc_value)
0.7047099622178948
ID3
localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 2/11
13/4/2020 Scikit-Learn
In [9]:
from sklearn import tree
clf1 = tree.DecisionTreeClassifier()
clf1 = clf1.fit(X_train, Y_train)
test_pred1=clf1.predict(X_test)
In [10]:
print("\nAcierto:", metrics.accuracy_score(test.output, test_pred1))
print(metrics.classification_report(test.output, test_pred1))
Acierto: 0.9049471928849361
precision recall f1-score support
0.0 0.95 0.94 0.95 1642
1.0 0.46 0.52 0.49 157
accuracy 0.90 1799
macro avg 0.71 0.73 0.72 1799
weighted avg 0.91 0.90 0.91 1799
In [11]:
roc_value1 = roc_auc_score(test.output, test_pred1)
print(roc_value1)
0.731913853697138
Cross Validation
In [12]:
seed = 1
scoring = 'accuracy'
In [13]:
models = []
models.append(('CART', tree.DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=None)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold,
scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
CART: 0.910466 (0.010126)
RF: 0.939664 (0.010440)
localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 3/11
13/4/2020 Scikit-Learn
In [18]:
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
print (random_grid)
{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max
_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 8
0, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_lea
f': [1, 2, 4], 'bootstrap': [True, False]}
localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 4/11
13/4/2020 Scikit-Learn
In [15]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_ite
r = 10, cv = 5, verbose=2, random_state=42)
rf_random.fit(X_train, Y_train)
rf_random.best_params_
localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 5/11
13/4/2020 Scikit-Learn
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_featu
res=sqrt, max_depth=50, bootstrap=True
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent wo
rkers.
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_feat
ures=sqrt, max_depth=50, bootstrap=True, total= 12.9s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_featu
res=sqrt, max_depth=50, bootstrap=True
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 12.8s remaining:
0.0s
localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 6/11
13/4/2020 Scikit-Learn
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_feat
ures=sqrt, max_depth=50, bootstrap=True, total= 13.1s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_featu
res=sqrt, max_depth=50, bootstrap=True
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_feat
ures=sqrt, max_depth=50, bootstrap=True, total= 13.6s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_featu
res=sqrt, max_depth=50, bootstrap=True
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_feat
ures=sqrt, max_depth=50, bootstrap=True, total= 13.8s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_featu
res=sqrt, max_depth=50, bootstrap=True
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_feat
ures=sqrt, max_depth=50, bootstrap=True, total= 13.9s
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_featu
res=sqrt, max_depth=90, bootstrap=False
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_feat
ures=sqrt, max_depth=90, bootstrap=False, total= 1.0min
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_featu
res=sqrt, max_depth=90, bootstrap=False
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_feat
ures=sqrt, max_depth=90, bootstrap=False, total= 1.1min
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_featu
res=sqrt, max_depth=90, bootstrap=False
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_feat
ures=sqrt, max_depth=90, bootstrap=False, total= 1.1min
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_featu
res=sqrt, max_depth=90, bootstrap=False
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_feat
ures=sqrt, max_depth=90, bootstrap=False, total= 1.1min
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_featu
res=sqrt, max_depth=90, bootstrap=False
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_feat
ures=sqrt, max_depth=90, bootstrap=False, total= 1.1min
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featur
es=auto, max_depth=60, bootstrap=False
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=60, bootstrap=False, total=49.4min
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featur
es=auto, max_depth=60, bootstrap=False
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=60, bootstrap=False, total=30.2min
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featur
es=auto, max_depth=60, bootstrap=False
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=60, bootstrap=False, total=18.2min
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featur
es=auto, max_depth=60, bootstrap=False
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=60, bootstrap=False, total= 8.3min
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featur
es=auto, max_depth=60, bootstrap=False
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=60, bootstrap=False, total= 9.5min
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featur
es=sqrt, max_depth=30, bootstrap=True
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featu
res=sqrt, max_depth=30, bootstrap=True, total= 27.8s
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featur
es=sqrt, max_depth=30, bootstrap=True
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featu
localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 7/11
13/4/2020 Scikit-Learn
res=sqrt, max_depth=30, bootstrap=True, total= 28.6s
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featur
es=sqrt, max_depth=30, bootstrap=True
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featu
res=sqrt, max_depth=30, bootstrap=True, total= 21.6s
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featur
es=sqrt, max_depth=30, bootstrap=True
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featu
res=sqrt, max_depth=30, bootstrap=True, total= 21.4s
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featur
es=sqrt, max_depth=30, bootstrap=True
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featu
res=sqrt, max_depth=30, bootstrap=True, total= 21.0s
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_featu
res=auto, max_depth=80, bootstrap=False
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_feat
ures=auto, max_depth=80, bootstrap=False, total=12.0min
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_featu
res=auto, max_depth=80, bootstrap=False
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_feat
ures=auto, max_depth=80, bootstrap=False, total=15.1min
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_featu
res=auto, max_depth=80, bootstrap=False
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_feat
ures=auto, max_depth=80, bootstrap=False, total=15.4min
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_featu
res=auto, max_depth=80, bootstrap=False
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_feat
ures=auto, max_depth=80, bootstrap=False, total=13.1min
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_featu
res=auto, max_depth=80, bootstrap=False
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_feat
ures=auto, max_depth=80, bootstrap=False, total=16.0min
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_featu
res=sqrt, max_depth=60, bootstrap=False
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_feat
ures=sqrt, max_depth=60, bootstrap=False, total= 9.8s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_featu
res=sqrt, max_depth=60, bootstrap=False
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_feat
ures=sqrt, max_depth=60, bootstrap=False, total= 10.1s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_featu
res=sqrt, max_depth=60, bootstrap=False
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_feat
ures=sqrt, max_depth=60, bootstrap=False, total= 9.8s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_featu
res=sqrt, max_depth=60, bootstrap=False
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_feat
ures=sqrt, max_depth=60, bootstrap=False, total= 10.2s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_featu
res=sqrt, max_depth=60, bootstrap=False
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_feat
ures=sqrt, max_depth=60, bootstrap=False, total= 10.1s
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=50, bootstrap=False
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=False, total=25.0min
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=50, bootstrap=False
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=False, total=28.5min
localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 8/11
13/4/2020 Scikit-Learn
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=50, bootstrap=False
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=False, total=34.1min
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=50, bootstrap=False
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=False, total=30.8min
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=50, bootstrap=False
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=False, total=31.1min
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featur
es=sqrt, max_depth=10, bootstrap=True
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featu
res=sqrt, max_depth=10, bootstrap=True, total= 2.2s
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featur
es=sqrt, max_depth=10, bootstrap=True
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featu
res=sqrt, max_depth=10, bootstrap=True, total= 2.2s
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featur
es=sqrt, max_depth=10, bootstrap=True
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featu
res=sqrt, max_depth=10, bootstrap=True, total= 2.1s
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featur
es=sqrt, max_depth=10, bootstrap=True
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featu
res=sqrt, max_depth=10, bootstrap=True, total= 2.1s
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featur
es=sqrt, max_depth=10, bootstrap=True
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featu
res=sqrt, max_depth=10, bootstrap=True, total= 2.1s
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featur
es=auto, max_depth=100, bootstrap=True
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featu
res=auto, max_depth=100, bootstrap=True, total=11.2min
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featur
es=auto, max_depth=100, bootstrap=True
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featu
res=auto, max_depth=100, bootstrap=True, total= 9.5min
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featur
es=auto, max_depth=100, bootstrap=True
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featu
res=auto, max_depth=100, bootstrap=True, total= 9.6min
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featur
es=auto, max_depth=100, bootstrap=True
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featu
res=auto, max_depth=100, bootstrap=True, total= 9.8min
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featur
es=auto, max_depth=100, bootstrap=True
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featu
res=auto, max_depth=100, bootstrap=True, total= 9.0min
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_featu
res=auto, max_depth=50, bootstrap=True
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=True, total=15.8min
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_featu
res=auto, max_depth=50, bootstrap=True
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=True, total=16.6min
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_featu
localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 9/11
13/4/2020 Scikit-Learn
res=auto, max_depth=50, bootstrap=True
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=True, total=16.4min
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_featu
res=auto, max_depth=50, bootstrap=True
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=True, total=16.7min
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_featu
res=auto, max_depth=50, bootstrap=True
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=True, total=15.5min
[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 476.0min finished
Out[15]:
{'n_estimators': 200,
'min_samples_split': 10,
'min_samples_leaf': 1,
'max_features': 'sqrt',
'max_depth': 60,
'bootstrap': False}
In [16]:
rf_random.best_params_
Out[16]:
{'n_estimators': 200,
'min_samples_split': 10,
'min_samples_leaf': 1,
'max_features': 'sqrt',
'max_depth': 60,
'bootstrap': False}
comparamos
In [22]:
clf2 = RandomForestClassifier(n_estimators=200, max_features="sqrt", max_depth=60, min_
samples_split=10, min_samples_leaf=1, bootstrap= False)
clf2 = clf.fit(X_train, Y_train)
localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 10/11
13/4/2020 Scikit-Learn
In [23]:
test_pred2=clf2.predict(X_test)
print("\nAcierto:", metrics.accuracy_score(test.output, test_pred2))
print(metrics.classification_report(test.output, test_pred2))
Acierto: 0.9382990550305725
precision recall f1-score support
0.0 0.95 0.99 0.97 1642
1.0 0.78 0.41 0.54 157
accuracy 0.94 1799
macro avg 0.86 0.70 0.75 1799
weighted avg 0.93 0.94 0.93 1799
In [ ]:
localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 11/11