import
numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
%matplotlib inline
from google.colab import files
uploaded = files.upload()
Choose Files No file chosen
Upload widget is only available when the cell has been
executed in the
current browser session. Please rerun this cell to enable.
Saving car evaluation(1).csv to car evaluation(1) (2).csv
import io
df = pd.read_csv(io.BytesIO(uploaded['car_evaluation(1).csv']))
df
vhigh vhigh.1 2 2.1 small low unacc
0 vhigh vhigh 2 2 small med unacc
1 vhigh vhigh 2 2 small high unacc
2 vhigh vhigh 2 2 med low unacc
3 vhigh vhigh 2 2 med med unacc
4 vhigh vhigh 2 2 med high unacc
... ... ... ... ... ... ... ...
1722 low low 5more more med med good
1723 low low 5more more med high vgood
1724 low low 5more more big low unacc
1725 low low 5more more big med good
1726 low low 5more more big high vgood
1727 rows × 7 columns
df.shape
(1727, 7)
df.head()
vhigh vhigh.1 2 2.1 small low unacc
0 vhigh vhigh 2 2 small med unacc
1 vhigh vhigh 2 2 small high unacc
2 vhigh vhigh 2 2 med low unacc
3 vhigh vhigh 2 2 med med unacc
df 4 vhigh vhigh 2 2 med high unacc
vhigh vhigh.1 2 2.1 small low unacc
0 vhigh vhigh 2 2 small med unacc
1 vhigh vhigh 2 2 small high unacc
2 vhigh vhigh 2 2 med low unacc
3 vhigh vhigh 2 2 med med unacc
4 vhigh vhigh 2 2 med high unacc
... ... ... ... ... ... ... ...
1722 low low 5more more med med good
1723 low low 5more more med high vgood
1724 low low 5more more big low unacc
1725 low low 5more more big med good
1726 low low 5more more big high vgood
1727 rows × 7 columns
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df.columns = col_names
col_names
['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df.head()
buying maint doors persons lug_boot safety class
#summary of data set
0 vhigh vhigh 2 2 small med unacc
1 vhigh vhigh 2 2 small high unacc
df.info()
2 vhigh vhigh 2 2 med low unacc
3 vhigh vhigh 2 2 med med unacc
<class
4 'pandas.core.frame.DataFrame'>
vhigh vhigh 2 2 med high unacc
RangeIndex: 1727 entries, 0 to 1726
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 buying 1727 non-null object
1 maint 1727 non-null object
2 doors 1727 non-null object
3 persons 1727 non-null object
4 lug_boot 1727 non-null object
5 safety 1727 non-null object
6 class 1727 non-null object
dtypes: object(7)
memory usage: 94.6+ KB
#
#Frequency distribution of values in variables
#Now, check the frequency counts of categorical variables.
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
for col in col_names:
print(df[col].value_counts())
high 432
med 432
low 432
vhigh 431
Name: buying, dtype: int64
high 432
med 432
low 432
vhigh 431
Name: maint, dtype: int64
3 432
4 432
5more 432
2 431
Name: doors, dtype: int64
4 576
more 576
2 575
Name: persons, dtype: int64
big 576
med 576
small 575
Name: lug_boot, dtype: int64
high 576
med 576
low 575
Name: safety, dtype: int64
unacc 1209
acc 384
good 69
vgood 65
Name: class, dtype: int64
df['class'].value_counts()
unacc 1209
acc 384
good 69
vgood 65
Name: class, dtype: int64
# check missing values in variables
df.isnull().sum()
buying 0
maint 0
doors 0
persons 0
lug_boot 0
safety 0
class 0
dtype: int64
#Declare feature vector and target variable
X = df.drop(['class'], axis=1)
y = df['class']
X
buying maint doors persons lug_boot safety
0 vhigh vhigh 2 2 small med
1 vhigh vhigh 2 2 small high
2 vhigh vhigh 2 2 med low
3 vhigh vhigh 2 2 med med
4 vhigh vhigh 2 2 med high
y ... ... ... ... ... ... ...
01722 unacc
low low 5more more med med
1 unacc
21723 low
unacc
low 5more more med high
3 unacc
41724 low
unacc
low 5more more big low
...
1725 low low 5more more big med
1722 good
1723
1726 vgood
low low 5more more big high
1724 unacc
1725 good
1727 rows × 6 columns
1726 vgood
Name: class, Length: 1727, dtype: object
#Split data into separate training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state =
# check the shape of X_train and X_test
X_train.shape, X_test.shape
((1157, 6), (570, 6))
#Feature Engineering
# check data types in X_train
X_train.dtypes
buying object
maint object
doors object
persons object
lug_boot object
safety object
dtype: object
#Encode categorical variables
X_train.head()
buying maint doors persons lug_boot safety
83 vhigh vhigh 5more 2 med low
48 vhigh vhigh 3 more med med
468 high vhigh 3 4 small med
155 vhigh high 3 more med low
1043 med high 4 more small low
pip install category_encoders
Requirement already satisfied: category_encoders in /usr/local/lib/python3.7/dist-pac
Requirement already satisfied: numpy>=1.14.0 in /usr/local/lib/python3.7/dist-package
Requirement already satisfied: patsy>=0.5.1 in /usr/local/lib/python3.7/dist-packages
Requirement already satisfied: statsmodels>=0.9.0 in /usr/local/lib/python3.7/dist-pa
Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.7/dist-
Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.7/dist-packages
Requirement already satisfied: pandas>=0.21.1 in /usr/local/lib/python3.7/dist-packag
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dis
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages
Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from pa
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages
# import category encoders
import category_encoders as ce
# encode categorical variables with ordinal encoding
encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safe
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)
/usr/local/lib/python3.7/dist-packages/category_encoders/utils.py:21: FutureWarning:
elif pd.api.types.is_categorical(cols):
X_train.head()
buying maint doors persons lug_boot safety
83
X_test.head() 1 1 1 1 1 1
48 1 1 2 2 1 2
buying maint doors persons lug_boot safety
468 2 1 2 3 2 2
599 2 2 3 1 3 1
155 1 2 2 2 1 1
932 3 1 3 3 3 1
1043 3 2 3 2 2 1
628 2 2 1 1 3 3
1497 4 2 1 3 1 2
1262 3 4 3 2 1 1
# import Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
# instantiate the classifier
rfc = RandomForestClassifier(random_state=0)
# fit the model
rfc.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)
# Predict the Test set results
y_pred = rfc.predict(X_test)
# Check accuracy score
from sklearn.metrics import accuracy_score
#print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_t
#H b ild th R d F t Cl ifi d l ith d f lt t f ti t
#Here, we build the Random Forest Classifier model with default parameter of n_estimators
# instantiate the classifier with n_estimators = 100
rfc_100 = RandomForestClassifier(n_estimators=100, random_state=0)
# fit the model to the training set
rfc_100.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)
# Predict on the test set results
y_pred_100 = rfc_100.predict(X_test)
# Check accuracy score
print('Model accuracy score with 100 decision-trees : {0:0.4f}'. format(accuracy_score(y_t
Model accuracy score with 100 decision-trees : 0.9649
#Find important features with Random Forest model
# create the classifier with n_estimators = 100
clf = RandomForestClassifier(n_estimators=100, random_state=0)
# fit the model to the training set
clf.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)
# view the feature scores
feature scores = pd Series(clf feature importances index=X train columns) sort values(as
feature_scores = pd.Series(clf.feature_importances_, index=X_train.columns).sort_values(as
feature_scores
safety 0.291657
persons 0.235380
buying 0.160692
maint 0.134143
lug_boot 0.111595
doors 0.066533
dtype: float64
# Creating a seaborn bar plot
sns.barplot(x=feature_scores, y=feature_scores.index)
# Add title to the graph
plt.title("Visualizing Important Features")
# Visualize the graph
plt.show()
#Build Random Forest model on selected features
# declare feature vector and target variable
X = df.drop(['class', 'doors'], axis=1)
y = df['class']
# split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state =
X
buying maint persons lug_boot safety
0 vhigh vhigh 2 small med
1 vhigh vhigh 2 small high
2 vhigh vhigh 2 med low
3 vhigh vhigh 2 med med
4 vhigh vhigh 2 med high
... ... ... ... ... ...
1722 low low more med med
1723 low low more med high
1724 low low more big low
1725 low low more big med
1726 low low more big high
1727 rows × 5 columns
#encode categorical variables with ordinal encoding
encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'persons', 'lug_boot', 'safety'])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)
/usr/local/lib/python3.7/dist-packages/category_encoders/utils.py:21: FutureWarning:
elif pd.api.types.is_categorical(cols):
# instantiate the classifier with n_estimators = 100
clf = RandomForestClassifier(random_state=0)
# fit the model to the training set
clf.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)
# Predict on the test set results
y_pred = clf.predict(X_test)
# Check accuracy score
print('Model accuracy score with doors variable removed : {0:0.4f}'. format(accuracy_score
Model accuracy score with doors variable removed : 0.9263
# Classification Report
#Classification report is another way to evaluate the classification model performance. It
#We can print a classification report as follows:-
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
precision recall f1-score support
acc 0.88 0.85 0.86 127
good 0.62 0.56 0.59 18
unacc 0.97 0.97 0.97 399
vgood 0.75 0.81 0.78 26
accuracy 0.93 570
macro avg 0.80 0.80 0.80 570
weighted avg 0.93 0.93 0.93 570