Pre-Processing
Example – 1:
import pandas as pd
data = pd.read_csv("J:\Machine
Learning\Class\Practical\Practical_1\age_salary.csv")
print(data.columns)
X = data.iloc[:,:-1].values #Takes all rows of all columns except
the last column
Y = data.iloc[:,-1].values # Takes all rows of the last column
X
Y
Example – 2:
import pandas as pd
dataset = pd.read_csv('J:\\Machine
Learning\\Class\Practical\\Preprocessing\\Data1.csv')
print(dataset.columns)
dataset
dataset.info()
dataset.head()
#Row and column count
dataset.shape
#Removing insufficient column
dataset_new = dataset.drop(['Age',], axis = 1)
dataset_new
#To measure the central tendency of variables
dataset_new.describe()
#To change column name
dataset.rename(index =str, columns={'Country' : 'Countries',
'Age' : 'age',
'Salary' : 'Sal',
'Purchased' : 'Purchased'}, inplace =
True)
dataset
#Count missing values
dataset.isnull().sum().sort_values(ascending=False)
#Print the missing value column
dataset[dataset.isnull().any(axis=1)].head()
#Remove missing value rows
ds_new = dataset.dropna()
ds_new
ds_new.isnull().sum().sort_values(ascending=False)
#To check datatype
ds_new.dtypes
#To convert as integer
ds_new['age'] = ds_new['age'].astype('int64')
ds_new.dtypes
Example – 3
import pandas as pd
data = pd.read_csv('J:\\Machine
Learning\\Class\Practical\\Preprocessing\\Book1.csv')
# Slice the result for first 5 rows
print (data[0:5]['Salary'])
# Use the multi-axes indexing method called .loc
print (data.loc[:,['Salary','Name']])
# Use the multi-axes indexing funtion
print (data.loc[[1,3,5],['Salary','Name']])
# Use the multi-axes indexing funtion
print (data.loc[2:6,['Salary','Name']])
print (data.loc[:,['Salary','Name']])
Example – 4
import pandas as pd
dataset = pd.read_csv('J:\\Machine
Learning\\Class\Practical\\Preprocessing\\Data1.csv')
print(dataset.columns)
dataset
dataset.info()
#Creating Independent variable
X = dataset.iloc[:, :-1].values #Takes all rows of all columns
except the last column
#Creating Dependent variable
Y = dataset.iloc[:, -1].values # Takes all rows of the last column
X
Y
#Dealing with missing values with mean imputer
from sklearn.preprocessing import Imputer
imputer=Imputer(missing_values='NaN',strategy='mean',axis=0)
imputer.fit(X[:,1:3])
X[:,1:3]=imputer.transform(X[:,1:3])
X
#Encoding categorical data
from sklearn.preprocessing import
LabelEncoder,OneHotEncoder
labelencoder_X=LabelEncoder()
X[:,0] = labelencoder_X.fit_transform(X[:,0]) #Encoding the
values of column Country
onehotencoder=OneHotEncoder(categorical_features=[0])
X=onehotencoder.fit_transform(X).toarray()
X
labelencoder_y=LabelEncoder()
Y= labelencoder_y.fit_transform(Y)
Y
#Splitting the data into training and test data
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,
random_state=0)
from sklearn.preprocessing import StandardScaler
sc_X=StandardScaler()
X_train=sc_X.fit_transform(X_train)
X_test=sc_X.fit_transform(X_test)
X_train
X_test