Importing Data ¶
In [ ]: # To check the existing working directory
In [2]: import os
In [4]: os.getcwd()
Out[4]: 'C:\\Users\\rgandyala\\4 Data processing and Stats'
In [6]: import pandas as pd
In [ ]: # To change the working directory
In [6]: import os # OS module provides a way of using operating system dependent function
import pandas as pd
os.chdir("C:\\Users\\rgandyala\\4 Data processing and Stats")
In [5]: # Now place the data set in working directory and use below code to import the da
In [7]: csv1 = pd.read_csv("mba.csv")
#pd.read_CSV("C:\\Users\\rgandyala\\4 Data processing and Stats\\mba.csv")
In [8]: csv1
...
In [6]: # pd.read_csv is command use to import the data
Different ways of Importing csv
pd.read_csv("Iris.csv") is used to load our data into python
pd.read_csv("Iris.csv", skiprows=1) # Skips the first row
pd.read_csv("Iris.csv", header=1) # Skips header
pd.read_csv("Iris.csv", nrows=2) # Reading only first 2 rows
pd.read_csv("Iris.csv", na_values=["n.a.", "not available"]) # Telling what NA values are to python
pd.read_csv("Iris.csv",parse_dates=['day']) # As date column is taken as strin g we will to take as
Date dat type
In [7]: # if working directory is not set we can access the data directly from any folder
In [9]: import pandas as pd # data frame
import numpy as np # array , linear algebra ,Fourier Transfora,
In [10]: data = pd.read_csv("D:\\Course\\Python\\Datasets\\Data.csv")
In [11]: # To view the data
data
...
In [26]: data
#filename['columnname']
...
In [47]: # number of Rows
len(data)
Out[47]: 10
In [48]: # check the number of columns
len(data.columns)
...
In [21]: # To read the column in data set
data["Country"]
...
In [27]: # Viewing Data
data
...
In [28]:
data.head() # Displays first 5 rows -
#data.head(4)
Out[28]: Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No
3 Spain 38.0 61000.0 No
4 Germany 40.0 NaN Yes
In [24]: data.tail() # Displays last 6 rows - we can mention the required row numbers
#data.tail(4)
Out[24]: Country Age Salary Purchased
5 France 35.0 58000.0 Yes
6 Spain NaN 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes
In [26]: data.columns # Names of the columns
Out[26]: Index(['Country', 'Age', 'Salary', 'Purchased'], dtype='object')
In [28]: data.shape # Number of rows and columns
Out[28]: (10, 4)
In [30]: data.values # Displays values of data
...
In [32]: data.dtypes # Data Type of all columns
...
In [33]: data.info() #Information about dataset
...
Data Selection
Pandas has different data Access methods
As usual we use indexing operators"[]" and attribute operator "." for quick and easy access
.loc() which is for label based indexing - can search value based on values
.iloc() which is for integer based - can search value based on indexing
syntax - file[row,columns]
In [29]: data
Out[29]: Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No
3 Spain 38.0 61000.0 No
4 Germany 40.0 NaN Yes
5 France 35.0 58000.0 Yes
6 Spain NaN 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes
In [34]: data.loc[0,"Salary"] #loc[row,columns]
...
In [35]: data.loc[data["Purchased"]=="Yes"]
...
In [30]: data
...
In [37]: X = data.iloc[:, :-1].values
In [38]: X
...
In [39]: # To identiy the missing values - True means we have missing values in Data
In [40]: data.isnull()
Out[40]: Country Age Salary Purchased
0 False False False False
1 False False False False
2 False False False False
3 False False False False
4 False False True False
5 False False False False
6 False True False False
7 False False False False
8 False False False False
9 False False False False
In [41]: data.isnull().any()#True -missing value is present, #Flase missing value is not p
...
In [42]: data.isnull().sum()#count of missing values in a column
...
In [50]: # To remove the columns we can us del or drop command
data
Out[50]: Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No
3 Spain 38.0 61000.0 No
4 Germany 40.0 NaN Yes
5 France 35.0 58000.0 Yes
6 Spain NaN 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes
In [51]: del data['Country']
In [52]: data
...
In [53]: # To remove the Row from Data
data.drop(0)
Out[53]: Age Salary Purchased
1 27.0 48000.0 Yes
2 30.0 54000.0 No
3 38.0 61000.0 No
4 40.0 NaN Yes
5 35.0 58000.0 Yes
6 NaN 52000.0 No
7 48.0 79000.0 Yes
8 50.0 83000.0 No
9 37.0 67000.0 Yes
In [56]: # To Remove the columns
data.drop("Age",axis=1,inplace=True)#column
In [57]: data
...
In [58]: data = pd.read_csv("D:\\Course\\Python\\Datasets\\Data.csv")
In [59]: # To replace the missing values we can user fillna command
In [60]: data
Out[60]: Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No
3 Spain 38.0 61000.0 No
4 Germany 40.0 NaN Yes
5 France 35.0 58000.0 Yes
6 Spain NaN 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes
In [61]: data.fillna(12) # All the missing values will be replaced by 12
Out[61]: Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No
3 Spain 38.0 61000.0 No
4 Germany 40.0 12.0 Yes
5 France 35.0 58000.0 Yes
6 Spain 12.0 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes
In [63]: data = pd.read_csv("D:\\Course\\Python\\Datasets\\Data.csv")
data
Out[63]: Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No
3 Spain 38.0 61000.0 No
4 Germany 40.0 NaN Yes
5 France 35.0 58000.0 Yes
6 Spain NaN 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes
In [66]: # Drop rows that are complete missing all data
data.dropna()
...
In [68]: data = pd.read_csv("D:\\Course\\Python\\Datasets\\Data.csv")
In [69]: # To Drop the columns of missing values
In [71]: #Drop columns with missing data
data.dropna(axis=1)
...
In [72]: data = pd.read_csv("D:\\Course\\Python\\Datasets\\Data.csv")
In [73]: # To Replace the missing values with mean or median
data.fillna(data.mean(), inplace=True)
In [74]: data
...
In [75]: # we can also replace value with individual Columns
In [76]: data = pd.read_csv("D:\\Course\\Python\\Datasets\\Data.csv")
In [77]: data['Age'].fillna(data['Age'].mean(), inplace=True)
In [78]: data
...
In [79]: data['Salary'].fillna(data['Salary'].mean(), inplace=True)
In [80]: data
...
In [ ]: