Pandas DataFrame Operations
In [1]: import numpy as np
import pandas as pd
loc and iloc function
Note:
-> Use loc if you want to access by index naming (Named Based)
-> Use iloc if you want to access specific value of that particular index (Index Position Based)
In [2]: # Creating a dataframe of random values using rand() function
df=pd.DataFrame(np.random.rand(15, 5), index=np.arange(15), columns=['A','B','C','D','E']
df
Out[2]: A B C D E
A B C D E
0 0.155607 0.755827 0.969633 0.868001 0.465959
1 0.960138 0.830459 0.866927 0.451456 0.996912
2 0.359386 0.285517 0.384530 0.762477 0.054584
3 0.518195 0.763627 0.634155 0.446701 0.525050
4 0.724831 0.595158 0.567885 0.443769 0.003731
5 0.653767 0.135068 0.617750 0.916386 0.730770
6 0.361150 0.114477 0.990422 0.338284 0.682234
7 0.849273 0.988311 0.410418 0.094562 0.196757
8 0.057593 0.107239 0.958606 0.293883 0.473595
9 0.433382 0.983331 0.904262 0.710168 0.093863
10 0.742669 0.227531 0.149279 0.006128 0.491768
11 0.639688 0.096979 0.877697 0.463407 0.309152
12 0.711802 0.031837 0.337505 0.147178 0.677049
13 0.229736 0.215737 0.795375 0.613702 0.369378
14 0.841418 0.227723 0.659610 0.575109 0.613839
In [3]: # Access dataframe elements with an argument
df.loc[(df['C']<0.3)]
Out[3]: A B C D E
10 0.742669 0.227531 0.149279 0.006128 0.491768
In [4]: # Another method to access dataframe elements with an argument
df.loc[(df['C']<0.4) & (df['D']>0.1)]
Out[4]: A B C D E
2 0.359386 0.285517 0.384530 0.762477 0.054584
12 0.711802 0.031837 0.337505 0.147178 0.677049
In [5]: # Access only columns of a dataframe
df.loc[:,['C','D']]
Out[5]: C D
0 0.969633 0.868001
1 0.866927 0.451456
2 0.384530 0.762477
3 0.634155 0.446701
4 0.567885 0.443769
5 0.617750 0.916386
C D
6 0.990422 0.338284
7 0.410418 0.094562
8 0.958606 0.293883
9 0.904262 0.710168
10 0.149279 0.006128
11 0.877697 0.463407
12 0.337505 0.147178
13 0.795375 0.613702
14 0.659610 0.575109
In [6]: # Access only rows of a dataframe
df.loc[[6,8],:]
Out[6]: A B C D E
6 0.361150 0.114477 0.990422 0.338284 0.682234
8 0.057593 0.107239 0.958606 0.293883 0.473595
In [7]: # How to drop a specific column of data frame
df.drop(['E'], axis=1)
Out[7]: A B C D
0 0.155607 0.755827 0.969633 0.868001
1 0.960138 0.830459 0.866927 0.451456
2 0.359386 0.285517 0.384530 0.762477
3 0.518195 0.763627 0.634155 0.446701
4 0.724831 0.595158 0.567885 0.443769
5 0.653767 0.135068 0.617750 0.916386
6 0.361150 0.114477 0.990422 0.338284
7 0.849273 0.988311 0.410418 0.094562
8 0.057593 0.107239 0.958606 0.293883
9 0.433382 0.983331 0.904262 0.710168
10 0.742669 0.227531 0.149279 0.006128
11 0.639688 0.096979 0.877697 0.463407
12 0.711802 0.031837 0.337505 0.147178
13 0.229736 0.215737 0.795375 0.613702
14 0.841418 0.227723 0.659610 0.575109
In [8]: # How to drop a specific row of data frame
df.drop([5])
Out[8]: A B C D E
0 0.155607 0.755827 0.969633 0.868001 0.465959
1 0.960138 0.830459 0.866927 0.451456 0.996912
2 0.359386 0.285517 0.384530 0.762477 0.054584
3 0.518195 0.763627 0.634155 0.446701 0.525050
4 0.724831 0.595158 0.567885 0.443769 0.003731
6 0.361150 0.114477 0.990422 0.338284 0.682234
7 0.849273 0.988311 0.410418 0.094562 0.196757
8 0.057593 0.107239 0.958606 0.293883 0.473595
9 0.433382 0.983331 0.904262 0.710168 0.093863
10 0.742669 0.227531 0.149279 0.006128 0.491768
11 0.639688 0.096979 0.877697 0.463407 0.309152
12 0.711802 0.031837 0.337505 0.147178 0.677049
13 0.229736 0.215737 0.795375 0.613702 0.369378
14 0.841418 0.227723 0.659610 0.575109 0.613839
In [9]: # Original dataframe remained unchanged
df
Out[9]: A B C D E
0 0.155607 0.755827 0.969633 0.868001 0.465959
1 0.960138 0.830459 0.866927 0.451456 0.996912
2 0.359386 0.285517 0.384530 0.762477 0.054584
3 0.518195 0.763627 0.634155 0.446701 0.525050
4 0.724831 0.595158 0.567885 0.443769 0.003731
5 0.653767 0.135068 0.617750 0.916386 0.730770
6 0.361150 0.114477 0.990422 0.338284 0.682234
7 0.849273 0.988311 0.410418 0.094562 0.196757
8 0.057593 0.107239 0.958606 0.293883 0.473595
9 0.433382 0.983331 0.904262 0.710168 0.093863
10 0.742669 0.227531 0.149279 0.006128 0.491768
11 0.639688 0.096979 0.877697 0.463407 0.309152
12 0.711802 0.031837 0.337505 0.147178 0.677049
13 0.229736 0.215737 0.795375 0.613702 0.369378
14 0.841418 0.227723 0.659610 0.575109 0.613839
In [10]: # To change origianl dataframe, use copy function or assign a variable
df=df.drop(['D', 'E'], axis=1) # For dropping columns
df=df.drop([3,5,8,11,14]) # For dropping rows
In [11]: # Now original data changed
df
Out[11]: A B C
0 0.155607 0.755827 0.969633
1 0.960138 0.830459 0.866927
2 0.359386 0.285517 0.384530
4 0.724831 0.595158 0.567885
6 0.361150 0.114477 0.990422
7 0.849273 0.988311 0.410418
9 0.433382 0.983331 0.904262
10 0.742669 0.227531 0.149279
12 0.711802 0.031837 0.337505
13 0.229736 0.215737 0.795375
In [12]: # Now reset index, but it create a new index column
df.reset_index()
Out[12]: index A B C
0 0 0.155607 0.755827 0.969633
1 1 0.960138 0.830459 0.866927
2 2 0.359386 0.285517 0.384530
3 4 0.724831 0.595158 0.567885
4 6 0.361150 0.114477 0.990422
5 7 0.849273 0.988311 0.410418
6 9 0.433382 0.983331 0.904262
7 10 0.742669 0.227531 0.149279
8 12 0.711802 0.031837 0.337505
9 13 0.229736 0.215737 0.795375
In [13]: # To resolve above issue
df.reset_index(drop=True, inplace=True)
In [14]: # Index issue resolved
df
Out[14]: A B C
0 0.155607 0.755827 0.969633
A B C
1 0.960138 0.830459 0.866927
2 0.359386 0.285517 0.384530
3 0.724831 0.595158 0.567885
4 0.361150 0.114477 0.990422
5 0.849273 0.988311 0.410418
6 0.433382 0.983331 0.904262
7 0.742669 0.227531 0.149279
8 0.711802 0.031837 0.337505
9 0.229736 0.215737 0.795375
In [15]: # Access specific value of that particular index
df.iloc[[0,4]]
Out[15]: A B C
0 0.155607 0.755827 0.969633
4 0.361150 0.114477 0.990422
In [16]: # Anoter method to access specific value of that particular index
df.iloc[[0,1],[1,2]]
Out[16]: B C
0 0.755827 0.969633
1 0.830459 0.866927
In [17]: df
Out[17]: A B C
0 0.155607 0.755827 0.969633
1 0.960138 0.830459 0.866927
2 0.359386 0.285517 0.384530
3 0.724831 0.595158 0.567885
4 0.361150 0.114477 0.990422
5 0.849273 0.988311 0.410418
6 0.433382 0.983331 0.904262
7 0.742669 0.227531 0.149279
8 0.711802 0.031837 0.337505
9 0.229736 0.215737 0.795375
In [18]: # Null all values of C column
df['C'].isnull()
0 False
Out[18]:
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
Name: C, dtype: bool
In [19]: # Another Null all values of C column, permenent
df['C']=None
df
Out[19]: A B C
0 0.155607 0.755827 None
1 0.960138 0.830459 None
2 0.359386 0.285517 None
3 0.724831 0.595158 None
4 0.361150 0.114477 None
5 0.849273 0.988311 None
6 0.433382 0.983331 None
7 0.742669 0.227531 None
8 0.711802 0.031837 None
9 0.229736 0.215737 None
Drop NaN Value
In [20]: # Read a CSV
df2 = pd.read_csv('superhero.csv')
df2
Out[20]: Name Toy Born
0 Superman NaN NaN
1 Batman Batmobile 1940-04-25
2 Catwoman NaN NaN
In [21]: # Drop NaN Values but original not changed
df2.dropna() #OR "df.dropna(axis=0)"
Out[21]: Name Toy Born
1 Batman Batmobile 1940-04-25
In [22]: # Drop NaN Values from columns but original not changed
df2.dropna(axis='columns') #OR "df.dropna(axis=1)"
Out[22]: Name
0 Superman
1 Batman
2 Catwoman
In [23]: # Permanent drop NaN Values
df2.dropna(inplace=True)
df2
Out[23]: Name Toy Born
1 Batman Batmobile 1940-04-25
Drop Duplicates
In [24]: # Read a CSV
df3 = pd.read_csv('iceshop.csv')
df3
Out[24]: Brand Style Rating
0 Kulfa Cup 4.0
1 Kulfa Cup 4.0
2 Praline Pack 3.5
3 Mango Cup 4.5
4 Mango Cup 4.5
5 Chocolate Pack 4.0
In [25]: # Drop duplicates but original not changed
df3.drop_duplicates()
Out[25]: Brand Style Rating
0 Kulfa Cup 4.0
2 Praline Pack 3.5
3 Mango Cup 4.5
5 Chocolate Pack 4.0
In [26]: # Drop duplicates from specific subset but original not changed
df3.drop_duplicates(subset=['Brand'])
Out[26]: Brand Style Rating
0 Kulfa Cup 4.0
2 Praline Pack 3.5
Brand Style Rating
3 Mango Cup 4.5
5 Chocolate Pack 4.0
In [27]: # Permanent drop duplicates
df3.drop_duplicates(inplace=True)
df3
Out[27]: Brand Style Rating
0 Kulfa Cup 4.0
2 Praline Pack 3.5
3 Mango Cup 4.5
5 Chocolate Pack 4.0
In [28]: # Check dataframe shape and info
df3.shape
(4, 3)
Out[28]:
In [29]: df3.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 5
Data columns (total 3 columns):
# Column Non-Null Count Dtype
0 Brand 4 non-null object
1 Style 4 non-null object
2 Rating 4 non-null float64
dtypes: float64(1), object(2)
memory usage: 128.0+ bytes