AI Basics
import pandas as pd
#Pandas Cheat Sheet for
Beginners
#Python
/sumitkhanna
Python
Follow me on Sumit Khanna for more updates
Pandas Cheat Sheet
In [ ]: # Importing Pandas
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# Creating a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Edward'],
'Age': [24, 27, 22, 32, 29],
'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
'Salary': [70000, 80000, 65000, 90000, 75000]}
df = pd.DataFrame(data)
df
Out[ ]: Name Age City Salary
0 Alice 24 New York 70000
1 Bob 27 Los Angeles 80000
2 Charlie 22 Chicago 65000
3 David 32 Houston 90000
4 Edward 29 Phoenix 75000
Basic Operations
In [ ]: # Viewing the first few rows of the DataFrame
df.head() # Default shows the first 5 rows, but you can specify the number of rows as an argument
Out[ ]: Name Age City Salary
0 Alice 24 New York 70000
1 Bob 27 Los Angeles 80000
2 Charlie 22 Chicago 65000
3 David 32 Houston 90000
4 Edward 29 Phoenix 75000
In [ ]: # Viewing the last few rows of the DataFrame
df.tail() # Default shows the last 5 rows, but you can specify the number of rows as an argument
Out[ ]: Name Age City Salary
0 Alice 24 New York 70000
1 Bob 27 Los Angeles 80000
2 Charlie 22 Chicago 65000
3 David 32 Houston 90000
4 Edward 29 Phoenix 75000
In [ ]: # Getting the DataFrame's shape
df.shape # Returns a tuple representing the dimensionality of the DataFrame
Out[ ]: (5, 4)
In [ ]: # Getting summary information about the DataFrame
df.info() # Provides a concise summary of the DataFrame, including the data types and non-null values
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 5 non-null object
1 Age 5 non-null int64
2 City 5 non-null object
3 Salary 5 non-null int64
dtypes: int64(2), object(2)
memory usage: 292.0+ bytes
In [ ]: # Descriptive statistics of the DataFrame
df.describe() # Generates descriptive statistics that summarize the central tendency, dispersion, and shape of the DataFrame's distribution
Out[ ]: Age Salary
count 5.000000 5.000000
mean 26.800000 76000.000000
std 3.962323 9617.692031
min 22.000000 65000.000000
25% 24.000000 70000.000000
50% 27.000000 75000.000000
75% 29.000000 80000.000000
max 32.000000 90000.000000
Selecting Data
In [ ]: # Selecting a single column
df['Name'] # Returns a Series
Out[ ]: 0 Alice
1 Bob
2 Charlie
3 David
4 Edward
Name: Name, dtype: object
In [ ]: # Selecting multiple columns
df[['Name', 'City']] # Returns a DataFrame with specified columns
Out[ ]: Name City
0 Alice New York
1 Bob Los Angeles
2 Charlie Chicago
3 David Houston
4 Edward Phoenix
In [ ]: # Selecting rows by index labels
df.loc[0] # Selects the row with index label 0
Out[ ]: Name Alice
Age 24
City New York
Salary 70000
Name: 0, dtype: object
In [ ]: # Selecting rows by index positions
df.iloc[0] # Selects the first row
Out[ ]: Name Alice
Age 24
City New York
Salary 70000
Name: 0, dtype: object
In [ ]: # Conditional selection
df[df['Age'] > 25] # Returns a DataFrame where the age is greater than 25
Out[ ]: Name Age City Salary
1 Bob 27 Los Angeles 80000
3 David 32 Houston 90000
4 Edward 29 Phoenix 75000
Modifying Data
In [ ]: # Adding a new column
df['Experience'] = [2, 5, 1, 7, 3] # Adds a new column 'Experience' to the DataFrame
df
Out[ ]: Name Age City Salary Experience
0 Alice 24 New York 70000 2
1 Bob 27 Los Angeles 80000 5
2 Charlie 22 Chicago 65000 1
3 David 32 Houston 90000 7
4 Edward 29 Phoenix 75000 3
In [ ]: # Modifying an existing column
df['Salary'] = df['Salary'] + 5000 # Increases each salary by 5000
df
Out[ ]: Name Age City Salary Experience
0 Alice 24 New York 75000 2
1 Bob 27 Los Angeles 85000 5
2 Charlie 22 Chicago 70000 1
3 David 32 Houston 95000 7
4 Edward 29 Phoenix 80000 3
In [ ]: # Dropping a column
df.drop('Experience', axis=1, inplace=True) # Drops the 'Experience' column
df
Out[ ]: Name Age City Salary
0 Alice 24 New York 75000
1 Bob 27 Los Angeles 85000
2 Charlie 22 Chicago 70000
3 David 32 Houston 95000
4 Edward 29 Phoenix 80000
In [ ]: # Renaming columns
df.rename(columns={'Name': 'Employee Name', 'City': 'Location'}, inplace=True) # Renames columns
df
Out[ ]: Employee Name Age Location Salary
0 Alice 24 New York 75000
1 Bob 27 Los Angeles 85000
2 Charlie 22 Chicago 70000
3 David 32 Houston 95000
4 Edward 29 Phoenix 80000
Group Operations
In [ ]: # Grouping data by a column and calculating the mean
grouped = df.groupby('Location')['Age'].mean() # Groups by 'Location' and calculates the mean age for each group
grouped
Out[ ]: Location
Chicago 22.0
Houston 32.0
Los Angeles 27.0
New York 24.0
Phoenix 29.0
Name: Age, dtype: float64
In [ ]: # Grouping data by multiple columns
grouped_multi = df.groupby(['Location', 'Age']).size() # Groups by 'Location' and 'Age' and counts the number of occurrences for each group
grouped_multi
Out[ ]: Location Age
Chicago 22 1
Houston 32 1
Los Angeles 27 1
New York 24 1
Phoenix 29 1
dtype: int64
Searching Data
In [ ]: # Searching for specific values
df[df['Employee Name'].str.contains('A')] # Returns rows where 'Employee Name' contains the letter 'A'
Out[ ]: Employee Name Age Location Salary
0 Alice 24 New York 75000
In [ ]: # Advanced searching with regular expressions
df[df['Employee Name'].str.contains('^A', regex=True)] # Returns rows where 'Employee Name' starts with 'A'
Out[ ]: Employee Name Age Location Salary
0 Alice 24 New York 75000
Statistical Functions
In [ ]: # Calculating the mean of a column
df['Salary'].mean() # Returns the mean of the 'Salary' column
Out[ ]: 81000.0
In [ ]: # Calculating the median of a column
df['Salary'].median() # Returns the median of the 'Salary' column
Out[ ]: 80000.0
In [ ]: # Calculating the standard deviation of a column
df['Salary'].std() # Returns the standard deviation of the 'Salary' column
Out[ ]: 9617.692030835673
In [ ]: # Calculating the correlation between columns
df[['Salary', 'Age']].corr() # Returns the correlation matrix for the DataFrame
Out[ ]: Salary Age
Salary 1.000000 0.924993
Age 0.924993 1.000000
In [ ]: # Value counts for a column
df['Location'].value_counts() # Returns the counts of unique values in the 'Location' column
Out[ ]: Location
New York 1
Los Angeles 1
Chicago 1
Houston 1
Phoenix 1
Name: count, dtype: int64
Applying Functions
In [ ]: # Applying a function to a column
df['Salary'].apply(lambda x: x / 1000) # Converts 'Salary' from dollars to thousands
Out[ ]: 0 75.0
1 85.0
2 70.0
3 95.0
4 80.0
Name: Salary, dtype: float64
In [ ]: # Applying a function to each row
df.apply(lambda row: row['Age'] * 2, axis=1) # Doubles the 'Age' of each employee
Out[ ]: 0 48
1 54
2 44
3 64
4 58
dtype: int64
Merging and Joining Data
In [ ]: # Creating another sample DataFrame for merging
data2 = {'Name': ['Alice', 'Bob', 'Charlie', 'Fiona'],
'Department': ['HR', 'Engineering', 'Marketing', 'Finance']}
df2 = pd.DataFrame(data2)
df2
Out[ ]: Name Department
0 Alice HR
1 Bob Engineering
2 Charlie Marketing
3 Fiona Finance
In [ ]: # Merging DataFrames on a common column
merged_df = pd.merge(df, df2, left_on='Employee Name', right_on='Name', how='inner') # Merges on 'Employee Name' and 'Name'
merged_df
Out[ ]: Employee Name Age Location Salary Name Department
0 Alice 24 New York 75000 Alice HR
1 Bob 27 Los Angeles 85000 Bob Engineering
2 Charlie 22 Chicago 70000 Charlie Marketing
In [ ]: # Concatenating DataFrames
concat_df = pd.concat([df, df2], axis=1) # Concatenates DataFrames along columns
concat_df
Out[ ]: Employee Name Age Location Salary Name Department
0 Alice 24 New York 75000 Alice HR
1 Bob 27 Los Angeles 85000 Bob Engineering
2 Charlie 22 Chicago 70000 Charlie Marketing
3 David 32 Houston 95000 Fiona Finance
4 Edward 29 Phoenix 80000 NaN NaN
Handling Missing Data
In [ ]: # Creating a DataFrame with missing values
import numpy as np
data_with_nan = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Edward'],
'Age': [24, 27, 22, np.nan, 29],
'City': ['New York', 'Los Angeles', 'Chicago', np.nan, 'Phoenix'],
'Salary': [70000, 80000, 65000, 90000, np.nan]}
df_nan = pd.DataFrame(data_with_nan)
df_nan
Out[ ]: Name Age City Salary
0 Alice 24.0 New York 70000.0
1 Bob 27.0 Los Angeles 80000.0
2 Charlie 22.0 Chicago 65000.0
3 David NaN NaN 90000.0
4 Edward 29.0 Phoenix NaN
In [ ]: # Checking for missing values
df_nan.isnull() # Returns a DataFrame of booleans indicating if each value is missing
Out[ ]: Name Age City Salary
0 False False False False
1 False False False False
2 False False False False
3 False True True False
4 False False False True
In [ ]: # Dropping rows with missing values
df_nan.dropna()
Out[ ]: Name Age City Salary
0 Alice 24.0 New York 70000.0
1 Bob 27.0 Los Angeles 80000.0
2 Charlie 22.0 Chicago 65000.0
In [ ]: # Filling missing values
df_nan.fillna({'Age': df_nan['Age'].mean(), 'City': 'Unknown', 'Salary': df_nan['Salary'].median()})
Out[ ]: Name Age City Salary
0 Alice 24.0 New York 70000.0
1 Bob 27.0 Los Angeles 80000.0
2 Charlie 22.0 Chicago 65000.0
3 David 25.5 Unknown 90000.0
4 Edward 29.0 Phoenix 75000.0
Advanced Querying
In [ ]: # Querying using a boolean mask
mask = df['Age'] > 25
df[mask] # Returns rows where 'Age' is greater than 25
Out[ ]: Employee Name Age Location Salary
1 Bob 27 Los Angeles 85000
3 David 32 Houston 95000
4 Edward 29 Phoenix 80000
In [ ]: # Using the query() method for complex conditions
df.query('Age > 25 and Salary > 75000') # Returns rows where 'Age' is greater than 25 and 'Salary' is greater than 75000c
Out[ ]: Employee Name Age Location Salary
1 Bob 27 Los Angeles 85000
3 David 32 Houston 95000
4 Edward 29 Phoenix 80000
Advanced Statistics
In [ ]: # Calculating the cumulative sum of a column
df['Cumulative Salary'] = df['Salary'].cumsum() # Adds a column with the cumulative sum of 'Salary'
df
Out[ ]: Employee Name Age Location Salary Cumulative Salary
0 Alice 24 New York 75000 75000
1 Bob 27 Los Angeles 85000 160000
2 Charlie 22 Chicago 70000 230000
3 David 32 Houston 95000 325000
4 Edward 29 Phoenix 80000 405000
In [ ]: # Calculating the rolling mean of a column
df['Rolling Mean Salary'] = df['Salary'].rolling(window=2).mean() # Adds a column with the rolling mean of 'Salary' over a window of 2 rows
df
Out[ ]: Employee Name Age Location Salary Cumulative Salary Rolling Mean Salary
0 Alice 24 New York 75000 75000 NaN
1 Bob 27 Los Angeles 85000 160000 80000.0
2 Charlie 22 Chicago 70000 230000 77500.0
3 David 32 Houston 95000 325000 82500.0
4 Edward 29 Phoenix 80000 405000 87500.0
In [ ]: # Calculating the expanding mean of a column
df['Expanding Mean Salary'] = df['Salary'].expanding(min_periods=2).mean() # Adds a column with the expanding mean of 'Salary' starting with a
df
Out[ ]: Employee Name Age Location Salary Cumulative Salary Rolling Mean Salary Expanding Mean Salary
0 Alice 24 New York 75000 75000 NaN NaN
1 Bob 27 Los Angeles 85000 160000 80000.0 80000.000000
2 Charlie 22 Chicago 70000 230000 77500.0 76666.666667
3 David 32 Houston 95000 325000 82500.0 81250.000000
4 Edward 29 Phoenix 80000 405000 87500.0 81000.000000
Advanced Grouping
In [ ]: # Applying multiple aggregate functions
grouped_agg = df.groupby('Location').agg({'Age': ['mean', 'max'], 'Salary': ['sum', 'mean']}) # Groups by 'Location' and applies multiple aggr
grouped_agg
Out[ ]: Age Salary
mean max sum mean
Location
Chicago 22.0 22 70000 70000.0
Houston 32.0 32 95000 95000.0
Los Angeles 27.0 27 85000 85000.0
New York 24.0 24 75000 75000.0
Phoenix 29.0 29 80000 80000.0
In [ ]: # Grouping by a column and applying a custom function
def custom_function(x):
return x.max() - x.min()
grouped_custom = df.groupby('Location')['Salary'].apply(custom_function) # Groups by 'Location' and applies a custom function to 'Salary'
grouped_custom
Out[ ]: Location
Chicago 0
Houston 0
Los Angeles 0
New York 0
Phoenix 0
Name: Salary, dtype: int64
Advanced Filtering
In [ ]: # Filtering using a custom function
def filter_function(x):
return x['Age'].mean() > 25
filtered = df.groupby('Location').filter(filter_function) # Filters groups where the mean 'Age' is greater than 25
filtered
Out[ ]: Employee Name Age Location Salary Cumulative Salary Rolling Mean Salary Expanding Mean Salary
1 Bob 27 Los Angeles 85000 160000 80000.0 80000.0
3 David 32 Houston 95000 325000 82500.0 81250.0
4 Edward 29 Phoenix 80000 405000 87500.0 81000.0
In [ ]: # Filtering with the where() method
df.where(df['Salary'] > 75000, other=0) # Replaces values where 'Salary' is not greater than 75000 with 0
Out[ ]: Employee Name Age Location Salary Cumulative Salary Rolling Mean Salary Expanding Mean Salary
0 0 0 0 0 0 0.0 0.0
1 Bob 27 Los Angeles 85000 160000 80000.0 80000.0
2 0 0 0 0 0 0.0 0.0
3 David 32 Houston 95000 325000 82500.0 81250.0
4 Edward 29 Phoenix 80000 405000 87500.0 81000.0
DataFrame Styling
In [ ]: # Highlighting maximum values in a DataFrame
df.style.highlight_max(axis=0) # Highlights the maximum values in each column
Out[ ]: Employee Name Age Location Salary Cumulative Salary Rolling Mean Salary Expanding Mean Salary
0 Alice 24 New York 75000 75000 nan nan
1 Bob 27 Los Angeles 85000 160000 80000.000000 80000.000000
2 Charlie 22 Chicago 70000 230000 77500.000000 76666.666667
3 David 32 Houston 95000 325000 82500.000000 81250.000000
4 Edward 29 Phoenix 80000 405000 87500.000000 81000.000000
In [ ]: # Applying a gradient based on values
df.style.background_gradient(cmap='viridis') # Applies a color gradient based on values
Out[ ]: Employee Name Age Location Salary Cumulative Salary Rolling Mean Salary Expanding Mean Salary
0 Alice 24 New York 75000 75000 nan nan
1 Bob 27 Los Angeles 85000 160000 80000.000000 80000.000000
2 Charlie 22 Chicago 70000 230000 77500.000000 76666.666667
3 David 32 Houston 95000 325000 82500.000000 81250.000000
4 Edward 29 Phoenix 80000 405000 87500.000000 81000.000000
In [ ]: # Applying a custom function for styling
def color_negative_red(val):
color = 'red' if val < 75000 else 'black'
return 'color: {}'.format(color)
df.style.applymap(color_negative_red, subset=['Salary']) # Colors 'Salary' values red if they are less than 75000
Out[ ]: Employee Name Age Location Salary Cumulative Salary Rolling Mean Salary Expanding Mean Salary
0 Alice 24 New York 75000 75000 nan nan
1 Bob 27 Los Angeles 85000 160000 80000.000000 80000.000000
2 Charlie 22 Chicago 70000 230000 77500.000000 76666.666667
3 David 32 Houston 95000 325000 82500.000000 81250.000000
4 Edward 29 Phoenix 80000 405000 87500.000000 81000.000000
Follow me on Sumit Khanna for more updates