keyboard_arrow_down Importing Pandas
import pandas as pd
import numpy as np
keyboard_arrow_down Data Structures
keyboard_arrow_down Series
A one-dimensional array-like object.
# Creating a Series from a list
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
# Creating a Series with a custom index
s = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
print(s)
a 1
b 2
c 3
dtype: int64
print(s.iloc[1])
s.loc['b']
NaN vs None
keyboard_arrow_down DataFrame
A two-dimensional, size-mutable, potentially heterogeneous tabular data structure.
# Creating a DataFrame from a dictionary
import pandas as pd
data = {
'A': [1, 2, 3, 4],
'B': [5, 6, 7, 8],
'C': [9, 10, 11, 12]
}
df = pd.DataFrame(data)
print(df)
A B C
0 1 5 9
1 2 6 10
2 3 7 11
3 4 8 12
keyboard_arrow_down Basic Operations
Viewing Data
# Display the first few rows
print(df.head(3))
A B C
0 1 5 9
1 2 6 10
2 3 7 11
# Display the last few rows
print(df.tail(1))
A B C
3 4 8 12
# Display information about the DataFrame
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 A 4 non-null int64
1 B 4 non-null int64
2 C 4 non-null int64
dtypes: int64(3)
memory usage: 224.0 bytes
None
# Display summary statistics
print(df.describe())
A B C
count 4.000000 4.000000 4.000000
mean 2.500000 6.500000 10.500000
std 1.290994 1.290994 1.290994
min 1.000000 5.000000 9.000000
25% 1.750000 5.750000 9.750000
50% 2.500000 6.500000 10.500000
75% 3.250000 7.250000 11.250000
max 4.000000 8.000000 12.000000
# Display the DataFrame's index
print(df.index)
RangeIndex(start=0, stop=4, step=1)
# Display the DataFrame's columns
print(df.columns)
Index(['A', 'B', 'C'], dtype='object')
# Display the DataFrame's values
print(df.values)
[[ 1 5 9]
[ 2 6 10]
[ 3 7 11]
[ 4 8 12]]
keyboard_arrow_down Selecting Data
# Selecting a single column
print(df['A'])
0 1
1 2
2 3
3 4
Name: A, dtype: int64
# Selecting multiple columns
print(df[['A', 'B']])
A B
0 1 5
1 2 6
2 3 7
3 4 8
df
A B C
0 1 5 9
1 2 6 10
2 3 7 11
3 4 8 12
Next steps: Generate code with df
toggle_off View recommended plots New interactive sheet
# Selecting rows by index
print(df.loc[1]) # By label
print()
print(df.iloc[0]) # By position
A 2
B 6
C 10
Name: 1, dtype: int64
A 1
B 5
C 9
Name: 0, dtype: int64
print(df)
A B C
0 1 5 9
1 2 6 10
2 3 7 11
3 4 8 12
# Selecting a subset of rows and columns
print(df.loc[1:2, ['A', 'C']])
print()
print(df.iloc[0:2, 0:2])
A C
1 2 10
2 3 11
A B
0 1 5
1 2 6
import pandas as pd
data = {
'A': [1, 2, 3, 4],
'B': [5, 6, 7, 8],
'C': [9, 10, 11, 12]
}
df = pd.DataFrame(data)
print(df)
A B C
0 1 5 9
1 2 6 10
2 3 7 11
3 4 8 12
a = df['A']>2
print(df[a])
A B C
2 3 7 11
3 4 8 12
# Boolean indexing
print(df['B'][df['A'] > 2])
2 7
3 8
Name: B, dtype: int64
print(df['A'] > 2)
0 False
1 False
2 True
3 True
Name: A, dtype: bool
keyboard_arrow_down Setting Values
# Setting a new column
df['D'] = df['A'] + df['B']
print(df)
A B C D
0 1 5 9 6
1 2 6 10 8
2 3 7 11 10
3 4 8 12 12
df['D1'] = [54, 65, 8, 56]
print(df)
A B C D D1
0 1 5 9 6 54
1 2 6 10 8 65
2 3 7 11 10 8
3 4 8 12 12 56
# Setting values by label
df.at[0, 'A'] = 10
print(df)
A B C D D1
0 10 5 9 6 54
1 2 6 10 8 65
2 3 7 11 10 8
3 4 8 12 12 56
# Setting values by position
df.iat[0, 1] = 20
print(df)
A B C D D1
0 10 20 9 6 54
1 2 6 10 8 65
2 3 7 11 10 8
3 4 8 12 12 56
# Setting values by boolean indexing
df.loc[df['A'] > 2, 'B'] = 99
print(df)
A B C D D1
0 10 99 9 6 54
1 2 6 10 8 65
2 3 99 11 10 8
3 4 99 12 12 56
keyboard_arrow_down Data Manipulation
Handling Missing Data
# Replacing missing values
df_with_nan = df.copy()
df_with_nan
A B C D D1
0 10 99 9 6 54
1 2 6 10 8 65
2 3 99 11 10 8
3 4 99 12 12 56
Next steps: Generate code with df_with_nan
toggle_off View recommended plots New interactive sheet
df_with_nan.iloc[0, 1] = np.nan
df_with_nan
A B C D D1
0 10 NaN 9 6 54
1 2 6.0 10 8 65
2 3 99.0 11 10 8
3 4 99.0 12 12 56
Next steps: Generate code with df_with_nan
toggle_off View recommended plots New interactive sheet
print(df_with_nan.fillna(0)) # Replace with 0
A B C D D1
0 10 0.0 9 6 54
1 2 6.0 10 8 65
2 3 99.0 11 10 8
3 4 99.0 12 12 56
df_with_nan
A B C D D1
0 10 NaN 9 6 54
1 2 6.0 10 8 65
2 3 99.0 11 10 8
3 4 99.0 12 12 56
Next steps: Generate code with df_with_nan
toggle_off View recommended plots New interactive sheet
print(df_with_nan.dropna()) # Drop rows with missing values
A B C D D1
1 2 6.0 10 8 65
2 3 99.0 11 10 8
3 4 99.0 12 12 56
print(df_with_nan.isna())
A B C D D1
0 False True False False False
1 False False False False False
2 False False False False False
3 False False False False False
print(df_with_nan[df_with_nan['B'].isna()]) # Check for missing values
A B C D D1
0 10 NaN 9 6 54
Handling Duplicates
# Creating a DataFrame with duplicates
df_dup = pd.DataFrame({
'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar'],
'B': [1, 1, 2, 2, 1, 1]
})
print(df_dup)
print()
# Dropping duplicates
df_no_dup = df_dup.drop_duplicates()
print("DataFrame without duplicates:\n", df_no_dup)
A B
0 foo 1
1 bar 1
2 foo 2
3 bar 2
4 foo 1
5 bar 1
DataFrame without duplicates:
A B
0 foo 1
1 bar 1
2 foo 2
3 bar 2
keyboard_arrow_down Operations
df
A B C D D1
0 10 99 9 6 54
1 2 6 10 8 65
2 3 99 11 10 8
3 4 99 12 12 56
Next steps: Generate code with df
toggle_off View recommended plots New interactive sheet
# Performing a column-wise operation
print(df.mean())
A 4.75
B 75.75
C 10.50
D 9.00
D1 45.75
dtype: float64
# Performing a row-wise operation
print(df.mean(axis=1))
0 35.6
1 18.2
2 26.2
3 36.6
dtype: float64
df
A B C D D1
0 10 99 9 6 54
1 2 6 10 8 65
2 3 99 11 10 8
3 4 99 12 12 56
Next steps: Generate code with df
toggle_off View recommended plots New interactive sheet
import numpy as np
import pandas as pd
def value_difference(my_list):
my_array = np.array(my_list)
return np.max(my_array)-np.min(my_array)
# Applying functions to columns/rows
print(df)
print()
print(df.apply(np.cumsum))
print()
print(df.apply(value_difference,axis=1))
print()
print(df.apply(value_difference,axis=0))
A B C D D1
0 10 99 9 6 54
1 2 6 10 8 65
2 3 99 11 10 8
3 4 99 12 12 56
A B C D D1
0 10 99 9 6 54
1 12 105 19 14 119
2 15 204 30 24 127
3 19 303 42 36 183
0 93
1 63
2 96
3 95
dtype: int64
A 8
B 93
C 3
D 6
D1 57
dtype: int64
keyboard_arrow_down String Operations
# Creating a DataFrame with string data
df_str = pd.DataFrame({'A': ['foo', 'bar', 'baz'], 'B': ['one', 'two', 'three']})
df_str
A B
0 foo one
1 bar two
2 baz three
Next steps: Generate code with df_str
toggle_off View recommended plots New interactive sheet
# Applying string methods
print(df_str['A'].str.upper())
print()
print(df_str['B'].str.len())
print()
print(df_str['A'].str.contains('o'))
0 FOO
1 BAR
2 BAZ
Name: A, dtype: object
0 3
1 3
2 5
Name: B, dtype: int64
0 True
1 False
2 False
Name: A, dtype: bool
keyboard_arrow_down Merging and Joining
# Creating DataFrames to merge
df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['B', 'C', 'D'], 'value': [4, 5, 6]})
print(df1)
print()
print(df2)
key value
0 A 1
1 B 2
2 C 3
key value
0 B 4
1 C 5
2 D 6
# Merging DataFrames
print(pd.merge(df1, df2, on='key')) # Inner join
print()
print(pd.merge(df1, df2, on='key', how='left')) # Left join
print()
print(pd.merge(df1, df2, on='key', how='outer')) # Outer join
key value_x value_y
0 B 2 4
1 C 3 5
key value_x value_y
0 A 1 NaN
1 B 2 4.0
2 C 3 5.0
key value_x value_y
0 A 1.0 NaN
1 B 2.0 4.0
2 C 3.0 5.0
3 D NaN 6.0
# Concatenating DataFrames
print(pd.concat([df1, df2], axis=0)) # Concatenate rows
print()
print(pd.concat([df1, df2], axis=1)) # Concatenate columns
key value
0 A 1
1 B 2
2 C 3
0 B 4
1 C 5
2 D 6
key value key value
0 A 1 B 4
1 B 2 C 5
2 C 3 D 6
keyboard_arrow_down Grouping and Aggregating
# Creating a DataFrame to group
df_group = pd.DataFrame({
'key': ['A', 'B', 'A', 'B'],
'data': range(4)
})
df_group
key data
0 A 0
1 B 1
2 A 2
3 B 3
Next steps: Generate code with df_group
toggle_off View recommended plots New interactive sheet
# Grouping and applying aggregate functions
grouped = df_group.groupby('key').agg({'data': 'mean'})
grouped
data
key
A 1.0
B 2.0
Next steps: Generate code with grouped
toggle_off View recommended plots New interactive sheet
# Grouping and applying aggregate functions
grouped = df_group.groupby('key').agg({'data': ['mean', 'sum','size']})
grouped
data
mean sum size
key
A 1.0 2 2
B 2.0 4 2
Next steps: Generate code with grouped
toggle_off View recommended plots New interactive sheet
keyboard_arrow_down Advanced Operations
Pivot Tables
# Creating a DataFrame for pivot table
df_pivot = pd.DataFrame({
'A': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'],
'B': ['one', 'one', 'two', 'two', 'one', 'one'],
'C': ['small', 'large', 'large', 'small', 'small', 'large'],
'D': [1, 2, 2, 3, 3, 4]
})
df_pivot
A B C D
0 foo one small 1
1 foo one large 2
2 foo two large 2
3 bar two small 3
4 bar one small 3
5 bar one large 4
Next steps: Generate code with df_pivot
toggle_off View recommended plots New interactive sheet
# Creating a pivot table
pivot = df_pivot.pivot_table(values='D', index=['A', 'B'], columns=['C'])
print(pivot)
C large small
A B
bar one 4.0 3.0
two NaN 3.0
foo one 2.0 1.0
two 2.0 NaN
keyboard_arrow_down Plotting
import matplotlib.pyplot as plt
# Creating a DataFrame to plot
df_plot = pd.DataFrame({
'A': np.cumsum(np.random.randn(1000)),
'B': np.cumsum(np.random.randn(1000)),
'C': np.cumsum(np.random.randn(1000))
}, index=pd.date_range('2024-01-01', periods=1000))
# Plotting the DataFrame
df_plot.plot()
plt.show()
url = 'https://gist.githubusercontent.com/DiogoRibeiro7/c6590d0cf119e87c39e31c21a9c0f3a8/raw/4a8e3da267a0c1f0d650901d8295a5153bde8b21/PlayTennis.csv'
df = pd.read_csv(url)
df
Outlook Temperature Humidity Wind Play Tennis
0 Sunny Hot High Weak No
1 Sunny Hot High Strong No
2 Overcast Hot High Weak Yes
3 Rain Mild High Weak Yes
4 Rain Cool Normal Weak Yes
5 Rain Cool Normal Strong No
6 Overcast Cool Normal Strong Yes
7 Sunny Mild High Weak No
8 Sunny Cool Normal Weak Yes
9 Rain Mild Normal Weak Yes
10 Sunny Mild Normal Strong Yes
11 Overcast Mild High Strong Yes
12 Overcast Hot Normal Weak Yes
13 Rain Mild High Strong No
Next steps: Generate code with df
toggle_off View recommended plots New interactive sheet
Advanced Data Manipulation
keyboard_arrow_down Creating DataFrames
import pandas as pd
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [24, 27, 22, 32],
'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']
}
df = pd.DataFrame(data)
print("DataFrame from dictionary:\n", df)
DataFrame from dictionary:
Name Age City
0 Alice 24 New York
1 Bob 27 Los Angeles
2 Charlie 22 Chicago
3 David 32 Houston
From a CSV file
keyboard_arrow_down Basic DataFrame Operations
# Display the first few rows
print("First few rows:\n", df.head())
# Display the last few rows
print("Last few rows:\n", df.tail())
# Display information about the DataFrame
print("Info:\n", df.info())
# Display summary statistics
print("Summary statistics:\n", df.describe())
First few rows:
Name Age City
0 Alice 24 New York
1 Bob 27 Los Angeles
2 Charlie 22 Chicago
3 David 32 Houston
Last few rows:
Name Age City
0 Alice 24 New York
1 Bob 27 Los Angeles
2 Charlie 22 Chicago
3 David 32 Houston
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 4 non-null object
1 Age 4 non-null int64
2 City 4 non-null object
dtypes: int64(1), object(2)
memory usage: 224.0+ bytes
Info:
None
Summary statistics:
Age
count 4.000000
mean 26.250000
std 4.349329
min 22.000000
25% 23.500000
50% 25.500000
75% 28.250000
max 32.000000
keyboard_arrow_down Selecting columns
# Selecting a single column
print("Single column:\n", df['Name'])
# Selecting multiple columns
print("Multiple columns:\n", df[['Name', 'City']])
Single column:
0 Alice
1 Bob
2 Charlie
3 David
Name: Name, dtype: object
Multiple columns:
Name City
0 Alice New York
1 Bob Los Angeles
2 Charlie Chicago
3 David Houston
keyboard_arrow_down Selecting rows
# Selecting a single row by index
print("Single row:\n", df.iloc[1])
# Selecting multiple rows by index
print("Multiple rows:\n", df.iloc[1:3])
# Selecting rows by conditions
print("Rows with age > 25:\n", df[df['Age'] > 25])
Single row:
Name Bob
Age 27
City Los Angeles
Name: 1, dtype: object
Multiple rows:
Name Age City
1 Bob 27 Los Angeles
2 Charlie 22 Chicago
Rows with age > 25:
Name Age City
1 Bob 27 Los Angeles
3 David 32 Houston
keyboard_arrow_down Data Manipulation
Adding a new column
df['Salary'] = [50000, 60000, 45000, 80000]
print("DataFrame with new column:\n", df)
DataFrame with new column:
Name Age City Salary
0 Alice 24 New York 50000
1 Bob 27 Los Angeles 60000
2 Charlie 22 Chicago 45000
3 David 32 Houston 80000
keyboard_arrow_down Updating values
# Update a single value
df.at[0, 'Age'] = 25
print("Updated DataFrame:\n", df)
# Update multiple values based on a condition
df.loc[df['City'] == 'New York', 'City'] = 'NYC'
print("DataFrame after updating values:\n", df)
Updated DataFrame:
Name Age City Salary
0 Alice 25 New York 50000
1 Bob 27 Los Angeles 60000
2 Charlie 22 Chicago 45000
3 David 32 Houston 80000
DataFrame after updating values:
Name Age City Salary
0 Alice 25 NYC 50000
1 Bob 27 Los Angeles 60000
2 Charlie 22 Chicago 45000
3 David 32 Houston 80000
keyboard_arrow_down Deleting columns and rows
# Delete a column
df = df.drop(columns=['Salary'])
print("DataFrame after deleting a column:\n", df)
# Delete a row
df = df.drop(index=1)
print("DataFrame after deleting a row:\n", df)
DataFrame after deleting a column:
Name Age City
0 Alice 25 NYC
1 Bob 27 Los Angeles
2 Charlie 22 Chicago
3 David 32 Houston
DataFrame after deleting a row:
Name Age City
0 Alice 25 NYC
2 Charlie 22 Chicago
3 David 32 Houston
keyboard_arrow_down Sorting data
# Sort by a single column
df_sorted = df.sort_values(by='Age')
print("Sorted by Age:\n", df_sorted)
# Sort by multiple columns
df_sorted = df.sort_values(by=['City', 'Age'])
print("Sorted by City and Age:\n", df_sorted)
Sorted by Age:
Name Age City
2 Charlie 22 Chicago
0 Alice 25 NYC
3 David 32 Houston
Sorted by City and Age:
Name Age City
2 Charlie 22 Chicago
3 David 32 Houston
0 Alice 25 NYC
keyboard_arrow_down Basic Statistical Operations
# Calculate mean
mean_age = df['Age'].mean()
print("Mean Age:", mean_age)
# Calculate sum
sum_age = df['Age'].sum()
print("Sum of Ages:", sum_age)
# Calculate median
median_age = df['Age'].median()
print("Median Age:", median_age)
# Calculate standard deviation
std_age = df['Age'].std()
print("Standard Deviation of Age:", std_age)
Mean Age: 26.333333333333332
Sum of Ages: 79
Median Age: 25.0
Standard Deviation of Age: 5.131601439446884
keyboard_arrow_down Putting It All Together
import pandas as pd
# Creating a DataFrame from a dictionary
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [24, 27, 22, 32],
'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']
}
df = pd.DataFrame(data)
print("DataFrame from dictionary:\n", df)
# Viewing data
print("First few rows:\n", df.head())
print("Summary statistics:\n", df.describe())
# Selecting columns and rows
print("Single column:\n", df['Name'])
print("Rows with age > 25:\n", df[df['Age'] > 25])
# Adding a new column
df['Salary'] = [50000, 60000, 45000, 80000]
print("DataFrame with new column:\n", df)
# Updating values
df.at[0, 'Age'] = 25
df.loc[df['City'] == 'New York', 'City'] = 'NYC'
print("Updated DataFrame:\n", df)
# Deleting columns and rows
df = df.drop(columns=['Salary'])
df = df.drop(index=1)
print("DataFrame after deleting a column and a row:\n", df)
# Sorting data
df_sorted = df.sort_values(by='Age')
print("Sorted by Age:\n", df_sorted)
# Basic statistical operations
mean_age = df['Age'].mean()
sum_age = df['Age'].sum()
median_age = df['Age'].median()
std_age = df['Age'].std()
print("Mean Age:", mean_age)
print("Sum of Ages:", sum_age)
print("Median Age:", median_age)
print("Standard Deviation of Age:", std_age)
DataFrame from dictionary:
Name Age City
0 Alice 24 New York
1 Bob 27 Los Angeles
2 Charlie 22 Chicago
3 David 32 Houston
First few rows:
Name Age City
0 Alice 24 New York
1 Bob 27 Los Angeles
2 Charlie 22 Chicago
3 David 32 Houston
Summary statistics:
Age
count 4.000000
mean 26.250000
std 4.349329
min 22.000000
25% 23.500000
50% 25.500000
75% 28.250000
max 32.000000
Single column:
0 Alice
1 Bob
2 Charlie
3 David
Name: Name, dtype: object
Rows with age > 25:
Name Age City
1 Bob 27 Los Angeles
3 David 32 Houston
DataFrame with new column:
Name Age City Salary
0 Alice 24 New York 50000
1 Bob 27 Los Angeles 60000
2 Charlie 22 Chicago 45000
3 David 32 Houston 80000
Updated DataFrame:
Name Age City Salary
0 Alice 25 NYC 50000
1 Bob 27 Los Angeles 60000
2 Charlie 22 Chicago 45000
3 David 32 Houston 80000
DataFrame after deleting a column and a row:
Name Age City
0 Alice 25 NYC
2 Charlie 22 Chicago
3 David 32 Houston
Sorted by Age:
Name Age City
2 Charlie 22 Chicago
0 Alice 25 NYC
3 David 32 Houston
Mean Age: 26.333333333333332
Sum of Ages: 79
Median Age: 25.0
Standard Deviation of Age: 5.131601439446884
keyboard_arrow_down Visualization with Pandas and Seaborn
import matplotlib.pyplot as plt
import seaborn as sns
# Creating a DataFrame for visualization
df_viz = pd.DataFrame({
'A': np.random.randn(100),
'B': np.random.randn(100),
'C': np.random.randn(100)
})
# Plotting with Pandas
df_viz.plot(kind='scatter', x='A', y='B')
plt.show()
# Plotting with Seaborn
sns.pairplot(df_viz)
plt.show()
keyboard_arrow_down Reading Different Files
pip install openpyxl # installing module to read excel files
Requirement already satisfied: openpyxl in /usr/local/lib/python3.10/dist-packages (3.1.5)
Requirement already satisfied: et-xmlfile in /usr/local/lib/python3.10/dist-packages (from openpyxl) (2.0.0)
From CSV
file_path = r"/content/uber-raw-data-apr14.csv"
df = pd.read_csv(file_path)
print(df.head())
Date/Time Lat Lon Base
0 4/1/2014 0:11:00 40.7690 -73.9549 B02512
1 4/1/2014 0:17:00 40.7267 -74.0345 B02512
2 4/1/2014 0:21:00 40.7316 -73.9873 B02512
3 4/1/2014 0:28:00 40.7588 -73.9776 B02512
4 4/1/2014 0:33:00 40.7594 -73.9722 B02512
From Excel
# excel_file_path = r"C:\Users\Nileaysh\Downloads\BCADS17.xlsx"
file_path = r"/content/200c2e152dbe4ac7b301e8ae7a6762ce.xlsx"
df = pd.read_excel(file_path)
print(df.head())
table column Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5 \
0 movie id NaN NaN NaN NaN
1 movie title NaN NaN NaN NaN
2 movie year NaN NaN NaN NaN
3 movie date_published NaN NaN NaN NaN
4 movie duration NaN NaN NaN NaN
Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 Unnamed: 10 Unnamed: 11 \
0 NaN NaN NaN genre NaN NaN
1 NaN NaN NaN * movie_id NaN NaN
2 NaN NaN NaN * genre NaN NaN
3 NaN NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN NaN NaN
Unnamed: 12
0 ratings
1 * movie_id
2 avg_rating
3 total_votes
4 median_rating