📘 Data Analysis & Visualization in Python - Exam Cheat Sheet
1. Importing Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
2. NumPy Arrays
a1 = np.zeros((2, 3)) # 2x3 zero array
a2 = [[3, 4, 5], [7, 8, 9]]
np.add(a1, a2) # Matrix addition
np.append(a1, a2, axis=0) # Append rows
np.shape(a1) # Get shape
3. Pandas DataFrames
df = pd.DataFrame({'Name': ['Amit', 'Neha', 'Amit'], 'Age': [23, 30,
25]})
df.head() # First 5 rows
df.describe() # Stats summary
df.info() # Structure
df.columns # Column names
df['Age'].mean() # Average age
df['Name'].nunique() # Unique names
df.groupby('Name')['Age'].mean() # Group avg
4. Merging and Joining
df1 = pd.DataFrame({'RollNo': [1, 2], 'Name': ['Ravi', 'Megha']})
df2 = pd.DataFrame({'RollNo': [2, 3], 'Name': ['Megha', 'Karan']})
pd.merge(df1, df2, on='Name', how='inner') # Merge on Name
pd.merge(df1, df2, on=['RollNo', 'Name'], how='inner') # Exact match
5. Handling Missing Data
df.dropna(thresh=2) # Keep rows with >=2 non-NA
df.fillna(method='ffill', limit=2) # Forward fill
6. Plotting and Visualization
plt.plot(days, rainfall, 'ro-') # Line Plot
plt.scatter(df['Salary'], df['Age']) # Scatter Plot
df['column'].value_counts().plot(kind='bar') # Bar Plot
sns.boxplot(data=df, y='sales') # Box Plot
sns.heatmap(df.corr(), annot=True) # Heatmap
7. MultiIndex and Swap Level
df.index.names = ['key1', 'key2']
df = df.swaplevel('key1', 'key2')
df = df.sort_index(level=0)
8. Binning Data
ages = [21, 25, 33, 45, 62]
pd.cut(ages, bins=[18, 25, 35, 60, 100], labels=['Youth', 'YoungAdult',
'MiddleAged', 'Senior'])
pd.qcut(ages, 4) # Equal-sized bins
9. File Handling
pd.read_csv("data.csv") # Read CSV
pd.read_excel("data.xlsx") # Read Excel
df.to_csv("out.csv", index=False) # Save CSV
df.to_excel("out.xlsx", index=False) # Save Excel
pd.read_excel("data.xlsx", index_col="Employee ID")
10. Subplots and Save Plot
fig, axs = plt.subplots(1, 2)
axs[0].scatter(df['Salary'], df['Age'])
axs[1].bar(df['Role'].value_counts().index, df['Role'].value_counts())
plt.savefig("plot.png")
11. Correlation and Covariance
df[['Hours', 'Marks']].corr() # Correlation
df[['Hours', 'Marks']].cov() # Covariance
12. Remove Duplicate Rows
df.drop_duplicates(['col1', 'col2'], keep='last')
13. Series Rank & Comparison
s1 = pd.Series([5, 0, -4, 8])
s1.rank() # Rank values
df2 > df1['B'].min() # Element-wise comparison
14. Experience: Practice Scenario
df = pd.read_csv("employee.csv")
df.groupby('Role')['Salary'].sum() # Role-wise total
salary
df[df['Gender'] == 'Female'].shape[0] # Female count
df[df['Salary'] >= df['Salary'].mean()] # Filter by average
salary