In [3]: import pandas as pd
import numpy as np
# Set seed for reproducibility
np.random.seed(0)
# Create a DataFrame with 3 columns and 50 rows of random numeric data
data = np.random.rand(50, 3)
df = pd.DataFrame(data, columns=['A', 'B', 'C'])
# Replace 10% of the values by null values whose index positions are generated using random
null_indices = np.random.choice(df.index, size=int(0.1 * len(df)), replace=False)
df.iloc[null_indices] = np.nan
In [4]: # a. Identify and count missing values in a DataFrame
missing_values_count = df.isnull().sum()
print("Missing Values Count:")
print(missing_values_count)
Missing Values Count:
A 5
B 5
C 5
dtype: int64
In [5]: # b. Drop the column having more than 5 null values
df = df.dropna(thresh=len(df) - 5, axis=1)
print("\nDataFrame after dropping columns with more than 5 null values:")
print(df)
DataFrame after dropping columns with more than 5 null values:
A B C
0 0.548814 0.715189 0.602763
1 0.544883 0.423655 0.645894
2 0.437587 0.891773 0.963663
3 0.383442 0.791725 0.528895
4 0.568045 0.925597 0.071036
5 0.087129 0.020218 0.832620
6 NaN NaN NaN
7 0.799159 0.461479 0.780529
8 0.118274 0.639921 0.143353
9 0.944669 0.521848 0.414662
10 0.264556 0.774234 0.456150
11 0.568434 0.018790 0.617635
12 0.612096 0.616934 0.943748
13 0.681820 0.359508 0.437032
14 NaN NaN NaN
15 0.670638 0.210383 0.128926
16 NaN NaN NaN
17 0.438602 0.988374 0.102045
18 0.208877 0.161310 0.653108
19 0.253292 0.466311 0.244426
20 0.158970 0.110375 0.656330
21 0.138183 0.196582 0.368725
22 0.820993 0.097101 0.837945
23 0.096098 0.976459 0.468651
24 0.976761 0.604846 0.739264
25 0.039188 0.282807 0.120197
26 0.296140 0.118728 0.317983
27 0.414263 0.064147 0.692472
28 0.566601 0.265389 0.523248
29 0.093941 0.575946 0.929296
30 0.318569 0.667410 0.131798
31 0.716327 0.289406 0.183191
32 0.586513 0.020108 0.828940
33 0.004695 0.677817 0.270008
34 0.735194 0.962189 0.248753
35 0.576157 0.592042 0.572252
36 0.223082 0.952749 0.447125
37 0.846409 0.699479 0.297437
38 0.813798 0.396506 0.881103
39 0.581273 0.881735 0.692532
40 0.725254 0.501324 0.956084
41 0.643990 0.423855 0.606393
42 0.019193 0.301575 0.660174
43 NaN NaN NaN
44 0.135474 0.298282 0.569965
45 0.590873 0.574325 0.653201
46 0.652103 0.431418 0.896547
47 0.367562 0.435865 0.891923
48 0.806194 0.703889 0.100227
49 NaN NaN NaN
In [7]: # c. Identify the row label having the maximum sum of all values in a row and drop that row
max_sum_row_label = df.sum(axis=1).idxmax()
df = df.drop(index=max_sum_row_label)
print("\nDataFrame after dropping row with maximum sum of values:")
print(df)
DataFrame after dropping row with maximum sum of values:
A B C
0 0.548814 0.715189 0.602763
1 0.544883 0.423655 0.645894
3 0.383442 0.791725 0.528895
4 0.568045 0.925597 0.071036
5 0.087129 0.020218 0.832620
6 NaN NaN NaN
7 0.799159 0.461479 0.780529
8 0.118274 0.639921 0.143353
9 0.944669 0.521848 0.414662
10 0.264556 0.774234 0.456150
11 0.568434 0.018790 0.617635
12 0.612096 0.616934 0.943748
13 0.681820 0.359508 0.437032
14 NaN NaN NaN
15 0.670638 0.210383 0.128926
16 NaN NaN NaN
17 0.438602 0.988374 0.102045
18 0.208877 0.161310 0.653108
19 0.253292 0.466311 0.244426
20 0.158970 0.110375 0.656330
21 0.138183 0.196582 0.368725
22 0.820993 0.097101 0.837945
23 0.096098 0.976459 0.468651
25 0.039188 0.282807 0.120197
26 0.296140 0.118728 0.317983
27 0.414263 0.064147 0.692472
28 0.566601 0.265389 0.523248
29 0.093941 0.575946 0.929296
30 0.318569 0.667410 0.131798
31 0.716327 0.289406 0.183191
32 0.586513 0.020108 0.828940
33 0.004695 0.677817 0.270008
34 0.735194 0.962189 0.248753
35 0.576157 0.592042 0.572252
36 0.223082 0.952749 0.447125
37 0.846409 0.699479 0.297437
38 0.813798 0.396506 0.881103
39 0.581273 0.881735 0.692532
40 0.725254 0.501324 0.956084
41 0.643990 0.423855 0.606393
42 0.019193 0.301575 0.660174
43 NaN NaN NaN
44 0.135474 0.298282 0.569965
45 0.590873 0.574325 0.653201
46 0.652103 0.431418 0.896547
47 0.367562 0.435865 0.891923
48 0.806194 0.703889 0.100227
49 NaN NaN NaN
In [8]: # d. Sort the DataFrame on the basis of the first column
df_sorted = df.sort_values(by='A')
print("\nDataFrame sorted on the basis of the first column:")
print(df_sorted)
DataFrame sorted on the basis of the first column:
A B C
33 0.004695 0.677817 0.270008
42 0.019193 0.301575 0.660174
25 0.039188 0.282807 0.120197
5 0.087129 0.020218 0.832620
29 0.093941 0.575946 0.929296
23 0.096098 0.976459 0.468651
8 0.118274 0.639921 0.143353
44 0.135474 0.298282 0.569965
21 0.138183 0.196582 0.368725
20 0.158970 0.110375 0.656330
18 0.208877 0.161310 0.653108
36 0.223082 0.952749 0.447125
19 0.253292 0.466311 0.244426
10 0.264556 0.774234 0.456150
26 0.296140 0.118728 0.317983
30 0.318569 0.667410 0.131798
47 0.367562 0.435865 0.891923
3 0.383442 0.791725 0.528895
27 0.414263 0.064147 0.692472
17 0.438602 0.988374 0.102045
1 0.544883 0.423655 0.645894
0 0.548814 0.715189 0.602763
28 0.566601 0.265389 0.523248
4 0.568045 0.925597 0.071036
11 0.568434 0.018790 0.617635
35 0.576157 0.592042 0.572252
39 0.581273 0.881735 0.692532
32 0.586513 0.020108 0.828940
45 0.590873 0.574325 0.653201
12 0.612096 0.616934 0.943748
41 0.643990 0.423855 0.606393
46 0.652103 0.431418 0.896547
15 0.670638 0.210383 0.128926
13 0.681820 0.359508 0.437032
31 0.716327 0.289406 0.183191
40 0.725254 0.501324 0.956084
34 0.735194 0.962189 0.248753
7 0.799159 0.461479 0.780529
48 0.806194 0.703889 0.100227
38 0.813798 0.396506 0.881103
22 0.820993 0.097101 0.837945
37 0.846409 0.699479 0.297437
9 0.944669 0.521848 0.414662
6 NaN NaN NaN
14 NaN NaN NaN
16 NaN NaN NaN
43 NaN NaN NaN
49 NaN NaN NaN
In [9]: # e. Remove all duplicates from the first column
df_unique = df.drop_duplicates(subset='A')
print("\nDataFrame after removing duplicates from the first column:")
print(df_unique)
DataFrame after removing duplicates from the first column:
A B C
0 0.548814 0.715189 0.602763
1 0.544883 0.423655 0.645894
3 0.383442 0.791725 0.528895
4 0.568045 0.925597 0.071036
5 0.087129 0.020218 0.832620
6 NaN NaN NaN
7 0.799159 0.461479 0.780529
8 0.118274 0.639921 0.143353
9 0.944669 0.521848 0.414662
10 0.264556 0.774234 0.456150
11 0.568434 0.018790 0.617635
12 0.612096 0.616934 0.943748
13 0.681820 0.359508 0.437032
15 0.670638 0.210383 0.128926
17 0.438602 0.988374 0.102045
18 0.208877 0.161310 0.653108
19 0.253292 0.466311 0.244426
20 0.158970 0.110375 0.656330
21 0.138183 0.196582 0.368725
22 0.820993 0.097101 0.837945
23 0.096098 0.976459 0.468651
25 0.039188 0.282807 0.120197
26 0.296140 0.118728 0.317983
27 0.414263 0.064147 0.692472
28 0.566601 0.265389 0.523248
29 0.093941 0.575946 0.929296
30 0.318569 0.667410 0.131798
31 0.716327 0.289406 0.183191
32 0.586513 0.020108 0.828940
33 0.004695 0.677817 0.270008
34 0.735194 0.962189 0.248753
35 0.576157 0.592042 0.572252
36 0.223082 0.952749 0.447125
37 0.846409 0.699479 0.297437
38 0.813798 0.396506 0.881103
39 0.581273 0.881735 0.692532
40 0.725254 0.501324 0.956084
41 0.643990 0.423855 0.606393
42 0.019193 0.301575 0.660174
44 0.135474 0.298282 0.569965
45 0.590873 0.574325 0.653201
46 0.652103 0.431418 0.896547
47 0.367562 0.435865 0.891923
48 0.806194 0.703889 0.100227
In [10]: # f. Find the correlation between the first and second column and covariance between the sec
correlation_AB = df['A'].corr(df['B'])
covariance_BC = df['B'].cov(df['C'])
print("\nCorrelation between the first and second column:",correlation_AB)
print("Covariance between the second and third column:",covariance_BC)
Correlation between the first and second column: 0.05849765987946871
Covariance between the second and third column: -0.025965685609794554
In [16]: # g. Discretize the second column and create 5 bins
import pandas as pd
import numpy as np
# Assuming df is your DataFrame and 'B' is the second column
df['B_bins'] = pd.qcut(df['B'], q=5, labels=False)
print("\nDataFrame with discretized second column:")
print(df)
DataFrame with discretized second column:
A B C B_bins
0 0.548814 0.715189 0.602763 4.0
1 0.544883 0.423655 0.645894 2.0
3 0.383442 0.791725 0.528895 4.0
4 0.568045 0.925597 0.071036 4.0
5 0.087129 0.020218 0.832620 0.0
6 NaN NaN NaN NaN
7 0.799159 0.461479 0.780529 2.0
8 0.118274 0.639921 0.143353 3.0
9 0.944669 0.521848 0.414662 2.0
10 0.264556 0.774234 0.456150 4.0
11 0.568434 0.018790 0.617635 0.0
12 0.612096 0.616934 0.943748 3.0
13 0.681820 0.359508 0.437032 1.0
14 NaN NaN NaN NaN
15 0.670638 0.210383 0.128926 1.0
16 NaN NaN NaN NaN
17 0.438602 0.988374 0.102045 4.0
18 0.208877 0.161310 0.653108 0.0
19 0.253292 0.466311 0.244426 2.0
20 0.158970 0.110375 0.656330 0.0
21 0.138183 0.196582 0.368725 0.0
22 0.820993 0.097101 0.837945 0.0
23 0.096098 0.976459 0.468651 4.0
25 0.039188 0.282807 0.120197 1.0
26 0.296140 0.118728 0.317983 0.0
27 0.414263 0.064147 0.692472 0.0
28 0.566601 0.265389 0.523248 1.0
29 0.093941 0.575946 0.929296 3.0
30 0.318569 0.667410 0.131798 3.0
31 0.716327 0.289406 0.183191 1.0
32 0.586513 0.020108 0.828940 0.0
33 0.004695 0.677817 0.270008 3.0
34 0.735194 0.962189 0.248753 4.0
35 0.576157 0.592042 0.572252 3.0
36 0.223082 0.952749 0.447125 4.0
37 0.846409 0.699479 0.297437 3.0
38 0.813798 0.396506 0.881103 1.0
39 0.581273 0.881735 0.692532 4.0
40 0.725254 0.501324 0.956084 2.0
41 0.643990 0.423855 0.606393 2.0
42 0.019193 0.301575 0.660174 1.0
43 NaN NaN NaN NaN
44 0.135474 0.298282 0.569965 1.0
45 0.590873 0.574325 0.653201 2.0
46 0.652103 0.431418 0.896547 2.0
47 0.367562 0.435865 0.891923 2.0
48 0.806194 0.703889 0.100227 3.0
49 NaN NaN NaN NaN
In [17]: print('By- Aaryan Pandey 13591')
By- Aaryan Pandey 13591
In [ ]: