In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
In [3]:
df=pd.read_csv(r'/home/lab108-28/Desktop/ML DATASETS/housing.csv')
In [4]:
df.head()
Out[4]:
longitude latitude housing_median_age total_rooms total_bedrooms population households med
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0
In [5]:
df.shape
Out[5]:
(20640, 10)
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 20640 non-null float64
1 latitude 20640 non-null float64
2 housing_median_age 20640 non-null float64
3 total_rooms 20640 non-null float64
4 total_bedrooms 20433 non-null float64
5 population 20640 non-null float64
6 households 20640 non-null float64
7 median_income 20640 non-null float64
8 median_house_value 20640 non-null float64
9 ocean_proximity 20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
In [7]:
df.nunique
Out[7]:
Explore our developer-friendly HTML to PDF API Printed using PDFCrowd HTML to PDF
<bound method DataFrame.nunique of longitude latitude housing_median_age total
_rooms total_bedrooms \
0 -122.23 37.88 41.0 880.0 129.0
1 -122.22 37.86 21.0 7099.0 1106.0
2 -122.24 37.85 52.0 1467.0 190.0
3 -122.25 37.85 52.0 1274.0 235.0
4 -122.25 37.85 52.0 1627.0 280.0
... ... ... ... ... ...
20635 -121.09 39.48 25.0 1665.0 374.0
20636 -121.21 39.49 18.0 697.0 150.0
20637 -121.22 39.43 17.0 2254.0 485.0
20638 -121.32 39.43 18.0 1860.0 409.0
20639 -121.24 39.37 16.0 2785.0 616.0
population households median_income median_house_value \
0 322.0 126.0 8.3252 452600.0
1 2401.0 1138.0 8.3014 358500.0
2 496.0 177.0 7.2574 352100.0
3 558.0 219.0 5.6431 341300.0
4 565.0 259.0 3.8462 342200.0
... ... ... ... ...
20635 845.0 330.0 1.5603 78100.0
20636 356.0 114.0 2.5568 77100.0
20637 1007.0 433.0 1.7000 92300.0
20638 741.0 349.0 1.8672 84700.0
20639 1387.0 530.0 2.3886 89400.0
ocean_proximity
0 NEAR BAY
1 NEAR BAY
2 NEAR BAY
3 NEAR BAY
4 NEAR BAY
... ...
20635 INLAND
20636 INLAND
20637 INLAND
20638 INLAND
20639 INLAND
[20640 rows x 10 columns]>
In [8]:
df.isnull().sum()
Out[8]:
longitude 0
latitude 0
housing_median_age 0
total_rooms 0
total_bedrooms 207
population 0
households 0
median_income 0
median_house_value 0
ocean_proximity 0
dtype: int64
In [9]:
Explore our developer-friendly HTML to PDF API Printed using PDFCrowd HTML to PDF
df.duplicated().sum()
Out[9]:
0
In [10]:
df['total_bedrooms'].median()
Out[10]:
435.0
In [11]:
df['total_bedrooms'].fillna(df['total_bedrooms'].median(),inplace=True)
In [12]:
for i in df.iloc[:,2:7]:
df[i]=df[i].astype('int')
In [13]:
df.head()
Out[13]:
longitude latitude housing_median_age total_rooms total_bedrooms population households med
0 -122.23 37.88 41 880 129 322 126
1 -122.22 37.86 21 7099 1106 2401 1138
2 -122.24 37.85 52 1467 190 496 177
3 -122.25 37.85 52 1274 235 558 219
4 -122.25 37.85 52 1627 280 565 259
In [14]:
df.describe().T
Out[14]:
count mean std min 25% 50%
longitude 20640.0 -119.569704 2.003532 -124.3500 -121.8000 -118.4900
latitude 20640.0 35.631861 2.135952 32.5400 33.9300 34.2600
housing_median_age 20640.0 28.639486 12.585558 1.0000 18.0000 29.0000
total_rooms 20640.0 2635.763081 2181.615252 2.0000 1447.7500 2127.0000 3
total_bedrooms 20640.0 536.838857 419.391878 1.0000 297.0000 435.0000
population 20640.0 1425.476744 1132.462122 3.0000 787.0000 1166.0000
households 20640.0 499.539680 382.329753 1.0000 280.0000 409.0000
median_income 20640.0 3.870671 1.899822 0.4999 2.5634 3.5348
median_house_value 20640.0 206855.816909 115395.615874 14999.0000 119600.0000 179700.0000 264
In [15]:
Numerical=df.select_dtypes(include=[np.number]).columns
print(Numerical)
Explore our developer-friendly HTML to PDF API Printed using PDFCrowd HTML to PDF
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
'total_bedrooms', 'population', 'households', 'median_income',
'median_house_value'],
dtype='object')
In [16]:
for col in Numerical:
plt.figure(figsize=(10,6))
df[col].plot(kind='hist',title=col,bins=60,edgecolor='black')
plt.ylabel('Frequency')
plt.show()
Explore our developer-friendly HTML to PDF API Printed using PDFCrowd HTML to PDF
Explore our developer-friendly HTML to PDF API Printed using PDFCrowd HTML to PDF
Explore our developer-friendly HTML to PDF API Printed using PDFCrowd HTML to PDF
Explore our developer-friendly HTML to PDF API Printed using PDFCrowd HTML to PDF
In [17]:
for col in Numerical:
plt.figure(figsize=(6,6))
sns.boxplot(df[col],color='blue')
plt.title(col)
plt.ylabel(col)
plt.show()
Explore our developer-friendly HTML to PDF API Printed using PDFCrowd HTML to PDF
Explore our developer-friendly HTML to PDF API Printed using PDFCrowd HTML to PDF
Explore our developer-friendly HTML to PDF API Printed using PDFCrowd HTML to PDF
Explore our developer-friendly HTML to PDF API Printed using PDFCrowd HTML to PDF
Explore our developer-friendly HTML to PDF API Printed using PDFCrowd HTML to PDF
Explore our developer-friendly HTML to PDF API Printed using PDFCrowd HTML to PDF
Explore our developer-friendly HTML to PDF API Printed using PDFCrowd HTML to PDF
Explore our developer-friendly HTML to PDF API Printed using PDFCrowd HTML to PDF
Explore our developer-friendly HTML to PDF API Printed using PDFCrowd HTML to PDF
In [ ]:
Explore our developer-friendly HTML to PDF API Printed using PDFCrowd HTML to PDF