HANDLING OUTLIERS
Using Box Plot
# Load dataset
data <- mtcars$hp
# Create Box Plot
boxplot(data, main="Boxplot of Horsepower (hp) - Before Removing Outliers",
col="lightblue", horizontal=TRUE)
# Compute Q1, Q3, and IQR
Q1 <- quantile(data, 0.25) # 25th percentile
Q3 <- quantile(data, 0.75) # 75th percentile
IQR <- Q3 - Q1 # Interquartile Range
# Define lower and upper bounds
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
print(Q1)
print(Q3)
print(IQR)
print(lower_bound)
print(upper_bound)
# Identify outliers
outliers <- data[data < lower_bound | data > upper_bound]
print(outliers)
# Remove Outliers
data_clean <- data[data >= lower_bound & data <= upper_bound]
# Box Plot After Outlier Removal
boxplot(data_clean, main="Boxplot of Horsepower (hp) - After Removing Outliers",
col="lightgreen", horizontal=TRUE)
Output
> print(Q1) > print(IQR) > print(upper_bound)
25% 75% 75%
96.5 83.5 305.25
> print(Q3) > print(lower_bound) > print(outliers)
75% 25% [1] 335
180 -28.75
Using histogram
# Load dataset
data <- mtcars$hp
# Create Histogram
hist(data, main="Histogram of Horsepower (hp) - Before Handling Outliers",
col="lightblue", xlab="Horsepower (hp)", border="black", breaks=10)
# Compute Q1, Q3, and IQR
Q1 <- quantile(data, 0.25) # 25th percentile
Q3 <- quantile(data, 0.75) # 75th percentile
IQR <- Q3 - Q1 # Interquartile Range
# Define lower and upper bounds
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
print(Q1)
print(Q3)
print(IQR)
print(lower_bound)
print(upper_bound)
# Identify outliers
outliers <- data[data < lower_bound | data > upper_bound]
print(outliers)
# Remove Outliers
data_clean <- data[data >= lower_bound & data <= upper_bound]
# Create Histogram After Removing Outliers
hist(data_clean, main="Histogram of Horsepower (hp) - After Removing Outliers",
col="lightgreen", xlab="Horsepower (hp)", border="black", breaks=10)
Output
> print(Q1) > print(IQR) > print(upper_bound)
25% 75% 75%
96.5 83.5 305.25
> print(Q3) > print(lower_bound) > print(outliers)
75% 25% [1] 335
180 -28.75
Using barplot
# Load the built-in airquality dataset
data("airquality")
# 1. Check for Missing Values in Each Column
cat("Missing Values by Column:\n")
print(colSums(is.na(airquality)))
# 2. Visualize Gaps and Distribution Using Bar Chart
# Count the frequency of each Ozone value (including NA as a category)
ozone_data <- airquality$Ozone
ozone_data[is.na(ozone_data)] <- "Missing"
# Bar chart to show distribution and gaps in Ozone levels
barplot(table(ozone_data),
main = "Ozone Levels with Gaps",
xlab = "Ozone Levels",
ylab = "Frequency",
col = "lightblue",
border = "black",
las = 2, # Make x-axis labels vertical
cex.names = 0.7) # Reduce label size for readability
# 3. Detect Outliers Using Frequency Analysis
# Remove NAs for outlier detection
ozone_data_clean <- na.omit(airquality$Ozone)
# Count the frequency of each value
ozone_freq <- table(ozone_data_clean)
cat("\nFrequency of Ozone Levels:\n")
print(ozone_freq)
# Identify outliers as values with frequency of 1 (rare occurrences)
ozone_outliers <- names(ozone_freq[ozone_freq == 1])
cat("\nDetected Outliers (Rare Values):", ozone_outliers, "\n")
# Bar chart showing frequency distribution (for outlier spotting)
barplot(ozone_freq,
main = "Frequency Distribution of Ozone Levels",
xlab = "Ozone Levels",
ylab = "Frequency",
col = "salmon",
border = "black",
las = 2,
cex.names = 0.7)
# 4. Clean Data: Impute Missing Values with Median
# Impute missing values with median
median_value <- median(ozone_data_clean, na.rm = TRUE)
imputed_ozone <- ifelse(is.na(airquality$Ozone), median_value, airquality$Ozone)
# 5. Compare Before and After Cleaning Using Bar Chart
# Frequency of Original Data (with gaps and outliers)
ozone_freq_original <- table(airquality$Ozone)
# Frequency of Cleaned Data (without gaps and outliers)
ozone_freq_cleaned <- table(imputed_ozone)
# Set layout for side-by-side plots
par(mfrow = c(1, 2))
# Original Data (with gaps and outliers)
barplot(ozone_freq_original,
main = "Original Ozone Levels",
xlab = "Ozone Levels",
ylab = "Frequency",
col = "coral",
border = "black",
las = 2,
cex.names = 0.7)
# Cleaned Data (without gaps and outliers)
barplot(ozone_freq_cleaned,
main = "Cleaned & Imputed Ozone Levels",
xlab = "Ozone Levels",
ylab = "Frequency",
col = "lightgreen",
border = "black",
las = 2, cex.names = 0.7)
# Reset layout
par(mfrow = c(1, 1))
Output