# PANGANTIHON, Norman O.
STT061-M14
# Activity on Regression Analysis
# Your task is consider the Anscombe data set again
# These are your new problems
# Problem 1: Encode the Anscombe data set using Excel (Save the file)
# Problem 2: Export the xls file into a csv file (Find Export command
# in the File command menu of Excel)
# csv means comma separated values
# Problem 3: Use R command to load the csv file in R
homedir <- "~/NORMAN/STT061/"
setwd(homedir)
Anscombe <- read.csv("Anscombe.csv")
# Problem 4: Use R command to compute the mean of X1,X2,X3,X4
mean(Anscombe$X1)
mean(Anscombe$X2)
mean(Anscombe$X3)
mean(Anscombe$X4)
# Problem 5: Use R command to compute the mean of Y1,Y2,Y3,Y4
mean(Anscombe$Y1)
mean(Anscombe$Y2)
mean(Anscombe$Y3)
mean(Anscombe$Y4)
# Problem 6: Use R command to compute the variance Y1,Y2,Y3,Y4
var(Anscombe$Y1)
var(Anscombe$Y2)
var(Anscombe$Y3)
var(Anscombe$Y4)
# Problem 7: Use R command to compute the variance X1,X2,X3,X4
var(Anscombe$X1)
var(Anscombe$X2)
var(Anscombe$X3)
var(Anscombe$X4)
# Problem 8: Use R command to compute the sd Y1,Y2,Y3,Y4
sd(Anscombe$Y1)
sd(Anscombe$Y2)
sd(Anscombe$Y3)
sd(Anscombe$Y4)
# Problem 9: Use R command to compute the sd X1,X2,X3,X4
sd(Anscombe$X1)
sd(Anscombe$X2)
sd(Anscombe$X3)
sd(Anscombe$X4)
# Problem 10: Use R command to compute the correlation (X1,Y1),
# and also for (X2,Y2), (X3,Y3), (X4,Y4).
cor(Anscombe$X1,Anscombe$Y1)
cor(Anscombe$X2,Anscombe$Y2)
cor(Anscombe$X3,Anscombe$Y3)
cor(Anscombe$X4,Anscombe$Y4)
# Establish causal relationship by validating the four assumptions
# Problem 11: Build the Simple Linear Regression Model for (X1, Y1).
# Verify the assumptions of this model.
# Step1: Load data set
Anscombe <- read.csv("Anscombe.csv")
# Step2: Build the Simple Linear Regression Model
model <- lm(Anscombe$Y1 ~ Anscombe$X1, data=Anscombe$Y1andAnscombe$X1) # create the linear
regression model
plot(Anscombe$X1,Anscombe$Y1) # scatter plot
abline(model,col = "red",lwd = 3) # plot the regression line
model # display regression coefficients
cor(Anscombe$X1,Anscombe$Y1) # get correlation coefficient
# correlation = 0.8164205 - this value is high
# I can say that there is a strong linear relationship between X1 and Y1
# Findings for Assumption1:
# The linearity of data is satisfied
# because of the high value of correlation
# and the scatter plot shows that
# as X1 increases, Y1 also increases.
# Step 3: Verify Assumption2: Independence of Error terms
plot(model,1)
plot(model$fitted.values, model$residuals)
plot(model,1)
# Conclusion: Independence of error terms is satisfied.
# In the residuals versus fits plot,
# the points seem randomly scattered,
# and it does not appear that there is a pattern.
# Also, there is a red line which is
# approximately horizontal at 0.
# Step4: Verify Assumption3: Normality of Error terms.
# For simple linear regression
# we can check the normality of the response variable
# Can use the hist() to check whether the dependent variable
# follows a normal distribution. Use also Test for normality and the
# QQ plot and the Normality Rule
hist(Anscombe$Y1,probability=T, main="Histogram",xlab="Raw data")
lines(density(Anscombe$Y1),col=2,lwd = 3)
# Findings: it appears that the distribution
# is approximately normal
# We can confirm this by computing the skewness and kurtosis coefficients
library(datawizard)
skewness(Anscombe$Y1)
kurtosis(Anscombe$Y1)
# skew = -0.065 > -0.5 indicating nearly symmetrical data
# Kurtosis = -0.535 < 0 indicating that the curve is flatter than normal
# Findings: The distribution of the response variable is nearly symmetric
# Conclusion: The distribution of the response variable is normal
# Perform Shapiro-Wilk test for Normality
shapiro.test(Anscombe$Y1)
# W = 0.97693, p-value = 0.9467
# The p-value of the test turns out to be 0.9467 .
# Since this value is greater than .05, we have sufficient evidence
# to say that the sample data come from a population
# that is normally distributed.
# Perform the Empirical Normality rule: (This requires that the curve is symmetric)
# Percentage of area covered under normality rule: 68/95/99.7
# Area covered within 1 SD is 68%
# Area covered within 2 SD is 95%
# Area covered within 3 SD is 99.7%
# Compute the actual percentage
data1 <- Anscombe$Y1
within1sd <- round(sum(abs(data1) < mean(data1)+1*sd(data1))*(100/length(data1)),2)
within2sd <- round(sum(abs(data1) < mean(data1)+2*sd(data1))*(100/length(data1)),2)
within3sd <- round(sum(abs(data1) < mean(data1)+3*sd(data1))*(100/length(data1)),2)
percentage <- paste0(within1sd,'/',within2sd,'/',within3sd)
(normsfreq <- paste0("Actual Percentage: ", percentage))
# "Actual Percentage: 81.82/100/100 which satisfies the empirical rule
# and there is a validation that the curve is symmetric
# using the histogram skewness and kurtosis.
# Therefore the result about the actual empirical coverage
# should be considered
# because it has already been established that the curve is nearly symmetric.
# Findings: the Empirical Normality rule is satisfied
# Decision: The result of Empirical rule is valid
# since the conditions are satisfied and
# the shape of the distribution is nearly symmetric.
# Here is another way to establish normality
# create Quantile-Quantile plot for residuals ( the plot
# should fall along a straight line)
qqnorm(model$residuals)
qqline(model$residuals)
# Findings: residuals seem to fall along the straight line
# This means the error terms is normally distributed.
# Final Conclusion about normality of error terms:
# Assumption is satisfied.
# Step5: Verify Assumption4: Constant Variance of Y for each x
# The assumption can be checked by examining the scale-location
# plot which is also known as spread location-plot
plot(model,3)
# Findings: There is constant variance of Y for each X value.
# Another view on how to verify constancy of variance
# Use augment function to automatically insert fitted values
# and residuals
library(broom)
augment_model <- augment(model)
library(ggplot2)
ggplot(augment_model, aes(Anscombe$X1, Anscombe$Y1)) +
geom_point() +
stat_smooth(method = lm, se = TRUE) +
geom_segment(aes(xend = Anscombe$X1, yend = .fitted), color = "red", size = 0.3)
# The variance is represented by the shaded band.
# As you can see the shaded band which represents the standard
# deviation across all values of x. Its values are closer
# from the beginning up to the end values of x.
# This suggests that the variance of Y is constant along the values
# of x.
# Conclusion: Assumption 4 is satisfied
# ---------------------------------------------------------
# Final Conclusion: All 4 assumptions are satisfied.
# Therefore the simple linear regression model is
# appropriate for the data. This means to say that we do not have
# to look for other methods to improve the model.
# Validation of the assumptions showed that the Simple linear regression
# model is valid to represent the causal relationship
# between the X1 variable and the Y1 variable.
# ---------------------------------------------------------
# If instead of evaluating the response variable,
# we will investigate the residuals (actual value - fitted value)
# If we will take a look at the residuals we will have
hist(model$residuals,probability=T, main="Histogram",xlab="Raw data")
lines(density(model$residuals),col=2,lwd = 3)
# ---------------------------------------------------------
# Problem 12: Build the Simple Linear Regression Model for (X2, Y2).
# Verify the assumptions of this model.
# Step1: Load data set
Anscombe <- read.csv("Anscombe.csv")
# Step2: Build the Simple Linear Regression Model
model <- lm(Anscombe$Y2 ~ Anscombe$X2, data=Anscombe$Y2andAnscombe$X2) # create the linear
regression model
plot(Anscombe$X2,Anscombe$Y2) # scatter plot
abline(model,col = "red",lwd = 3) # plot the regression line
model # display regression coefficients
cor(Anscombe$X2,Anscombe$Y2) # get correlation coefficient
# correlation = 0.8162365 -
# Findings for Assumption1:
# Step 3: Verify Assumption2: Independence of Error terms
plot(model,1)
plot(model$fitted.values, model$residuals)
plot(model,1)
# Conclusion:
# Step4: Verify Assumption3: Normality of Error terms.
# For simple linear regression
# we can check the normality of the response variable
# Can use the hist() to check whether the dependent variable
# follows a normal distribution. Use also Test for normality and the
# QQ plot and the Normality Rule
hist(Anscombe$Y2,probability=T, main="Histogram",xlab="Raw data")
lines(density(Anscombe$Y2),col=2,lwd = 3)
# Findings:
# We can confirm this by computing the skewness and kurtosis coefficients
library(datawizard)
skewness(Anscombe$Y2) # skew = -1.316 < -0.5 indicating strong skewness
# on the left side
kurtosis(Anscombe$Y2) # kurtosis = 0.846 > 0 indicating that the curve
# is higher than normal
# Findings:
# Conclusion:
# Perform Shapiro-Wilk test for Normality
shapiro.test(Anscombe$Y2)
# W = 0.82837, p-value = 0.02222
# The p-value of the test turns out to be 0.02222 .
# Perform the Empirical Normality rule: (This requires that the curve is symmetric)
# Percentage of area covered under normality rule: 68/95/99.7
# Area covered within 1 SD is 68%
# Area covered within 2 SD is 95%
# Area covered within 3 SD is 99.7%
# Compute the actual percentage
data1 <- Anscombe$Y2
within1sd <- round(sum(abs(data1) < mean(data1)+1*sd(data1))*(100/length(data1)),2)
within2sd <- round(sum(abs(data1) < mean(data1)+2*sd(data1))*(100/length(data1)),2)
within3sd <- round(sum(abs(data1) < mean(data1)+3*sd(data1))*(100/length(data1)),2)
percentage <- paste0(within1sd,'/',within2sd,'/',within3sd)
(normsfreq <- paste0("Actual Percentage: ", percentage))
# "Actual Percentage: 100/100/100" which satisfies the empirical rule
# Findings:
# Decision:
# Here is another way to establish normality
# create Quantile-Quantile plot for residuals ( the plot
# should fall along a straight line)
qqnorm(model$residuals)
qqline(model$residuals)
# Findings:
# Decision:
# Final Conclusion about normality of error terms:
# Step5: Verify Assumption4: Constant Variance of Y for each x
# The assumption can be checked by examining the scale-location
# plot which is also known as spread location-plot
plot(model,3)
# Findings: No constant variance of Y for each
# x value.
# Another view on how to verify constancy of variance
# Use augment function to automatically insert fitted values
# and residuals
library(broom)
augment_model <- augment(model)
library(ggplot2)
ggplot(augment_model, aes(Anscombe$X2, Anscombe$Y2)) +
geom_point() +
stat_smooth(method = lm, se = TRUE) +
geom_segment(aes(xend = Anscombe$X2, yend = .fitted), color = "red", size = 0.3)
# The variance is represented by the shaded band.
# Conclusion:
# ---------------------------------------------------------
# Final Conclusion:
# ---------------------------------------------------------
# If instead of evaluating the response variable,
# we will investigate the residuals (actual value - fitted value)
# If we will take a look at the residuals we will have
hist(model$residuals,probability=T, main="Histogram",xlab="Raw data")
lines(density(model$residuals),col=2,lwd = 3)
# ---------------------------------------------------------
# Problem 13: Build the Simple Linear Regression Model for (X3, Y3).
# Verify the assumptions of this model.
# Step1: Load data set
Anscombe <- read.csv("Anscombe.csv")
# Step2: Build the Simple Linear Regression Model
model <- lm(Anscombe$Y3 ~ Anscombe$X3, data=Anscombe$Y3andAnscombe$X3)
# create the linear regression model# create the linear regression model
plot(Anscombe$X3,Anscombe$Y3) # scatter plot
abline(model,col = "red",lwd = 3) # plot the regression line
model # display regression coefficients
cor(Anscombe$X3,Anscombe$Y3) # get correlation coefficient
# correlation = 0.8162867 -
# Findings for Assumption1:
# Step 3: Verify Assumption2: Independence of Error terms
plot(model,1)
plot(model$fitted.values, model$residuals)
plot(model,1)
# Conclusion:
# Step4: Verify Assumption3: Normality of Error terms.
# For simple linear regression
# we can check the normality of the response variable
# Can use the hist() to check whether the dependent variable
# follows a normal distribution. Use also Test for normality and the
# QQ plot and the Normality Rule
hist(Anscombe$Y3,probability=T, main="Histogram",xlab="Raw data")
lines(density(Anscombe$Y3),col=2,lwd = 3)
# Findings:
# We can confirm this by computing the skewness and kurtosis coefficients
library(datawizard)
skewness(Anscombe$Y3) # skew = 1.855 > 1 indicating strong skewness
# on the right side
kurtosis(Anscombe$Y3) # kurtosis = 4.384 > 0 indicating that the curve
# is higher than normal
# Findings:
# Conclusion:
# Perform Shapiro-Wilk test for Normality
shapiro.test(Anscombe$Y3)
# W = 0.83361, p-value = 0.02604
# The p-value of the test turns out to be 0.02604 .
#
# Perform the Empirical Normality rule: (This requires that the curve is symmetric)
# Percentage of area covered under normality rule: 68/95/99.7
# Area covered within 1 SD is 68%
# Area covered within 2 SD is 95%
# Area covered within 3 SD is 99.7%
# Compute the actual percentage
data1 <- Anscombe$Y3
within1sd <- round(sum(abs(data1) < mean(data1)+1*sd(data1))*(100/length(data1)),2)
within2sd <- round(sum(abs(data1) < mean(data1)+2*sd(data1))*(100/length(data1)),2)
within3sd <- round(sum(abs(data1) < mean(data1)+3*sd(data1))*(100/length(data1)),2)
percentage <- paste0(within1sd,'/',within2sd,'/',within3sd)
(normsfreq <- paste0("Actual Percentage: ", percentage))
# "Actual Percentage: 90.91/90.91/100" which satisfies the empirical rule
# Findings:
# Decision:
# Here is another way to establish normality
# create Quantile-Quantile plot for residuals ( the plot
# should fall along a straight line)
qqnorm(model$residuals)
qqline(model$residuals)
# Findings:
# Decision:
# Final Conclusion about normality of error terms:
# Step5: Verify Assumption4: Constant Variance of Y for each x
# The assumption can be checked by examining the scale-location
# plot which is also known as spread location-plot
plot(model,3)
# Findings: No constant variance of Y for each
# x value.
# Another view on how to verify constancy of variance
# Use augment function to automatically insert fitted values
# and residuals
library(broom)
augment_model <- augment(model)
library(ggplot2)
ggplot(augment_model, aes(Anscombe$X3, Anscombe$Y3)) +
geom_point() +
stat_smooth(method = lm, se = TRUE) +
geom_segment(aes(xend = Anscombe$X3, yend = .fitted), color = "red", size = 0.3)
# The variance is represented by the shaded band.
# Conclusion:
# ---------------------------------------------------------
# Final Conclusion:
# ---------------------------------------------------------
# If instead of evaluating the response variable,
# we will investigate the residuals (actual value - fitted value)
# If we will take a look at the residuals we will have
hist(model$residuals,probability=T, main="Histogram",xlab="Raw data")
lines(density(model$residuals),col=2,lwd = 3)
# ---------------------------------------------------------
# Problem 14: Build the Simple Linear Regression Model for (X4, Y4).
# Verify the assumptions of this model.
# Step1: Load data set
Anscombe <- read.csv("Anscombe.csv")
# Step2: Build the Simple Linear Regression Model
model <- lm(Anscombe$Y4 ~ Anscombe$X4, data=Anscombe$Y4andAnscombe$X4) # create the linear
regression model
plot(Anscombe$X4,Anscombe$Y4) # scatter plot
abline(model,col = "red",lwd = 3) # plot the regression line
model # display regression coefficients
cor(Anscombe$X4,Anscombe$Y4) # get correlation coefficient
# correlation = 0.8165214 -
# Findings for Assumption1:
# Step 3: Verify Assumption2: Independence of Error terms
plot(model,1)
plot(model$fitted.values, model$residuals)
plot(model,1)
# Conclusion:
# Step4: Verify Assumption3: Normality of Error terms.
# For simple linear regression
# we can check the normality of the response variable
# Can use the hist() to check whether the dependent variable
# follows a normal distribution. Use also Test for normality and the
# QQ plot and the Normality Rule
hist(Anscombe$Y4,probability=T, main="Histogram",xlab="Raw data")
lines(density(Anscombe$Y4),col=2,lwd = 3)
# Findings:
# We can confirm this by computing the skewness and kurtosis coefficients
library(datawizard)
skewness(Anscombe$Y4) # skew = 1.507 > -.5 indicating strong skewness
# on the right side
kurtosis(Anscombe$Y4) # kurtosis = 3.151 > 0 indicating that the curve
# is higher than normal
# Findings:
# Conclusion:
# Perform Shapiro-Wilk test for Normality
shapiro.test(Anscombe$Y4)
# W = 0.87536, p-value = 0.09081
# The p-value of the test turns out to be 0.09081 .
# Perform the Empirical Normality rule: (This requires that the curve is symmetric)
# Percentage of area covered under normality rule: 68/95/99.7
# Area covered within 1 SD is 68%
# Area covered within 2 SD is 95%
# Area covered within 3 SD is 99.7%
# Compute the actual percentage
data1 <- Anscombe$Y4
within1sd <- round(sum(abs(data1) < mean(data1)+1*sd(data1))*(100/length(data1)),2)
within2sd <- round(sum(abs(data1) < mean(data1)+2*sd(data1))*(100/length(data1)),2)
within3sd <- round(sum(abs(data1) < mean(data1)+3*sd(data1))*(100/length(data1)),2)
percentage <- paste0(within1sd,'/',within2sd,'/',within3sd)
(normsfreq <- paste0("Actual Percentage: ", percentage))
# "Actual Percentage: 90.91/90.91/100" which satisfies the empirical rule
# Findings:
# Decision:
# Here is another way to establish normality
# create Quantile-Quantile plot for residuals ( the plot
# should fall along a straight line)
qqnorm(model$residuals)
qqline(model$residuals)
# Findings:
# Decision:
# Final Conclusion about normality of error terms:
# Step5: Verify Assumption4: Constant Variance of Y for each x
# The assumption can be checked by examining the scale-location
# plot which is also known as spread location-plot
plot(model,3)
# Findings:
# Another view on how to verify constancy of variance
# Use augment function to automatically insert fitted values
# and residuals
library(broom)
augment_model <- augment(model)
library(ggplot2)
ggplot(augment_model, aes(Anscombe$X4, Anscombe$Y4)) +
geom_point() +
stat_smooth(method = lm, se = TRUE) +
geom_segment(aes(xend = Anscombe$X4, yend = .fitted), color = "red", size = 0.3)
# The variance is represented by the shaded band.
# Conclusion: Assumption 4 is not satisfied
# ---------------------------------------------------------
# Final Conclusion:
# ---------------------------------------------------------
# If instead of evaluating the response variable,
# we will investigate the residuals (actual value - fitted value)
# If we will take a look at the residuals we will have
hist(model$residuals,probability=T, main="Histogram",xlab="Raw data")
lines(density(model$residuals),col=2,lwd = 3)
# ---------------------------------------------------------
# Finally which pair is suitable for Simple Linear Regression Analysis?
# Short Answer:
# ---------------------------------------------------------