Session 05: Data Visualisation with R
Dr. Kunal Saha
August 17, 2023
1 Installing and Calling the Libraries
library(openxlsx)
library(knitr)
library(tidyverse)
library(dplyr)
library(data.table)
# working path (where your data should be stored)
base::getwd()
## [1] "C:/DDrive/ztest/stats1/s05"
message("Output from getwd(). This is where your data file(s) should be stored")
## Output from getwd(). This is where your data file(s) should be stored
The excel data set should be at the location provided by the getwd() function
# To manually set the working directory (where your data is currently stored)
setwd("C:\\DDrive\\ztest\\stats1\\s05")
2 Import the Dataset
emp_data_c<-openxlsx::read.xlsx("s05_data.xlsx", sheet = "EmployeeData", colNames=TRUE)
knitr::kable(head(emp_data_c))
id Full_Name Gender Department Salary City Date.HiredAge Full.TimeHRA Bonus Final.Salary
000362 Abdul M Human 71100 Bengaluru 43690 25 Yes 0.2 0.25 106650
Sayed Resources
000201 Akshit M Administration
130440 Pune 42036 31 Yes 0.2 0.25 195660
Chouhan
1
id Full_Name Gender Department Salary City Date.HiredAge Full.TimeHRA Bonus Final.Salary
000307 Aman M Engineering 125000 Chennai 43598 24 Yes 0.3 0.30 211250
Devda
000639 Amit M Human 57500 Bengaluru 44344 21 Yes 0.2 0.25 86250
Dhanak Resources
000306 Animesh M Engineering 132200 Gurugram 43510 25 Yes 0.2 0.30 206232
Gehlod
000303 Anushka F Human 56500 Bengaluru 43468 23 No 0.2 0.25 84750
Gupta Resources
3 Using ggplot2
Always start by calling the ggplot() function.
Then specify the data object. It should a data frame.
Then come the aesthetics, set in the aes() function: set the variables for the X and Y axes (as required)
Next, Call the appropriate plot, for example , geom_bar() for bar chart
Add labels as required.
3.1 Draw a chart / plot to show employees per department
ggplot(data = emp_data_c, aes(x = Department))+
geom_bar() +
labs(title = "Frequency Distribution of Departments")
2
Frequency Distribution of Departments
12.5
10.0
7.5
count
5.0
2.5
0.0
Accounting Administration Engineering Human Resources Information Services
Department
3.2 Show the distribution of Salaries
## Alternative 1
## Using count of bins
ggplot(data = emp_data_c, aes(x = Salary)) +
geom_histogram(bins=20,color = "grey30", fill = "white") +
labs(title = "Salary Histogram")
3
Salary Histogram
6
4
count
50000 100000 150000 200000 250000
Salary
## Alternative 2
## Using binwidth
ggplot(data = emp_data_c, aes(x = Salary)) +
geom_histogram(binwidth = 10000,color = "grey30", fill = "white") +
labs(title = "Salary Histogram")
4
Salary Histogram
6
4
count
50000 100000 150000 200000 250000
Salary
3.2.1 Exercise: Show the distribution of Age
ggplot(data = emp_data_c, aes(x = Age)) +
geom_histogram(bins=10,color = "grey30", fill = "white") +
labs(title = "Age Histogram")
5
Age Histogram
8
6
count
20 25 30 35
Age
3.3 Show the Department wise count of employees as a Pie Chart
ggplot(data = emp_data_c, aes(x=factor(1), fill=Department)) +
geom_bar(stat = "count") +
coord_polar(theta="y") +
scale_y_continuous(breaks = seq(0, length(emp_data_c$Department), length(emp_data_c$Department)/4),
labels = c("0", "25%", "50%", "75%", "100%"))
6
0/100%
1
Department
Accounting
factor(1)
Administration
75% 25%
Engineering
Human Resources
Information Services
50%
count
3.4 Show the Distribution between Salary and Age
ggplot(data = emp_data_c, aes(x = Age, y = Salary))+
geom_point() +
labs(title = "Scatterplot of Age and Salary")
7
Scatterplot of Age and Salary
250000
200000
Salary
150000
100000
50000
25 30 35
Age
3.5 Can we Add more Information to this plot ?
We can show department wise salaries here by incorporating color
ggplot(data = emp_data_c, aes(x = Age, y = Salary, color=Department))+
geom_point() +
labs(title = "Scatterplot of Age and Salary")
8
Scatterplot of Age and Salary
250000
200000
Department
Accounting
Salary
150000 Administration
Engineering
Human Resources
Information Services
100000
50000
25 30 35
Age
3.6 Boxplot
A boxplot is a plot of distribution of numerical values. It can be used along with categorical variables to
make more complex plots
For example, a boxplot can show the distribution of Salary of ALL employees and also for Salaries per
department.
3.6.1 Show the Distribution of Salary for ALL Employees
ggplot(data = emp_data_c, aes(x = factor(0), y = Salary))+
geom_boxplot() +
labs(title = "Boxplot of Salary")
9
Boxplot of Salary
250000
200000
Salary
150000
100000
50000
0
factor(0)
### Show the Distribution of Salary per Department
ggplot(data = emp_data_c, aes(x = Department, y = Salary))+
geom_boxplot() +
labs(title = "Boxplot of Age and Salary")
10
Boxplot of Age and Salary
250000
200000
Salary
150000
100000
50000
Accounting Administration Engineering Human ResourcesInformation Services
Department
Interpret the above plot
4 Exercises
4.1 Draw a plot to show city-wise count of employees
ggplot(data = emp_data_c, aes(x = City))+
geom_bar() +
labs(title = "Frequency Distribution of Departments")
11
Frequency Distribution of Departments
4
count
Bengaluru Chennai Gurugram Kolkata Mumbai New Delhi Pune
City
4.2 Creating a new Column “Year” from Date of Joining
emp_data_c$Date.Hired<-base::as.Date(emp_data_c$Date.Hired, origin = "1899-12-30")
kable(head(emp_data_c))
id Full_Name Gender Department Salary City Date.HiredAge Full.TimeHRA Bonus Final.Salary
000362 Abdul M Human 71100 Bengaluru2019- 25 Yes 0.2 0.25 106650
Sayed Resources 08-13
000201 Akshit M Administration
130440 Pune 2015- 31 Yes 0.2 0.25 195660
Chouhan 02-01
000307 Aman M Engineering 125000 Chennai 2019- 24 Yes 0.3 0.30 211250
Devda 05-13
000639 Amit M Human 57500 Bengaluru2021- 21 Yes 0.2 0.25 86250
Dhanak Resources 05-28
000306 Animesh M Engineering 132200 Gurugram2019- 25 Yes 0.2 0.30 206232
Gehlod 02-14
000303 Anushka F Human 56500 Bengaluru2019- 23 No 0.2 0.25 84750
Gupta Resources 01-03
12
emp_data_c1<-emp_data_c %>% mutate(Joined.Year = year(emp_data_c$Date.Hired))
kable(head(emp_data_c1))
id Full_Name GenderDepartment Salary City Date.Hired
Age Full.TimeHRABonus Final.Salary
Joined.Year
000362Abdul M Human 71100 Bengaluru
2019- 25 Yes 0.2 0.25 106650 2019
Sayed Re- 08-13
sources
000201Akshit M Administration
130440Pune 2015- 31 Yes 0.2 0.25 195660 2015
Chouhan 02-01
000307Aman M Engineering 125000Chennai 2019- 24 Yes 0.3 0.30 211250 2019
Devda 05-13
000639Amit M Human 57500 Bengaluru
2021- 21 Yes 0.2 0.25 86250 2021
Dhanak Re- 05-28
sources
000306Animesh M Engineering 132200Gurugram2019- 25 Yes 0.2 0.30 206232 2019
Gehlod 02-14
000303Anushka F Human 56500 Bengaluru
2019- 23 No 0.2 0.25 84750 2019
Gupta Re- 01-03
sources
4.3 Exercise: Create a Column called “Joined.Month”
emp_data_c1<-emp_data_c1 %>% mutate(Joined.Month = month(emp_data_c$Date.Hired))
kable(head(emp_data_c1))
id Full_NameGenderDepartmentSalaryCity Date.Hired
Age Full.Time
HRABonusFinal.Salary
Joined.Year
Joined.Month
000362Abdul M Human 71100 Bengaluru
2019- 25 Yes 0.2 0.25 106650 2019 8
Sayed Re- 08-13
sources
000201Akshit M Administration
130440Pune 2015- 31 Yes 0.2 0.25 195660 2015 2
Chouhan 02-01
000307Aman M Engineering125000Chennai2019- 24 Yes 0.3 0.30 211250 2019 5
Devda 05-13
000639Amit M Human 57500 Bengaluru
2021- 21 Yes 0.2 0.25 86250 2021 5
Dhanak Re- 05-28
sources
000306Animesh M Engineering132200Gurugram
2019- 25 Yes 0.2 0.30 206232 2019 2
Gehlod 02-14
000303Anushka F Human 56500 Bengaluru
2019- 23 No 0.2 0.25 84750 2019 1
Gupta Re- 01-03
sources
4.4 Exercise: Draw a plot of joining year wise employees
13
ggplot(data = emp_data_c1, aes(x = Joined.Year))+
geom_bar() +
labs(title = "Frequency Distribution of Year of Joining")
Frequency Distribution of Year of Joining
8
6
count
2012 2016 2020
Joined.Year
4.5 Exercise: Draw a plot of joining month wise employees
ggplot(data = emp_data_c1, aes(factor(Joined.Month))) +
geom_bar() +
labs(title = "Frequency Distribution of Year of Joining")
14
Frequency Distribution of Year of Joining
4
3
count
1 2 3 4 5 6 7 8 9 10 11 12
factor(Joined.Month)
5 References:
R for Data Science (2e) : https://r4ds.hadley.nz/
The R Manuals : https://cran.r-project.org/manuals.html
R Documentation : https://www.r-project.org/other-docs.html
Tidyverse Documentation : https://www.tidyverse.org/
15