#“Predicting the salary of an IT employee from EU region”.
#From the prospective of the employees and employers it is very essential to know the yearly salary
of a particular candidate or a particular position in the organization.
#In this analysis, an attempt is made to solve the problem of getting a right salary based on the
historical survey of existing employees on different parameters.
#This type of study is part of HR analytics, which can assist HR teams set goals, measure success, and
optimize procedures based on desired skill sets and skill sets created by their applicants.
#The company may concentrate on training its personnel based on this analysis, which will help the
company generate income in the future.
#This predication is very help full for the aspiring IT candidates to predict their salary based on their
skill sets, current salary income and other variables.
#This can also be used by HR department to determine the salary for an existing or new position in
an organization.
#loading the required library and importing the "IT salary survey EU" data set
library(readxl)
library(descr)
library(caTools)
library(caret)
library(ISLR)
read.table()
data.frame()
data1<-read.csv("C:/Users/hp/Downloads/2027911 MLA Cia1/IT Salary Survey EU 2020...csv",
stringsAsFactors = TRUE)
sum(is.na(data1))
str(data1)
#The unwanted columns which has no relation in the given problem are removed
data1<-data1[-23]
data1<-data1[-22]
data1<-data1[-21]
data1<-data1[-1]
view(data1)
#the sum of null values is found and removed to get a desired output for the dependent variable
yearly salary
sum(is.na(data1))
data1<-na.omit(data1)
summary(data1)
#The data set is splitted and Created a subset of 70% data
library(caTools)
set.seed(100)
split1<-sample.split(data1$City,SplitRatio =0.7)
summary(split1)
#The data set is trained and tested in the ratio of 70:30
#through train we can check the generalisability of the trained model
datatrain<-subset(data1, split1==TRUE)
datatest<-subset(data1, split1==FALSE)
summary(datatrain)
str(datatrain)
#logistic regression
library(caret)
lreg1<-train(Contract.duration~ .,
method ="glm",
family = "binomial",
data=datatrain)
lreg1
summary(lreg1)
#taken by removing outliers(12) from the outlier section at the bottom after plotting "reg1"
data1[c(631,659,805,442,854,805,565,575,779,153,839,728,1224,495,574,967,959,674),]<-NA
#taken by removing outliers(12) from the outlier section at the bottom after plotting "reg1"
data1[c(631,659,805,442,854),]<-NA
reg1<-lm(Yearly.brutto.salary..without.bonus.and.stocks..in.EUR~
Total.years.of.experience+Yearly.bonus...stocks.in.EUR+datatrain$Annual.brutto.salary..without.bon
us.and.stocks..one.year.ago..Only.answer.if.staying.in.the.same.country+
datatrain$Years.of.experience.in.Germany+datatrain$Annual.bonus.stocks.one.year.ago..Only.answ
er.if.staying.in.same.country+datatrain$Number.of.vacation.days+datatrain$Contract.duration+data
train$Age, data = datatrain)
For replacing blank categorical column with “none”
library(tidyverse)
data1%>%
select(Age,Gender,City,Position,Total.years.of.experience,Years.of.experience.in.Germany
,Seniority.level,Your.main.technology...programming.language,Other.technologies.programming.lan
guages.you.use.often,Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,Yearly.bonus...stocks.in
.EUR,Annual.brutto.salary..without.bonus.and.stocks..one.year.ago..Only.answer.if.staying.in.the.sa
me.country,Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.country,Number.of.v
acation.days,Employment.status,Contract.duration,Main.language.at.work,Company.size,Company.t
ype,)%>%
mutate(Position=replace_na(Position,"none"))%>%
view()
-------------------------------------------------------
data1%>%
select(Age,Gender,City,Position,Total.years.of.experience,Total.years.of.experience,Seniority.level,Y
our.main.technology...programming.language,Other.technologies.programming.languages.you.use.o
ften,Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,Yearly.bonus...stocks.in.EUR,Annual.bru
tto.salary..without.bonus.and.stocks..one.year.ago..Only.answer.if.staying.in.the.same.country,Annu
al.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.country,Number.of.vacation.days,Empl
oyment.status,)
library(readxl)
read.table()
data.frame()
data1<-read.csv("C:/Users/hp/Downloads/Sem4/satya 1.csv", stringsAsFactors = TRUE)
data1<-na.omit(data1)#taken by removing outliers(6) from the outlier section at the bottom
data1<-na.omit(data1)
library(caTools)
set.seed(100)
split1<-sample.split(data1$Course,SplitRatio =0.7)
summary(split1)
datatrain<-subset(data1, split1==TRUE)
datatest<-subset(data1, split1==FALSE)
summary(datatrain)
str(datatrain)
library(ggplot2)
ggplot(datatrain, aes(datatrain$Student_engagement, datatrain$goal_orientation))+geom_point()
cor(datatrain$Student_engagement, datatrain$goal_orientation)
#exists a relationship (p value=0.68)
cor.test(datatrain$Student_engagement, datatrain$goal_orientation)
ggplot(datatrain, aes(datatrain$Student_engagement, datatrain$goal_orientation))+geom_smooth()
#looks linear
reg1<-lm(datatrain$Student_engagement~
datatrain$goal_orientation+
datatrain$academic_self_eficacy, data = datatrain)
summary(reg1)
#OR
reg1<-lm(Student_engagement~
goal_orientation+
academic_self_eficacy,
data = datatrain)
summary(reg1)
#F-statistic p<0.05 reject null, model is good fit
#How good the model is answered with R2 which is 66%
#Adj r2 for more than one independent variables (adjust for degg of freedom)
#intercept: is not significantly effecting dependent variable(if p>0.05)
#beta1: 22% no significant effect of the constant on 'self efficacy'
#beta2: 73%
#Multicollinearity
library(car)
vif(reg1) #variance inflation factor:<4(small model, no multicollinearity)
#Assumptions
#Checking randomness: error term is seen randomly distributed
plot(reg1$residuals, c(1:length(reg1$residuals)))
#normality of error terms(boxplot,qqnorm,shapiro,skewness)
boxplot(reg1$residuals) #the midline is not at the middle, devaitions exists
shapiro.test(reg1$residuals) #P<0.05 reject null, not normally distributed
hist(reg1$residuals)#not normal
descr(datatrain)
summary(datatrain)
library(moments)
# skeness and kurtosis- normality
datatrain$residual1<-reg1$residuals
moments::skewness(datatrain$residual1)
moments::kurtosis(datatrain$residual1)
#OR
library(e1071)
kurtosis(reg1$residuals)
#outliers
#hetro sidasticity, find cone shape, equally distribution on both sides
plot(reg1$residuals, reg1$fitted.values)
library(lmtest)
bptest(reg1) #no equal distribution of , model is not a good fit
#outliers---
plot(reg1)
#OR
library(car)
influenceIndexPlot(reg1)#261
#converted outlers as missing using NA and removed in main data set in next step
data1[c(261,799,96,426,248),]<-NA
reg1<-lm(emission~mass+
wheelbase+
axlesteering+
axleother+
enginecapacity+
emisionreduction,data = data1)
summary(reg1)
library(car)
vif(reg1)
#model---more variable, more data set----two models=axle steering, axle other
#created two models based on the higher ranges of Vif
regm1<-lm(emission~mass+
wheelbase+
axleother+
enginecapacity+
emisionreduction,data = data1)
summary(regm1)
vif(regm1) #no-multicollinaerity..no higher values
regm2<-lm(emission~mass+
wheelbase+
axlesteering+
enginecapacity+
emisionreduction,data = data1)
summary(regm2)
vif(regm2)
#rmse------,Anova
prem1<-predict(regm1, data1)
prem2<-predict(regm2, data1)
library(caret)
library(Metrics)
RMSE(data1$emission,prem1)
RMSE(data1$emission,prem2)
# the coefficients and r2 values are similar so we can have both the models for prediction of
"emmission"
#loading the required library and importing the "IT salary survey EU" data set
library(readxl)
library(descr)
read.table()
data.frame()
data1<-read.csv("C:/Users/hp/Downloads/IT Salary Survey EU 2020...csv", stringsAsFactors = TRUE)
sum(is.na(data1))
str(data1)
#taken by removing outliers from the outlier section at the bottom after plotting "reg1"
#multiple running of the codes gave the following outliers.
data1[c(631,659,805,442,854,855,805,565,575,779,153,839,728,1224,495,574,967,959,674),]<-NA
#the unwanted columns which has no relation in the given problem are removed
data1<-data1[-23]
data1<-data1[-22]
data1<-data1[-21]
data1<-data1[-1]
view(data1)
#the sum of null values is found and removed to get a desired output for the dependent variable
yearly salary
sum(is.na(data1))
data1<-na.omit(data1)
summary(data1)
#The library is used to find unique values in a particular column
library(mice)
md.pattern(data1)
unique(data1$Position)
unique(data1$Seniority.level)
fix(data1)
summary(data1)
#the data set is splitted
library(caTools)
set.seed(100)
split1<-sample.split(data1$City,SplitRatio =0.7)
summary(split1)
#The data set is trained and tested in the ration of 70:30
datatrain<-subset(data1, split1==TRUE)
datatest<-subset(data1, split1==FALSE)
summary(datatrain)
str(datatrain)
#library is loaded
library(ggplot2)
ggplot(datatrain, aes(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Total.years.of.experience))+geom_smooth()
#The graph of yearly salary and years of experience of IT professional
cor(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Total.years.of.experience)
#There exists a nearly perfect relationship(p=0.43)
cor(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Years.of.experience.in.Germany)
#There exists no correlation between the salary and experience in germany
cor(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Yearly.bonus...stocks.in.EUR)
ggplot(datatrain, aes(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Yearly.bonus...stocks.in.EUR))+geom_smooth()
#linear regression
reg1<-lm(Yearly.brutto.salary..without.bonus.and.stocks..in.EUR~
Total.years.of.experience+Yearly.bonus...stocks.in.EUR+Annual.brutto.salary..without.bonus.and.sto
cks..one.year.ago..Only.answer.if.staying.in.the.same.country+
Years.of.experience.in.Germany+Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.
country+Number.of.vacation.days+Age, data = datatrain)
summary(reg1)
#Outliers are marked to N/A and is omited in the above section
plot(reg1)
#Based on the above results the new model is developed here
reg1<-lm(Yearly.brutto.salary..without.bonus.and.stocks..in.EUR~
Total.years.of.experience+Annual.brutto.salary..without.bonus.and.stocks..one.year.ago..Only.answ
er.if.staying.in.the.same.country+
Yearly.bonus...stocks.in.EUR+Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.cou
ntry, data = datatrain)
summary(reg1)
#F-statistic: p<0.05 reject null, model is good fit
#R-sqr=0.41 which means that 41% of the model looks good.
#Outliers have been detected through the regression plotting and the outliers have been deleted
plot(reg1)
#variance inflation factor:<4(small model, no multicollinearity)
#Checking randomness: error term is seen randomly distributed
plot(reg1$residuals, c(1:length(reg1$residuals)))
#normality of error terms(shapiro test)
shapiro.test(reg1$residuals)
#P<0.05 reject null, not normally distributed
hist(reg1$residuals)#The model is almost near to normal
LOGISTICS REG HR
library(readr)
data1 <- read_csv("C:/Users/hp/Downloads/IMB533_HR_Data_No_Missing_Value.csv", )
View(data1)
library(caTools)
library(caret)
library(ISLR)
#removing first two col of unwanted ID
data1<-data1[,-c(1,2)]
#Creating a subset of 70% data
set.seed(100)
split1<-sample.split(data1$Status, SplitRatio=0.7)
data1train<-subset(data1, split1==TRUE)
data1test<-subset(data1, split1==FALSE)
#through train we can chcek the generalisability of the trained model
str(data1train)
#logistic regression
library(caret)
lreg1<-train(Status~ .,
method ="glm",
family = "binomial",
data=data1train)
lreg1
summary(lreg1)
#conider variables with stars*** and some common sense. the estimate shows the poitive or
negative effects
#model fit--------
library(blorr)
library(Rcpp)
#AIC lower the value better the model
#analyse p value also
blr_model_fit_stats(lreg1$finalModel)
#MCFadden's R2-13.5% model is
#H0:model is good fit
#the model is good fit as (p-value is less than 0.05 reject (accept) null hypothesis) but the r2 and
other values are not great
blr_test_hosmer_lemeshow(lreg1$finalModel)
#confusion matrix
blr_confusion_matrix(lreg1$finalModel, cutoff = 0.5)
#chooose the cut off value such that the sensitivity is high
gaintable<-blr_gains_table(lreg1$finalModel)
blr_roc_curve(gaintable)
#sensitivity should be high by giving equal weightage(joining and not joining in this data) )
#OR
#cut off value if from the graph, it can be 0.4,0.5, 0.6, 0.7
#find a cut value in graph such that both conditions are satisfied
#consider sensitivity-6% and specificity-98%
#prob stat:HR STATUS-whether the candidate is joinging or not joing after offer given
lreg1$finalModel
#using the final model only in regression
predict1<-predict(lreg1, data1test)
#confusion matrix
confusionMatrix(predict1, data1test$Status)
#stepwise logistic regression
#If the variables are in high number should us this forward or backward method
library(blorr)
blr_step_aic_forward(lreg1$finalModel, details = TRUE)
#OR
regforward<-blr_step_aic_forward(lreg1$finalModel, details = TRUE)
#AIC lower the value better the model, AIC final model
plot(regforward)
#OR
blr_step_
regbackward<-blr
plot(regboth)
regboth$model
summary(regboth$model)
#both creating the model
library(caret)
lreg2<-train(Status~Notice.period+
Candidate.Source+
Location+
LOB+Age+
Offered.band+
DOJ.Extended+
Rex.in.Yrs,
method= "glm",
family ="binomial",
data= datatrain)
lreg2
lreg2$finalModel
summary(lreg2$finalModel)
gaintable<-blr_gains_table(lreg2$finalModel)
blr_roc_curve(gaintable)
library(readr)
dataset <- read_csv("C:/Users/hp/Downloads/Sem4-R/sem 4.3/logistic 1.csv")
View(dataset)
data1$Hospitalization<- factor(data1$Hospitalization,
levels = c(0,1),
labels = c("No Admission", "Admission"))
library(readr)
data1 <- read_csv("C:/Users/hp/Downloads/Sem4-R/sem 4.3/logistic 1.csv")
View(data1)
data1$Hospitalization<- factor(data1$Hospitalization,
levels = c(0,1),
labels = c("No Admission", "Admission"))
library(caret)
library(ISLR)
str(data1)
lreg1<-train(Hospitalization~ï..Age,
method ="glm",
family = "binomial",
data=data1)
lreg1
lreg1$finalModel
#results say residual deviance and null deviance has no significant change
#heigher the difference more is the effect of independent variables on Y
#beta0=-16.7 beta1=0.25
lreg1$finalModel$coefficients
#Odds ratio for intercept
exp(-16.719781)
exp(0.5769003)
lreg1$finalModel$coefficients
coef(lreg1$finalModel)
#probability
1.78/2.78=0.64
#probability
1.78/2.78
#probability for age 28
lreg1$finalModel$fitted.values
library(blorr)
library(Rcpp)
install.packages("blorr")
library(blorr)
library(Rcpp)
blr_model_fit_stats(lreg1$finalModel)
install.packages("survey")
#why we use t test in regression
library(survey)
regTermTest(lreg1$finalModel, "Age")
#confusion matrix
blr_confusion_matrix(lreg1$finalModel)
#“Predicting the contract duration of an IT employee from EU region as "Temporary contract" or
"Unlimited contract”.
#From the prospective of the employees and employers it is very essential to know the contract
duration of a particular candidate for a particular position in the organization.
#In this analysis, an attempt is made to solve the problem of getting a right contract based on the
historical survey of existing employees on different parameters.
#This type of study is part of HR analytics, which can assist HR teams set goals, measure success, and
optimize procedures based on desired/expected skill sets and skill sets created by their applicants.
#The company may concentrate on training its personnel based on this analysis, which will help the
company generate income in the future.
#This predication is very help full for the aspiring IT candidates to predict their contarct duration
based on their skill sets, current salary income and other variables.
#This can also be used by HR department to determine the contracts durations to be signed with an
existing or new position in an organization.
#loading the required library and importing the "IT salary survey EU" data set
library(readxl)
library(descr)
library(caret)
library(ISLR)
read.table()
data.frame()
data1<-read.csv("C:/Users/hp/Downloads/2027911 MLA Cia1/IT Salary Survey EU
2020...csv",stringsAsFactors = TRUE )
sum(is.na(data1))
str(data1)
#The unwanted columns which has no relation in the given problem are removed
data1<-data1[-23]
data1<-data1[-22]
data1<-data1[-21]
data1<-data1[-10]
data1<-data1[-9]
data1<-data1[-4]
data1<-data1[-1]
View(data1)
#the sum of null values is found and removed to get a desired output for the dependent variable
yearly salary
sum(is.na(data1))
data1[data1==""] <- NA
data1<-na.omit(data1)
summary(data1)
View(data1)
#The data set is splitted and Created a subset of 70% data
#The column of contarctv duration is split and the data model is trained and similraly the trained
model is also tested.
library(caTools)
set.seed(100)
split1<-sample.split(data1$Contract.duration,SplitRatio =0.7)
summary(split1)
#The data set is trained and tested in the ratio of 70:30
#through train we can check the generalisability of the model
datatrain<-subset(data1, split1==TRUE)
datatest<-subset(data1, split1==FALSE)
summary(datatrain)
str(datatrain)
#logistic regression
library(caret)
lreg1<-train(Contract.duration~.,
method ="glm",
family = "binomial",
data=datatrain)
lreg1
#The model has a accuracy of 84% and an error terms of 16%
summary(lreg1)
#Conider variables with p value less then 0.05.
#The variables in the columns such as Position, seniority level,yearly salary, employement status etc
has a postive impact on the dependent variable Contract duration
#The estimate in the table shows the poitive or negative effects of the independent variables on Y
#model fit--------
library(blorr)
library(Rcpp)
#AIC: lower the value better the model
#analyse p value also
blr_model_fit_stats(lreg1$finalModel)
#MCFadden's R2-13.5% model is.
#H0:model is good fit
#the model is good fit as (p-value is greater than 0.05 then accept null hypothesis)
blr_test_hosmer_lemeshow(lreg1$finalModel)
#Hence, the model is a good fit
#chooose the cut off value such that the sensitivity is high
gaintable<-blr_gains_table(lreg1$finalModel)
blr_roc_curve(gaintable)
#The cut off is chosen as 0.5 based on the graph plotted
#confusion matrix
blr_confusion_matrix(lreg1$finalModel, cutoff = 0.5)
#sensitivity should be high by giving equal weightage(temporaray contarct and unlimited contract in
this data) )
#The model of contract duration shows a accuracy of 97%. The independent varibale are highly
accurate in determining the dependent variable contarct duration
#cut off value from the graph is taken as a mid point i.e 0.5
#found a cut value in graph such that both conditions are satisfied
#consider sensitivity-99%. It determines how many actual correct independent values we were able
to predict correctly
#specificity-54%.It determines how many actual negative independent values we were able to
predict the contract duration model correctly
#using the final model only in regression
lreg1$finalModel
predict1<-predict(lreg1, datatest)
#confusion matrix
confusionMatrix(predict1, datatest$Contract.duration)
#The model of contract duration shows a accuracy of 86%. The independent varibale are highly
accurate in determining the dependent variable contarct duration
#consider sensitivity-30%. It determines how many actual correct independent values we were able
to predict correctly
#specificity-89%.It determines how many actual negative independent values we were able to
predict the contract duration model correctly
#logistic regression 2
library(caret)
lreg2<-train(Contract.duration~ Age+Gender+Position+Total.years.of.experience+
Seniority.level+Yearly.bonus...stocks.in.EUR+Annual.brutto.salary..without.bonus.and.stocks..one.ye
ar.ago..Only.answer.if.staying.in.the.same.country+
Number.of.vacation.days+Main.language.at.work+Employment.status, method
="glm",family = "binomial",data=datatrain)
lreg2
#The model has a accuracy of 83% and an error terms of 17%
summary(lreg2)
#model fit test for second model--------
library(blorr)
library(Rcpp)
#H0:model is good fit
#the model is good fit as (p-value is greater than 0.05 then accept null hypothesis)
blr_test_hosmer_lemeshow(lreg2$finalModel)
#Hence, the model is a good fit
#chooose the cut off value such that the sensitivity is high
gaintable<-blr_gains_table(lreg2$finalModel)
blr_roc_curve(gaintable)
#The cut off is chosen as 0.5 based on the graph plotted
#confusion matrix
blr_confusion_matrix(lreg2$finalModel, cutoff = 0.5)
#sensitivity should be high by giving equal weightage(temporaray contarct and unlimited contract in
this data) )
#The model of contract duration shows a accuracy of 97%. The independent varibale are highly
accurate in determining the dependent variable contarct duration
#cut off value from the graph is taken as a mid point i.e 0.5
#found a cut value in graph such that both conditions are satisfied
#consider sensitivity-99%. It determines how many actual correct independent values we were able
to predict correctly
#specificity-54%.It determines how many actual negative independent values we were able to
predict the contract duration model correctly
#Both the models 1 and model 2 of regression have the similar sensitivity and specificity and any one
of the model can be choosen.Hence, the step LOG RIG is utilized
#stepwise logistic regression
#If the variables are in high number should use this forward or backward method
library(blorr)
regforward<-blr_step_aic_forward(lreg1$finalModel, details = TRUE)
#AIC values
plot(regforward)