setwd("~/R/Data_Science_and_Machine_Learning")
data <- read.csv("heart-disease.csv", header = T)
data[ ,1] <- NULL
Preparation
colnames(data) <- c("age", "sex", "cp", "trestbps", "chol",
"fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "hd")
data[data == '?'] <- NA
data[data$sex == 0, 'sex'] <- 'F'
data[data$sex == 1, 'sex'] <- 'M'
data$sex <- as.factor(data$sex)
data$cp <- as.factor(data$cp)
data$fbs <- as.factor(data$fbs)
data$restecg <- as.factor(data$restecg)
data$exang <- as.factor(data$exang)
data$slope <- as.factor(data$slope)
data$ca <- as.integer(data$ca)
data$ca <- as.factor(data$ca)
data$thal <- as.integer(data$thal)
data$thal <- as.factor(data$thal)
data$hd <- ifelse(data$hd == 0, "Healthy", "Unhealthy")
data$hd <- as.factor(data$hd)
str(data)
## 'data.frame': 303 obs. of 14 variables:
## $ age : int 63 67 67 37 41 56 62 57 63 53 ...
## $ sex : Factor w/ 2 levels "F","M": 2 2 2 2 1 2 1 1 2 2 ...
## $ cp : Factor w/ 4 levels "1","2","3","4": 1 4 4 3 2 2 4 4 4 4 ...
## $ trestbps: int 145 160 120 130 130 120 140 120 130 140 ...
## $ chol : int 233 286 229 250 204 236 268 354 254 203 ...
## $ fbs : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 2 ...
## $ restecg : Factor w/ 3 levels "0","1","2": 3 3 3 1 3 1 3 1 3 3 ...
## $ thalach : int 150 108 129 187 172 178 160 163 147 155 ...
## $ exang : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 2 1 2 ...
## $ oldpeak : num 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
## $ slope : Factor w/ 3 levels "1","2","3": 3 2 2 3 1 1 3 1 2 3 ...
## $ ca : Factor w/ 4 levels "2","3","4","5": 1 4 3 1 1 1 3 1 2 1 ...
## $ thal : Factor w/ 3 levels "2","3","4": 2 1 3 1 1 1 1 1 3 3 ...
## $ hd : Factor w/ 2 levels "Healthy","Unhealthy": 1 2 2 1 1 1 2 1 2 2 ...
test <- head(data)
test$hd <- NULL
test
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1 63 M 1 145 233 1 2 150 0 2.3 3 2 3
## 2 67 M 4 160 286 0 2 108 1 1.5 2 5 2
## 3 67 M 4 120 229 0 2 129 1 2.6 2 4 4
## 4 37 M 3 130 250 0 0 187 0 3.5 3 2 2
## 5 41 F 2 130 204 0 2 172 0 1.4 1 2 2
## 6 56 M 2 120 236 0 0 178 0 0.8 1 2 2
Building a model
library(randomForest)
data.imputed <- rfImpute(hd ~ ., data = data, inter=6)
## ntree OOB 1 2
## 300: 17.49% 13.41% 22.30%
## ntree OOB 1 2
## 300: 17.49% 14.02% 21.58%
## ntree OOB 1 2
## 300: 16.83% 12.80% 21.58%
## ntree OOB 1 2
## 300: 18.48% 14.63% 23.02%
## ntree OOB 1 2
## 300: 19.80% 16.46% 23.74%
model <- randomForest(hd ~ ., data = data.imputed, ntree=1000, proximity=T)
print(model) #accuracy 83.5%
##
## Call:
## randomForest(formula = hd ~ ., data = data.imputed, ntree = 1000, proximity = T)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 17.16%
## Confusion matrix:
## Healthy Unhealthy class.error
## Healthy 142 22 0.1341463
## Unhealthy 30 109 0.2158273
Prediction
prediction.hd <- predict(model, test)
cbind(test,prediction.hd)
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1 63 M 1 145 233 1 2 150 0 2.3 3 2 3
## 2 67 M 4 160 286 0 2 108 1 1.5 2 5 2
## 3 67 M 4 120 229 0 2 129 1 2.6 2 4 4
## 4 37 M 3 130 250 0 0 187 0 3.5 3 2 2
## 5 41 F 2 130 204 0 2 172 0 1.4 1 2 2
## 6 56 M 2 120 236 0 0 178 0 0.8 1 2 2
## prediction.hd
## 1 Healthy
## 2 Unhealthy
## 3 Unhealthy
## 4 Healthy
## 5 Healthy
## 6 Healthy
Prediction check
head(data)
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1 63 M 1 145 233 1 2 150 0 2.3 3 2 3
## 2 67 M 4 160 286 0 2 108 1 1.5 2 5 2
## 3 67 M 4 120 229 0 2 129 1 2.6 2 4 4
## 4 37 M 3 130 250 0 0 187 0 3.5 3 2 2
## 5 41 F 2 130 204 0 2 172 0 1.4 1 2 2
## 6 56 M 2 120 236 0 0 178 0 0.8 1 2 2
## hd
## 1 Healthy
## 2 Unhealthy
## 3 Unhealthy
## 4 Healthy
## 5 Healthy
## 6 Healthy
Error visualisation
oob.error.data <- data.frame(
Trees=rep(1:nrow(model$err.rate), times=3),
Type=rep(c("OOB", "Healthy", "Unhealthy"), each=nrow(model$err.rate)),
Error=c(model$err.rate[,"OOB"],
model$err.rate[,"Healthy"],
model$err.rate[,"Unhealthy"]))
library(ggplot2)
library(ggthemes)
ggplot(data=oob.error.data, aes(x=Trees, y=Error)) +
geom_line(aes(color=Type)) +
ggtitle("Error of the estimation") +
theme_bw() +
theme(plot.title = element_text(colour="Black",size=10, hjust = 0.5))
INTERACTIVE distance visualisation
oob.values <- vector(length=10)
for(i in 1:10) {
temp.model <- randomForest(hd ~ ., data=data.imputed, mtry=i, ntree=1000)
oob.values[i] <- temp.model$err.rate[nrow(temp.model$err.rate),1]
}
oob.values #the third value is optimal but we would not know if we had not tried!
## [1] 0.1650165 0.1650165 0.1815182 0.1848185 0.1947195 0.1914191 0.1947195
## [8] 0.2013201 0.2079208 0.1980198
distance.matrix <- dist(1-model$proximity)
mds.stuff <- cmdscale(distance.matrix, eig = T, x.ret = T)
mds.var.per <- round(mds.stuff$eig/sum(mds.stuff$eig)*100, 1)
mds.values <- mds.stuff$points
mds.data <- data.frame(Sample=rownames(mds.values),
X=mds.values[,1],
Y=mds.values[,2],
Status=data.imputed$hd)
library(plotly)
g <- ggplot(data=mds.data, aes(x=X, y=Y, label=Sample)) +
geom_text(aes(color=Status)) +
theme_bw() +
xlab(paste("MDS1 - ", mds.var.per[1], "%", sep="")) +
ylab(paste("MDS2 - ", mds.var.per[2], "%", sep="")) +
ggtitle("MDS plot using (1 - Random Forest Proximities)") +
theme(plot.title = element_text(colour="Black",size=10, hjust = 0.5))
ggplotly(g)