setwd("~/R/Data_Science_and_Machine_Learning")
data <- read.csv("heart-disease.csv", header = T)
data[ ,1] <- NULL

Preparation

colnames(data) <- c("age", "sex", "cp", "trestbps", "chol", 
                    "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "hd")
data[data == '?'] <- NA

data[data$sex == 0, 'sex'] <- 'F'
data[data$sex == 1, 'sex'] <- 'M'
data$sex <- as.factor(data$sex)

data$cp <- as.factor(data$cp)
data$fbs <- as.factor(data$fbs)
data$restecg <- as.factor(data$restecg)
data$exang <- as.factor(data$exang)
data$slope <- as.factor(data$slope)

data$ca <- as.integer(data$ca)
data$ca <- as.factor(data$ca)
data$thal <- as.integer(data$thal)
data$thal <- as.factor(data$thal)

data$hd <- ifelse(data$hd == 0, "Healthy", "Unhealthy")
data$hd <- as.factor(data$hd)

str(data)
## 'data.frame':    303 obs. of  14 variables:
##  $ age     : int  63 67 67 37 41 56 62 57 63 53 ...
##  $ sex     : Factor w/ 2 levels "F","M": 2 2 2 2 1 2 1 1 2 2 ...
##  $ cp      : Factor w/ 4 levels "1","2","3","4": 1 4 4 3 2 2 4 4 4 4 ...
##  $ trestbps: int  145 160 120 130 130 120 140 120 130 140 ...
##  $ chol    : int  233 286 229 250 204 236 268 354 254 203 ...
##  $ fbs     : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 2 ...
##  $ restecg : Factor w/ 3 levels "0","1","2": 3 3 3 1 3 1 3 1 3 3 ...
##  $ thalach : int  150 108 129 187 172 178 160 163 147 155 ...
##  $ exang   : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 2 1 2 ...
##  $ oldpeak : num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
##  $ slope   : Factor w/ 3 levels "1","2","3": 3 2 2 3 1 1 3 1 2 3 ...
##  $ ca      : Factor w/ 4 levels "2","3","4","5": 1 4 3 1 1 1 3 1 2 1 ...
##  $ thal    : Factor w/ 3 levels "2","3","4": 2 1 3 1 1 1 1 1 3 3 ...
##  $ hd      : Factor w/ 2 levels "Healthy","Unhealthy": 1 2 2 1 1 1 2 1 2 2 ...
test <- head(data)
test$hd <- NULL
test
##   age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1  63   M  1      145  233   1       2     150     0     2.3     3  2    3
## 2  67   M  4      160  286   0       2     108     1     1.5     2  5    2
## 3  67   M  4      120  229   0       2     129     1     2.6     2  4    4
## 4  37   M  3      130  250   0       0     187     0     3.5     3  2    2
## 5  41   F  2      130  204   0       2     172     0     1.4     1  2    2
## 6  56   M  2      120  236   0       0     178     0     0.8     1  2    2

Building a model

library(randomForest)
data.imputed <- rfImpute(hd ~ ., data = data, inter=6)
## ntree      OOB      1      2
##   300:  17.49% 13.41% 22.30%
## ntree      OOB      1      2
##   300:  17.49% 14.02% 21.58%
## ntree      OOB      1      2
##   300:  16.83% 12.80% 21.58%
## ntree      OOB      1      2
##   300:  18.48% 14.63% 23.02%
## ntree      OOB      1      2
##   300:  19.80% 16.46% 23.74%
model <- randomForest(hd ~ ., data = data.imputed, ntree=1000, proximity=T)
print(model) #accuracy 83.5%
## 
## Call:
##  randomForest(formula = hd ~ ., data = data.imputed, ntree = 1000,      proximity = T) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 17.16%
## Confusion matrix:
##           Healthy Unhealthy class.error
## Healthy       142        22   0.1341463
## Unhealthy      30       109   0.2158273

Prediction

prediction.hd <- predict(model, test)
cbind(test,prediction.hd)
##   age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1  63   M  1      145  233   1       2     150     0     2.3     3  2    3
## 2  67   M  4      160  286   0       2     108     1     1.5     2  5    2
## 3  67   M  4      120  229   0       2     129     1     2.6     2  4    4
## 4  37   M  3      130  250   0       0     187     0     3.5     3  2    2
## 5  41   F  2      130  204   0       2     172     0     1.4     1  2    2
## 6  56   M  2      120  236   0       0     178     0     0.8     1  2    2
##   prediction.hd
## 1       Healthy
## 2     Unhealthy
## 3     Unhealthy
## 4       Healthy
## 5       Healthy
## 6       Healthy

Prediction check

head(data)
##   age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1  63   M  1      145  233   1       2     150     0     2.3     3  2    3
## 2  67   M  4      160  286   0       2     108     1     1.5     2  5    2
## 3  67   M  4      120  229   0       2     129     1     2.6     2  4    4
## 4  37   M  3      130  250   0       0     187     0     3.5     3  2    2
## 5  41   F  2      130  204   0       2     172     0     1.4     1  2    2
## 6  56   M  2      120  236   0       0     178     0     0.8     1  2    2
##          hd
## 1   Healthy
## 2 Unhealthy
## 3 Unhealthy
## 4   Healthy
## 5   Healthy
## 6   Healthy

Error visualisation

oob.error.data <- data.frame(
  Trees=rep(1:nrow(model$err.rate), times=3),
  Type=rep(c("OOB", "Healthy", "Unhealthy"), each=nrow(model$err.rate)),
  Error=c(model$err.rate[,"OOB"], 
          model$err.rate[,"Healthy"], 
          model$err.rate[,"Unhealthy"]))
library(ggplot2)
library(ggthemes)
ggplot(data=oob.error.data, aes(x=Trees, y=Error)) +
  geom_line(aes(color=Type)) +
  ggtitle("Error of the estimation") +
  theme_bw() +
  theme(plot.title = element_text(colour="Black",size=10, hjust = 0.5))

INTERACTIVE distance visualisation

oob.values <- vector(length=10)
for(i in 1:10) {
  temp.model <- randomForest(hd ~ ., data=data.imputed, mtry=i, ntree=1000)
  oob.values[i] <- temp.model$err.rate[nrow(temp.model$err.rate),1]
}
oob.values #the third value is optimal but we would not know if we had not tried!
##  [1] 0.1650165 0.1650165 0.1815182 0.1848185 0.1947195 0.1914191 0.1947195
##  [8] 0.2013201 0.2079208 0.1980198
distance.matrix <- dist(1-model$proximity)
mds.stuff <- cmdscale(distance.matrix, eig = T, x.ret = T)
mds.var.per <- round(mds.stuff$eig/sum(mds.stuff$eig)*100, 1)

mds.values <- mds.stuff$points
mds.data <- data.frame(Sample=rownames(mds.values),
                       X=mds.values[,1],
                       Y=mds.values[,2],
                       Status=data.imputed$hd)
library(plotly)
g <- ggplot(data=mds.data, aes(x=X, y=Y, label=Sample)) + 
  geom_text(aes(color=Status)) +
  theme_bw() +
  xlab(paste("MDS1 - ", mds.var.per[1], "%", sep="")) +
  ylab(paste("MDS2 - ", mds.var.per[2], "%", sep="")) +
  ggtitle("MDS plot using (1 - Random Forest Proximities)") +
  theme(plot.title = element_text(colour="Black",size=10, hjust = 0.5))
ggplotly(g)