Background

Data for this project comes from 6 male participants who wore accelerometers on the belt, forearm, arm, and dumbbell. The participants performed barbell lifts correctly and incorrectly in five different ways. The goal of this exercise is to predict, based on a number of different variables collected from the accelerometers, in which of the five ways an activity was performed (i.e. the classe variable).

To do so, I will use several classification algorithms to predict the class variable based on all the included variables in the dataset: (a) decision tree, (b) random forest, (c) boosting, and (d) linear discriminant analysis. I will first use cross-validation to divide the provided training data into two parts, a training set and a validation set. I will fit each model using the testing set data, and use the resulting model fit to predict classe values in the validation set. By examining the accuracy of each method in predicting values on the validation set, I can predict which method will perform best on the test set. After choosing the most accurate algorithm, I will use the specified model fit to predict classe values for the provided testing data.

# clear workspace
rm(list=ls())

# set knitr options
knitr::opts_chunk$set(warning = FALSE)
knitr::opts_chunk$set(message = FALSE)
knitr::opts_chunk$set(echo = TRUE)

# load packages
library(rprojroot)
library(data.table)
library(tidyverse)
library(caret)
library(rattle)
library(randomForest)
library(kableExtra)

# source files within this project
path <- function(x) find_root_file(x, criterion = has_file('practical-machine-learning-course.Rproj'))

# set seed
set.seed(8054)

Prepare the data

Load data

# load in the raw data
training_raw <- as.data.frame(fread(path('pml-training.csv'), na.strings = c('NA', '', '#DIV/0')) %>%
  mutate(classe = as.factor(classe)))
testing <- as.data.frame(fread(path('pml-testing.csv'), na.strings = c('NA', '', '#DIV/0')))

Partition into training and validation sets

inTrain <- createDataPartition(training_raw$classe, p = 0.7, list = FALSE)
training <- training_raw[inTrain,]
validation <- training_raw[-inTrain,]

Remove irrelevant variables and variables with >= 50% NAs

# remove the `V1` variable from all data
# and make classe a factor variable
training <- training %>%
  select(-V1)
validation <- validation %>%
  select(-V1)
testing <- testing %>%
  select(-V1)

# store problem numbers in testing data as a vector (will use later)
testingProblemIDs <- testing$problem_id

# then remove the `problem_id` variable
testing <- testing %>%
  select(-problem_id)

# remove variables with majority NA values from all sets
numRows <- nrow(training)
include <- NULL
for(ii in 1:ncol(training)){
  sumNAs <- sum(is.na(training[,ii]))/numRows
  if (sumNAs < 0.5) {
    include <- c(include, ii)
  } else {
  }
}

trainingSet <- training[,include]
validationSet <- validation[,include]
indicesForTest <- include[1:length(include)-1] # account for 1 less variable in testing data (classe)
testingSet <- testing[,indicesForTest]

Remove variables with near-zero-variance

nzv_training <- nearZeroVar(trainingSet, saveMetrics = FALSE)
trainingSet <- trainingSet[,-nzv_training]
nzv_validation <- nearZeroVar(validationSet, saveMetrics = FALSE)
validationSet <- validationSet[,-nzv_validation]
nzv_testing <- nearZeroVar(testingSet, saveMetrics = FALSE)
testingSet <- testingSet[,-nzv_testing]

# check that column names are still consistent across datasets
all(names(trainingSet) == names(validationSet))
## [1] TRUE
all(names(trainingSet)[1:length(testingSet)] == names(testingSet))
## [1] TRUE

Convert variable types

trainingSet <- trainingSet %>% mutate_if(is.character, as.factor)
validationSet <- validationSet %>% mutate_if(is.character, as.factor)
testingSet <- testingSet %>% mutate_if(is.character, as.factor)

Models

Option 1: Decision tree

# fit the model 
mod_rpart <- train(classe ~ ., method = "rpart", data = trainingSet)

# show the decision tree
fancyRpartPlot(mod_rpart$finalModel)

# predict validation set values based on model fit
predict_rpart <- predict(mod_rpart, newdata = validationSet)
 
# compute accuracy of model in predicting validation set classe
confusionMatrix(predict_rpart, validationSet$classe)$overall[1]
##  Accuracy 
## 0.5542906

Option 2: Random forest

# fit the model 
mod_rf <- randomForest(classe ~ ., data = trainingSet)

# predict validation set values based on model fit
predict_rf <- predict(mod_rf, newdata = validationSet)
 
# compute accuracy of model in predicting validation set classe
confusionMatrix(predict_rf, validationSet$classe)$overall[1]
##  Accuracy 
## 0.9991504

Option 3: Boosting

# fit the model 
mod_gbm <- train(classe ~ ., method = "gbm", data = trainingSet, verbose = FALSE, 
                 trControl = trainControl(method = "cv", number = 8)) # added training control options, otherwise too computationally intensive for my machine

# predict validation set values based on model fit
predict_gbm <- predict(mod_gbm, newdata = validationSet)
 
# compute accuracy of model in predicting validation set classe
confusionMatrix(predict_gbm, validationSet$classe)$overall[1]
##  Accuracy 
## 0.9971113

Option 4: Linear discriminant

# fit the model 
mod_lda <- train(classe ~ ., method = "lda", data = trainingSet)

# predict validation set values based on model fit
predict_lda <- predict(mod_lda, newdata = validationSet)
 
# compute accuracy of model in predicting validation set classe
confusionMatrix(predict_lda, validationSet$classe)$overall[1]
## Accuracy 
## 0.864401

Conclusions

Most accurate prediction method

Of all methods used, the random forest method yielded the highest accuracy in terms of predictions of the validation set. It is worth noting that the boosting method yielded an accuracy value that was only very slightly lower, whereas the linear discriminant and decision tree methods were less accurate. The percent expected out-of-sample error for the random forest method is calculated below:

accuracy_rf <- confusionMatrix(predict_rf, validationSet$classe)$overall[1]
expected_error <- 100 - accuracy_rf*100
expected_error 
##   Accuracy 
## 0.08496177

Most important predictor variables according to the random forest method

Using the random forest method method, the following 5 variables were most useful for prediction:

indices <- order(varImp(mod_rf))[1:5]
names(testingSet)[indices]
## [1] "raw_timestamp_part_2" "gyros_arm_z"          "gyros_dumbbell_z"    
## [4] "gyros_forearm_x"      "gyros_forearm_z"

Predictions of test set values

Using this method, I obtain the following predictions from the original testing data:

# ensure that training set and testing set data have matching factor levels

levels(testingSet$user_name) <- levels(trainingSet$user_name)
levels(testingSet$cvtd_timestamp) <- levels(trainingSet$cvtd_timestamp)

predictedVals <- predict(mod_rf, newdata = testingSet)
kable(data.frame(testingProblemIDs, predictedVals), format = 'html') %>%
  kable_styling('striped', full_width = FALSE)
testingProblemIDs predictedVals
1 B
2 A
3 B
4 A
5 A
6 E
7 D
8 B
9 A
10 A
11 B
12 C
13 B
14 A
15 E
16 E
17 A
18 B
19 B
20 B