Guideline regarding code writing style:

Different persons like to write/organize codes in different ways. I generally followed the following points.

 for(i in 1:10){
    a = i
    b = i + 1
 }

I wrote

 for(i in 1:10)
 {
    a = i
    b = i + 1
 }


We are going to import an example SPLC data in R as a data.table object, fit random forest to the data and perform predictions at given time-points (using R package randomForestSRC). Finally, the predictive performance will be evaluated through different measures of discrimination and calibration (using R packages riskRegression, pec, prodlim).

The data file is in data directory, the source files are in source directory, the analysis result files are saved as R workspace within Results directory, and plots and tables are saved within tables_and_figures directory.


[1] Set directories and load all libraries

 # empty workspace
 rm(list=ls())  
 
 # load library
 library(data.table) # for data input/output
 library(mice) # for imputation using mice() function
 library(randomForestSRC) # for random forest analysis using rfsrc() function
 library(kableExtra) # for printing nice looking tables using functions like column_spec()
 library(ggplot2) # for making nice looking compact plots using data.table object
 library(riskRegression) # for fitting Fine and Gray model using FGR()
 library(pec) # for predicting cumulative incidence from fitted CR object using predictEventProb()
 library(prodlim) 
 
 # set directories
 workdir = "/Users/nsanyal/Dropbox/Statistics/Research_Nil/Project-10-CompetingRisk/Introductory_manuals/RF_for_CR/"


[2] Import an example SPLC dataset

 # import data
 D = fread( paste0( workdir, "data/example_SPLC_data.csv" ) )
 dim(D)
## [1] 1000   20


[3] Select variables for analysis

 # exclude never smokers (1=never, 2=former, 3=current)
 D = D[ !D$smkstatus2 == 1,]
 dim(D)
 # [1] 873  20

 
 # Choose variables to include and subset data
 vars.include = c( "Time", "event", "age.ix.cat", "bmi", "chemo_ix", "cigday2", "copd", "edu", "fh", "hist2.ix", "packyears2", "ph", "quityears2", "race.cat", "radiation_ix", "sex", "smkstatus2", "stage2.ix", "surgery_ix", "USPSTF" )   
 data = D[,..vars.include]
 data$age.ix.cat = unname( sapply( data$age.ix.cat, function(x) switch(x, "71-80" = "71.80", "80+" = "80", "61-70" = "61.70", "<55" = "55", "56-60" = "56.60" )  ) )   # replace characters -,< and + by . 


 # Set missing code, if any, to NA. Then, convert categorical variables to factor.
 data$age.ix.cat = relevel( as.factor(data$age.ix.cat), ref='61.70' )   

 data$chemo_ix = ifelse( data$chemo_ix==9, NA, data$chemo_ix )
 data$chemo_ix = relevel( as.factor(data$chemo_ix), ref=1 ) 

 data$copd = ifelse( data$copd==9, NA, data$copd )
 data$copd = as.factor( as.character(data$copd) ) 

 data$edu = relevel( as.factor( as.character(data$edu) ), ref="2") # 2 = college group is reference 

 data$fh = as.factor( as.character(data$fh) ) 

 data$hist2.ix = relevel( as.factor(data$hist2.ix), ref="SQ" ) 

 data$ph = as.factor( as.character(data$ph) ) 

 data$race.cat = relevel( as.factor(data$race.cat), ref="W" ) 

 data$radiation_ix = ifelse( data$radiation_ix==9, NA, data$radiation_ix )
 data$radiation_ix = relevel( as.factor( as.character(data$radiation_ix) ), ref="0") # 0 = No radiotherapy 

 data$sex = relevel( as.factor( as.character(data$sex) ), ref="0") # 0 = Female
 data$smkstatus2 = relevel( as.factor( as.character(data$smkstatus2) ), ref="2") # 2 = Former 

 data$stage2.ix = ifelse( data$stage2.ix==9, NA, data$stage2.ix )
 data$stage2.ix = relevel( as.factor( as.character(data$stage2.ix) ), ref="0") # 0 = No stage 2 primary LC 

 data$surgery_ix = ifelse( data$surgery_ix==9, NA, data$surgery_ix )
 data$surgery_ix = relevel( as.factor(data$surgery_ix), ref=1 ) 

 data$USPSTF = relevel( as.factor( as.character(data$USPSTF) ), ref="0") # 0 = No stage 2 primary LC


[4] Impute data and save in file

 # set a seed
 seed = c(12) 

 # perform single imputation
 mi = mice( data = data, m = 1, maxit = 5, print = T, seed = seed ) 

 # extract imputed dataset
 data.imp = complete(mi,1)

 # # save imputed data
 # save( list = c("mi","data.imp"), file = paste0("Results/data_imp_seed",seed,".Rdata") )


[5] Fit RF model

 # load imputed data
 load(paste0(workdir,"Results/data_imp_seed12.Rdata"))

 # fit random forest model
 fit.rf = rfsrc( formula = Surv(Time, event) ~ ., cause = 1, data = data.imp, ntime = c(5,10,15), 
              ntree = 100, splitrule = "logrankCR", nodesize = 25, nsplit = 5, mtry = 3,
              importance = "random", block.size = 1, membership = T, var.used = "by.tree", 
              proximity = "oob", split.depth = "by.tree", seed = NULL, do.trace = 5 )
 # # there are many other arguments in rfsrc which are shown in the more detailed version below.
 # fit.rf = rfsrc( 
 #         # GENERAL
 #         formula = Surv(Time, event) ~ .,
 #          data = data.imp,
 #          cause = 1, #event-specific analysis: any number between 1 and J. If not specified, the 
 #                     #default is to use a composite splitting rule that averages over all event 
 #                     #types. Can also be a vector of non-negative weights of length J specifying 
 #                     #weights for each event
 #          ntime = c(5,10,15),
 #          seed = NULL,
 #          do.trace = 5, #no. of sec between updates to the user on appx. time to completion. 


 #          # MISSINGNESS
 #          na.action = "na.impute", #action taken for NAs: "na.omit", "na.impute"
 #          nimpute = 1, #number of iterations of the missing data algorithm. Performance measures 
 #                       #such as OOB error rates tend to become optimistic if nimpute is > 1. 


 #          # BOOTSTRAPPING
 #          bootstrap = "by.root", #Bootstrap protocol: "by.root", "by.node", "none", "by.user"
 #          samptype = "swr", #for bootstrapping "by.root": "swor" (subsampling), "swr"
 #          sampsize = n, #specifying size of bootstrap data when by.root is true. 
 #                                      #For swor .632*n, else n. Can also specify using a number.
 #          samp = NULL, #for bootstrap="by.user": n x ntree dim. array specifying how many times
 #                       #each record appears inbag in the bootstrap for each tree.
 #          case.wt = NULL, #non-negative wts (need not sum to 1) for sampling cases. Obsns with 
 #                          #larger wts will be selected with higher prob. in the bootstrap samples. 


 #          # FOREST/TREE PARAMETERS
 #          ntree = 100, #no. of tree
 #          mtry = 9, #no. of variables randomly selected as candidates for splitting a node
 #          nsplit = 5, #number of random splits to consider for each candidate splitting variable 
 #          splitrule = "logrankCR", #splitting rule: logrankCR (Gray's test), logrank (Logrank test)
 #          nodesize = 25, #forest average number of unique cases (data points) in a terminal node.
 #          nodedepth = NULL, #maximum depth to which a tree should be grown
 #          xvar.wt = NULL, #non-negative wts (need not sum to 1) representing the probability of
 #                          #selecting a variable for splitting. Default is to use uniform weights.
 #          split.wt = NULL, #non-negative wts used for multiplying the split statistic for a variable.
 #                           #A large value encourages the node to split on a specific variable.
 #                           #Default is to use uniform weights.
 

 #          # ESTIMATION / PREDICTION
 #          ensemble = "oob", #specifies the type of ensemble: "all", "oob", "inbag" 


 #          # VARIABLE SELECTION
 #          importance = "random", #method for computing variable importance: FALSE, TRUE/"permute", "none", "random", "anti"
 #          block.size = 10, #VIMP is calculated in "blocks" of size equal to block.size. 


 #          # OUTPUT CONTROL
 #          membership = T, #should terminal node membership and inbag information be returned?
 #          proximity = "oob", #proximity of cases as measured by the frequency of sharing the same 
 #                         #terminal node. An nxn matrix: FALSE, TRUE/"inbag", "oob", "all"
 #          distance = T,
 #          forest.wt = "oob", #Should the forest weight matrix be calculated? An nxn matrix which can be  
 #                         #used for prediction and constructing customized estimators: FALSE, 
 #                         #TRUE/"inbag", "oob", "all"
 #          var.used = "by.tree", #return variables used for splitting?: FALSE, "all.trees" (returns a 
 #                                #vector of the no. of times a split occurred on a variable), "by.tree"
 #                                #(returns a matrix recording the no. of times a split occurred on a 
 #                                #variable in a specific tree.)
 #          split.depth = "by.tree", #records the minimal depth for each variable: FALSE, "all.trees" 
 #                                   #(returns a matrix of the avg. minimal depth for a variable (columns) 
 #                                   #for a specific case (rows)), "by.tree" (returns a three-dimensional 
 #                                   #array recording minimal depth for a specific case for a variable 
 #                                   #for a specific tree)
 #          statistics = T #Should split statistics be returned? 

 #        )



 # save fit
 # save( list = c("fit.rf"), file = paste0(workdir,"Results/RF_fit_imp_seed12.Rdata") ) )


[6] Plot estimated cumulative incidence and hazard functions

 # plot fitted rfsrc object
 plot.competing.risk(fit.rf)


[7] Prediction using random forest

The main function rfsrc() already outputs estimated cumulative incidence and hazard functions, predicted values (cause-specific mortality) and prediction error for the training data. For a test data/ new data, all these measures can be computed using the fitted rfsrc object in predict.rfsrc() function.

 # predict using fitted rfsrc object
 pred.rf = predict( objct = fit.rf, newdata = data.imp, ...)  # other arguments are very similar to rfsrc()


[8] Variable Selection using random forest

Although, the main function rfsrc() has the option to output variable importance and minimal depth, these measures can be separately computed using the fitted rfsrc object in var.select() function, which provides variable selection based on these features.

 # perform variable selection based on minimal depth
 b = var.select( object = fit.rf, cause = 1, method = "md", ntree = 100, mtry = 2, verbose = F ) 

 #NOTE: This function implements random forest variable selection using tree minimal depth methodology 
 #(Ishwaran et al., 2010). The option method allows for two different approaches: 
 #1. method="md" (Invokes minimal depth variable selection) (preferred method), 
 #2. method="vh" or method="vh.vimp" (Invokes variable hunting). 

 # print variable importance and minimal depth values
 print(b$varselect)
##              depth  vimp.event.1  vimp.event.2  vimp.event.3
## surgery_ix    3.14 -0.0014069185  5.401909e-02  0.0155304912
## cigday2       3.75 -0.0010246583  5.633344e-03 -0.0051239774
## quityears2    3.79 -0.0083349157  5.845281e-03  0.0051114213
## hist2.ix      3.91 -0.0095772337  2.902567e-03  0.0039909412
## packyears2    4.02  0.0034109040  5.617082e-03  0.0044614748
## stage2.ix     4.02  0.0144114898  3.856763e-02  0.0027109170
## age.ix.cat    4.04 -0.0152396125  8.138480e-04  0.0011767681
## bmi           4.33 -0.0153980497  5.807440e-03  0.0031127804
## edu           4.42  0.0023948477  2.987714e-03  0.0082483622
## race.cat      4.48 -0.0014291058  1.081907e-03 -0.0014747634
## chemo_ix      4.80  0.0133317010  1.825293e-03  0.0069461151
## sex           4.97 -0.0026098403  1.563046e-03 -0.0031695856
## radiation_ix  5.12 -0.0002113046 -4.803669e-04  0.0057221485
## smkstatus2    5.23 -0.0065650592  8.879603e-04  0.0009351247
## USPSTF        5.25 -0.0024776742 -4.734015e-05  0.0046076795
## copd          5.43 -0.0042265518  1.158509e-03  0.0021820815
## ph            5.64 -0.0023605097  2.634606e-03 -0.0051433376
## fh            5.67 -0.0029547581  1.359144e-03  0.0095575434
 print(b$topvars)
## [1] "surgery_ix" "cigday2"    "quityears2" "hist2.ix"   "packyears2" "stage2.ix"  "age.ix.cat"
 md = b$varselect$depth
 names(md) = rownames(b$varselect)

 # plot variable importance and minimal depth values

 par(mfrow = c(1,2), mar=c(4,7,4,1)) 

 barplot( sort(fit.rf$importance[,1]), las = 1, horiz = T, main = "Variable importance (VIMP)", xlab = "VIMP", cex.names = .8, ylim = c(0,30) ) # barplot of VIMP-based importance

 barplot(rev(sort(md)), las = 1, horiz = T, main = "Minimal depth (MD)", xlab = "MD", cex.names = .8, ylim = c(0,30) ) # barplot of VIMP-based importance



[9] References and other resources