Different persons like to write/organize codes in different ways. I generally followed the following points.
More spacious. I would write x = c(2,3)
rather than x=c(2,3)
.
I used “=” instead of “<-” in all places where they are consequentially the same. [Note: Only in particular cases, “<-” is different than “=.” For example, cor(x=1:3,y=4:6)
and cor(x<-1:3,y<-4:6)
will give the same output 1, but in the latter case only, x and y will also get defined in main environment. As an aside, if you want to define a varible within the body of a function but also in the main environment, use like this: a = function(){ff <<- 2}
. Here, on running the function a
, ff
will also get defined in the main environment.]
Loop: For multiline for/if/while loop, I kept the curly brackets ({}) vertically aligned. Thus, instead of writing
for(i in 1:10){
= i
a = i + 1
b }
I wrote
for(i in 1:10)
{= i
a = i + 1
b }
data
directory, the source files are in source
directory, the analysis result files are saved as R workspace within Results
directory, and plots and tables are saved within tables_and_figures
directory.# empty workspace
rm(list=ls())
# load library
library(data.table) # for data input/output
library(mice) # for imputation using mice() function
library(randomForestSRC) # for random forest analysis using rfsrc() function
library(kableExtra) # for printing nice looking tables using functions like column_spec()
library(ggplot2) # for making nice looking compact plots using data.table object
library(riskRegression) # for fitting Fine and Gray model using FGR()
library(pec) # for predicting cumulative incidence from fitted CR object using predictEventProb()
library(prodlim)
# set directories
= "/Users/nsanyal/Dropbox/Statistics/Research_Nil/Project-10-CompetingRisk/Introductory_manuals/RF_for_CR/" workdir
# import data
= fread( paste0( workdir, "data/example_SPLC_data.csv" ) )
D dim(D)
## [1] 1000 20
# exclude never smokers (1=never, 2=former, 3=current)
= D[ !D$smkstatus2 == 1,]
D dim(D)
# [1] 873 20
# Choose variables to include and subset data
= c( "Time", "event", "age.ix.cat", "bmi", "chemo_ix", "cigday2", "copd", "edu", "fh", "hist2.ix", "packyears2", "ph", "quityears2", "race.cat", "radiation_ix", "sex", "smkstatus2", "stage2.ix", "surgery_ix", "USPSTF" )
vars.include = D[,..vars.include]
data $age.ix.cat = unname( sapply( data$age.ix.cat, function(x) switch(x, "71-80" = "71.80", "80+" = "80", "61-70" = "61.70", "<55" = "55", "56-60" = "56.60" ) ) ) # replace characters -,< and + by .
data
# Set missing code, if any, to NA. Then, convert categorical variables to factor.
$age.ix.cat = relevel( as.factor(data$age.ix.cat), ref='61.70' )
data
$chemo_ix = ifelse( data$chemo_ix==9, NA, data$chemo_ix )
data$chemo_ix = relevel( as.factor(data$chemo_ix), ref=1 )
data
$copd = ifelse( data$copd==9, NA, data$copd )
data$copd = as.factor( as.character(data$copd) )
data
$edu = relevel( as.factor( as.character(data$edu) ), ref="2") # 2 = college group is reference
data
$fh = as.factor( as.character(data$fh) )
data
$hist2.ix = relevel( as.factor(data$hist2.ix), ref="SQ" )
data
$ph = as.factor( as.character(data$ph) )
data
$race.cat = relevel( as.factor(data$race.cat), ref="W" )
data
$radiation_ix = ifelse( data$radiation_ix==9, NA, data$radiation_ix )
data$radiation_ix = relevel( as.factor( as.character(data$radiation_ix) ), ref="0") # 0 = No radiotherapy
data
$sex = relevel( as.factor( as.character(data$sex) ), ref="0") # 0 = Female
data$smkstatus2 = relevel( as.factor( as.character(data$smkstatus2) ), ref="2") # 2 = Former
data
$stage2.ix = ifelse( data$stage2.ix==9, NA, data$stage2.ix )
data$stage2.ix = relevel( as.factor( as.character(data$stage2.ix) ), ref="0") # 0 = No stage 2 primary LC
data
$surgery_ix = ifelse( data$surgery_ix==9, NA, data$surgery_ix )
data$surgery_ix = relevel( as.factor(data$surgery_ix), ref=1 )
data
$USPSTF = relevel( as.factor( as.character(data$USPSTF) ), ref="0") # 0 = No stage 2 primary LC data
# set a seed
= c(12)
seed
# perform single imputation
= mice( data = data, m = 1, maxit = 5, print = T, seed = seed )
mi
# extract imputed dataset
= complete(mi,1)
data.imp
# # save imputed data
# save( list = c("mi","data.imp"), file = paste0("Results/data_imp_seed",seed,".Rdata") )
# load imputed data
load(paste0(workdir,"Results/data_imp_seed12.Rdata"))
# fit random forest model
= rfsrc( formula = Surv(Time, event) ~ ., cause = 1, data = data.imp, ntime = c(5,10,15),
fit.rf ntree = 100, splitrule = "logrankCR", nodesize = 25, nsplit = 5, mtry = 3,
importance = "random", block.size = 1, membership = T, var.used = "by.tree",
proximity = "oob", split.depth = "by.tree", seed = NULL, do.trace = 5 )
# # there are many other arguments in rfsrc which are shown in the more detailed version below.
# fit.rf = rfsrc(
# # GENERAL
# formula = Surv(Time, event) ~ .,
# data = data.imp,
# cause = 1, #event-specific analysis: any number between 1 and J. If not specified, the
# #default is to use a composite splitting rule that averages over all event
# #types. Can also be a vector of non-negative weights of length J specifying
# #weights for each event
# ntime = c(5,10,15),
# seed = NULL,
# do.trace = 5, #no. of sec between updates to the user on appx. time to completion.
# # MISSINGNESS
# na.action = "na.impute", #action taken for NAs: "na.omit", "na.impute"
# nimpute = 1, #number of iterations of the missing data algorithm. Performance measures
# #such as OOB error rates tend to become optimistic if nimpute is > 1.
# # BOOTSTRAPPING
# bootstrap = "by.root", #Bootstrap protocol: "by.root", "by.node", "none", "by.user"
# samptype = "swr", #for bootstrapping "by.root": "swor" (subsampling), "swr"
# sampsize = n, #specifying size of bootstrap data when by.root is true.
# #For swor .632*n, else n. Can also specify using a number.
# samp = NULL, #for bootstrap="by.user": n x ntree dim. array specifying how many times
# #each record appears inbag in the bootstrap for each tree.
# case.wt = NULL, #non-negative wts (need not sum to 1) for sampling cases. Obsns with
# #larger wts will be selected with higher prob. in the bootstrap samples.
# # FOREST/TREE PARAMETERS
# ntree = 100, #no. of tree
# mtry = 9, #no. of variables randomly selected as candidates for splitting a node
# nsplit = 5, #number of random splits to consider for each candidate splitting variable
# splitrule = "logrankCR", #splitting rule: logrankCR (Gray's test), logrank (Logrank test)
# nodesize = 25, #forest average number of unique cases (data points) in a terminal node.
# nodedepth = NULL, #maximum depth to which a tree should be grown
# xvar.wt = NULL, #non-negative wts (need not sum to 1) representing the probability of
# #selecting a variable for splitting. Default is to use uniform weights.
# split.wt = NULL, #non-negative wts used for multiplying the split statistic for a variable.
# #A large value encourages the node to split on a specific variable.
# #Default is to use uniform weights.
# # ESTIMATION / PREDICTION
# ensemble = "oob", #specifies the type of ensemble: "all", "oob", "inbag"
# # VARIABLE SELECTION
# importance = "random", #method for computing variable importance: FALSE, TRUE/"permute", "none", "random", "anti"
# block.size = 10, #VIMP is calculated in "blocks" of size equal to block.size.
# # OUTPUT CONTROL
# membership = T, #should terminal node membership and inbag information be returned?
# proximity = "oob", #proximity of cases as measured by the frequency of sharing the same
# #terminal node. An nxn matrix: FALSE, TRUE/"inbag", "oob", "all"
# distance = T,
# forest.wt = "oob", #Should the forest weight matrix be calculated? An nxn matrix which can be
# #used for prediction and constructing customized estimators: FALSE,
# #TRUE/"inbag", "oob", "all"
# var.used = "by.tree", #return variables used for splitting?: FALSE, "all.trees" (returns a
# #vector of the no. of times a split occurred on a variable), "by.tree"
# #(returns a matrix recording the no. of times a split occurred on a
# #variable in a specific tree.)
# split.depth = "by.tree", #records the minimal depth for each variable: FALSE, "all.trees"
# #(returns a matrix of the avg. minimal depth for a variable (columns)
# #for a specific case (rows)), "by.tree" (returns a three-dimensional
# #array recording minimal depth for a specific case for a variable
# #for a specific tree)
# statistics = T #Should split statistics be returned?
# )
# save fit
# save( list = c("fit.rf"), file = paste0(workdir,"Results/RF_fit_imp_seed12.Rdata") ) )
# load fit
# load( paste0(workdir,"Results/RF_fit_imp_seed12.Rdata") )
# print a fit summary
print(fit.rf)
# names of all returned variables
names(fit.rf)
# [A] FOREST/TREE RELATED OUTPUTS
# number of deaths (deaths from all causes)
$ndead
fit.rf
# in-bag membership. n x ntree binary (0,1) matrix.
dim(fit.rf$inbag)
$inbag[1:5,1:5]
fit.rf#NOTE: 63.2% subjects are included in each tree as already specified by 'sampsize' input argument in RF fit.
#This can also be verified as below: apply(fit.rf$inbag, 2, mean)*100
# number of terminal nodes (leaves) for each tree in the forest
$leaf.count
fit.rf
# terminal node membership (in which terminal node of a tree, a sample falls). n x ntree matrix.
dim(fit.rf$membership)
$membership[1:5,1:5]
fit.rf
# how many times a variable was used for splitting within each tree of the forest. ntree x nvar matrix.
dim(fit.rf$var.used)
$var.used[1:5,]
fit.rf
# how many times two observations share the same leaf in the forest. n x n proximity matrix (can be very large)
dim(fit.rf$proximity)
$proximity[1:5,1:5]
fit.rf
# [B] ESTIMATES OUTPUTS
# in-bag cause-specific cumulative hazard function (CSCHF) for each event. n x ncause x ntimes matrix.
dim(fit.rf$chf)
$chf[1,1,]
fit.rf
# out-of-bag cause-specific cumulative hazard function (CSCHF) for each event. n x ncause x ntimes matrix.
dim(fit.rf$chf.oob)
$chf.oob[1,1,]
fit.rf
# in-bag cumulative incidence function for each event. n x ncause x ntimes matrix.
dim(fit.rf$cif)
$cif[1,1,]
fit.rf
# out-of-bag cumulative incidence function for each event. n x ncause x ntimes matrix.
dim(fit.rf$cif.oob)
$cif.oob[1,1,]
fit.rf
# [C] PREDICTION OUTPUTS
# in-bag predicted value (expected number of life years lost due to the event-specific cause up to the maximum follow up = cause-j mortality (Ishwaran et al. 2014). n x ncause matrix.
dim(fit.rf$predicted)
$predicted[1:5,]
fit.rf
# out-of-bag predicted value (expected number of life years lost due to the event-specific cause up to the maximum follow up = cause-j mortality (Ishwaran et al. 2014). n x ncause matrix.
dim(fit.rf$predicted.oob)
$predicted.oob[1:5,]
fit.rf
# tree cumulative out-of-bag prediction error rate (= 1 - out-of-bag C-index)
dim(fit.rf$err.rate)
$err.rate[1:12,]
fit.rf
$err.rate[ntree,] # this is printed in print(fit.rf)
fit.rf
#NOTE: The cumulative OOB error rate is computed at every 10'th tree, which is specified in the RF fit input
#as block.size = 10. VIMP is also calculated in "blocks" of size equal to block.size.
# [D] VARIABLE SELECTION OUTPUTS
# variable importance (VIMP) for each x-variable. nvar x ncause matrix.
dim(fit.rf$importance)
$importance
fit.rf
# minimal depth by tree for variable j for case i. n x nvar x ntree array.
dim(fit.rf$split.depth)
$split.depth[1,1,] fit.rf
Note: In rfsrc() function manual, in the description of the argument ‘cause,’ it is written – “Finally, regardless of how cause is specified, the returned forest object always provides estimates for all event types.” Only cause=1 is used to determine splitting rule, but once trees are built, prediction is done for all causes. That is why in the output, all the measures (CSCHF, CIF, predictions, and prediction errors) are given for all causes.
# plot fitted rfsrc object
plot.competing.risk(fit.rf)
The main function rfsrc() already outputs estimated cumulative incidence and hazard functions, predicted values (cause-specific mortality) and prediction error for the training data. For a test data/ new data, all these measures can be computed using the fitted rfsrc object in predict.rfsrc() function.
# predict using fitted rfsrc object
= predict( objct = fit.rf, newdata = data.imp, ...) # other arguments are very similar to rfsrc() pred.rf
Although, the main function rfsrc() has the option to output variable importance and minimal depth, these measures can be separately computed using the fitted rfsrc object in var.select() function, which provides variable selection based on these features.
# perform variable selection based on minimal depth
= var.select( object = fit.rf, cause = 1, method = "md", ntree = 100, mtry = 2, verbose = F )
b
#NOTE: This function implements random forest variable selection using tree minimal depth methodology
#(Ishwaran et al., 2010). The option method allows for two different approaches:
#1. method="md" (Invokes minimal depth variable selection) (preferred method),
#2. method="vh" or method="vh.vimp" (Invokes variable hunting).
# print variable importance and minimal depth values
print(b$varselect)
## depth vimp.event.1 vimp.event.2 vimp.event.3
## surgery_ix 3.14 -0.0014069185 5.401909e-02 0.0155304912
## cigday2 3.75 -0.0010246583 5.633344e-03 -0.0051239774
## quityears2 3.79 -0.0083349157 5.845281e-03 0.0051114213
## hist2.ix 3.91 -0.0095772337 2.902567e-03 0.0039909412
## packyears2 4.02 0.0034109040 5.617082e-03 0.0044614748
## stage2.ix 4.02 0.0144114898 3.856763e-02 0.0027109170
## age.ix.cat 4.04 -0.0152396125 8.138480e-04 0.0011767681
## bmi 4.33 -0.0153980497 5.807440e-03 0.0031127804
## edu 4.42 0.0023948477 2.987714e-03 0.0082483622
## race.cat 4.48 -0.0014291058 1.081907e-03 -0.0014747634
## chemo_ix 4.80 0.0133317010 1.825293e-03 0.0069461151
## sex 4.97 -0.0026098403 1.563046e-03 -0.0031695856
## radiation_ix 5.12 -0.0002113046 -4.803669e-04 0.0057221485
## smkstatus2 5.23 -0.0065650592 8.879603e-04 0.0009351247
## USPSTF 5.25 -0.0024776742 -4.734015e-05 0.0046076795
## copd 5.43 -0.0042265518 1.158509e-03 0.0021820815
## ph 5.64 -0.0023605097 2.634606e-03 -0.0051433376
## fh 5.67 -0.0029547581 1.359144e-03 0.0095575434
print(b$topvars)
## [1] "surgery_ix" "cigday2" "quityears2" "hist2.ix" "packyears2" "stage2.ix" "age.ix.cat"
= b$varselect$depth
md names(md) = rownames(b$varselect)
# plot variable importance and minimal depth values
par(mfrow = c(1,2), mar=c(4,7,4,1))
barplot( sort(fit.rf$importance[,1]), las = 1, horiz = T, main = "Variable importance (VIMP)", xlab = "VIMP", cex.names = .8, ylim = c(0,30) ) # barplot of VIMP-based importance
barplot(rev(sort(md)), las = 1, horiz = T, main = "Minimal depth (MD)", xlab = "MD", cex.names = .8, ylim = c(0,30) ) # barplot of VIMP-based importance
randomForestSRC
package related theory, details and codes: https://kogalur.github.io/randomForestSRC/index.html