Different persons like to write/organize codes in different ways. I generally followed the following points.
More spacious. I would write x = c(2,3) rather than x=c(2,3).
I used “=” instead of “<-” in all places where they are consequentially the same. [Note: Only in particular cases, “<-” is different than “=.” For example, cor(x=1:3,y=4:6) and cor(x<-1:3,y<-4:6) will give the same output 1, but in the latter case only, x and y will also get defined in main environment. As an aside, if you want to define a varible within the body of a function but also in the main environment, use like this: a = function(){ff <<- 2}. Here, on running the function a, ff will also get defined in the main environment.]
Loop: For multiline for/if/while loop, I kept the curly brackets ({}) vertically aligned. Thus, instead of writing
for(i in 1:10){
a = i
b = i + 1
}I wrote
for(i in 1:10)
{
a = i
b = i + 1
}data directory and the analysis result files are saved as R workspace within Results directory. Below, in [1]-[4] we prepare the data for analysis and in [5] we fit the CoxBoost model with user-chosen number of boosting steps and penalty parameter. In [6] and [7], we see how to objectively determine the number of boosting steps and penalty parameter. In practice, we can run [6] and [7] first to get objectively determned values and then use them to fit main CoxBoost model in [5]. # empty workspace
rm(list=ls())
# load library
library(data.table) # for data input/output
library(mice) # for imputation using mice() function
library(CoxBoost) # for CoxBoost analysis using CoxBoost() function
library(kableExtra) # for printing nice looking tables using functions like column_spec()
library(ggplot2) # for making nice looking compact plots using data.table object
library(riskRegression) # for fitting Fine and Gray model using FGR()
library(pec) # for predicting cumulative incidence from fitted CR object using predictEventProb()
library(prodlim)
# set directories
workdir = "/Users/nsanyal/Dropbox/Statistics/Research_Nil/Project-10-CompetingRisk/Introductory_manuals/CB_for_CR/" # import data
D = fread( paste0( workdir, "data/example_SPLC_data.csv" ) )
dim(D)## [1] 1000 20
# exclude never smokers (1=never, 2=former, 3=current)
D = D[ !D$smkstatus2 == 1,]
dim(D)
# [1] 873 20
# Choose variables to include and subset data
vars.include = c( "Time", "event", "age.ix.cat", "bmi", "chemo_ix", "cigday2", "copd", "edu", "fh", "hist2.ix", "packyears2", "ph", "quityears2", "race.cat", "radiation_ix", "sex", "smkstatus2", "stage2.ix", "surgery_ix", "USPSTF" )
data = D[,..vars.include]
data$age.ix.cat = unname( sapply( data$age.ix.cat, function(x) switch(x, "71-80" = "71.80", "80+" = "80", "61-70" = "61.70", "<55" = "55", "56-60" = "56.60" ) ) ) # replace characters -,< and + by .
# Set missing code, if any, to NA. Then, convert categorical variables to factor.
data$age.ix.cat = relevel( as.factor(data$age.ix.cat), ref='61.70' )
data$chemo_ix = ifelse( data$chemo_ix==9, NA, data$chemo_ix )
data$chemo_ix = relevel( as.factor(data$chemo_ix), ref=1 )
data$copd = ifelse( data$copd==9, NA, data$copd )
data$copd = as.factor( as.character(data$copd) )
data$edu = relevel( as.factor( as.character(data$edu) ), ref="2") # 2 = college group is reference
data$fh = as.factor( as.character(data$fh) )
data$hist2.ix = relevel( as.factor(data$hist2.ix), ref="SQ" )
data$ph = as.factor( as.character(data$ph) )
data$race.cat = relevel( as.factor(data$race.cat), ref="W" )
data$radiation_ix = ifelse( data$radiation_ix==9, NA, data$radiation_ix )
data$radiation_ix = relevel( as.factor( as.character(data$radiation_ix) ), ref="0") # 0 = No radiotherapy
data$sex = relevel( as.factor( as.character(data$sex) ), ref="0") # 0 = Female
data$smkstatus2 = relevel( as.factor( as.character(data$smkstatus2) ), ref="2") # 2 = Former
data$stage2.ix = ifelse( data$stage2.ix==9, NA, data$stage2.ix )
data$stage2.ix = relevel( as.factor( as.character(data$stage2.ix) ), ref="0") # 0 = No stage 2 primary LC
data$surgery_ix = ifelse( data$surgery_ix==9, NA, data$surgery_ix )
data$surgery_ix = relevel( as.factor(data$surgery_ix), ref=1 )
data$USPSTF = relevel( as.factor( as.character(data$USPSTF) ), ref="0") # 0 = No stage 2 primary LC # set a seed
seed = c(12)
# perform single imputation
mi = mice( data = data, m = 1, maxit = 5, print = T, seed = seed )
# extract imputed dataset
data.imp = complete(mi,1)
# # save imputed data
# save( list = c("mi","data.imp"), file = paste0("Results/data_imp_seed",seed,".Rdata") ) # load imputed data
load(paste0(workdir,"Results/data_imp_seed12.Rdata"))
# create covariate matrix and
mat = data.imp[, -c(1:2)]
expr = as.formula(paste(" ~ ", paste(colnames(mat), collapse = " + ")))
X = model.matrix(expr, mat)[, -1]
# fit CoxBoost model
fit.cb = CoxBoost(
time = data.imp$Time,
status = data.imp$event,
x = X, # n x p matrix of covariates.
standardize = T, # whether or not covariates should be standardized for
# estimation. Mandatory covariates are not standardized.
unpen.index=c(5,25), # indices of mandatory covariates. here bmi and sex are chosen.
stepno = 100, # number of boosting steps (M).
penalty = 100,
criterion = "pscore", # criterion to be used for selection in each boosting step.
# "pscore" corresponds to the penalized score statistics, "score" to the
# un-penalized score statistics. Different results will only be seen for
# un-standardized covariates ("pscore" will result in preferential
# selection of covariates with larger covariance), or if different
# penalties are used for different covariates.
return.score = T, # whether or not the value of the score statistic (or penalized
# score statistic, depending on criterion), as evaluated in each boosting
# step for every covariate, should be returned.
trace = T # logical value indicating whether progress in estimation
# should be indicated by printing the name of the covariate updated.
)
# print a summary of the fitted CoxBoost model
summary(fit.cb)
# variables contained in the fitted object
names(fit.cb)
# coefficient estimates
dim(fit.cb$coefficients) # (stepno+1) x p matrix containing the coefficient estimates for the
# (standardized) optional covariates for boosting steps 0 to stepno.
fit.cb$coefficients[100,] # check which coefficients are estimated as 0 in the last step
fit.cb$coefficients[99,]
fit.cb$coefficients[80,]
# finally selected and not-selected coefficients
colnames(X)[ which(fit.cb$coefficients[100,] != 0) ] # selected
colnames(X)[ which(fit.cb$coefficients[100,] == 0) ] # not selected
# score statistic estimates
dim(fit.cb$scoremat) # stepno x (p-p.unpen) matrix containing the value of the score statistic for
# each of the optional covariates before each boosting step.
which.max(fit.cb$scoremat[1,]) # which coefficient was updated in the first step
which.max(fit.cb$scoremat[2,])
which.max(fit.cb$scoremat[3,])
which.max(fit.cb$scoremat[4,])
# linear predictor estimates
dim(fit.cb$linear.predictor) # (stepno+1) x n matrix giving the linear predictor for boosting steps
# 0 to stepno and every observation.
# cumulative baseline hazard estimates
dim(fit.cb$Lambda) # (stepno+1) x ntime matrix with the Breslow estimate for the cumulative baseline
# hazard for boosting steps 0 to stepno for every event time.
# used penalty values
fit.cb$penalty # Fit K-fold cross-validation for CoxBoost to determine adequate penalty parameter (to be chosen only very coarsely)
optim.fit.cb = optimCoxBoostPenalty(time = data.imp$Time, status = data.imp$event, x = X, unpen.index=c(5,25),
minstepno = 50, maxstepno = 200, # range of boosting steps in which the “optimal” number
# of boosting steps is wanted to be.
start.penalty = 100, # start value for the search for the appropriate penalty.
iter.max = 10, trace = T)
# the optimal penalty
optim.fit.cb$penalty # Fit K-fold cross-validation for CoxBoost to determine optimal number of boosting steps
cv.fit.cb = cv.CoxBoost(time = data.imp$Time, status = data.imp$event, x = X, unpen.index=c(5,25),
maxstepno = 100, K = 10, type = "verweij", trace = T, penalty = 100,
criterion = "pscore" )
# maxstepno = maximum number of boosting steps to evaluate. The returned “optimal” number of
# boosting steps will be in the range [0,maxstepno].
# plot mean partial log-likelihood
cv.fit.cb$mean.logplik # vector of length maxstepno+1 with the mean partial log-likelihood for boosting
# steps 0 to maxstepno
plot(cv.fit.cb$mean.logplik)
# optimal boosting step number
cv.fit.cb$optimal.step # having minimum mean partial log-likelihood.