Different persons like to write/organize codes in different ways. I generally followed the following points.
More spacious. I would write x = c(2,3)
rather than x=c(2,3)
.
I used “=” instead of “<-” in all places where they are consequentially the same. [Note: Only in particular cases, “<-” is different than “=.” For example, cor(x=1:3,y=4:6)
and cor(x<-1:3,y<-4:6)
will give the same output 1, but in the latter case only, x and y will also get defined in main environment. As an aside, if you want to define a varible within the body of a function but also in the main environment, use like this: a = function(){ff <<- 2}
. Here, on running the function a
, ff
will also get defined in the main environment.]
Loop: For multiline for/if/while loop, I kept the curly brackets ({}) vertically aligned. Thus, instead of writing
for(i in 1:10){
= i
a = i + 1
b }
I wrote
for(i in 1:10)
{= i
a = i + 1
b }
data
directory and the analysis result files are saved as R workspace within Results
directory. Below, in [1]-[4] we prepare the data for analysis and in [5] we fit the CoxBoost model with user-chosen number of boosting steps and penalty parameter. In [6] and [7], we see how to objectively determine the number of boosting steps and penalty parameter. In practice, we can run [6] and [7] first to get objectively determned values and then use them to fit main CoxBoost model in [5].# empty workspace
rm(list=ls())
# load library
library(data.table) # for data input/output
library(mice) # for imputation using mice() function
library(CoxBoost) # for CoxBoost analysis using CoxBoost() function
library(kableExtra) # for printing nice looking tables using functions like column_spec()
library(ggplot2) # for making nice looking compact plots using data.table object
library(riskRegression) # for fitting Fine and Gray model using FGR()
library(pec) # for predicting cumulative incidence from fitted CR object using predictEventProb()
library(prodlim)
# set directories
= "/Users/nsanyal/Dropbox/Statistics/Research_Nil/Project-10-CompetingRisk/Introductory_manuals/CB_for_CR/" workdir
# import data
= fread( paste0( workdir, "data/example_SPLC_data.csv" ) )
D dim(D)
## [1] 1000 20
# exclude never smokers (1=never, 2=former, 3=current)
= D[ !D$smkstatus2 == 1,]
D dim(D)
# [1] 873 20
# Choose variables to include and subset data
= c( "Time", "event", "age.ix.cat", "bmi", "chemo_ix", "cigday2", "copd", "edu", "fh", "hist2.ix", "packyears2", "ph", "quityears2", "race.cat", "radiation_ix", "sex", "smkstatus2", "stage2.ix", "surgery_ix", "USPSTF" )
vars.include = D[,..vars.include]
data $age.ix.cat = unname( sapply( data$age.ix.cat, function(x) switch(x, "71-80" = "71.80", "80+" = "80", "61-70" = "61.70", "<55" = "55", "56-60" = "56.60" ) ) ) # replace characters -,< and + by .
data
# Set missing code, if any, to NA. Then, convert categorical variables to factor.
$age.ix.cat = relevel( as.factor(data$age.ix.cat), ref='61.70' )
data
$chemo_ix = ifelse( data$chemo_ix==9, NA, data$chemo_ix )
data$chemo_ix = relevel( as.factor(data$chemo_ix), ref=1 )
data
$copd = ifelse( data$copd==9, NA, data$copd )
data$copd = as.factor( as.character(data$copd) )
data
$edu = relevel( as.factor( as.character(data$edu) ), ref="2") # 2 = college group is reference
data
$fh = as.factor( as.character(data$fh) )
data
$hist2.ix = relevel( as.factor(data$hist2.ix), ref="SQ" )
data
$ph = as.factor( as.character(data$ph) )
data
$race.cat = relevel( as.factor(data$race.cat), ref="W" )
data
$radiation_ix = ifelse( data$radiation_ix==9, NA, data$radiation_ix )
data$radiation_ix = relevel( as.factor( as.character(data$radiation_ix) ), ref="0") # 0 = No radiotherapy
data
$sex = relevel( as.factor( as.character(data$sex) ), ref="0") # 0 = Female
data$smkstatus2 = relevel( as.factor( as.character(data$smkstatus2) ), ref="2") # 2 = Former
data
$stage2.ix = ifelse( data$stage2.ix==9, NA, data$stage2.ix )
data$stage2.ix = relevel( as.factor( as.character(data$stage2.ix) ), ref="0") # 0 = No stage 2 primary LC
data
$surgery_ix = ifelse( data$surgery_ix==9, NA, data$surgery_ix )
data$surgery_ix = relevel( as.factor(data$surgery_ix), ref=1 )
data
$USPSTF = relevel( as.factor( as.character(data$USPSTF) ), ref="0") # 0 = No stage 2 primary LC data
# set a seed
= c(12)
seed
# perform single imputation
= mice( data = data, m = 1, maxit = 5, print = T, seed = seed )
mi
# extract imputed dataset
= complete(mi,1)
data.imp
# # save imputed data
# save( list = c("mi","data.imp"), file = paste0("Results/data_imp_seed",seed,".Rdata") )
# load imputed data
load(paste0(workdir,"Results/data_imp_seed12.Rdata"))
# create covariate matrix and
= data.imp[, -c(1:2)]
mat = as.formula(paste(" ~ ", paste(colnames(mat), collapse = " + ")))
expr = model.matrix(expr, mat)[, -1]
X
# fit CoxBoost model
= CoxBoost(
fit.cb time = data.imp$Time,
status = data.imp$event,
x = X, # n x p matrix of covariates.
standardize = T, # whether or not covariates should be standardized for
# estimation. Mandatory covariates are not standardized.
unpen.index=c(5,25), # indices of mandatory covariates. here bmi and sex are chosen.
stepno = 100, # number of boosting steps (M).
penalty = 100,
criterion = "pscore", # criterion to be used for selection in each boosting step.
# "pscore" corresponds to the penalized score statistics, "score" to the
# un-penalized score statistics. Different results will only be seen for
# un-standardized covariates ("pscore" will result in preferential
# selection of covariates with larger covariance), or if different
# penalties are used for different covariates.
return.score = T, # whether or not the value of the score statistic (or penalized
# score statistic, depending on criterion), as evaluated in each boosting
# step for every covariate, should be returned.
trace = T # logical value indicating whether progress in estimation
# should be indicated by printing the name of the covariate updated.
)
# print a summary of the fitted CoxBoost model
summary(fit.cb)
# variables contained in the fitted object
names(fit.cb)
# coefficient estimates
dim(fit.cb$coefficients) # (stepno+1) x p matrix containing the coefficient estimates for the
# (standardized) optional covariates for boosting steps 0 to stepno.
$coefficients[100,] # check which coefficients are estimated as 0 in the last step
fit.cb$coefficients[99,]
fit.cb$coefficients[80,]
fit.cb
# finally selected and not-selected coefficients
colnames(X)[ which(fit.cb$coefficients[100,] != 0) ] # selected
colnames(X)[ which(fit.cb$coefficients[100,] == 0) ] # not selected
# score statistic estimates
dim(fit.cb$scoremat) # stepno x (p-p.unpen) matrix containing the value of the score statistic for
# each of the optional covariates before each boosting step.
which.max(fit.cb$scoremat[1,]) # which coefficient was updated in the first step
which.max(fit.cb$scoremat[2,])
which.max(fit.cb$scoremat[3,])
which.max(fit.cb$scoremat[4,])
# linear predictor estimates
dim(fit.cb$linear.predictor) # (stepno+1) x n matrix giving the linear predictor for boosting steps
# 0 to stepno and every observation.
# cumulative baseline hazard estimates
dim(fit.cb$Lambda) # (stepno+1) x ntime matrix with the Breslow estimate for the cumulative baseline
# hazard for boosting steps 0 to stepno for every event time.
# used penalty values
$penalty fit.cb
# Fit K-fold cross-validation for CoxBoost to determine adequate penalty parameter (to be chosen only very coarsely)
= optimCoxBoostPenalty(time = data.imp$Time, status = data.imp$event, x = X, unpen.index=c(5,25),
optim.fit.cb minstepno = 50, maxstepno = 200, # range of boosting steps in which the “optimal” number
# of boosting steps is wanted to be.
start.penalty = 100, # start value for the search for the appropriate penalty.
iter.max = 10, trace = T)
# the optimal penalty
$penalty optim.fit.cb
# Fit K-fold cross-validation for CoxBoost to determine optimal number of boosting steps
= cv.CoxBoost(time = data.imp$Time, status = data.imp$event, x = X, unpen.index=c(5,25),
cv.fit.cb maxstepno = 100, K = 10, type = "verweij", trace = T, penalty = 100,
criterion = "pscore" )
# maxstepno = maximum number of boosting steps to evaluate. The returned “optimal” number of
# boosting steps will be in the range [0,maxstepno].
# plot mean partial log-likelihood
$mean.logplik # vector of length maxstepno+1 with the mean partial log-likelihood for boosting
cv.fit.cb# steps 0 to maxstepno
plot(cv.fit.cb$mean.logplik)
# optimal boosting step number
$optimal.step # having minimum mean partial log-likelihood. cv.fit.cb