Estimating and Presenting Non-Linear Associations with Restricted Cubic Splines

Authors

Andrea Discacciati

Michael G. Palazzolo

Jeong-Gun Park

Giorgio E.M. Melloni

Sabina A. Murphy

Andrea Bellavia

Compiled

2025-04-09


Note

The R functions data_or_hr and data_probability are available from our RCSpline GitHub repository: https://andreabellavia.github.io/RCSplines/ (the following packages must be installed: Epi, aod, survival, Hmisc, glmtoolbox).

They can also be downloaded directly from this link: https://andreabellavia.github.io/RCSplines/continuous/R/R_functions.R.

Load the packages, the dataset, and the functions.

# Load packages
library(readxl)
library(rms)
library(survival)
library(ggplot2)
library(scales)
library(WeightIt)
library(cobalt)
library(glmtoolbox)

# Load dataset
data.example <- read_excel("data_example.xlsx")
data.example$y <- data.example$status
data.example$t <- data.example$time/365.25

# Load functions
source("R_functions.R")

Graphical options.

# Colours for plotting
colour.palette <- c("#00798c", "#d1495b", "#edae49",
                    "#66a182", "#2e4057", "grey")

# Set referent value for covariate X
referent.value <- median(data.example$x)

# Label x-axis
x.axis.label <- "log(NT-proBNP)"

# Label secondary y-axis (histogram)
y2.axis.label <- "Distribution of log(NT-proBNP)"

1 Logistic regression

1.1 Fit the model with x as the only covariate

logit.rcs <- glm(y ~ rcs(x, 4), 
                 data = data.example,
                 family = binomial(link = "logit"))
summary(logit.rcs)

Call:
glm(formula = y ~ rcs(x, 4), family = binomial(link = "logit"), 
    data = data.example)

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -2.67724    0.60773  -4.405 1.06e-05 ***
rcs(x, 4)x    0.05331    0.15126   0.352    0.725    
rcs(x, 4)x'   0.39251    0.48467   0.810    0.418    
rcs(x, 4)x''  0.12663    1.66549   0.076    0.939    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5217.2  on 7254  degrees of freedom
Residual deviance: 5010.1  on 7251  degrees of freedom
AIC: 5018.1

Number of Fisher Scoring iterations: 5

1.2 Generate the data to plot the OR as a function of x and save the RCS knots’ location

logit.or.data <- data_or_hr(model = logit.rcs, 
                            covariate = x, 
                            referent.value = referent.value)

P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
knots.location <- attr(logit.or.data, "knots.location")

head(logit.or.data)
         x   Estimate    StdErr          z         P exp(Est.)      2.5%    97.5%
1 1.250270 -0.3249129 0.4461308 -0.7282906 0.4664357 0.7225903 0.3014012 1.732365
2 1.324428 -0.3209598 0.4350176 -0.7378088 0.4606306 0.7254524 0.3092582 1.701753
3 1.398585 -0.3170068 0.4239099 -0.7478164 0.4545709 0.7283258 0.3173167 1.671701
4 1.472743 -0.3130537 0.4128081 -0.7583517 0.4482405 0.7312106 0.3255814 1.642198
5 1.546900 -0.3091007 0.4017128 -0.7694570 0.4416221 0.7341068 0.3340571 1.613236
6 1.621058 -0.3051477 0.3906244 -0.7811792 0.4346971 0.7370145 0.3427488 1.584806

1.3 Plot the OR as a function of x

1.3.1 Basic plot

ggplot(logit.or.data, 
       aes(x = x, 
           y = log(`exp(Est.)`), 
           ymin = log(`2.5%`), 
           ymax = log(`97.5%`))) +
  geom_line() + 
  geom_ribbon(alpha = .25) 

1.3.2 More refined plot

ggplot(logit.or.data, 
       aes(x = x, 
           y = log(`exp(Est.)`), 
           ymin = log(`2.5%`), 
           ymax = log(`97.5%`))) +
  geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*15)),
                 inherit.aes = FALSE,
                 bins = 30,
                 fill = tail(colour.palette, 1)) +
  geom_line(color = colour.palette[1], lwd = 1.5) + 
  geom_ribbon(fill = colour.palette[1], alpha = .25) +
  labs(x = x.axis.label, 
       y = "Odds Ratio (95% confidence interval)",
       caption = "Vertical dashed line indicates the referent value 
       Vertical dotted lines indicate knots' location") + 
  theme_bw(base_size = 11) +
  geom_vline(xintercept = referent.value, lty = 2) + 
  geom_vline(xintercept = knots.location, lty = 3) + 
  scale_y_continuous(breaks = log(c(0.1, 0.25, 0.5, 1, 2, 4, 6, 10, 20)), 
                     labels = \(x) sprintf("%4.2f", exp(x)),
                     sec.axis = sec_axis(~./15, 
                                         breaks = seq(0, .10, by = .025),
                                         labels = percent_format(), 
                                         name = y2.axis.label))   +
  scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
                     labels = \(x) sprintf("%.0f", exp(x)))

1.4 Generate the data to plot the event probability as a function of x

logit.prob.data <- data_probability(model = logit.rcs, 
                                    covariate = x)

head(logit.prob.data)
         x   Estimate       2.5%     97.5%
1 1.250270 0.06846001 0.03122661 0.1435124
2 1.324428 0.06871254 0.03201077 0.1413498
3 1.398585 0.06896593 0.03281337 0.1392167
4 1.472743 0.06922019 0.03363476 0.1371130
5 1.546900 0.06947531 0.03447526 0.1350386
6 1.621058 0.06973130 0.03533519 0.1329934

1.5 Plot the event probability as a function of x

1.5.1 Basic plot

ggplot(logit.prob.data, 
       aes(x = x, 
           y = Estimate, 
           ymin = `2.5%`, 
           ymax = `97.5%`)) +
  geom_line() + 
  geom_ribbon(alpha = .25) 

1.5.2 More refined plot

ggplot(logit.prob.data, 
       aes(x = x, 
           y = Estimate, 
           ymin = `2.5%`, 
           ymax = `97.5%`)) +
  geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*5)), 
                 inherit.aes = FALSE,
                 bins = 30,
                 fill = tail(colour.palette, 1)) +
  geom_line(color = colour.palette[1], lwd = 1.5) + 
  geom_ribbon(fill = colour.palette[1], alpha = .25) +
  labs(x = x.axis.label, 
       y = "Probability (%) (95% confidence interval)",
       caption = "Vertical dotted lines indicate knots' location") + 
  theme_bw(base_size = 11) +
  geom_vline(xintercept = knots.location, lty = 3) + 
  scale_y_continuous(breaks = seq(0, 1, 0.1),
                     limits = c(0, 1),
                     labels = percent_format(),
                     sec.axis = sec_axis(~./5, 
                                         breaks = seq(0, .10, by = .025),
                                         labels = percent_format(), 
                                         name = y2.axis.label))     +
  scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
                     labels = \(x) sprintf("%.0f", exp(x)))

1.6 Dealing with additional coavariates (i.e., covariates other than x)

1.6.1 Conditional model

Add the covariate age to the model and model it with a second-degree polynomial transform.

logit.rcs.cov <- glm(y ~ rcs(x, 4) + poly(age, 2), 
                     data = data.example,
                     family = binomial(link = "logit"))
summary(logit.rcs.cov)

Call:
glm(formula = y ~ rcs(x, 4) + poly(age, 2), family = binomial(link = "logit"), 
    data = data.example)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)   -3.55214    0.63751  -5.572 2.52e-08 ***
rcs(x, 4)x     0.17106    0.15822   1.081    0.280    
rcs(x, 4)x'    0.42445    0.50651   0.838    0.402    
rcs(x, 4)x''  -0.03209    1.74171  -0.018    0.985    
poly(age, 2)1 72.45505    4.28424  16.912  < 2e-16 ***
poly(age, 2)2  6.60949    3.59505   1.838    0.066 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5217.2  on 7254  degrees of freedom
Residual deviance: 4491.2  on 7249  degrees of freedom
AIC: 4503.2

Number of Fisher Scoring iterations: 6

1.6.1.1 Odds Ratio

Generate the data to plot the ORs (conditional on age) as a function of x and save the RCS knots’ location.

logit.or.data.cov <- data_or_hr(model = logit.rcs.cov, 
                                covariate = x, 
                                referent.value = referent.value)

P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
knots.location <- attr(logit.or.data.cov, "knots.location")

head(logit.or.data.cov)
         x   Estimate    StdErr         z          P exp(Est.)      2.5%    97.5%
1 1.250270 -0.7801561 0.4668110 -1.671246 0.09467302 0.4583344 0.1835829 1.144281
2 1.324428 -0.7674707 0.4551864 -1.686058 0.09178465 0.4641857 0.1902113 1.132784
3 1.398585 -0.7547852 0.4435677 -1.701624 0.08882593 0.4701116 0.1970768 1.121415
4 1.472743 -0.7420998 0.4319551 -1.718002 0.08579619 0.4761131 0.2041876 1.110174
5 1.546900 -0.7294143 0.4203491 -1.735258 0.08269507 0.4821913 0.2115522 1.099060
6 1.621058 -0.7167289 0.4087505 -1.753463 0.07952256 0.4883471 0.2191793 1.088072

Plot the OR, conditional on age, as a function of x.

ggplot(logit.or.data.cov, 
       aes(x = x, 
           y = log(`exp(Est.)`), 
           ymin = log(`2.5%`), 
           ymax = log(`97.5%`))) +
  geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*15)),
                 inherit.aes = FALSE,
                 bins = 30,
                 fill = tail(colour.palette, 1)) +
  geom_line(color = colour.palette[1], lwd = 1.5) + 
  geom_ribbon(fill = colour.palette[1], alpha = .25) +
  labs(x = x.axis.label, 
       y = "Odds Ratio (95% confidence interval)",
       caption = "Vertical dashed line indicates the referent value 
       Vertical dotted lines indicate knots' location") + 
  theme_bw(base_size = 11) +
  geom_vline(xintercept = referent.value, lty = 2) + 
  geom_vline(xintercept = knots.location, lty = 3) + 
  scale_y_continuous(breaks = log(c(0.1, 0.25, 0.5, 1, 2, 4, 6, 10, 20)), 
                     labels = \(x) sprintf("%4.2f", exp(x)),
                     sec.axis = sec_axis(~./15, 
                                         breaks = seq(0, .10, by = .025),
                                         labels = percent_format(), 
                                         name = y2.axis.label))   +
  scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
                     labels = \(x) sprintf("%.0f", exp(x)))

1.6.1.2 Event probabilty

Generate the data to plot the event probability (conditional on age) as a function of x. Fix the values of age at the 10th, median, and 90th percentile of its distribution: 52, 61, 69 years.

newdata.cov <- data.frame(age = quantile(data.example$age, c(.1, .5, .9)))

logit.prob.data.cov <- data_probability(model = logit.rcs.cov, 
                                        covariate = x,
                                        newdata = newdata.cov)

head(logit.prob.data.cov)
         x age   Estimate        2.5%      97.5%
1 1.250270  52 0.01200340 0.004950772 0.02881195
2 1.324428  52 0.01215478 0.005124542 0.02855282
3 1.398585  52 0.01230804 0.005304097 0.02829742
4 1.472743  52 0.01246321 0.005489593 0.02804581
5 1.546900  52 0.01262031 0.005681185 0.02779807
6 1.621058  52 0.01277936 0.005879030 0.02755427
ggplot(logit.prob.data.cov, 
       aes(x = x, 
           y = Estimate, 
           ymin = `2.5%`, 
           ymax = `97.5%`,
           color = factor(age),
           fill = factor(age))) +
  geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*5)), 
                 inherit.aes = FALSE,
                 bins = 30,
                 fill = tail(colour.palette, 1)) +
  geom_line(lwd = 1.5) + 
  geom_ribbon(alpha = .25, linewidth = 0) +
  labs(x = x.axis.label, 
       y = "Probability (%) (95% confidence interval)",
       caption = "Vertical dotted lines indicate knots' location",
       color = "Age (years)",
       fill = "Age (years)") + 
  theme_bw(base_size = 11) +
  theme(legend.position = "bottom") +
  geom_vline(xintercept = knots.location, lty = 3) + 
  scale_color_manual(values = colour.palette) +
  scale_fill_manual(values = colour.palette) +
  scale_y_continuous(breaks = seq(0, 1, 0.1),
                     limits = c(0, 1),
                     labels = percent_format(),
                     sec.axis = sec_axis(~./5, 
                                         breaks = seq(0, .10, by = .025),
                                         labels = percent_format(), 
                                         name = y2.axis.label)) +
  scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
                     labels = \(x) sprintf("%.0f", exp(x)))

1.6.2 Marginal model

Generate inverse probability weights (IPW) to balance the distribution of the covariate age across levels of x (see help(WeightIt)).

ipw.wgtit <- weightit(x ~ age,
                      data = data.example,
                      method = "cbps")
data.example$ipweights <- ipw.wgtit$weights

Assess balance numerically (see help(cobalt)).

bal.tab(ipw.wgtit, 
        stats = c("c", "k"), 
        un = TRUE, 
        thresholds = c(cor = .1), 
        poly = 2)
Balance Measures
        Type Corr.Un Corr.Adj    R.Threshold KS.Target.Adj
age  Contin. -0.1109   0.0000 Balanced, <0.1        0.0025
age² Contin. -0.1079   0.0034 Balanced, <0.1        0.0025

Balance tally for treatment correlations
                   count
Balanced, <0.1         2
Not Balanced, >0.1     0

Variable with the greatest treatment correlation
 Variable Corr.Adj    R.Threshold
     age²   0.0034 Balanced, <0.1

Effective sample sizes
             Total
Unadjusted 7255.  
Adjusted   7160.23

Fit a marginal model using the IPWs generated previously (note: use the function glmgee from the glmtoolbox package to obtain robust standard errors).

logit.rcs.ipw <- glmgee(y ~ rcs(x, 4), 
                        data = data.example,
                        family = binomial(link = "logit"),
                        weights = ipweights,
                        id = id)
summary(logit.rcs.ipw)

Sample size
   Number of observations:  7255
       Number of clusters:  7255 
             Cluster size:  1 
*************************************************************
Model
        Variance function:  binomial
            Link function:  logit
    Correlation structure:  Independence
*************************************************************
Coefficients
             Estimate Std.Error  z-value   Pr(>|z|)
(Intercept)  -3.02043   0.60992 -4.95221 7.3374e-07
rcs(x, 4)x    0.11516   0.15199  0.75764    0.44867
rcs(x, 4)x'   0.49022   0.49159  0.99722    0.31866
rcs(x, 4)x'' -0.25606   1.69526 -0.15105    0.87994
                                                   
Dispersion    0.99961                              
*************************************************************
Working correlation
    [1] 
[1]    1

1.6.2.1 Odds Ratio

Generate the data to plot the ORs (averaged over the distribution of age) as a function of x and save the RCS knots’ location.

logit.or.data.ipw <- data_or_hr(model = logit.rcs.ipw, 
                                covariate = x, 
                                referent.value = referent.value)

P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
knots.location <- attr(logit.or.data.ipw, "knots.location")

head(logit.or.data.ipw)
         x   Estimate    StdErr         z         P exp(Est.)      2.5%    97.5%
1 1.250270 -0.5879314 0.4472455 -1.314561 0.1886576 0.5554752 0.2311897 1.334630
2 1.324428 -0.5793917 0.4360814 -1.328632 0.1839694 0.5602391 0.2383308 1.316942
3 1.398585 -0.5708520 0.4249229 -1.343425 0.1791345 0.5650438 0.2456898 1.299503
4 1.472743 -0.5623123 0.4137706 -1.358995 0.1741481 0.5698898 0.2532729 1.282310
5 1.546900 -0.5537726 0.4026248 -1.375406 0.1690057 0.5747773 0.2610867 1.265361
6 1.621058 -0.5452329 0.3914863 -1.392725 0.1637029 0.5797068 0.2691377 1.248654

Plot the OR as a function of x.

ggplot(logit.or.data.ipw, 
       aes(x = x, 
           y = log(`exp(Est.)`), 
           ymin = log(`2.5%`), 
           ymax = log(`97.5%`))) +
  geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*15)),
                 inherit.aes = FALSE,
                 bins = 30,
                 fill = tail(colour.palette, 1)) +
  geom_line(color = colour.palette[1], lwd = 1.5) + 
  geom_ribbon(fill = colour.palette[1], alpha = .25) +
  labs(x = x.axis.label, 
       y = "Odds Ratio (95% confidence interval)",
       caption = "Vertical dashed line indicates the referent value 
       Vertical dotted lines indicate knots' location") + 
  theme_bw(base_size = 11) +
  geom_vline(xintercept = referent.value, lty = 2) + 
  geom_vline(xintercept = knots.location, lty = 3) + 
  scale_y_continuous(breaks = log(c(0.1, 0.25, 0.5, 1, 2, 4, 6, 10, 20)), 
                     labels = \(x) sprintf("%4.2f", exp(x)),
                     sec.axis = sec_axis(~./15, 
                                         breaks = seq(0, .10, by = .025),
                                         labels = percent_format(), 
                                         name = y2.axis.label))   +
  scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
                     labels = \(x) sprintf("%.0f", exp(x)))

1.6.2.2 Event probability

Generate the data to plot the event probability (averaged over the distribution of age) as a function of x.

logit.prob.data.ipw <- data_probability(model = logit.rcs.ipw, 
                                        covariate = x)

head(logit.prob.data.ipw)
         x   Estimate       2.5%     97.5%
1 1.250270 0.05332996 0.02405146 0.1140836
2 1.324428 0.05376274 0.02477293 0.1127547
3 1.398585 0.05419884 0.02551501 0.1114412
4 1.472743 0.05463826 0.02627820 0.1101431
5 1.546900 0.05508104 0.02706299 0.1088604
6 1.621058 0.05552720 0.02786990 0.1075931

Plot the event probability as a function of x.

ggplot(logit.prob.data.ipw, 
       aes(x = x, 
           y = Estimate, 
           ymin = `2.5%`, 
           ymax = `97.5%`)) +
  geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*5)), 
                 inherit.aes = FALSE,
                 bins = 30,
                 fill = tail(colour.palette, 1)) +
  geom_line(color = colour.palette[1], lwd = 1.5) + 
  geom_ribbon(fill = colour.palette[1], alpha = .25) +
  labs(x = x.axis.label, 
       y = "Probability (%) (95% confidence interval)",
       caption = "Vertical dotted lines indicate knots' location") + 
  theme_bw(base_size = 11) +
  geom_vline(xintercept = knots.location, lty = 3) + 
  scale_y_continuous(breaks = seq(0, 1, 0.1),
                     limits = c(0, 1),
                     labels = percent_format(),
                     sec.axis = sec_axis(~./5, 
                                         breaks = seq(0, .10, by = .025),
                                         labels = percent_format(), 
                                         name = y2.axis.label))     +
  scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
                     labels = \(x) sprintf("%.0f", exp(x)))

1.7 Explore how the shape of the OR as a function of x changes as the number of the knots changes

logit.or.data.nknots <- lapply(3:8, \(n.knots) {
  logit.rcs <- glm(y ~ rcs(x, n.knots), 
                   data = data.example,
                   family = binomial(link = "logit"))
  
  cat("--->", n.knots, "knots, AIC:", AIC(logit.rcs), "\n")
  
  data <- data_or_hr(model = logit.rcs, 
                     covariate = x,
                     referent.value = referent.value)
  data$n.knots <- n.knots
  data
})
---> 3 knots, AIC: 5015.7 

P-value for overall association: <0.0001
P-value for non-linearity: <0.0001

---> 4 knots, AIC: 5018.103 

P-value for overall association: <0.0001
P-value for non-linearity: <0.0001

---> 5 knots, AIC: 5018.088 

P-value for overall association: <0.0001
P-value for non-linearity: <0.0001

---> 6 knots, AIC: 5019.999 

P-value for overall association: <0.0001
P-value for non-linearity: <0.0001

---> 7 knots, AIC: 5022.09 

P-value for overall association: <0.0001
P-value for non-linearity: <0.0001

---> 8 knots, AIC: 5023.735 

P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
ggplot(do.call("rbind", logit.or.data.nknots), 
       aes(x = x, 
           y = log(`exp(Est.)`),
           color = factor(n.knots))) +
  geom_line(lwd = 1.5) + 
  geom_vline(xintercept = referent.value, lty = 2) + 
  labs(x = x.axis.label, 
       y = "Odds Ratio (95% confidence interval)",
       color = "Number of knots") + 
  theme_bw(base_size = 11) +
  theme(legend.position = "bottom") +
  scale_y_continuous(breaks = log(c(0.1, 0.5, 0.7, 1, 2, 4, 6, 10, 20)), 
                     labels = \(x) sprintf("%4.2f", exp(x)))  +
  scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
                     labels = \(x) sprintf("%.0f", exp(x))) +
  scale_color_manual(values = colour.palette)

2 Cox regression

2.1 Fit the model with x as the only covariate

cox.rcs <- coxph(Surv(t, y) ~ rcs(x, 4), 
                 data = data.example)
summary(cox.rcs)
Call:
coxph(formula = Surv(t, y) ~ rcs(x, 4), data = data.example)

  n= 7255, number of events= 844 

                 coef exp(coef) se(coef)      z Pr(>|z|)
rcs(x, 4)x    0.04343   1.04439  0.14403  0.302    0.763
rcs(x, 4)x'   0.41755   1.51824  0.45438  0.919    0.358
rcs(x, 4)x'' -0.17594   0.83867  1.54037 -0.114    0.909

             exp(coef) exp(-coef) lower .95 upper .95
rcs(x, 4)x      1.0444     0.9575   0.78752     1.385
rcs(x, 4)x'     1.5182     0.6587   0.62312     3.699
rcs(x, 4)x''    0.8387     1.1924   0.04097    17.169

Concordance= 0.617  (se = 0.011 )
Likelihood ratio test= 203.4  on 3 df,   p=<2e-16
Wald test            = 247  on 3 df,   p=<2e-16
Score (logrank) test = 278.3  on 3 df,   p=<2e-16

2.2 Generate the data to plot the HR as a function of x and save the RCS knots’ location

cox.hr.data <- data_or_hr(model = cox.rcs, 
                          covariate = x, 
                          referent.value = referent.value)

P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
head(cox.hr.data)
         x   Estimate    StdErr          z         P exp(Est.)      2.5%    97.5%
1 1.250270 -0.2939612 0.4260261 -0.6900075 0.4901895 0.7453054 0.3233703 1.717783
2 1.324428 -0.2907406 0.4154375 -0.6998419 0.4840260 0.7477096 0.3312165 1.687928
3 1.398585 -0.2875199 0.4048537 -0.7101822 0.4775911 0.7501216 0.3392498 1.658608
4 1.472743 -0.2842993 0.3942752 -0.7210681 0.4708676 0.7525414 0.3474743 1.629814
5 1.546900 -0.2810787 0.3837024 -0.7325433 0.4638370 0.7549689 0.3558943 1.601538
6 1.621058 -0.2778580 0.3731358 -0.7446566 0.4564794 0.7574044 0.3645139 1.573771

2.3 Plot the HR as a function of x

2.3.1 Basic plot

ggplot(cox.hr.data, 
       aes(x = x, 
           y = log(`exp(Est.)`), 
           ymin = log(`2.5%`), 
           ymax = log(`97.5%`))) +
  geom_line() + 
  geom_ribbon(alpha = .25)  

2.3.2 More refined plot

ggplot(cox.hr.data, 
       aes(x = x, 
           y = log(`exp(Est.)`), 
           ymin = log(`2.5%`), 
           ymax = log(`97.5%`))) +
  geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*15)),
                 inherit.aes = FALSE,
                 bins = 30,
                 fill = tail(colour.palette, 1)) +
  geom_line(color = colour.palette[1], lwd = 1.5) + 
  geom_ribbon(fill = colour.palette[1], alpha = .25) +
  labs(x = x.axis.label, 
       y = "Hazard Ratio (95% confidence interval)",
       caption = "Vertical dashed line indicates the referent value 
       Vertical dotted lines indicate knots' location") + 
  theme_bw(base_size = 11) +
  geom_vline(xintercept = referent.value, lty = 2) + 
  geom_vline(xintercept = knots.location, lty = 3) + 
  scale_y_continuous(breaks = log(c(0.1, 0.25, 0.5, 1, 2, 4, 6, 10, 20)), 
                     labels = \(x) sprintf("%4.2f", exp(x)),
                     sec.axis = sec_axis(~./15, 
                                         breaks = seq(0, .10, by = .025),
                                         labels = percent_format(), 
                                         name = y2.axis.label)) +
  scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
                     labels = \(x) sprintf("%.0f", exp(x)))

2.4 Generate the data to plot the event probability at different time horizons (years) as a function of x

cox.prob.data <- data_probability(model = cox.rcs, 
                                  covariate = x, 
                                  time = c(.5, 1.5, 2.5, 3.5))
head(cox.prob.data)
  time        x   Estimate       2.5%       97.5%
1  0.5 1.250270 0.01710444 0.03748504 0.007760347
2  0.5 1.324428 0.01715914 0.03685875 0.007945035
3  0.5 1.398585 0.01721401 0.03624379 0.008133861
4  0.5 1.472743 0.01726906 0.03564002 0.008326889
5  0.5 1.546900 0.01732428 0.03504733 0.008524181
6  0.5 1.621058 0.01737968 0.03446561 0.008725798

2.5 Plot the event probability at different time horizons (years) as a function of x

2.5.1 Basic plot

ggplot(cox.prob.data, 
       aes(x = x, 
           y = Estimate, 
           ymin = `2.5%`, 
           ymax = `97.5%`,
           fill = factor(time),
           color = factor(time))) +
  geom_line() + 
  geom_ribbon(alpha = .25)

2.5.2 More refined plot

ggplot(cox.prob.data, 
       aes(x = x, 
           y = Estimate, 
           ymin = `2.5%`, 
           ymax = `97.5%`,
           fill = factor(time),
           color = factor(time))) +
  geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*5)), 
                 inherit.aes = FALSE,
                 bins = 30,
                 fill = tail(colour.palette, 1)) +
  geom_line(lwd = 1.5) + 
  geom_ribbon(alpha = .25, color = NA) + 
  labs(x = x.axis.label, 
       y = "Event probability (%)\n(95% confidence interval)",
       caption = "Vertical dotted lines indicate knots' location",
       fill  = "Time horizon (years)",
       color = "Time horizon (years)") + 
  theme_bw(base_size = 11) +
  theme(legend.position = "bottom") +
  geom_vline(xintercept = knots.location, lty = 3) + 
  scale_y_continuous(breaks = seq(0, 1, 0.1), 
                     limits = c(0, 1),
                     labels = percent_format(),
                     sec.axis = sec_axis(~./5, 
                                         breaks = seq(0, .10, by = .025),
                                         labels = percent_format(), 
                                         name = y2.axis.label))  +
  scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
                     labels = \(x) sprintf("%.0f", exp(x))) + 
  scale_fill_manual(values = colour.palette) +
  scale_color_manual(values = colour.palette)

2.6 Dealing with additional coavariates (i.e., covariates other than x)

2.6.1 Conditional model

Add the covariate age to the model and model it with a second-degree polynomial transform.

cox.rcs.cov <- coxph(Surv(t, y) ~ rcs(x, 4) + poly(age, 2), 
                     data = data.example)
summary(cox.rcs.cov)
Call:
coxph(formula = Surv(t, y) ~ rcs(x, 4) + poly(age, 2), data = data.example)

  n= 7255, number of events= 844 

                    coef  exp(coef)   se(coef)      z Pr(>|z|)    
rcs(x, 4)x     1.465e-01  1.158e+00  1.461e-01  1.003    0.316    
rcs(x, 4)x'    4.100e-01  1.507e+00  4.576e-01  0.896    0.370    
rcs(x, 4)x''  -3.247e-01  7.227e-01  1.546e+00 -0.210    0.834    
poly(age, 2)1  6.568e+01  3.351e+28  4.041e+00 16.253   <2e-16 ***
poly(age, 2)2  1.213e+00  3.364e+00  2.954e+00  0.411    0.681    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

              exp(coef) exp(-coef) lower .95 upper .95
rcs(x, 4)x    1.158e+00  8.637e-01 8.695e-01 1.542e+00
rcs(x, 4)x'   1.507e+00  6.636e-01 6.145e-01 3.695e+00
rcs(x, 4)x''  7.227e-01  1.384e+00 3.489e-02 1.497e+01
poly(age, 2)1 3.351e+28  2.984e-29 1.217e+25 9.225e+31
poly(age, 2)2 3.364e+00  2.973e-01 1.029e-02 1.100e+03

Concordance= 0.745  (se = 0.009 )
Likelihood ratio test= 706.8  on 5 df,   p=<2e-16
Wald test            = 764.5  on 5 df,   p=<2e-16
Score (logrank) test = 915.4  on 5 df,   p=<2e-16

2.6.1.1 Hazard Ratio

Generate the data to plot the HRs (conditional on age) as a function of x and save the RCS knots’ location.

cox.hr.data.cov <- data_or_hr(model = cox.rcs.cov, 
                              covariate = x, 
                              referent.value = referent.value)

P-value for overall association: <0.0001
P-value for non-linearity: 0.000102
head(cox.hr.data.cov)
         x   Estimate    StdErr         z          P exp(Est.)      2.5%    97.5%
1 1.250270 -0.6812737 0.4325575 -1.574990 0.11525879 0.5059721 0.2167369 1.181191
2 1.324428 -0.6704106 0.4218139 -1.589352 0.11198104 0.5114985 0.2237668 1.169211
3 1.398585 -0.6595476 0.4110751 -1.604446 0.10861587 0.5170852 0.2310226 1.157364
4 1.472743 -0.6486845 0.4003413 -1.620329 0.10516171 0.5227330 0.2385112 1.145648
5 1.546900 -0.6378214 0.3896131 -1.637063 0.10161720 0.5284424 0.2462399 1.134062
6 1.621058 -0.6269584 0.3788910 -1.654720 0.09798134 0.5342142 0.2542160 1.122608

Plot the HR, conditional on age, as a function of x.

ggplot(cox.hr.data.cov, 
       aes(x = x, 
           y = log(`exp(Est.)`), 
           ymin = log(`2.5%`), 
           ymax = log(`97.5%`))) +
  geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*15)),
                 inherit.aes = FALSE,
                 bins = 30,
                 fill = tail(colour.palette, 1)) +
  geom_line(color = colour.palette[1], lwd = 1.5) + 
  geom_ribbon(fill = colour.palette[1], alpha = .25) +
  labs(x = x.axis.label, 
       y = "Hazard Ratio (95% confidence interval)",
       caption = "Vertical dashed line indicates the referent value 
       Vertical dotted lines indicate knots' location") + 
  theme_bw(base_size = 11) +
  geom_vline(xintercept = referent.value, lty = 2) + 
  geom_vline(xintercept = knots.location, lty = 3) + 
  scale_y_continuous(breaks = log(c(0.1, 0.25, 0.5, 1, 2, 4, 6, 10, 20)), 
                     labels = \(x) sprintf("%4.2f", exp(x)),
                     sec.axis = sec_axis(~./15, 
                                         breaks = seq(0, .10, by = .025),
                                         labels = percent_format(), 
                                         name = y2.axis.label))   +
  scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
                     labels = \(x) sprintf("%.0f", exp(x)))

2.6.1.2 Event probability

Generate the data to plot the event probability at different time horizons (conditional on age) as a function of x. Fix the values of age at the 10th and 90th percentile of its distribution: 52, 69 years and the time horizon at 1 and 3 years.

newdata.cov <- data.frame(age = quantile(data.example$age, c(.1, .9)))

cox.prob.data.cov <- data_probability(model = cox.rcs.cov, 
                                      covariate = x, 
                                      newdata = newdata.cov,
                                      time = c(1, 3))
head(cox.prob.data.cov)
  time        x age    Estimate       2.5%       97.5%
1    1 1.250270  52 0.006450633 0.01473741 0.002816826
2    1 1.324428  52 0.006520858 0.01460146 0.002905568
3    1 1.398585  52 0.006591845 0.01446754 0.002996939
4    1 1.472743  52 0.006663603 0.01433566 0.003090998
5    1 1.546900  52 0.006736139 0.01420587 0.003187803
6    1 1.621058  52 0.006809462 0.01407821 0.003287411
ggplot(cox.prob.data.cov, 
       aes(x = x, 
           y = Estimate, 
           ymin = `2.5%`, 
           ymax = `97.5%`,
           fill = factor(time),
           color = factor(time),
           linetype = factor(age))) +
  geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*5)), 
                 inherit.aes = FALSE,
                 bins = 30,
                 fill = tail(colour.palette, 1)) +
  geom_line(lwd = 1.5) + 
  geom_ribbon(alpha = .25, color = NA) + 
  labs(x = x.axis.label, 
       y = "Event probability (%)\n(95% confidence interval)",
       caption = "Vertical dotted lines indicate knots' location",
       fill  = "Time horizion (years)",
       color = "Time horizion (years)",
       linetype = "Age (years)") + 
  theme_bw(base_size = 11) +
  theme(legend.position = "bottom") +
  geom_vline(xintercept = knots.location, lty = 3) + 
  scale_y_continuous(breaks = seq(0, 1, 0.1), 
                     limits = c(0, 1),
                     labels = percent_format(),
                     sec.axis = sec_axis(~./5, 
                                         breaks = seq(0, .10, by = .025),
                                         labels = percent_format(), 
                                         name = y2.axis.label))  +
  scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
                     labels = \(x) sprintf("%.0f", exp(x))) + 
  scale_fill_manual(values = colour.palette) +
  scale_color_manual(values = colour.palette)

2.6.2 Marginal model

Generate inverse probability weights (IPW) to balance the distribution of the covariate age across levels of x (see help(WeightIt)).

ipw.wgtit <- weightit(x ~ age,
                      data = data.example,
                      method = "cbps")
data.example$ipweights <- ipw.wgtit$weights

Assess balance numerically (see help(cobalt)).

bal.tab(ipw.wgtit, 
        stats = c("c", "k"), 
        un = TRUE, 
        thresholds = c(cor = .1), 
        poly = 2)
Balance Measures
        Type Corr.Un Corr.Adj    R.Threshold KS.Target.Adj
age  Contin. -0.1109   0.0000 Balanced, <0.1        0.0025
age² Contin. -0.1079   0.0034 Balanced, <0.1        0.0025

Balance tally for treatment correlations
                   count
Balanced, <0.1         2
Not Balanced, >0.1     0

Variable with the greatest treatment correlation
 Variable Corr.Adj    R.Threshold
     age²   0.0034 Balanced, <0.1

Effective sample sizes
             Total
Unadjusted 7255.  
Adjusted   7160.23

Fit a marginal model using the IPWs generated previously.

cox.rcs.ipw <- coxph(Surv(t, y) ~ rcs(x, 4), 
                     data = data.example,
                     weights =  ipweights,
                     robust = TRUE)
summary(cox.rcs.ipw)
Call:
coxph(formula = Surv(t, y) ~ rcs(x, 4), data = data.example, 
    weights = ipweights, robust = TRUE)

  n= 7255, number of events= 844 

                coef exp(coef) se(coef) robust se      z Pr(>|z|)
rcs(x, 4)x    0.1021    1.1075   0.1526    0.1444  0.707    0.480
rcs(x, 4)x'   0.5280    1.6955   0.4683    0.4583  1.152    0.249
rcs(x, 4)x'' -0.6509    0.5216   1.5670    1.5542 -0.419    0.675

             exp(coef) exp(-coef) lower .95 upper .95
rcs(x, 4)x      1.1075     0.9029   0.83446     1.470
rcs(x, 4)x'     1.6955     0.5898   0.69052     4.163
rcs(x, 4)x''    0.5216     1.9172   0.02479    10.972

Concordance= 0.641  (se = 0.01 )
Likelihood ratio test= 286.2  on 3 df,   p=<2e-16
Wald test            = 316.6  on 3 df,   p=<2e-16
Score (logrank) test = 399.9  on 3 df,   p=<2e-16,   Robust = 160.1  p=<2e-16

  (Note: the likelihood ratio and score tests assume independence of
     observations within a cluster, the Wald and robust score tests do not).

2.6.2.1 Hazard ratio

Generate the data to plot the HRs (averaged over the distribution of age) as a function of x and save the RCS knots’ location.

cox.hr.data.ipw <- data_or_hr(model = cox.rcs.ipw, 
                              covariate = x, 
                              referent.value = referent.value)

P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
head(cox.hr.data.ipw)
         x   Estimate    StdErr         z         P exp(Est.)      2.5%    97.5%
1 1.250270 -0.5484782 0.4265002 -1.285997 0.1984439 0.5778285 0.2504732 1.333020
2 1.324428 -0.5409069 0.4158843 -1.300619 0.1933890 0.5822200 0.2576830 1.315493
3 1.398585 -0.5333355 0.4052734 -1.315990 0.1881775 0.5866449 0.2650977 1.298209
4 1.472743 -0.5257641 0.3946677 -1.332169 0.1828047 0.5911035 0.2727230 1.281166
5 1.546900 -0.5181928 0.3840679 -1.349222 0.1772658 0.5955959 0.2805644 1.264361
6 1.621058 -0.5106214 0.3734744 -1.367219 0.1715566 0.6001225 0.2886276 1.247791

Plot the HR as a function of x.

ggplot(cox.hr.data.ipw, 
       aes(x = x, 
           y = log(`exp(Est.)`), 
           ymin = log(`2.5%`), 
           ymax = log(`97.5%`))) +
  geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*15)),
                 inherit.aes = FALSE,
                 bins = 30,
                 fill = tail(colour.palette, 1)) +
  geom_line(color = colour.palette[1], lwd = 1.5) + 
  geom_ribbon(fill = colour.palette[1], alpha = .25) +
  labs(x = x.axis.label, 
       y = "Hazard Ratio (95% confidence interval)",
       caption = "Vertical dashed line indicates the referent value 
       Vertical dotted lines indicate knots' location") + 
  theme_bw(base_size = 11) +
  geom_vline(xintercept = referent.value, lty = 2) + 
  geom_vline(xintercept = knots.location, lty = 3) + 
  scale_y_continuous(breaks = log(c(0.1, 0.25, 0.5, 1, 2, 4, 6, 10, 20)), 
                     labels = \(x) sprintf("%4.2f", exp(x)),
                     sec.axis = sec_axis(~./15, 
                                         breaks = seq(0, .10, by = .025),
                                         labels = percent_format(), 
                                         name = y2.axis.label)) +
  scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
                     labels = \(x) sprintf("%.0f", exp(x)))

2.6.2.2 Event probability

Generate the data to plot the event probability (averaged over the distribution of age) as a function of x at different time horizons (0.5, 1.5, 2.5, 3.5 years).

cox.prob.data.ipw <- data_probability(model = cox.rcs.ipw, 
                                      covariate = x, 
                                      time = c(.5, 1.5, 2.5, 3.5))
head(cox.prob.data.ipw)
  time        x   Estimate       2.5%       97.5%
1  0.5 1.250270 0.01299518 0.02861233 0.005876494
2  0.5 1.324428 0.01309330 0.02825202 0.006042857
3  0.5 1.398585 0.01319215 0.02789703 0.006213724
4  0.5 1.472743 0.01329174 0.02754734 0.006389194
5  0.5 1.546900 0.01339208 0.02720294 0.006569366
6  0.5 1.621058 0.01349317 0.02686383 0.006754337

Plot the event probability as a function of x.

ggplot(cox.prob.data.ipw, 
       aes(x = x, 
           y = Estimate, 
           ymin = `2.5%`, 
           ymax = `97.5%`,
           fill = factor(time),
           color = factor(time))) +
  geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*5)), 
                 inherit.aes = FALSE,
                 bins = 30,
                 fill = tail(colour.palette, 1)) +
  geom_line(lwd = 1.5) + 
  geom_ribbon(alpha = .25, color = NA) + 
  labs(x = x.axis.label, 
       y = "Event probability (%)\n(95% confidence interval)",
       caption = "Vertical dotted lines indicate knots' location",
       fill  = "Time horizon (years)",
       color = "Time horizon (years)") + 
  theme_bw(base_size = 11) +
  theme(legend.position = "bottom") +
  geom_vline(xintercept = knots.location, lty = 3) + 
  scale_y_continuous(breaks = seq(0, 1, 0.1), 
                     limits = c(0, 1),
                     labels = percent_format(),
                     sec.axis = sec_axis(~./5, 
                                         breaks = seq(0, .10, by = .025),
                                         labels = percent_format(), 
                                         name = y2.axis.label))  +
  scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
                     labels = \(x) sprintf("%.0f", exp(x))) + 
  scale_fill_manual(values = colour.palette) +
  scale_color_manual(values = colour.palette)

2.7 Explore how the shape of the HR as a function of x changes as the number of the knots changes

cox.hr.data.nknots <- lapply(3:8, \(n.knots) {
  cox.rcs <- coxph(Surv(t, y) ~ rcs(x, n.knots), 
                   data = data.example)
  
  cat("--->", n.knots, "knots, AIC:", AIC(cox.rcs), "\n")
  
  data <- data_or_hr(model = cox.rcs, 
                     covariate = x, 
                     referent.value = referent.value)
  data$n.knots <- n.knots
  data
})
---> 3 knots, AIC: 14377.83 

P-value for overall association: <0.0001
P-value for non-linearity: <0.0001

---> 4 knots, AIC: 14380.5 

P-value for overall association: <0.0001
P-value for non-linearity: <0.0001

---> 5 knots, AIC: 14379.88 

P-value for overall association: <0.0001
P-value for non-linearity: <0.0001

---> 6 knots, AIC: 14381.75 

P-value for overall association: <0.0001
P-value for non-linearity: <0.0001

---> 7 knots, AIC: 14383.81 

P-value for overall association: <0.0001
P-value for non-linearity: <0.0001

---> 8 knots, AIC: 14385.35 

P-value for overall association: <0.0001
P-value for non-linearity: 0.000154
ggplot(do.call("rbind", cox.hr.data.nknots), 
       aes(x = x, 
           y = log(`exp(Est.)`),
           color = factor(n.knots))) +
  geom_line(lwd = 1.5) + 
  geom_vline(xintercept = referent.value, lty = 2) + 
  labs(x = x.axis.label, 
       y = "Hazard Ratio (95% confidence interval)",
       color = "Number of knots") + 
  theme_bw(base_size = 11) +
  theme(legend.position = "bottom") +
  scale_y_continuous(breaks = log(c(0.1, 0.5, 0.7, 1, 2, 4, 6, 10, 20)), 
                     labels = \(x) sprintf("%4.2f", exp(x)))   +
  scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
                     labels = \(x) sprintf("%.0f", exp(x))) +
  scale_color_manual(values = colour.palette)

Session info

sessionInfo()
R version 4.3.1 (2023-06-16)
Platform: x86_64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.7

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/Stockholm
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] glmtoolbox_0.1.12 cobalt_4.5.5      WeightIt_1.4.0    scales_1.3.0      ggplot2_3.5.1    
[6] survival_3.5-5    rms_6.7-1         Hmisc_5.1-0       readxl_1.4.2     

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.0    rootSolve_1.8.2.4   farver_2.1.1        dplyr_1.1.2        
 [5] fastmap_1.2.0       TH.data_1.1-2       digest_0.6.37       rpart_4.1.19       
 [9] lifecycle_1.0.4     cluster_2.1.4       statmod_1.5.0       magrittr_2.0.3     
[13] compiler_4.3.1      rlang_1.1.4         tools_4.3.1         utf8_1.2.3         
[17] yaml_2.3.10         data.table_1.15.4   knitr_1.48          labeling_0.4.2     
[21] htmlwidgets_1.6.2   plyr_1.8.8          SuppDists_1.1-9.9   multcomp_1.4-25    
[25] polspline_1.1.23    withr_2.5.0         foreign_0.8-84      purrr_1.0.1        
[29] numDeriv_2016.8-1.1 nnet_7.3-19         grid_4.3.1          aod_1.3.2          
[33] fansi_1.0.4         colorspace_2.1-0    MASS_7.3-60         etm_1.1.1          
[37] cli_3.6.3           mvtnorm_1.2-2       rmarkdown_2.28      crayon_1.5.2       
[41] generics_0.1.3      RcppParallel_5.1.7  rstudioapi_0.14     stringr_1.5.0      
[45] splines_4.3.1       parallel_4.3.1      cellranger_1.1.0    base64enc_0.1-3    
[49] vctrs_0.6.3         Matrix_1.6-1.1      sandwich_3.0-2      jsonlite_1.8.9     
[53] SparseM_1.81        Formula_1.2-5       htmlTable_2.4.1     Epi_2.47.1         
[57] tidyr_1.3.0         cmprsk_2.2-11       glue_1.8.0          chk_0.10.0         
[61] codetools_0.2-19    stringi_1.7.12      gtable_0.3.3        munsell_0.5.0      
[65] tibble_3.2.1        pillar_1.9.0        htmltools_0.5.8.1   quantreg_5.97      
[69] R6_2.5.1            zigg_0.0.2          evaluate_1.0.1      lattice_0.21-8     
[73] backports_1.4.1     Rfast_2.1.5.1       broom_1.0.5         MatrixModels_0.5-2 
[77] Rcpp_1.0.10         gridExtra_2.3       nlme_3.1-162        checkmate_2.2.0    
[81] mgcv_1.8-42         xfun_0.48           zoo_1.8-12          pkgconfig_2.0.3