# Load packages
library(readxl)
library(rms)
library(survival)
library(ggplot2)
library(scales)
library(WeightIt)
library(cobalt)
library(glmtoolbox)
# Load dataset
<- read_excel("data_example.xlsx")
data.example $y <- data.example$status
data.example$t <- data.example$time/365.25
data.example
# Load functions
source("R_functions.R")
Estimating and Presenting Non-Linear Associations with Restricted Cubic Splines
The R functions data_or_hr
and data_probability
are available from our RCSpline GitHub repository: https://andreabellavia.github.io/RCSplines/ (the following packages must be installed: Epi
, aod
, survival
, Hmisc
, glmtoolbox
).
They can also be downloaded directly from this link: https://andreabellavia.github.io/RCSplines/continuous/R/R_functions.R.
Load the packages, the dataset, and the functions.
Graphical options.
# Colours for plotting
<- c("#00798c", "#d1495b", "#edae49",
colour.palette "#66a182", "#2e4057", "grey")
# Set referent value for covariate X
<- median(data.example$x)
referent.value
# Label x-axis
<- "log(NT-proBNP)"
x.axis.label
# Label secondary y-axis (histogram)
<- "Distribution of log(NT-proBNP)" y2.axis.label
1 Logistic regression
1.1 Fit the model with x
as the only covariate
<- glm(y ~ rcs(x, 4),
logit.rcs data = data.example,
family = binomial(link = "logit"))
summary(logit.rcs)
Call:
glm(formula = y ~ rcs(x, 4), family = binomial(link = "logit"),
data = data.example)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.67724 0.60773 -4.405 1.06e-05 ***
rcs(x, 4)x 0.05331 0.15126 0.352 0.725
rcs(x, 4)x' 0.39251 0.48467 0.810 0.418
rcs(x, 4)x'' 0.12663 1.66549 0.076 0.939
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 5217.2 on 7254 degrees of freedom
Residual deviance: 5010.1 on 7251 degrees of freedom
AIC: 5018.1
Number of Fisher Scoring iterations: 5
1.2 Generate the data to plot the OR as a function of x
and save the RCS knots’ location
<- data_or_hr(model = logit.rcs,
logit.or.data covariate = x,
referent.value = referent.value)
P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
<- attr(logit.or.data, "knots.location")
knots.location
head(logit.or.data)
x Estimate StdErr z P exp(Est.) 2.5% 97.5%
1 1.250270 -0.3249129 0.4461308 -0.7282906 0.4664357 0.7225903 0.3014012 1.732365
2 1.324428 -0.3209598 0.4350176 -0.7378088 0.4606306 0.7254524 0.3092582 1.701753
3 1.398585 -0.3170068 0.4239099 -0.7478164 0.4545709 0.7283258 0.3173167 1.671701
4 1.472743 -0.3130537 0.4128081 -0.7583517 0.4482405 0.7312106 0.3255814 1.642198
5 1.546900 -0.3091007 0.4017128 -0.7694570 0.4416221 0.7341068 0.3340571 1.613236
6 1.621058 -0.3051477 0.3906244 -0.7811792 0.4346971 0.7370145 0.3427488 1.584806
1.3 Plot the OR as a function of x
1.3.1 Basic plot
ggplot(logit.or.data,
aes(x = x,
y = log(`exp(Est.)`),
ymin = log(`2.5%`),
ymax = log(`97.5%`))) +
geom_line() +
geom_ribbon(alpha = .25)
1.3.2 More refined plot
ggplot(logit.or.data,
aes(x = x,
y = log(`exp(Est.)`),
ymin = log(`2.5%`),
ymax = log(`97.5%`))) +
geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*15)),
inherit.aes = FALSE,
bins = 30,
fill = tail(colour.palette, 1)) +
geom_line(color = colour.palette[1], lwd = 1.5) +
geom_ribbon(fill = colour.palette[1], alpha = .25) +
labs(x = x.axis.label,
y = "Odds Ratio (95% confidence interval)",
caption = "Vertical dashed line indicates the referent value
Vertical dotted lines indicate knots' location") +
theme_bw(base_size = 11) +
geom_vline(xintercept = referent.value, lty = 2) +
geom_vline(xintercept = knots.location, lty = 3) +
scale_y_continuous(breaks = log(c(0.1, 0.25, 0.5, 1, 2, 4, 6, 10, 20)),
labels = \(x) sprintf("%4.2f", exp(x)),
sec.axis = sec_axis(~./15,
breaks = seq(0, .10, by = .025),
labels = percent_format(),
name = y2.axis.label)) +
scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
labels = \(x) sprintf("%.0f", exp(x)))
1.4 Generate the data to plot the event probability as a function of x
<- data_probability(model = logit.rcs,
logit.prob.data covariate = x)
head(logit.prob.data)
x Estimate 2.5% 97.5%
1 1.250270 0.06846001 0.03122661 0.1435124
2 1.324428 0.06871254 0.03201077 0.1413498
3 1.398585 0.06896593 0.03281337 0.1392167
4 1.472743 0.06922019 0.03363476 0.1371130
5 1.546900 0.06947531 0.03447526 0.1350386
6 1.621058 0.06973130 0.03533519 0.1329934
1.5 Plot the event probability as a function of x
1.5.1 Basic plot
ggplot(logit.prob.data,
aes(x = x,
y = Estimate,
ymin = `2.5%`,
ymax = `97.5%`)) +
geom_line() +
geom_ribbon(alpha = .25)
1.5.2 More refined plot
ggplot(logit.prob.data,
aes(x = x,
y = Estimate,
ymin = `2.5%`,
ymax = `97.5%`)) +
geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*5)),
inherit.aes = FALSE,
bins = 30,
fill = tail(colour.palette, 1)) +
geom_line(color = colour.palette[1], lwd = 1.5) +
geom_ribbon(fill = colour.palette[1], alpha = .25) +
labs(x = x.axis.label,
y = "Probability (%) (95% confidence interval)",
caption = "Vertical dotted lines indicate knots' location") +
theme_bw(base_size = 11) +
geom_vline(xintercept = knots.location, lty = 3) +
scale_y_continuous(breaks = seq(0, 1, 0.1),
limits = c(0, 1),
labels = percent_format(),
sec.axis = sec_axis(~./5,
breaks = seq(0, .10, by = .025),
labels = percent_format(),
name = y2.axis.label)) +
scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
labels = \(x) sprintf("%.0f", exp(x)))
1.6 Dealing with additional coavariates (i.e., covariates other than x
)
1.6.1 Conditional model
Add the covariate age
to the model and model it with a second-degree polynomial transform.
<- glm(y ~ rcs(x, 4) + poly(age, 2),
logit.rcs.cov data = data.example,
family = binomial(link = "logit"))
summary(logit.rcs.cov)
Call:
glm(formula = y ~ rcs(x, 4) + poly(age, 2), family = binomial(link = "logit"),
data = data.example)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.55214 0.63751 -5.572 2.52e-08 ***
rcs(x, 4)x 0.17106 0.15822 1.081 0.280
rcs(x, 4)x' 0.42445 0.50651 0.838 0.402
rcs(x, 4)x'' -0.03209 1.74171 -0.018 0.985
poly(age, 2)1 72.45505 4.28424 16.912 < 2e-16 ***
poly(age, 2)2 6.60949 3.59505 1.838 0.066 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 5217.2 on 7254 degrees of freedom
Residual deviance: 4491.2 on 7249 degrees of freedom
AIC: 4503.2
Number of Fisher Scoring iterations: 6
1.6.1.1 Odds Ratio
Generate the data to plot the ORs (conditional on age
) as a function of x
and save the RCS knots’ location.
<- data_or_hr(model = logit.rcs.cov,
logit.or.data.cov covariate = x,
referent.value = referent.value)
P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
<- attr(logit.or.data.cov, "knots.location")
knots.location
head(logit.or.data.cov)
x Estimate StdErr z P exp(Est.) 2.5% 97.5%
1 1.250270 -0.7801561 0.4668110 -1.671246 0.09467302 0.4583344 0.1835829 1.144281
2 1.324428 -0.7674707 0.4551864 -1.686058 0.09178465 0.4641857 0.1902113 1.132784
3 1.398585 -0.7547852 0.4435677 -1.701624 0.08882593 0.4701116 0.1970768 1.121415
4 1.472743 -0.7420998 0.4319551 -1.718002 0.08579619 0.4761131 0.2041876 1.110174
5 1.546900 -0.7294143 0.4203491 -1.735258 0.08269507 0.4821913 0.2115522 1.099060
6 1.621058 -0.7167289 0.4087505 -1.753463 0.07952256 0.4883471 0.2191793 1.088072
Plot the OR, conditional on age
, as a function of x
.
ggplot(logit.or.data.cov,
aes(x = x,
y = log(`exp(Est.)`),
ymin = log(`2.5%`),
ymax = log(`97.5%`))) +
geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*15)),
inherit.aes = FALSE,
bins = 30,
fill = tail(colour.palette, 1)) +
geom_line(color = colour.palette[1], lwd = 1.5) +
geom_ribbon(fill = colour.palette[1], alpha = .25) +
labs(x = x.axis.label,
y = "Odds Ratio (95% confidence interval)",
caption = "Vertical dashed line indicates the referent value
Vertical dotted lines indicate knots' location") +
theme_bw(base_size = 11) +
geom_vline(xintercept = referent.value, lty = 2) +
geom_vline(xintercept = knots.location, lty = 3) +
scale_y_continuous(breaks = log(c(0.1, 0.25, 0.5, 1, 2, 4, 6, 10, 20)),
labels = \(x) sprintf("%4.2f", exp(x)),
sec.axis = sec_axis(~./15,
breaks = seq(0, .10, by = .025),
labels = percent_format(),
name = y2.axis.label)) +
scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
labels = \(x) sprintf("%.0f", exp(x)))
1.6.1.2 Event probabilty
Generate the data to plot the event probability (conditional on age
) as a function of x
. Fix the values of age
at the 10th, median, and 90th percentile of its distribution: 52, 61, 69 years.
<- data.frame(age = quantile(data.example$age, c(.1, .5, .9)))
newdata.cov
<- data_probability(model = logit.rcs.cov,
logit.prob.data.cov covariate = x,
newdata = newdata.cov)
head(logit.prob.data.cov)
x age Estimate 2.5% 97.5%
1 1.250270 52 0.01200340 0.004950772 0.02881195
2 1.324428 52 0.01215478 0.005124542 0.02855282
3 1.398585 52 0.01230804 0.005304097 0.02829742
4 1.472743 52 0.01246321 0.005489593 0.02804581
5 1.546900 52 0.01262031 0.005681185 0.02779807
6 1.621058 52 0.01277936 0.005879030 0.02755427
ggplot(logit.prob.data.cov,
aes(x = x,
y = Estimate,
ymin = `2.5%`,
ymax = `97.5%`,
color = factor(age),
fill = factor(age))) +
geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*5)),
inherit.aes = FALSE,
bins = 30,
fill = tail(colour.palette, 1)) +
geom_line(lwd = 1.5) +
geom_ribbon(alpha = .25, linewidth = 0) +
labs(x = x.axis.label,
y = "Probability (%) (95% confidence interval)",
caption = "Vertical dotted lines indicate knots' location",
color = "Age (years)",
fill = "Age (years)") +
theme_bw(base_size = 11) +
theme(legend.position = "bottom") +
geom_vline(xintercept = knots.location, lty = 3) +
scale_color_manual(values = colour.palette) +
scale_fill_manual(values = colour.palette) +
scale_y_continuous(breaks = seq(0, 1, 0.1),
limits = c(0, 1),
labels = percent_format(),
sec.axis = sec_axis(~./5,
breaks = seq(0, .10, by = .025),
labels = percent_format(),
name = y2.axis.label)) +
scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
labels = \(x) sprintf("%.0f", exp(x)))
1.6.2 Marginal model
Generate inverse probability weights (IPW) to balance the distribution of the covariate age
across levels of x
(see help(WeightIt)
).
<- weightit(x ~ age,
ipw.wgtit data = data.example,
method = "cbps")
$ipweights <- ipw.wgtit$weights data.example
Assess balance numerically (see help(cobalt)
).
bal.tab(ipw.wgtit,
stats = c("c", "k"),
un = TRUE,
thresholds = c(cor = .1),
poly = 2)
Balance Measures
Type Corr.Un Corr.Adj R.Threshold KS.Target.Adj
age Contin. -0.1109 0.0000 Balanced, <0.1 0.0025
age² Contin. -0.1079 0.0034 Balanced, <0.1 0.0025
Balance tally for treatment correlations
count
Balanced, <0.1 2
Not Balanced, >0.1 0
Variable with the greatest treatment correlation
Variable Corr.Adj R.Threshold
age² 0.0034 Balanced, <0.1
Effective sample sizes
Total
Unadjusted 7255.
Adjusted 7160.23
Fit a marginal model using the IPWs generated previously (note: use the function glmgee
from the glmtoolbox
package to obtain robust standard errors).
<- glmgee(y ~ rcs(x, 4),
logit.rcs.ipw data = data.example,
family = binomial(link = "logit"),
weights = ipweights,
id = id)
summary(logit.rcs.ipw)
Sample size
Number of observations: 7255
Number of clusters: 7255
Cluster size: 1
*************************************************************
Model
Variance function: binomial
Link function: logit
Correlation structure: Independence
*************************************************************
Coefficients
Estimate Std.Error z-value Pr(>|z|)
(Intercept) -3.02043 0.60992 -4.95221 7.3374e-07
rcs(x, 4)x 0.11516 0.15199 0.75764 0.44867
rcs(x, 4)x' 0.49022 0.49159 0.99722 0.31866
rcs(x, 4)x'' -0.25606 1.69526 -0.15105 0.87994
Dispersion 0.99961
*************************************************************
Working correlation
[1]
[1] 1
1.6.2.1 Odds Ratio
Generate the data to plot the ORs (averaged over the distribution of age
) as a function of x
and save the RCS knots’ location.
<- data_or_hr(model = logit.rcs.ipw,
logit.or.data.ipw covariate = x,
referent.value = referent.value)
P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
<- attr(logit.or.data.ipw, "knots.location")
knots.location
head(logit.or.data.ipw)
x Estimate StdErr z P exp(Est.) 2.5% 97.5%
1 1.250270 -0.5879314 0.4472455 -1.314561 0.1886576 0.5554752 0.2311897 1.334630
2 1.324428 -0.5793917 0.4360814 -1.328632 0.1839694 0.5602391 0.2383308 1.316942
3 1.398585 -0.5708520 0.4249229 -1.343425 0.1791345 0.5650438 0.2456898 1.299503
4 1.472743 -0.5623123 0.4137706 -1.358995 0.1741481 0.5698898 0.2532729 1.282310
5 1.546900 -0.5537726 0.4026248 -1.375406 0.1690057 0.5747773 0.2610867 1.265361
6 1.621058 -0.5452329 0.3914863 -1.392725 0.1637029 0.5797068 0.2691377 1.248654
Plot the OR as a function of x
.
ggplot(logit.or.data.ipw,
aes(x = x,
y = log(`exp(Est.)`),
ymin = log(`2.5%`),
ymax = log(`97.5%`))) +
geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*15)),
inherit.aes = FALSE,
bins = 30,
fill = tail(colour.palette, 1)) +
geom_line(color = colour.palette[1], lwd = 1.5) +
geom_ribbon(fill = colour.palette[1], alpha = .25) +
labs(x = x.axis.label,
y = "Odds Ratio (95% confidence interval)",
caption = "Vertical dashed line indicates the referent value
Vertical dotted lines indicate knots' location") +
theme_bw(base_size = 11) +
geom_vline(xintercept = referent.value, lty = 2) +
geom_vline(xintercept = knots.location, lty = 3) +
scale_y_continuous(breaks = log(c(0.1, 0.25, 0.5, 1, 2, 4, 6, 10, 20)),
labels = \(x) sprintf("%4.2f", exp(x)),
sec.axis = sec_axis(~./15,
breaks = seq(0, .10, by = .025),
labels = percent_format(),
name = y2.axis.label)) +
scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
labels = \(x) sprintf("%.0f", exp(x)))
1.6.2.2 Event probability
Generate the data to plot the event probability (averaged over the distribution of age
) as a function of x
.
<- data_probability(model = logit.rcs.ipw,
logit.prob.data.ipw covariate = x)
head(logit.prob.data.ipw)
x Estimate 2.5% 97.5%
1 1.250270 0.05332996 0.02405146 0.1140836
2 1.324428 0.05376274 0.02477293 0.1127547
3 1.398585 0.05419884 0.02551501 0.1114412
4 1.472743 0.05463826 0.02627820 0.1101431
5 1.546900 0.05508104 0.02706299 0.1088604
6 1.621058 0.05552720 0.02786990 0.1075931
Plot the event probability as a function of x
.
ggplot(logit.prob.data.ipw,
aes(x = x,
y = Estimate,
ymin = `2.5%`,
ymax = `97.5%`)) +
geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*5)),
inherit.aes = FALSE,
bins = 30,
fill = tail(colour.palette, 1)) +
geom_line(color = colour.palette[1], lwd = 1.5) +
geom_ribbon(fill = colour.palette[1], alpha = .25) +
labs(x = x.axis.label,
y = "Probability (%) (95% confidence interval)",
caption = "Vertical dotted lines indicate knots' location") +
theme_bw(base_size = 11) +
geom_vline(xintercept = knots.location, lty = 3) +
scale_y_continuous(breaks = seq(0, 1, 0.1),
limits = c(0, 1),
labels = percent_format(),
sec.axis = sec_axis(~./5,
breaks = seq(0, .10, by = .025),
labels = percent_format(),
name = y2.axis.label)) +
scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
labels = \(x) sprintf("%.0f", exp(x)))
1.7 Explore how the shape of the OR as a function of x
changes as the number of the knots changes
<- lapply(3:8, \(n.knots) {
logit.or.data.nknots <- glm(y ~ rcs(x, n.knots),
logit.rcs data = data.example,
family = binomial(link = "logit"))
cat("--->", n.knots, "knots, AIC:", AIC(logit.rcs), "\n")
<- data_or_hr(model = logit.rcs,
data covariate = x,
referent.value = referent.value)
$n.knots <- n.knots
data
data })
---> 3 knots, AIC: 5015.7
P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
---> 4 knots, AIC: 5018.103
P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
---> 5 knots, AIC: 5018.088
P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
---> 6 knots, AIC: 5019.999
P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
---> 7 knots, AIC: 5022.09
P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
---> 8 knots, AIC: 5023.735
P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
ggplot(do.call("rbind", logit.or.data.nknots),
aes(x = x,
y = log(`exp(Est.)`),
color = factor(n.knots))) +
geom_line(lwd = 1.5) +
geom_vline(xintercept = referent.value, lty = 2) +
labs(x = x.axis.label,
y = "Odds Ratio (95% confidence interval)",
color = "Number of knots") +
theme_bw(base_size = 11) +
theme(legend.position = "bottom") +
scale_y_continuous(breaks = log(c(0.1, 0.5, 0.7, 1, 2, 4, 6, 10, 20)),
labels = \(x) sprintf("%4.2f", exp(x))) +
scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
labels = \(x) sprintf("%.0f", exp(x))) +
scale_color_manual(values = colour.palette)
2 Cox regression
2.1 Fit the model with x
as the only covariate
<- coxph(Surv(t, y) ~ rcs(x, 4),
cox.rcs data = data.example)
summary(cox.rcs)
Call:
coxph(formula = Surv(t, y) ~ rcs(x, 4), data = data.example)
n= 7255, number of events= 844
coef exp(coef) se(coef) z Pr(>|z|)
rcs(x, 4)x 0.04343 1.04439 0.14403 0.302 0.763
rcs(x, 4)x' 0.41755 1.51824 0.45438 0.919 0.358
rcs(x, 4)x'' -0.17594 0.83867 1.54037 -0.114 0.909
exp(coef) exp(-coef) lower .95 upper .95
rcs(x, 4)x 1.0444 0.9575 0.78752 1.385
rcs(x, 4)x' 1.5182 0.6587 0.62312 3.699
rcs(x, 4)x'' 0.8387 1.1924 0.04097 17.169
Concordance= 0.617 (se = 0.011 )
Likelihood ratio test= 203.4 on 3 df, p=<2e-16
Wald test = 247 on 3 df, p=<2e-16
Score (logrank) test = 278.3 on 3 df, p=<2e-16
2.2 Generate the data to plot the HR as a function of x
and save the RCS knots’ location
<- data_or_hr(model = cox.rcs,
cox.hr.data covariate = x,
referent.value = referent.value)
P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
head(cox.hr.data)
x Estimate StdErr z P exp(Est.) 2.5% 97.5%
1 1.250270 -0.2939612 0.4260261 -0.6900075 0.4901895 0.7453054 0.3233703 1.717783
2 1.324428 -0.2907406 0.4154375 -0.6998419 0.4840260 0.7477096 0.3312165 1.687928
3 1.398585 -0.2875199 0.4048537 -0.7101822 0.4775911 0.7501216 0.3392498 1.658608
4 1.472743 -0.2842993 0.3942752 -0.7210681 0.4708676 0.7525414 0.3474743 1.629814
5 1.546900 -0.2810787 0.3837024 -0.7325433 0.4638370 0.7549689 0.3558943 1.601538
6 1.621058 -0.2778580 0.3731358 -0.7446566 0.4564794 0.7574044 0.3645139 1.573771
2.3 Plot the HR as a function of x
2.3.1 Basic plot
ggplot(cox.hr.data,
aes(x = x,
y = log(`exp(Est.)`),
ymin = log(`2.5%`),
ymax = log(`97.5%`))) +
geom_line() +
geom_ribbon(alpha = .25)
2.3.2 More refined plot
ggplot(cox.hr.data,
aes(x = x,
y = log(`exp(Est.)`),
ymin = log(`2.5%`),
ymax = log(`97.5%`))) +
geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*15)),
inherit.aes = FALSE,
bins = 30,
fill = tail(colour.palette, 1)) +
geom_line(color = colour.palette[1], lwd = 1.5) +
geom_ribbon(fill = colour.palette[1], alpha = .25) +
labs(x = x.axis.label,
y = "Hazard Ratio (95% confidence interval)",
caption = "Vertical dashed line indicates the referent value
Vertical dotted lines indicate knots' location") +
theme_bw(base_size = 11) +
geom_vline(xintercept = referent.value, lty = 2) +
geom_vline(xintercept = knots.location, lty = 3) +
scale_y_continuous(breaks = log(c(0.1, 0.25, 0.5, 1, 2, 4, 6, 10, 20)),
labels = \(x) sprintf("%4.2f", exp(x)),
sec.axis = sec_axis(~./15,
breaks = seq(0, .10, by = .025),
labels = percent_format(),
name = y2.axis.label)) +
scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
labels = \(x) sprintf("%.0f", exp(x)))
2.4 Generate the data to plot the event probability at different time horizons (years) as a function of x
<- data_probability(model = cox.rcs,
cox.prob.data covariate = x,
time = c(.5, 1.5, 2.5, 3.5))
head(cox.prob.data)
time x Estimate 2.5% 97.5%
1 0.5 1.250270 0.01710444 0.03748504 0.007760347
2 0.5 1.324428 0.01715914 0.03685875 0.007945035
3 0.5 1.398585 0.01721401 0.03624379 0.008133861
4 0.5 1.472743 0.01726906 0.03564002 0.008326889
5 0.5 1.546900 0.01732428 0.03504733 0.008524181
6 0.5 1.621058 0.01737968 0.03446561 0.008725798
2.5 Plot the event probability at different time horizons (years) as a function of x
2.5.1 Basic plot
ggplot(cox.prob.data,
aes(x = x,
y = Estimate,
ymin = `2.5%`,
ymax = `97.5%`,
fill = factor(time),
color = factor(time))) +
geom_line() +
geom_ribbon(alpha = .25)
2.5.2 More refined plot
ggplot(cox.prob.data,
aes(x = x,
y = Estimate,
ymin = `2.5%`,
ymax = `97.5%`,
fill = factor(time),
color = factor(time))) +
geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*5)),
inherit.aes = FALSE,
bins = 30,
fill = tail(colour.palette, 1)) +
geom_line(lwd = 1.5) +
geom_ribbon(alpha = .25, color = NA) +
labs(x = x.axis.label,
y = "Event probability (%)\n(95% confidence interval)",
caption = "Vertical dotted lines indicate knots' location",
fill = "Time horizon (years)",
color = "Time horizon (years)") +
theme_bw(base_size = 11) +
theme(legend.position = "bottom") +
geom_vline(xintercept = knots.location, lty = 3) +
scale_y_continuous(breaks = seq(0, 1, 0.1),
limits = c(0, 1),
labels = percent_format(),
sec.axis = sec_axis(~./5,
breaks = seq(0, .10, by = .025),
labels = percent_format(),
name = y2.axis.label)) +
scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
labels = \(x) sprintf("%.0f", exp(x))) +
scale_fill_manual(values = colour.palette) +
scale_color_manual(values = colour.palette)
2.6 Dealing with additional coavariates (i.e., covariates other than x
)
2.6.1 Conditional model
Add the covariate age
to the model and model it with a second-degree polynomial transform.
<- coxph(Surv(t, y) ~ rcs(x, 4) + poly(age, 2),
cox.rcs.cov data = data.example)
summary(cox.rcs.cov)
Call:
coxph(formula = Surv(t, y) ~ rcs(x, 4) + poly(age, 2), data = data.example)
n= 7255, number of events= 844
coef exp(coef) se(coef) z Pr(>|z|)
rcs(x, 4)x 1.465e-01 1.158e+00 1.461e-01 1.003 0.316
rcs(x, 4)x' 4.100e-01 1.507e+00 4.576e-01 0.896 0.370
rcs(x, 4)x'' -3.247e-01 7.227e-01 1.546e+00 -0.210 0.834
poly(age, 2)1 6.568e+01 3.351e+28 4.041e+00 16.253 <2e-16 ***
poly(age, 2)2 1.213e+00 3.364e+00 2.954e+00 0.411 0.681
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
rcs(x, 4)x 1.158e+00 8.637e-01 8.695e-01 1.542e+00
rcs(x, 4)x' 1.507e+00 6.636e-01 6.145e-01 3.695e+00
rcs(x, 4)x'' 7.227e-01 1.384e+00 3.489e-02 1.497e+01
poly(age, 2)1 3.351e+28 2.984e-29 1.217e+25 9.225e+31
poly(age, 2)2 3.364e+00 2.973e-01 1.029e-02 1.100e+03
Concordance= 0.745 (se = 0.009 )
Likelihood ratio test= 706.8 on 5 df, p=<2e-16
Wald test = 764.5 on 5 df, p=<2e-16
Score (logrank) test = 915.4 on 5 df, p=<2e-16
2.6.1.1 Hazard Ratio
Generate the data to plot the HRs (conditional on age
) as a function of x
and save the RCS knots’ location.
<- data_or_hr(model = cox.rcs.cov,
cox.hr.data.cov covariate = x,
referent.value = referent.value)
P-value for overall association: <0.0001
P-value for non-linearity: 0.000102
head(cox.hr.data.cov)
x Estimate StdErr z P exp(Est.) 2.5% 97.5%
1 1.250270 -0.6812737 0.4325575 -1.574990 0.11525879 0.5059721 0.2167369 1.181191
2 1.324428 -0.6704106 0.4218139 -1.589352 0.11198104 0.5114985 0.2237668 1.169211
3 1.398585 -0.6595476 0.4110751 -1.604446 0.10861587 0.5170852 0.2310226 1.157364
4 1.472743 -0.6486845 0.4003413 -1.620329 0.10516171 0.5227330 0.2385112 1.145648
5 1.546900 -0.6378214 0.3896131 -1.637063 0.10161720 0.5284424 0.2462399 1.134062
6 1.621058 -0.6269584 0.3788910 -1.654720 0.09798134 0.5342142 0.2542160 1.122608
Plot the HR, conditional on age
, as a function of x
.
ggplot(cox.hr.data.cov,
aes(x = x,
y = log(`exp(Est.)`),
ymin = log(`2.5%`),
ymax = log(`97.5%`))) +
geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*15)),
inherit.aes = FALSE,
bins = 30,
fill = tail(colour.palette, 1)) +
geom_line(color = colour.palette[1], lwd = 1.5) +
geom_ribbon(fill = colour.palette[1], alpha = .25) +
labs(x = x.axis.label,
y = "Hazard Ratio (95% confidence interval)",
caption = "Vertical dashed line indicates the referent value
Vertical dotted lines indicate knots' location") +
theme_bw(base_size = 11) +
geom_vline(xintercept = referent.value, lty = 2) +
geom_vline(xintercept = knots.location, lty = 3) +
scale_y_continuous(breaks = log(c(0.1, 0.25, 0.5, 1, 2, 4, 6, 10, 20)),
labels = \(x) sprintf("%4.2f", exp(x)),
sec.axis = sec_axis(~./15,
breaks = seq(0, .10, by = .025),
labels = percent_format(),
name = y2.axis.label)) +
scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
labels = \(x) sprintf("%.0f", exp(x)))
2.6.1.2 Event probability
Generate the data to plot the event probability at different time horizons (conditional on age
) as a function of x
. Fix the values of age
at the 10th and 90th percentile of its distribution: 52, 69 years and the time horizon at 1 and 3 years.
<- data.frame(age = quantile(data.example$age, c(.1, .9)))
newdata.cov
<- data_probability(model = cox.rcs.cov,
cox.prob.data.cov covariate = x,
newdata = newdata.cov,
time = c(1, 3))
head(cox.prob.data.cov)
time x age Estimate 2.5% 97.5%
1 1 1.250270 52 0.006450633 0.01473741 0.002816826
2 1 1.324428 52 0.006520858 0.01460146 0.002905568
3 1 1.398585 52 0.006591845 0.01446754 0.002996939
4 1 1.472743 52 0.006663603 0.01433566 0.003090998
5 1 1.546900 52 0.006736139 0.01420587 0.003187803
6 1 1.621058 52 0.006809462 0.01407821 0.003287411
ggplot(cox.prob.data.cov,
aes(x = x,
y = Estimate,
ymin = `2.5%`,
ymax = `97.5%`,
fill = factor(time),
color = factor(time),
linetype = factor(age))) +
geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*5)),
inherit.aes = FALSE,
bins = 30,
fill = tail(colour.palette, 1)) +
geom_line(lwd = 1.5) +
geom_ribbon(alpha = .25, color = NA) +
labs(x = x.axis.label,
y = "Event probability (%)\n(95% confidence interval)",
caption = "Vertical dotted lines indicate knots' location",
fill = "Time horizion (years)",
color = "Time horizion (years)",
linetype = "Age (years)") +
theme_bw(base_size = 11) +
theme(legend.position = "bottom") +
geom_vline(xintercept = knots.location, lty = 3) +
scale_y_continuous(breaks = seq(0, 1, 0.1),
limits = c(0, 1),
labels = percent_format(),
sec.axis = sec_axis(~./5,
breaks = seq(0, .10, by = .025),
labels = percent_format(),
name = y2.axis.label)) +
scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
labels = \(x) sprintf("%.0f", exp(x))) +
scale_fill_manual(values = colour.palette) +
scale_color_manual(values = colour.palette)
2.6.2 Marginal model
Generate inverse probability weights (IPW) to balance the distribution of the covariate age
across levels of x
(see help(WeightIt)
).
<- weightit(x ~ age,
ipw.wgtit data = data.example,
method = "cbps")
$ipweights <- ipw.wgtit$weights data.example
Assess balance numerically (see help(cobalt)
).
bal.tab(ipw.wgtit,
stats = c("c", "k"),
un = TRUE,
thresholds = c(cor = .1),
poly = 2)
Balance Measures
Type Corr.Un Corr.Adj R.Threshold KS.Target.Adj
age Contin. -0.1109 0.0000 Balanced, <0.1 0.0025
age² Contin. -0.1079 0.0034 Balanced, <0.1 0.0025
Balance tally for treatment correlations
count
Balanced, <0.1 2
Not Balanced, >0.1 0
Variable with the greatest treatment correlation
Variable Corr.Adj R.Threshold
age² 0.0034 Balanced, <0.1
Effective sample sizes
Total
Unadjusted 7255.
Adjusted 7160.23
Fit a marginal model using the IPWs generated previously.
<- coxph(Surv(t, y) ~ rcs(x, 4),
cox.rcs.ipw data = data.example,
weights = ipweights,
robust = TRUE)
summary(cox.rcs.ipw)
Call:
coxph(formula = Surv(t, y) ~ rcs(x, 4), data = data.example,
weights = ipweights, robust = TRUE)
n= 7255, number of events= 844
coef exp(coef) se(coef) robust se z Pr(>|z|)
rcs(x, 4)x 0.1021 1.1075 0.1526 0.1444 0.707 0.480
rcs(x, 4)x' 0.5280 1.6955 0.4683 0.4583 1.152 0.249
rcs(x, 4)x'' -0.6509 0.5216 1.5670 1.5542 -0.419 0.675
exp(coef) exp(-coef) lower .95 upper .95
rcs(x, 4)x 1.1075 0.9029 0.83446 1.470
rcs(x, 4)x' 1.6955 0.5898 0.69052 4.163
rcs(x, 4)x'' 0.5216 1.9172 0.02479 10.972
Concordance= 0.641 (se = 0.01 )
Likelihood ratio test= 286.2 on 3 df, p=<2e-16
Wald test = 316.6 on 3 df, p=<2e-16
Score (logrank) test = 399.9 on 3 df, p=<2e-16, Robust = 160.1 p=<2e-16
(Note: the likelihood ratio and score tests assume independence of
observations within a cluster, the Wald and robust score tests do not).
2.6.2.1 Hazard ratio
Generate the data to plot the HRs (averaged over the distribution of age
) as a function of x
and save the RCS knots’ location.
<- data_or_hr(model = cox.rcs.ipw,
cox.hr.data.ipw covariate = x,
referent.value = referent.value)
P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
head(cox.hr.data.ipw)
x Estimate StdErr z P exp(Est.) 2.5% 97.5%
1 1.250270 -0.5484782 0.4265002 -1.285997 0.1984439 0.5778285 0.2504732 1.333020
2 1.324428 -0.5409069 0.4158843 -1.300619 0.1933890 0.5822200 0.2576830 1.315493
3 1.398585 -0.5333355 0.4052734 -1.315990 0.1881775 0.5866449 0.2650977 1.298209
4 1.472743 -0.5257641 0.3946677 -1.332169 0.1828047 0.5911035 0.2727230 1.281166
5 1.546900 -0.5181928 0.3840679 -1.349222 0.1772658 0.5955959 0.2805644 1.264361
6 1.621058 -0.5106214 0.3734744 -1.367219 0.1715566 0.6001225 0.2886276 1.247791
Plot the HR as a function of x
.
ggplot(cox.hr.data.ipw,
aes(x = x,
y = log(`exp(Est.)`),
ymin = log(`2.5%`),
ymax = log(`97.5%`))) +
geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*15)),
inherit.aes = FALSE,
bins = 30,
fill = tail(colour.palette, 1)) +
geom_line(color = colour.palette[1], lwd = 1.5) +
geom_ribbon(fill = colour.palette[1], alpha = .25) +
labs(x = x.axis.label,
y = "Hazard Ratio (95% confidence interval)",
caption = "Vertical dashed line indicates the referent value
Vertical dotted lines indicate knots' location") +
theme_bw(base_size = 11) +
geom_vline(xintercept = referent.value, lty = 2) +
geom_vline(xintercept = knots.location, lty = 3) +
scale_y_continuous(breaks = log(c(0.1, 0.25, 0.5, 1, 2, 4, 6, 10, 20)),
labels = \(x) sprintf("%4.2f", exp(x)),
sec.axis = sec_axis(~./15,
breaks = seq(0, .10, by = .025),
labels = percent_format(),
name = y2.axis.label)) +
scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
labels = \(x) sprintf("%.0f", exp(x)))
2.6.2.2 Event probability
Generate the data to plot the event probability (averaged over the distribution of age
) as a function of x
at different time horizons (0.5, 1.5, 2.5, 3.5 years).
<- data_probability(model = cox.rcs.ipw,
cox.prob.data.ipw covariate = x,
time = c(.5, 1.5, 2.5, 3.5))
head(cox.prob.data.ipw)
time x Estimate 2.5% 97.5%
1 0.5 1.250270 0.01299518 0.02861233 0.005876494
2 0.5 1.324428 0.01309330 0.02825202 0.006042857
3 0.5 1.398585 0.01319215 0.02789703 0.006213724
4 0.5 1.472743 0.01329174 0.02754734 0.006389194
5 0.5 1.546900 0.01339208 0.02720294 0.006569366
6 0.5 1.621058 0.01349317 0.02686383 0.006754337
Plot the event probability as a function of x
.
ggplot(cox.prob.data.ipw,
aes(x = x,
y = Estimate,
ymin = `2.5%`,
ymax = `97.5%`,
fill = factor(time),
color = factor(time))) +
geom_histogram(data = data.example, aes(x = x, y = after_stat(density*width*5)),
inherit.aes = FALSE,
bins = 30,
fill = tail(colour.palette, 1)) +
geom_line(lwd = 1.5) +
geom_ribbon(alpha = .25, color = NA) +
labs(x = x.axis.label,
y = "Event probability (%)\n(95% confidence interval)",
caption = "Vertical dotted lines indicate knots' location",
fill = "Time horizon (years)",
color = "Time horizon (years)") +
theme_bw(base_size = 11) +
theme(legend.position = "bottom") +
geom_vline(xintercept = knots.location, lty = 3) +
scale_y_continuous(breaks = seq(0, 1, 0.1),
limits = c(0, 1),
labels = percent_format(),
sec.axis = sec_axis(~./5,
breaks = seq(0, .10, by = .025),
labels = percent_format(),
name = y2.axis.label)) +
scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
labels = \(x) sprintf("%.0f", exp(x))) +
scale_fill_manual(values = colour.palette) +
scale_color_manual(values = colour.palette)
2.7 Explore how the shape of the HR as a function of x
changes as the number of the knots changes
<- lapply(3:8, \(n.knots) {
cox.hr.data.nknots <- coxph(Surv(t, y) ~ rcs(x, n.knots),
cox.rcs data = data.example)
cat("--->", n.knots, "knots, AIC:", AIC(cox.rcs), "\n")
<- data_or_hr(model = cox.rcs,
data covariate = x,
referent.value = referent.value)
$n.knots <- n.knots
data
data })
---> 3 knots, AIC: 14377.83
P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
---> 4 knots, AIC: 14380.5
P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
---> 5 knots, AIC: 14379.88
P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
---> 6 knots, AIC: 14381.75
P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
---> 7 knots, AIC: 14383.81
P-value for overall association: <0.0001
P-value for non-linearity: <0.0001
---> 8 knots, AIC: 14385.35
P-value for overall association: <0.0001
P-value for non-linearity: 0.000154
ggplot(do.call("rbind", cox.hr.data.nknots),
aes(x = x,
y = log(`exp(Est.)`),
color = factor(n.knots))) +
geom_line(lwd = 1.5) +
geom_vline(xintercept = referent.value, lty = 2) +
labs(x = x.axis.label,
y = "Hazard Ratio (95% confidence interval)",
color = "Number of knots") +
theme_bw(base_size = 11) +
theme(legend.position = "bottom") +
scale_y_continuous(breaks = log(c(0.1, 0.5, 0.7, 1, 2, 4, 6, 10, 20)),
labels = \(x) sprintf("%4.2f", exp(x))) +
scale_x_continuous(breaks = log(c(5, 10, 20, 50, 150, 400, 1000, 3000, 8000)),
labels = \(x) sprintf("%.0f", exp(x))) +
scale_color_manual(values = colour.palette)
Session info
sessionInfo()
R version 4.3.1 (2023-06-16)
Platform: x86_64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.7
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: Europe/Stockholm
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] glmtoolbox_0.1.12 cobalt_4.5.5 WeightIt_1.4.0 scales_1.3.0 ggplot2_3.5.1
[6] survival_3.5-5 rms_6.7-1 Hmisc_5.1-0 readxl_1.4.2
loaded via a namespace (and not attached):
[1] tidyselect_1.2.0 rootSolve_1.8.2.4 farver_2.1.1 dplyr_1.1.2
[5] fastmap_1.2.0 TH.data_1.1-2 digest_0.6.37 rpart_4.1.19
[9] lifecycle_1.0.4 cluster_2.1.4 statmod_1.5.0 magrittr_2.0.3
[13] compiler_4.3.1 rlang_1.1.4 tools_4.3.1 utf8_1.2.3
[17] yaml_2.3.10 data.table_1.15.4 knitr_1.48 labeling_0.4.2
[21] htmlwidgets_1.6.2 plyr_1.8.8 SuppDists_1.1-9.9 multcomp_1.4-25
[25] polspline_1.1.23 withr_2.5.0 foreign_0.8-84 purrr_1.0.1
[29] numDeriv_2016.8-1.1 nnet_7.3-19 grid_4.3.1 aod_1.3.2
[33] fansi_1.0.4 colorspace_2.1-0 MASS_7.3-60 etm_1.1.1
[37] cli_3.6.3 mvtnorm_1.2-2 rmarkdown_2.28 crayon_1.5.2
[41] generics_0.1.3 RcppParallel_5.1.7 rstudioapi_0.14 stringr_1.5.0
[45] splines_4.3.1 parallel_4.3.1 cellranger_1.1.0 base64enc_0.1-3
[49] vctrs_0.6.3 Matrix_1.6-1.1 sandwich_3.0-2 jsonlite_1.8.9
[53] SparseM_1.81 Formula_1.2-5 htmlTable_2.4.1 Epi_2.47.1
[57] tidyr_1.3.0 cmprsk_2.2-11 glue_1.8.0 chk_0.10.0
[61] codetools_0.2-19 stringi_1.7.12 gtable_0.3.3 munsell_0.5.0
[65] tibble_3.2.1 pillar_1.9.0 htmltools_0.5.8.1 quantreg_5.97
[69] R6_2.5.1 zigg_0.0.2 evaluate_1.0.1 lattice_0.21-8
[73] backports_1.4.1 Rfast_2.1.5.1 broom_1.0.5 MatrixModels_0.5-2
[77] Rcpp_1.0.10 gridExtra_2.3 nlme_3.1-162 checkmate_2.2.0
[81] mgcv_1.8-42 xfun_0.48 zoo_1.8-12 pkgconfig_2.0.3