# Simulation function – response is a quadratic function of continuous predictor

sim_quadratic_logistic_data = function(sample_size = 25) {
x = rnorm(n = sample_size)
eta = -1.5 + 0.5 * x + x ^ 2 + rnorm(sample_size, sd = 2) # add some noise
p = 1 / (1 + exp(-eta))
y = rbinom(n = sample_size, size = 1, prob = p)
data.frame(y.factor = factor(y, levels = 1:0, labels = c("COPD","control")), x = x, y = y)
}

# Fit once to see what the resulting predicted curve looks like

set.seed(42)
table(example_data$y.factor) ## ## COPD control ## 209 291 fit_glm <- glm(y ~ x + I(x^2), data = example_data, family = binomial) summary(fit_glm) ## ## Call: ## glm(formula = y ~ x + I(x^2), family = binomial, data = example_data) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -2.0555 -0.8862 -0.8084 1.2208 1.6041 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -0.9058 0.1255 -7.217 5.30e-13 *** ## x 0.3904 0.1118 3.491 0.000482 *** ## I(x^2) 0.6619 0.1009 6.558 5.45e-11 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 679.64 on 499 degrees of freedom ## Residual deviance: 609.91 on 497 degrees of freedom ## AIC: 615.91 ## ## Number of Fisher Scoring iterations: 4 plot(y ~ x, data = example_data, pch = 20, ylab = "Estimated Probability", main = "Logistic Regression, Quadratic Relationship") grid() curve(predict(fit_glm, data.frame(x), type = "response"), add = TRUE, col = "dodgerblue", lty = 2) curve(boot::inv.logit(-1.5 + 0.5 * x + x ^ 2), add = TRUE, col = "darkorange", lty = 1) legend("bottomleft", c("True Probability", "Estimated Probability", "Data"), lty = c(1, 2, 0), pch = c(NA, NA, 20), lwd = 2, col = c("darkorange", "dodgerblue", "black")) # Run cross-validation experiment once if(!requireNamespace("pacman")) install.packages("pacman") ## Loading required namespace: pacman pacman::p_load(caret) pacman::p_load(MLmetrics) pacman::p_load(dplyr) pacman::p_load(tidyr) # function that returns the metrics of interest MySummary <- function (data, lev = NULL, model = NULL) { PPV = caret::posPredValue(data = data$pred,
reference = data$obs) NPV = caret::negPredValue(data = data$pred,
reference = data$obs) tt <- table(data$pred, data$obs) n_call_COPD <- sum(tt[1,]) # number of 'positives' called by the model n_call_control <- sum(tt[2,]) # number of 'negatives' called by the model c(PPV = PPV, NPV = NPV, Macro = (PPV + NPV) / 2, # JH calls this ave.suuccess12 SE2_Macro = (1/2^2) * (PPV * (1-PPV) / n_call_COPD + NPV * (1-NPV) / n_call_control) # JH calls this var.ave.success ) } set.seed(42) # sample size N <- 500 # folds K <- 5 # simulate data df <- sim_quadratic_logistic_data(sample_size = N) # fit the model with cross-validated metrics fit <- caret::train(y.factor ~ x + I(x^2), data = df, method = "glm", family = "binomial", trControl = trainControl(method = "cv", number = K, savePredictions = TRUE, summaryFunction = MySummary, classProbs = TRUE)) fit$resample
##         PPV       NPV     Macro   SE2_Macro Resample
## 1 0.7307692 0.6891892 0.7099792 0.002615458    Fold1
## 2 0.5000000 0.6000000 0.5500000 0.003875000    Fold2
## 3 0.5652174 0.6282051 0.5967113 0.003419760    Fold3
## 4 0.8400000 0.7297297 0.7848649 0.002010298    Fold4
## 5 0.5909091 0.6282051 0.6095571 0.003495596    Fold5

# Run CV experiment many times

set.seed(42)
n.samples <- 400 # number of replications

# sample size
N <- 500

# folds
K <- 5

# repeat experiment several times
res.list <- replicate(n.samples, expr = {

# simulate data with samp size

# fit the model with cross-validated metrics
fit <- caret::train(y.factor ~ x + I(x^2),
data = df,
method = "glm",
family = "binomial",
trControl = trainControl(method = "cv",
number = K,
savePredictions = TRUE,
summaryFunction = MySummary,
classProbs = TRUE))
data.frame(
SE_pbar = sqrt((1/K)*mean(fit$resample$SE2_Macro)), # model based (what the reviewer suggested)
SEM = sd(fit$resample$Macro) / sqrt(K), # what we did aka the Standard error of the mean
corr_ppv_npv = cor(fit$resample$PPV, fit$resample$NPV)
)

}, simplify = FALSE)

res <- do.call(rbind, res.list) %>%
mutate(replicate = 1:n())

hist(res$corr_ppv_npv, col = "lightblue", xlab = "corr(PPV, NPV)", main = sprintf("Average correlation(PPV, NPV) over %i samples = %0.2f",n.samples, mean(res$corr_ppv_npv))) res %>%
dplyr::select(replicate, SE_pbar, SEM) %>%
tidyr::pivot_longer(cols = -1, names_to = "SE_type") %>%
ggplot(aes(x = SE_type, y = value)) + geom_boxplot() +
labs(title = "Distribution of standard errors for SEM vs. Average of \nK fold model based variances over many replications") boxplot(res$SEM / res$SE_pbar, main = "Ratio of SEM to SE_pbar") 