Evaluating PRS in R to calculate “Area Under the Curve” and Odds Ratio
Published:
This post shows the R script I’ve written to calculate AUC and OR for the PRS generated in plink in previous blog posts. It has undergone several iterations as I adjust our desired outcomes.
I’ve pasted the script below:
#!/usr/bin/env Rscript
library(pROC)
library(ggplot2)
library(scales)
library(tidyverse)
library(dplyr)
library(lme4)
library(sjPlot)
library(abd)
# Script takes multiple arguments, all in the form of file paths
# Arg 1: population covariate file location
# Arg 2: PMBB Ids of cancer cases
# Arg 3: PMBB Ids of control cases
# Arg 4: .Profile file generated in plink using OC_build_PRS
# Arg 5: Cancer type
# Arg 6: Output filepath and name
# The script outputs a csv file with the following columns
# Cancer subtype
# Odds ratio of the PRS
# Odds ratio confidence interval
# AUC of the PRS ROC
# Confidence interval of AUC of the PRS ROC, bootstrapped x2000
# AUC of the Principal components only (PC) ROC
# Confidence interval of AUC of the PC ROC, bootstrapped x2000
# Change in AUC in PRS model compared to PC model, bootstrapped x2000
# P value of Change in AUC in PRS model compared to PC model
# Number of cancer cases in model
# Number of controls in model
# Number of SNPs used in PRS
args=commandArgs(trailingOnly = TRUE)
# Temporary args while working on script
args=c("G://My Drive//Residency//Research//Maxwell//phenotypes//OC_covariate_file.txt","G://My Drive//Residency//Research//Maxwell//PMBB_IDs//PMBB_IDs_lung_cancer.txt","G://My Drive//Residency//Research//Maxwell//PMBB_IDs//PMBB_IDs_controls.txt", "G://My Drive//Residency//Research//Maxwell//Profiles//lung_all_p5.profile", "lung_all_p5", "G://My Drive//Residency//Research//Maxwell//lung_all_p5.csv")
# read covariates file and name columns
covariates <- read.csv(file= args[1], header = T, sep="\t")
colnames(covariates)[1] = "id"
# Subset population to only those with age 50-80
covariates = covariates[covariates$BIRTH_YEAR>1942,]
covariates = covariates[covariates$BIRTH_YEAR<1972,]
# Remove patients who are already eligible for CT screening based on smoking hx
covariates = covariates[covariates$CT_ELIGIBLE=="NO",]
# Load "cases" PMBB_ID file
case_phenotypes <- read.csv(file= args[2], header = F, sep=",")
colnames(case_phenotypes)[1] = "id"
case_phenotypes$pheno=1
# Load "control" PMBB_ID file
control_phenotypes <- read.csv(file= args[3], header = F, sep=",")
colnames(control_phenotypes)[1] = "id"
control_phenotypes$pheno=0
# Combine cases and controls into one dataframe
phenotypes = rbind(case_phenotypes, control_phenotypes)
# genotype file = .profile generated from PLINK
genotypes <- read.csv(file=args[4], header = T, sep = "")
colnames(genotypes)[1] <- c("id")
# Merge phenotypes with covariates based on id column
pheno_cov = merge(phenotypes, covariates, by="id")
# Merge pheno_cov with genotype scores based on id column
score_pheno_cov = merge(pheno_cov, genotypes, by="id")
# Ensure complete cases
score_pheno_cov <- score_pheno_cov[complete.cases(score_pheno_cov),]
# Normalize score of genotype scores
score_pheno_cov[32] <- apply(score_pheno_cov[32],2,scale)
# Relabel columns as id and normalizedScore
colnames(score_pheno_cov)[32] <- "normalizedScore"
# for non-gender specific cancers:
mod_PRS <- glm(pheno ~ normalizedScore, family="binomial", data=score_pheno_cov)
##################################################################
# 1. Calculate AUC
# AUC calculated using predict() and roc() for PRS model
predict_mod_PRS <- predict(mod_PRS,type=c("response"))
roc_mod_PRS <- roc(score_pheno_cov$pheno ~ predict_mod_PRS)
# AUC and bootstrapped CI of the predict_mod_PRS
auc_mod_PRS <-roc(score_pheno_cov$pheno ~ predict_mod_PRS, score_pheno_cov, plot=T, ci=T,print.auc = TRUE)
CI_bootstrap_PRS <- ci.auc(auc_mod_PRS, method='bootstrap', n.boot=10000)
AUC_PRS <- round(as.numeric(auc(roc_mod_PRS)),4)
AUC_PRS_CI <- paste(round(as.numeric(CI_bootstrap_PRS)[1],4), "-", round(as.numeric(CI_bootstrap_PRS)[3],4))
##################################################################
# 2. Calculate Odds ratio of disease occurence per 1 SD in scoring
# This should be equal to the beta-coefficient in our model "predict_mod_PRS"
options(digits = 4)
OR_PRS = round(exp(summary(mod_PRS)$coefficients[2]),4)
OR_PRS_CI <- paste(round(exp((summary(mod_PRS)$coefficients[2])-1.96*summary(mod_PRS)$coefficients[2,2]),4), "-", round(exp((summary(mod_PRS)$coefficients[2])+1.96*summary(mod_PRS)$coefficients[2,2]),4))
##################################################################
# 3. Lifetime risk of lung cancer in top centile
# Reorder score_pheno file by decreasing score
score_pheno_cov1 <- score_pheno_cov[order(score_pheno_cov$normalizedScore, decreasing = TRUE), ]
# Find # of patients in top percentile
n_top_percent = round(length(score_pheno_cov1$id)*0.01, digits=0)
# Take top 1% of patients in score_pheno file
score_pheno_cov2 <- score_pheno_cov1[1:n_top_percent,]
# Determine % of top 1% patients who have lung cancer, compare to entire population
percent_positive = length(which(score_pheno_cov2$pheno==1))/n_top_percent
percent_positive_baseline = length(which(score_pheno_cov$pheno==1))/length(score_pheno_cov$pheno)
##################################################################
# 4. Odds ratio of middle quintile, compared to top and bottom quintile
# Divide population into 5 segments
# extract top, bottom, and remaining quintiles
top_quintile = which(score_pheno_cov$normalizedScore > quantile(score_pheno_cov$normalizedScore, 0.8))
bottom_quintile = which(score_pheno_cov$normalizedScore < quantile(score_pheno_cov$normalizedScore,0.2))
middle_quintile = which((score_pheno_cov$normalizedScore > quantile(score_pheno_cov$normalizedScore,0.4)) & (score_pheno_cov$normalizedScore < quantile(score_pheno_cov$normalizedScore,0.6)))
# define data frames for top, bottom, and remaining quintiles
top_set = score_pheno_cov[top_quintile,]
bot_set = score_pheno_cov[bottom_quintile,]
middle_set = score_pheno_cov[middle_quintile,]
# Compare top to mid quintile
two_by_two_tm = matrix(c(length(which(top_set$pheno==1)),length(which(middle_set$pheno==1)),length(which(top_set$pheno==0)),length(which(middle_set$pheno==0))),nrow=2)
rownames(two_by_two_tm) <- c("top", "middle")
colnames(two_by_two_tm) <- c("Cancer", "No Cancer")
top_mid_OR <- (two_by_two_tm[1,1]*two_by_two_tm[2,2])/(two_by_two_tm[1,2]*two_by_two_tm[2,1])
top_mid_OR_CI <- 1.96*sqrt(1/two_by_two_tm[1,1]+1/two_by_two_tm[1,2]+1/two_by_two_tm[2,1]+1/two_by_two_tm[2,2])
top_mid_OR_CI_low <- top_mid_OR - top_mid_OR_CI
top_mid_OR_CI_high <- top_mid_OR + top_mid_OR_CI
top_mid_OR_CI <- paste(round(top_mid_OR_CI_low,4), "-", round(top_mid_OR_CI_high,4))
# Compare middle to bottom quintile
two_by_two_mb = matrix(c(length(which(middle_set$pheno==1)),length(which(bot_set$pheno==1)),length(which(middle_set$pheno==0)),length(which(bot_set$pheno==0))),nrow=2)
rownames(two_by_two_mb) <- c("middle", "bottom")
colnames(two_by_two_mb) <- c("Cancer", "No Cancer")
mid_bot_OR <- (two_by_two_mb[1,1]*two_by_two_mb[2,2])/(two_by_two_mb[1,2]*two_by_two_mb[2,1])
mid_bot_OR_CI <- 1.96*sqrt(1/two_by_two_mb[1,1]+1/two_by_two_mb[1,2]+1/two_by_two_mb[2,1]+1/two_by_two_mb[2,2])
mid_bot_OR_CI_high <- mid_bot_OR + mid_bot_OR_CI
mid_bot_OR_CI_low <- mid_bot_OR - mid_bot_OR_CI
mid_bot_OR_CI <- paste(round(mid_bot_OR_CI_low,4), "-", round(mid_bot_OR_CI_high,4))
###########################################################################
# Create output file that will be written to csv
# 4 Columns labelled for AUC, AUC confidence interval, odds ratio, and odds ratio CI
# 3 additional columns for sample sizes
output = data.frame(Subset=args[5], AUC_PRS=AUC_PRS, AUC_PRS_CI=AUC_PRS_CI, OR_PRS=OR_PRS, OR_PRS_CI=OR_PRS_CI, Top_Centile_Prevalence=percent_positive, Baseline_Prevalence=percent_positive_baseline, top_mid_OR=top_mid_OR,top_mid_OR_CI=top_mid_OR_CI,mid_bot_OR=mid_bot_OR,mid_bot_OR_CI=mid_bot_OR_CI,n_cases=length(which(score_pheno_cov$pheno==1)), n_controls=length(which(score_pheno_cov$pheno==0)), n_snps=max(genotypes$CNT)/2 )
# Output to CSV with file path as specified in args
write.csv(output,args[6], row.names = FALSE, quote=FALSE)