Evaluating PRS in R to calculate “Area Under the Curve” and Odds Ratio

4 minute read

Published:

This post shows the R script I’ve written to calculate AUC and OR for the PRS generated in plink in previous blog posts. It has undergone several iterations as I adjust our desired outcomes.

I’ve pasted the script below:

#!/usr/bin/env Rscript
library(pROC)
library(ggplot2)
library(scales)
library(tidyverse)
library(dplyr)
library(lme4)
library(sjPlot)
library(abd)

# Script takes multiple arguments, all in the form of file paths
  # Arg 1: population covariate file location
  # Arg 2: PMBB Ids of cancer cases
  # Arg 3: PMBB Ids of control cases
  # Arg 4: .Profile file generated in plink using OC_build_PRS
  # Arg 5: Cancer type
  # Arg 6: Output filepath and name

# The script outputs a csv file with the following columns
  # Cancer subtype
  # Odds ratio of the PRS
  # Odds ratio confidence interval
  # AUC of the PRS ROC
  # Confidence interval of AUC of the PRS ROC, bootstrapped x2000
  # AUC of the Principal components only (PC) ROC
  # Confidence interval of AUC of the PC ROC, bootstrapped x2000
  # Change in AUC in PRS model compared to PC model, bootstrapped x2000
  # P value of Change in AUC in PRS model compared to PC model
  # Number of cancer cases in model
  # Number of controls in model
  # Number of SNPs used in PRS

args=commandArgs(trailingOnly = TRUE)

# Temporary args while working on script
args=c("G://My Drive//Residency//Research//Maxwell//phenotypes//OC_covariate_file.txt","G://My Drive//Residency//Research//Maxwell//PMBB_IDs//PMBB_IDs_lung_cancer.txt","G://My Drive//Residency//Research//Maxwell//PMBB_IDs//PMBB_IDs_controls.txt", "G://My Drive//Residency//Research//Maxwell//Profiles//lung_all_p5.profile", "lung_all_p5", "G://My Drive//Residency//Research//Maxwell//lung_all_p5.csv")

# read covariates file and name columns 
covariates <- read.csv(file= args[1], header = T, sep="\t")
colnames(covariates)[1] = "id"

# Subset population to only those with age 50-80
covariates = covariates[covariates$BIRTH_YEAR>1942,]
covariates = covariates[covariates$BIRTH_YEAR<1972,]
# Remove patients who are already eligible for CT screening based on smoking hx
covariates = covariates[covariates$CT_ELIGIBLE=="NO",]

# Load "cases" PMBB_ID file
case_phenotypes <- read.csv(file= args[2], header = F, sep=",")
colnames(case_phenotypes)[1] = "id"
case_phenotypes$pheno=1

# Load "control" PMBB_ID file
control_phenotypes <- read.csv(file= args[3], header = F, sep=",")
colnames(control_phenotypes)[1] = "id"
control_phenotypes$pheno=0

# Combine cases and controls into one dataframe
phenotypes = rbind(case_phenotypes, control_phenotypes)

# genotype file = .profile generated from PLINK
genotypes <- read.csv(file=args[4], header = T, sep = "")
colnames(genotypes)[1] <- c("id")

# Merge phenotypes with covariates based on id column
pheno_cov = merge(phenotypes, covariates, by="id")
# Merge pheno_cov with genotype scores based on id column
score_pheno_cov = merge(pheno_cov, genotypes, by="id")
# Ensure complete cases
score_pheno_cov <- score_pheno_cov[complete.cases(score_pheno_cov),]
# Normalize score of genotype scores
score_pheno_cov[32] <- apply(score_pheno_cov[32],2,scale)
# Relabel columns as id and normalizedScore
colnames(score_pheno_cov)[32] <- "normalizedScore"

# for non-gender specific cancers:
mod_PRS <- glm(pheno ~ normalizedScore, family="binomial", data=score_pheno_cov)

##################################################################

# 1. Calculate AUC

# AUC calculated using predict() and roc() for PRS model
predict_mod_PRS <- predict(mod_PRS,type=c("response"))

roc_mod_PRS <- roc(score_pheno_cov$pheno ~ predict_mod_PRS)

# AUC and bootstrapped CI of the predict_mod_PRS 
auc_mod_PRS <-roc(score_pheno_cov$pheno ~ predict_mod_PRS, score_pheno_cov, plot=T, ci=T,print.auc = TRUE)

CI_bootstrap_PRS <- ci.auc(auc_mod_PRS, method='bootstrap', n.boot=10000)

AUC_PRS <- round(as.numeric(auc(roc_mod_PRS)),4)
AUC_PRS_CI <- paste(round(as.numeric(CI_bootstrap_PRS)[1],4), "-", round(as.numeric(CI_bootstrap_PRS)[3],4))

##################################################################

# 2. Calculate Odds ratio of disease occurence per 1 SD in scoring

# This should be equal to the beta-coefficient in our model "predict_mod_PRS"

options(digits = 4)
OR_PRS = round(exp(summary(mod_PRS)$coefficients[2]),4)
OR_PRS_CI <- paste(round(exp((summary(mod_PRS)$coefficients[2])-1.96*summary(mod_PRS)$coefficients[2,2]),4), "-", round(exp((summary(mod_PRS)$coefficients[2])+1.96*summary(mod_PRS)$coefficients[2,2]),4))

##################################################################

# 3. Lifetime risk of lung cancer in top centile

# Reorder score_pheno file by decreasing score
score_pheno_cov1 <- score_pheno_cov[order(score_pheno_cov$normalizedScore, decreasing = TRUE), ]

# Find # of patients in top percentile
n_top_percent = round(length(score_pheno_cov1$id)*0.01, digits=0)

# Take top 1% of patients in score_pheno file
score_pheno_cov2 <- score_pheno_cov1[1:n_top_percent,]

# Determine % of top 1% patients who have lung cancer, compare to entire population
percent_positive = length(which(score_pheno_cov2$pheno==1))/n_top_percent
percent_positive_baseline = length(which(score_pheno_cov$pheno==1))/length(score_pheno_cov$pheno)

##################################################################

# 4. Odds ratio of middle quintile, compared to top and bottom quintile

# Divide population into 5 segments

# extract top, bottom, and remaining quintiles
top_quintile = which(score_pheno_cov$normalizedScore > quantile(score_pheno_cov$normalizedScore, 0.8))
bottom_quintile = which(score_pheno_cov$normalizedScore < quantile(score_pheno_cov$normalizedScore,0.2))
middle_quintile = which((score_pheno_cov$normalizedScore > quantile(score_pheno_cov$normalizedScore,0.4)) & (score_pheno_cov$normalizedScore < quantile(score_pheno_cov$normalizedScore,0.6)))

# define data frames for top, bottom, and remaining quintiles
top_set = score_pheno_cov[top_quintile,]
bot_set = score_pheno_cov[bottom_quintile,]
middle_set = score_pheno_cov[middle_quintile,]

# Compare top to mid quintile
two_by_two_tm = matrix(c(length(which(top_set$pheno==1)),length(which(middle_set$pheno==1)),length(which(top_set$pheno==0)),length(which(middle_set$pheno==0))),nrow=2)
rownames(two_by_two_tm) <- c("top", "middle")
colnames(two_by_two_tm) <- c("Cancer", "No Cancer")

top_mid_OR <- (two_by_two_tm[1,1]*two_by_two_tm[2,2])/(two_by_two_tm[1,2]*two_by_two_tm[2,1])
top_mid_OR_CI <- 1.96*sqrt(1/two_by_two_tm[1,1]+1/two_by_two_tm[1,2]+1/two_by_two_tm[2,1]+1/two_by_two_tm[2,2])

top_mid_OR_CI_low <- top_mid_OR - top_mid_OR_CI
top_mid_OR_CI_high <- top_mid_OR + top_mid_OR_CI

top_mid_OR_CI <- paste(round(top_mid_OR_CI_low,4), "-", round(top_mid_OR_CI_high,4))

# Compare middle to bottom quintile
two_by_two_mb = matrix(c(length(which(middle_set$pheno==1)),length(which(bot_set$pheno==1)),length(which(middle_set$pheno==0)),length(which(bot_set$pheno==0))),nrow=2)
rownames(two_by_two_mb) <- c("middle", "bottom")
colnames(two_by_two_mb) <- c("Cancer", "No Cancer")

mid_bot_OR <- (two_by_two_mb[1,1]*two_by_two_mb[2,2])/(two_by_two_mb[1,2]*two_by_two_mb[2,1])
mid_bot_OR_CI <- 1.96*sqrt(1/two_by_two_mb[1,1]+1/two_by_two_mb[1,2]+1/two_by_two_mb[2,1]+1/two_by_two_mb[2,2])

mid_bot_OR_CI_high <- mid_bot_OR + mid_bot_OR_CI
mid_bot_OR_CI_low <- mid_bot_OR - mid_bot_OR_CI

mid_bot_OR_CI <- paste(round(mid_bot_OR_CI_low,4), "-", round(mid_bot_OR_CI_high,4))

###########################################################################

# Create output file that will be written to csv
# 4 Columns labelled for AUC, AUC confidence interval, odds ratio, and odds ratio CI
# 3 additional columns for sample sizes
output = data.frame(Subset=args[5], AUC_PRS=AUC_PRS, AUC_PRS_CI=AUC_PRS_CI, OR_PRS=OR_PRS, OR_PRS_CI=OR_PRS_CI, Top_Centile_Prevalence=percent_positive, Baseline_Prevalence=percent_positive_baseline, top_mid_OR=top_mid_OR,top_mid_OR_CI=top_mid_OR_CI,mid_bot_OR=mid_bot_OR,mid_bot_OR_CI=mid_bot_OR_CI,n_cases=length(which(score_pheno_cov$pheno==1)), n_controls=length(which(score_pheno_cov$pheno==0)), n_snps=max(genotypes$CNT)/2 )

# Output to CSV with file path as specified in args
write.csv(output,args[6], row.names = FALSE, quote=FALSE)