Subsetting patients by lung cancer histology subtype

2 minute read

Published:

#!/usr/bin/env Rscript

####
# Code to pull out PMBB_IDs of patients with lung cancer vs. controls
# Code also subdivides lung cancer types into histology

# Read in icd-10 codes of patients, using phenotype version 2.1
icd <- read.csv(file= "G://My Drive//Work//Research//Maxwell//phenotypes//PMBB-Release-2020-2.1_phenotype_icd-10-matrix.txt", header = T, sep="\t")

# Read in file from Heena which has PMBB IDs, lung cancer subtypes
histology <- read.csv(file= "G://My Drive//Work//Research//Maxwell//phenotypes//non_identified_lungcancer_pmcr.csv", header = T, sep=",")

# Find subset of patients who are controls, i.e no diagnosis of any cancer
# These are patients with all 0s in the columns between C00-D49
# which(colnames(icd) == "C00.0") = 468
# which(colnames(icd) == "D49.9") = 1857

# Get row numbers of patients who have no cancer diagnosis
control_rows <- which(rowSums(icd[,468:1857])==0)
# Get the PMBB_IDs of those row numbers
controls <- icd$PMBB_ID[control_rows]
# length(controls)
# Write to OC_controls text file
writeLines(controls, "G://My Drive//Work//Research//Maxwell//phenotypes//PMBB_IDs_controls.txt")

# Get row numbers of patients who have at least 1 diagnosis of lung cancer
# which(colnames(icd) == "C34.00") = 598
# which(colnames(icd) == "C34.92") = 613
case_rows <- which(rowSums(icd[,598:613])!=0)
# Get the PMBB_IDs of those row numbers
# The cases vector has a list of PMBB_IDs of patients with lung cancer
cases <- icd$PMBB_ID[case_rows]
# length(cases)

# Now determine histology subtypes

# Next, only pull histology rows for data related to lung cancer
histology <- histology[which(histology$PRIMARY_SITE_SUBCATEGORY=="Lung"),]

# First, only pull rows from cases
cases <- as.data.frame(cases)
colnames(cases) <- c("PMBB_ID")
histology <- merge(histology, cases, by="PMBB_ID")

# For duplicate patients, pull out only the first lung cancer diagnosis
# Order by PMBB_ID and Age at diagnosis
histology <- histology[order(histology$PMBB_ID, histology$AGE_AT_DX),]
# Only keep first diagnosis in list of duplicates
histology <- histology[!duplicated(histology$PMBB_ID),]

# Only pull important columns 
histology <- histology[,c(1,2,6,8,13,15,16,27,31)]

writeLines(histology$PMBB_ID, "G://My Drive//Work//Research//Maxwell//phenotypes//PMBB_IDs_lung_cancer.txt")

# Create sublists for Adeno
# This includes
  # Adenocarcinoma, NOS
  # Adenocarcinoma with mixed subtypes
  # Papillary Adenocarcinoma, NOS
  # Mucinous Adenocarcinoma
# Note, I do not include Adenosquamous Carcinoma in this category
adeno <- histology[which(grepl("Adenocarcinoma",histology$HISTOLOGY_DESC)),]
writeLines(adeno$PMBB_ID, "G://My Drive//Work//Research//Maxwell//phenotypes//PMBB_IDs_lung_adeno.txt")

# Create sublists for squamous
# This includes
  # Squamous Cell Carcinoma, NOS
# Does not inlucde adenosquamous
squamous <-  histology[which(grepl("Squamous",histology$HISTOLOGY_DESC)),]
writeLines(squamous$PMBB_ID, "G://My Drive//Work//Research//Maxwell//phenotypes//PMBB_IDs_lung_squamous.txt")

# Sublist for small cell carcinoma
sclc <-  histology[which(grepl("^Small",histology$HISTOLOGY_DESC)),]
writeLines(sclc$PMBB_ID, "G://My Drive//Work//Research//Maxwell//phenotypes//PMBB_IDs_lung_sclc.txt")

# Sublist for non-small cell carcinoma
nsclc <-  histology[which(!grepl("^Small",histology$HISTOLOGY_DESC)),]
writeLines(nsclc$PMBB_ID, "G://My Drive//Work//Research//Maxwell//phenotypes//PMBB_IDs_lung_nsclc.txt")