Dependencies

Load requisite packages.

library(data.table)

Define Paths

Define brain regions for analysis and paths to normalized data files (i.e., Trimmed Mean of M-Values or TMM matrices).

brain_regions = c("IFG", "STG", "PHG", "FP")
data_files = list(
  IFG = "../Data/AMP-AD_MSBB_MSSM_BM_44.normalized.sex_race_age_RIN_PMI_exonicRate_rRnaRate_batch_adj.tsv",
  STG = "../Data/AMP-AD_MSBB_MSSM_BM_22.normalized.sex_race_age_RIN_PMI_exonicRate_rRnaRate_batch_adj.tsv",
  PHG = "../Data/AMP-AD_MSBB_MSSM_BM_36.normalized.sex_race_age_RIN_PMI_exonicRate_rRnaRate_batch_adj.tsv",
  FP = "../Data/AMP-AD_MSBB_MSSM_BM_10.normalized.sex_race_age_RIN_PMI_exonicRate_rRnaRate_batch_adj.tsv"
)

Analyze Brain Regions

Iterate over previously specified brain regions to process each region. All code after this chunk should be encapsulated in a for loop. To appropriately document the code, pseudocode of the for loop is shown below, while the contents of the for loop are described in subsequent chunks.

for(brain_region in brain_regions) {
  
  # insert following chunks here
  
}

For the purposes of illustration, we define the brain_region of interest as the inferior frontal fygrus (IFG).

brain_region = "IFG"

Process Expression Data

Process expression data and remove unnecessary headers from sample IDs (e.g., S109B355.BM_10_798 becomes BM_10_798).

# process expression data
cat("processing", brain_region, "...\n")
data_file = unlist(data_files[brain_region])
msbb = suppressWarnings({fread(data_file, sep = "\t")})
gene_ids = msbb$V1
msbb = as.data.frame(msbb[, -1, with = F])
rownames(msbb) = gene_ids

# remove unnecessary headers
colnames(msbb) = gsub("^.*\\.", "", colnames(msbb))

Extract subject IDs in the same order as the sample IDs, then check for samples without an ID match.

# extract subject IDs
key_file =  "../Data/MSBB_RNAseq_covariates_November2018Update.csv"
key_map = fread(key_file)
key_map = unique(key_map[, .(sampleIdentifier, individualIdentifier)])

# check for samples without an ID match
cat("any samples don't have id match? ")
cat(any(!colnames(msbb) %in% key_map$sampleIdentifier), "\n")
sub_key_map = key_map[data.table(sampleIdentifier = colnames(msbb)), on = .(sampleIdentifier)]
all(sub_key_map$sampleIdentifier == colnames(msbb))
colnames(msbb) = sub_key_map$individualIdentifier

Check for duplicated samples.

# any duplicated samples?
if(any(duplicated(colnames(msbb)))){
  cat("remove duplicate samples..\n")
  msbb = msbb[, !duplicated(colnames(msbb))]
}

Process Covariate Data

Note that subject #1009 has an invalid Braak score of 9.

cov_file = "../Data/MSBB_clinical.csv"
cov = fread(cov_file)
setnames(cov, c("NP.1", "bbscore"), c("CERAD", "Braak"))
cov[Braak > 6, Braak := NA]

Extract clinical data in the same order as in the data.

cov = cov[data.table(individualIdentifier = colnames(msbb)), on = .(individualIdentifier)]
all(cov$individualIdentifier == colnames(msbb))

Define Clinical Variables

Braak

Group levels into three categories:

1 = Normal/I/II
2 = III/IV
3 = V/VI

cov[, B := as.factor(cov$Braak)]
levels(cov$B) = c(1,1,1,2,2,3,3)
cov[, B := as.factor(as.character(B))]
table(cov$B, cov$Braak)

# double-check
table(cov$C, cov$Braak)

CERAD

Old MSBB levels:

1 = Normal
2 = Definite
3 = Probable
4 = Possible

Define new levels:

1 = Normal
2 = Possible
3 = Probable
4 = Definite

# refactor
cov[, C := as.factor(CERAD)]
levels(cov$C) = c(0,3,2,1)
cov[, C := as.factor(as.character(C))]

# double-check
table(cov$C, cov$CERAD)

Raw APOE

Define three categories:

1 = E2/E2 and E3/E3
2 = E3/E4, E2/E4, and E4/E4

NA values will be treated as E3/E3.

# define variable
cov$RawAPOE = paste0(cov$Apo1, cov$Apo2)
cov$RawAPOE = gsub("NANA", NA, cov$RawAPOE)

# treat NA values as E3/E3
cov[is.na(RawAPOE), RawAPOE := "33"]

APOE

Group levels into three categories:

E2 = E2/E2 and E2/E3
E3 = E3/E3
E4 = E2/E4, E3/E4, and E4/E4

# refactor
levels(cov$E) = c("E2", "E2", "E4", "E3", "E4", "E4")
cov[, E := factor(as.character(E), levels = c("E3", "E2", "E4"))]

# double-check
table(cov$E, cov$RawAPOE)

E4 Count

Group levels into three categories:

0 = E2/E2, E3/E3, and E2/E3
1 = E2/E4 and E3/E4
2 = E4/E4

# refactor
cov[, E4num := as.factor(RawAPOE)]
levels(cov$E4num) = c(0, 0, 1, 0, 1, 2)
cov[, E4num := as.numeric(as.character(E4num))]

# double-check
table(cov$E4num, cov$RawAPOE)

E2 Count

Group levels into three categories:

0 = E4/E4, E3/E3, and E3/E4
1 = E2/E4 and E2/E3
2 = E2/E2

# refactor
cov[, E2num := as.factor(RawAPOE)]
levels(cov$E2num) = c(2, 1, 1, 0, 0, 0)
cov[, E2num := as.numeric(as.character(E2num))]

# double-check
table(cov$E2num, cov$RawAPOE)

Age at Death

Clean age at death variable.

# clean age at death
cov[, age_at_death := gsub("90\\+", "91", AOD)]
cov[, age_at_death := as.numeric(age_at_death)]

Save Results

Choose levels and contrasts, then save results.

# save results
expSet = list(mData = msbb, cov = cov)
save(expSet, file = paste0("../Data/MSBB-", brain_region,"-24.Rdata"))

Define MSBB Variables

Dependencies

Define Paths

Analyze Brain Regions

Process Expression Data

Process Covariate Data

Define Clinical Variables

Braak

CERAD

Raw APOE

APOE

E4 Count

E2 Count

Age at Death

Save Results

Corrections