Spectral Clustering

This script defines the functions which are used to perform spectral clustering.

Calculate Affinity Matrix

Calculate affinity matrix for spectral clustering. Code is derived from the SNFtool package, see associated publication in Nature Methods here.

#' @param diff 
affinityCustom = function (diff, sigma = 0.5) 
{
  N <- nrow(diff)
  diff <- (diff + t(diff))/2
  diag(diff) <- 0
  sortedColumns <- as.matrix(t(apply(diff, 2, sort)))
  finiteMean <- function(x) {
    return(mean(x[is.finite(x)]))
  }
  # this line has been modified to remove [, 1:K + 1]
  means <- apply(sortedColumns, 1, finiteMean) + 
    .Machine$double.eps
  avg <- function(x, y) {
    return((x + y)/2)
  }
  Sig <- outer(means, means, avg)/3 * 2 + diff/3 + .Machine$double.eps
  Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps
  densities <- dnorm(diff, 0, sigma * Sig, log = FALSE)
  W <- (densities + t(densities))/2
  return(W)
}

Z-Scores

Calculate Z-Score

#' @param sf string; C(CERAD)/B(Braak);
#' @param level string; level of CERAD or Braak stage;
#' @param mData matrix; expression matrix
#' @param cov data.table; clinical data
to_z_score = function(sf, level, mDataG, cov){
  if(!all(colnames(mDataG) == cov$projid)){
    stop()
  }
  if(sf == "C"){
    cat("select CERAD", level, "samples..\n")
    remove_proj = cov[,is.na(C) | is.na(E)]
    if(any(remove_proj)){
      cat("remove", sum(remove_proj), "samples with missing values..\n")
      mDataG = mDataG[, !remove_proj]
      cov = cov[!is.na(C) & !is.na(E),]
    }
    E = cov[C == level, E]
    sel = cov$C == level
  }else if(sf == "B"){
    cat("select Braak", level, "samples..\n")
    remove_proj = cov[,is.na(B) | is.na(E)]
    if(any(remove_proj)){
      cat("remove", sum(remove_proj),"samples with missing values..\n")
      mDataG = mDataG[, !remove_proj]
      cov = cov[!is.na(B) & !is.na(E),]
    }
    E = cov[B == level, E]
    sel = cov$B == level
  }else{
    stop()
  }
  
  means = rowMeans(mDataG[, sel])
  sds = apply(mDataG[, sel], 1, sd)
  zscore = function(x) return((x - means)/sds)
  mData_z = apply(mDataG[, sel], 2, zscore)
  
  # check should be TRUE
  cat("random check:",
      all(mData_z[9,] == (mDataG[9,sel] - mean(as.numeric(mDataG[9,sel])))/sd(as.numeric(mDataG[9,sel]))),
      "\n")
  return(list(z = mData_z, E = E, mData = mDataG[, sel],
              means = means, sds = sds))
}

Calculate Average Z-Score

#' @param z_list list; list generated by to_z_score()
to_ave_z = function(z_list){
  z = z_list$z
  E = z_list$E
  return(data.table(Genes = rownames(z),
                    E2 = rowMeans(z[, E == "E2"]),
                    E3 = rowMeans(z[, E == "E3"]),
                    E4 = rowMeans(z[, E == "E4"])))
}

Spectral Clustering

#' @param avez data.table/data.frame/matrix containing the average z-scores of E2, E3, and E4
#' @param k number of cluster
spectral_clustering = function(avez, k){
  set.seed(9)
  
  zMtx = as.matrix(avez[, c("E2", "E3", "E4")])
  rownames(zMtx) = avez$Genes
  zMtx = zMtx[!duplicated(rownames(zMtx)), ]
  
  # calculate distance matrix
  distM = zMtx %>% dist2(., .) %>% .^(1/2)
  # calculate similarity matrix
  simM = affinityCustom(distM)
  # perform spectral clustering
  clustA = spectralClustering(simM, K = k)
  
  return(list(clustA = clustA,
              zMtx = zMtx))
}

Statistical Testing

Statistical testing of interested microglia-APOE genes across APOE genotypes.

#' @param z matrix; z 
#' @param genes vector; vector of interested genes
#' @param cov data.table; clinical data
#' @param adj.donor T/F; adjusted by donor or not
#' @param donor donor id; need provide if adj.donor = T
test_avez = function(zmtx, clust_genes, cov, donor = NULL, adj.donor = FALSE){
  data = data.frame(
    ave_z = colMeans(zmtx[clust_genes, ]),
    APO = cov$E)
  if(levels(cov$E)[1] != "E3"){
    stop("must use E3 as reference\n")
  }
  if(adj.donor){
    require(lme4)
    library(lmerTest)
    data$donor = donor
    fit = lmer(ave_z ~ (1|donor)+ APO, data = data)
  }else{
    fit = lm(ave_z ~ APO, data = data)
  }
  return(summary(fit, confint = TRUE, digits = 3))
}

Hypergeometric Tests

Hypergeometric tests of interested microglia-APOE genes across APOE genotypes.

#' @param genes1 vector;
#' @param genes2 vector; 
phyper_test = function(genes1, genes2){
  q = length(intersect(genes1, genes2))
  s = length(genes1)
  n = nrow(mData)
  m = length(genes2)
  p = phyper(q,m,n,s, lower.tail = F)
  return(p)
}

Corrections

If you see mistakes or want to suggest changes, please create an issue on the source repository.