HPA spatial correlation added

akatrib · Oct 19, 2020 · 96f7c8d · 96f7c8d
1 parent cf68065
commit 96f7c8d
Show file tree

Hide file tree

Showing 2 changed files with 103 additions and 2 deletions.
diff --git a/HPA-spatial-expression.R b/HPA-spatial-expression.R
@@ -0,0 +1,100 @@
+# ——————————————————————————————————————————————————————————————————————————————————
+#   HPA-spatial-expression.R  @AmalKatrib
+
+#   OBJECTIVE:
+#    - Identify "spatially-correlated" gene isoforms / proteins,
+#      concatenating them so as to amplify the signal in downstream analysis
+#      as well as focus the analysis on areas of interest, ultimately enhancing
+#      the functional & physiological interpretability of research findings
+#
+#   PREREQS:
+#   - "Human Protein Atlas" (https://www.proteinatlas.org) tissue,
+#      User-aggregated "tissue", "immune" & "blood" data, further
+#      processed for streamlined analysis. Files consist of:
+#          - "hpa.tissue.csv" = aggregated human tissue protein expression data
+#          -  "hpa.blood.csv" = aggregated human blood/immune cell type expression data
+#   - "genes" = gene isoforms / proteins of interest
+#
+# ——————————————————————————————————————————————————————————————————————————————————
+rm( list = ls (all = TRUE))
+options(stringsAsFactors = F)
+# load packages
+library(data.table)
+library(dplyr)
+library(tidyverse)
+
+# ------------------------------------------------------
+#       MANUAL INPUT
+# ------------------------------------------------------
+# specify pearson coeff. utoff
+r = 0.8
+# ------------------------------------------------------
+#   1. HPA CORRELATION
+#    for tissue-specific expression profile correlation
+# ------------------------------------------------------
+#  load HPA tissue expression info & tidy up dataframe
+tissue = read.csv("hpa.tissue.csv", row.names = 1)
+tissue = tissue %>% group_by(gene.name) %>% arrange(gene.name, tissue.name, desc(hpa.tissue))
+tissue = tissue %>% group_by(gene.name) %>% filter(!duplicated(tissue.name)) %>% select(gene.name, tissue.name, hpa.tissue)
+tissue = spread(tissue, tissue.name, hpa.tissue) %>% data.frame()
+rownames(tissue) = tissue$gene.name
+
+# remove rows with > 50% missing values
+tissue = tissue[which(rowMeans(!is.na(tissue)) > 0.5), ]
+
+# for purposes of this analysis, discard entries with noHPA tissue info
+id.tissue = genes
+id.tissue = id.tissue[id.tissue %in% tissue$gene.name]
+
+# correlate tissue expression data using the pre-set pearson coef  for significance
+corr.tissue = list()
+for (i in 1:length(id.tissue)) {
+     ind = which(tissue$gene.name %in% id.tissue[i])
+     x = t(cor(t(tissue[ind, -1]), t(tissue[-ind, -1]), use = "complete.obs")) %>%  #
+          data.frame() %>%
+          mutate(id = rownames(.)) %>%
+          filter(abs(.[[1]]) >= r)
+     corr.tissue[[i]] = unique(x$id) }
+names(corr.tissue) = id.tissue
+
+# output correlation results
+x = corr.tissue[lapply(corr.tissue, length) > 0]
+lapply(seq_along(x), function(ind) {
+           write.table(x[[ind]],
+           file = paste0("spatialExpCorr/", names(x)[ind], "_HPAtissue.correlates", r, ".txt"), quote = F, row.names = F, col.names = F ) })
+
+
+# ------------------------------------------------------
+#   2. HUMAN PROTEIN ATLAS CORRELATION
+#   for blood immune-specific profile correlation
+# ------------------------------------------------------
+#  load HPA tissue expression info & tidy up dataframe
+blood = read.csv("hpa.blood.csv", row.names = 1)
+blood = blood %>% group_by(gene.name) %>% arrange(gene.name, blood.cell, desc(blood.exp))
+blood = blood %>% group_by(gene.name) %>% filter(!duplicated(blood.cell)) %>% select(gene.name, blood.cell, blood.exp)
+blood = spread(blood, blood.cell, blood.cellexp) %>% data.frame()
+rownames(blood) = blood$gene.name
+
+# remove rows with > 50% missing values
+blood = blood[which(rowMeans(!is.na(blood)) > 0.5), ]
+
+# for purposes of this analysis, discard entries with noHPA tissue info
+id.blood = genes
+id.blood= id.blood[id.blood %in% blood$gene.name]
+
+# correlate tissue expression data using the pre-set pearson coef  for significance
+corr.blood= list()
+for (i in 1:length(id.blood)) {
+     ind = which(blood$gene.name %in% id.blood[i])
+     x = t(cor(t(blood[ind, -1]), t(blood[-ind, -1]), use = "complete.obs")) %>%
+         data.frame() %>%
+         mutate(id = rownames(.)) %>%
+         filter(abs(.[[1]]) >= r)
+     corr.blood[[i]] = unique(x$id) }
+names(corr.blood) = id.blood
+
+# output correlation results
+x = corr.blood[lapply(corr.blood, length) > 0]
+lapply(seq_along(x), function(ind) {
+           write.table(x[[ind]],
+            file = paste0("spatialExpCorr/", names(x)[ind], "_HPAblood.correlates", r, ".txt"), quote = F, row.names = F, col.names = F ) })
diff --git a/README.md b/README.md
@@ -14,5 +14,6 @@
 
 ---
 
-### SpatialExpressionCorrelation.R
-> To extract "spatially-correlated" genes and isoforms that exhibit  a significant overlap in organ-, tissue-, and cell type-specific expression profile
+### [HPA-spatial-expression.R](HPA-spatial-expression.R)
+> To extract "spatially-correlated" genes isoforms & proteins, exhibiting a significant overlap in organ-, tissue-, and cell type-specific expression profile,
+> using data downloaded from __Human Protein Atlas (HPA)__ & then further adjusted to facilitate a streamlined analysis