Skip to content

Commit

Permalink
HPA spatial correlation added
Browse files Browse the repository at this point in the history
  • Loading branch information
akatrib committed Oct 19, 2020
1 parent cf68065 commit 96f7c8d
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 2 deletions.
100 changes: 100 additions & 0 deletions HPA-spatial-expression.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# ——————————————————————————————————————————————————————————————————————————————————
# HPA-spatial-expression.R @AmalKatrib

# OBJECTIVE:
# - Identify "spatially-correlated" gene isoforms / proteins,
# concatenating them so as to amplify the signal in downstream analysis
# as well as focus the analysis on areas of interest, ultimately enhancing
# the functional & physiological interpretability of research findings
#
# PREREQS:
# - "Human Protein Atlas" (https://www.proteinatlas.org) tissue,
# User-aggregated "tissue", "immune" & "blood" data, further
# processed for streamlined analysis. Files consist of:
# - "hpa.tissue.csv" = aggregated human tissue protein expression data
# - "hpa.blood.csv" = aggregated human blood/immune cell type expression data
# - "genes" = gene isoforms / proteins of interest
#
# ——————————————————————————————————————————————————————————————————————————————————
rm( list = ls (all = TRUE))
options(stringsAsFactors = F)
# load packages
library(data.table)
library(dplyr)
library(tidyverse)

# ------------------------------------------------------
# MANUAL INPUT
# ------------------------------------------------------
# specify pearson coeff. utoff
r = 0.8
# ------------------------------------------------------
# 1. HPA CORRELATION
# for tissue-specific expression profile correlation
# ------------------------------------------------------
# load HPA tissue expression info & tidy up dataframe
tissue = read.csv("hpa.tissue.csv", row.names = 1)
tissue = tissue %>% group_by(gene.name) %>% arrange(gene.name, tissue.name, desc(hpa.tissue))
tissue = tissue %>% group_by(gene.name) %>% filter(!duplicated(tissue.name)) %>% select(gene.name, tissue.name, hpa.tissue)
tissue = spread(tissue, tissue.name, hpa.tissue) %>% data.frame()
rownames(tissue) = tissue$gene.name

# remove rows with > 50% missing values
tissue = tissue[which(rowMeans(!is.na(tissue)) > 0.5), ]

# for purposes of this analysis, discard entries with noHPA tissue info
id.tissue = genes
id.tissue = id.tissue[id.tissue %in% tissue$gene.name]

# correlate tissue expression data using the pre-set pearson coef for significance
corr.tissue = list()
for (i in 1:length(id.tissue)) {
ind = which(tissue$gene.name %in% id.tissue[i])
x = t(cor(t(tissue[ind, -1]), t(tissue[-ind, -1]), use = "complete.obs")) %>% #
data.frame() %>%
mutate(id = rownames(.)) %>%
filter(abs(.[[1]]) >= r)
corr.tissue[[i]] = unique(x$id) }
names(corr.tissue) = id.tissue

# output correlation results
x = corr.tissue[lapply(corr.tissue, length) > 0]
lapply(seq_along(x), function(ind) {
write.table(x[[ind]],
file = paste0("spatialExpCorr/", names(x)[ind], "_HPAtissue.correlates", r, ".txt"), quote = F, row.names = F, col.names = F ) })


# ------------------------------------------------------
# 2. HUMAN PROTEIN ATLAS CORRELATION
# for blood immune-specific profile correlation
# ------------------------------------------------------
# load HPA tissue expression info & tidy up dataframe
blood = read.csv("hpa.blood.csv", row.names = 1)
blood = blood %>% group_by(gene.name) %>% arrange(gene.name, blood.cell, desc(blood.exp))
blood = blood %>% group_by(gene.name) %>% filter(!duplicated(blood.cell)) %>% select(gene.name, blood.cell, blood.exp)
blood = spread(blood, blood.cell, blood.cellexp) %>% data.frame()
rownames(blood) = blood$gene.name

# remove rows with > 50% missing values
blood = blood[which(rowMeans(!is.na(blood)) > 0.5), ]

# for purposes of this analysis, discard entries with noHPA tissue info
id.blood = genes
id.blood= id.blood[id.blood %in% blood$gene.name]

# correlate tissue expression data using the pre-set pearson coef for significance
corr.blood= list()
for (i in 1:length(id.blood)) {
ind = which(blood$gene.name %in% id.blood[i])
x = t(cor(t(blood[ind, -1]), t(blood[-ind, -1]), use = "complete.obs")) %>%
data.frame() %>%
mutate(id = rownames(.)) %>%
filter(abs(.[[1]]) >= r)
corr.blood[[i]] = unique(x$id) }
names(corr.blood) = id.blood

# output correlation results
x = corr.blood[lapply(corr.blood, length) > 0]
lapply(seq_along(x), function(ind) {
write.table(x[[ind]],
file = paste0("spatialExpCorr/", names(x)[ind], "_HPAblood.correlates", r, ".txt"), quote = F, row.names = F, col.names = F ) })
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@

---

### SpatialExpressionCorrelation.R
> To extract "spatially-correlated" genes and isoforms that exhibit a significant overlap in organ-, tissue-, and cell type-specific expression profile
### [HPA-spatial-expression.R](HPA-spatial-expression.R)
> To extract "spatially-correlated" genes isoforms & proteins, exhibiting a significant overlap in organ-, tissue-, and cell type-specific expression profile,
> using data downloaded from __Human Protein Atlas (HPA)__ & then further adjusted to facilitate a streamlined analysis

0 comments on commit 96f7c8d

Please sign in to comment.