-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
227 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,10 @@ | ||
## Systems Biology Analytics | ||
Systems Biology Analytics | ||
|
||
#### geneSet-enrichment.R: | ||
Enrichment analysis, leveraging Enrichr's comprehensive list of curated gene set libraries to extract significantly represented: | ||
- Pathways, molecular functions & biological processes | ||
- Co-expressed/-localized molecular factors & interactors | ||
- Organs, tissues & cell types / compartments | ||
- Phenotypes & clinical traits | ||
- Diseases, pathologies & symptoms | ||
- Drugs & Therapeutic Targets |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,216 @@ | ||
# ———————————————————————————————————————————————————————————————————————————————— | ||
# geneSet-enrichment.R | ||
# @AmalKatrib | ||
# | ||
# Objective: Leverage Enrichr's curated list of gene-set libraries to conduct enrichment analysis | ||
# using an input of genes that were deemed as "interesting" from prior analysis | ||
# Prerequisites: | ||
# -- [ REQ'D INPUT ] Identify genes of interest & save in the appropriate folder | ||
# -- [ OPTIONAL INPUT ] Extract spatially-correlated genes & save in the appropriate folder | ||
# -- [ ONLINE ANALYSIS ] Use Enrichr.com to analyze & download enrichment results | ||
# -- [ OFFLINE ANALYSIS ] Specify biological databases relevent to research question | ||
# | ||
# ———————————————————————————————————————————————————————————————————————————————— | ||
rm( list = ls (all = TRUE)) | ||
options(stringsAsFactors = F) | ||
|
||
# load packages | ||
library(dplyr) | ||
library(stringr) | ||
library(enrichR) | ||
|
||
# -------------------------------------------------- | ||
# MANUAL INPUT | ||
# -------------------------------------------------- | ||
|
||
# specify which dataset to analyze | ||
# select from: "a", "b", "c" | ||
type = "a" | ||
|
||
# set corresponding directory for data input and analysis output | ||
dirIn = ifelse(type == "a", "dirA1/", ifelse(type == "b", "dirB1/", ifelse(type == "c", "dirC1/", NA))) | ||
dirAnalysis = ifelse(type == "a", "dirA2/", ifelse(type == "b", "dirB2/", ifelse(type == "c", "dirC2/", NA))) | ||
|
||
# specify analysis type; select from: | ||
# "allGene": to input protein biomarker + ALL spatially-correlated proteins from Human Protein Atlas & Allen Human Brain Atlas | ||
# "intersectingGene": to input protein biomarker + ONLY OVERLAPPING spatially-correlated proteins from Human Protein Atlas & Allen Human Brain Atlas | ||
analysisType = "allGene" # "allGene", "intersectingGene" | ||
|
||
# set enrichr filtering parameters, where (1) is for analysis with spatially-similar genes and (2) for analysis without | ||
p1 = 0.05 # Fisher exact p-val | ||
p2 = 1 | ||
c1 = 10 # Combined Score = log(p-value) * z.score | ||
c2 = 10 | ||
d = 0.25 # limit # of findings from one single database by removing those in the bottom x% for combined score | ||
|
||
# -------------------------------------------------- | ||
# DATA INPUT | ||
# .............. | ||
setwd(dirAnalysis) | ||
# -------------------------------------------------- | ||
|
||
# load gene list & adjust data input format to remove duplicates & sort | ||
genes = read.table("genes.txt"() | ||
genes = genes[,1] %>% unique() %>% sort | ||
|
||
# -------------------------------------------------- | ||
# PRELIMINARY DATA ANALYSIS | ||
# -------------------------------------------------- | ||
|
||
# load genes with similar spatial expression (from prior analysis)) | ||
genes.similar = list.files(path = dirAnalysis, full.names = T) | ||
genes.similar = genes.similar[grep(analysisType, genes.similar)] | ||
names = gsub(".*//(.*)\\_all.*", "\\1", genes.similar) | ||
genes.similar = lapply(genes.similar, function(x) read.table(x) %>% unlist(use.names = F)) | ||
names(genes.similar) = names | ||
|
||
# save non-empty entries | ||
lapply(seq_along(genes.in.similar), function(i) { | ||
write.table(genes.in.similar[[i]], | ||
file = paste0(names(genes.in.similar)[i], "_spatiallySimilarGenes.txt"), row.names = F, col.names = F, quote = F) }) | ||
|
||
# save altogether | ||
write.table(c(names(genes.in.similar), unlist(genes.in.similar, use.names = F)) %>% unique, | ||
file = "listAllSpatiallySimilarGenes.txt", row.names = F, col.names = F, quote = F) | ||
|
||
# now make sure we include the gene/protein biomarker itself to the list | ||
genes.similar = lapply(seq_along(genes.similar), function(i) c(names(genes.similar)[i], genes.similar[[i]])) | ||
names(genes.similar) = names | ||
|
||
|
||
# -------------------------------------------------- | ||
# ENRICHR GENE-SET ENRICHMENT ANALYSIS | ||
# -------------------------------------------------- | ||
|
||
#### ONLINE: get enrichr results from gene query search online | ||
x = list.files()[grep(".txt", list.files())] | ||
enrichr = lapply(1:length(x), function(i) read.csv(x[i], header = T, sep = "\t")) | ||
names(enrichr) = gsub("_table.txt", "", x) | ||
|
||
|
||
#### OFFLINE: using enrichr libraries | ||
# grab all libraries from EnrichR. Make sure to comment out irrelevant libraries, | ||
# for example, those that are: specific to a disease, outdated, etc. | ||
dbs = listEnrichrDbs() %>% arrange(desc(libraryName)) | ||
ind = c(grep("mouse", dbs$libraryName, ignore.case = T), | ||
grep("cancer", dbs$libraryName, ignore.case = T), | ||
grep("onco", dbs$libraryName, ignore.case = T), | ||
grep("computational", dbs$libraryName, ignore.case = T), | ||
grep("microbe", dbs$libraryName, ignore.case = T), | ||
grep("achilles", dbs$libraryName, ignore.case = T), | ||
grep("virus", dbs$libraryName, ignore.case = T), | ||
grep("muscle", dbs$libraryName, ignore.case = T), | ||
grep("virus", dbs$libraryName, ignore.case = T), | ||
grep("2017b", dbs$libraryName, ignore.case = T), | ||
grep("gtex", dbs$libraryName, ignore.case = T), | ||
grep("crispr", dbs$libraryName, ignore.case = T), | ||
grep("LINCS", dbs$libraryName, ignore.case = T), | ||
grep("GWAS", dbs$libraryName, ignore.case = T), | ||
grep("NIH_Funded_", dbs$libraryName, ignore.case = T), | ||
grep("microRNA", dbs$libraryName, ignore.case = T), | ||
grep("miRTarBase", dbs$libraryName, ignore.case = T), | ||
grep("Transcription", dbs$libraryName, ignore.case = T), | ||
grep("TF-LOF", dbs$libraryName, ignore.case = T), | ||
grep("TF_", dbs$libraryName, ignore.case = T), | ||
grep("Phosphatase", dbs$libraryName, ignore.case = T), | ||
grep("Pfam", dbs$libraryName, ignore.case = T), | ||
grep("InterPro", dbs$libraryName, ignore.case = T), | ||
grep("Homolo", dbs$libraryName, ignore.case = T), | ||
grep("CHEA_", dbs$libraryName, ignore.case = T), | ||
grep("_Coexp", dbs$libraryName, ignore.case = T), | ||
grep("BioPlex", dbs$libraryName, ignore.case = T), | ||
grep("CORUM", dbs$libraryName, ignore.case = T), | ||
grep("NURSA", dbs$libraryName, ignore.case = T), | ||
grep("Allen", dbs$libraryName, ignore.case = T), | ||
grep("Aging", dbs$libraryName, ignore.case = T), | ||
grep("MCF7", dbs$libraryName, ignore.case = T), | ||
grep("lncHub", dbs$libraryName, ignore.case = T), | ||
grep("Kinase", dbs$libraryName, ignore.case = T), | ||
grep("KEA_", dbs$libraryName, ignore.case = T), | ||
grep("Rare_Diseases", dbs$libraryName, ignore.case = T), | ||
grep("SubCell", dbs$libraryName, ignore.case = T), | ||
grep("JASPAR", dbs$libraryName, ignore.case = T), | ||
grep("Ligand", dbs$libraryName, ignore.case = T), | ||
grep("L1000", dbs$libraryName, ignore.case = T), | ||
grep("SILAC", dbs$libraryName, ignore.case = T), | ||
grep("Gene_Atlas", dbs$libraryName, ignore.case = T), | ||
grep("Browser_", dbs$libraryName, ignore.case = T), | ||
grep("Histone", dbs$libraryName, ignore.case = T), | ||
grep("Metabolites", dbs$libraryName, ignore.case = T), | ||
grep("Epigenomics", dbs$libraryName, ignore.case = T), | ||
grep("PPI_", dbs$libraryName, ignore.case = T), | ||
grep("ESCAPE", dbs$libraryName, ignore.case = T), | ||
grep("ENCODE", dbs$libraryName, ignore.case = T), | ||
grep("Chromosome", dbs$libraryName, ignore.case = T), | ||
grep("CMAP_", dbs$libraryName, ignore.case = T)) | ||
|
||
dbs_tmp = dbs | ||
dbs_tmp$libraryName = gsub('_Human', '', dbs_tmp$libraryName) | ||
dbs_tmp$libraryName = gsub('_[[:digit:]]+', '', dbs_tmp$libraryName) | ||
|
||
ind = c(ind, which(duplicated(dbs_tmp$libraryName))) %>% unique() | ||
dbs = dbs$libraryName[-ind] | ||
rm(dbs_tmp) | ||
|
||
# run enichR offline for genes WITH spatially-similar genes | ||
enrichr = lapply(genes.similar, function(i) enrichr(i, dbs)) | ||
enrichr_copy = enrichr #save copy | ||
|
||
# remove missing database entries in list | ||
for (i in 1:length(enrichr)) { | ||
enrichr[[i]] = Filter(function(x) nrow(x) > 0, enrichr[[i]]) | ||
enrichr[[i]] = Filter(function(x) !is.null(x), enrichr[[i]]) } | ||
|
||
|
||
# -------------------------------------------------- | ||
# ENRICHR RESULTS FILTERING | ||
# -------------------------------------------------- | ||
|
||
# filter enrichr results to extract top enriched terms (pathways, processes, etc.), | ||
# arrange by % genes/term; overlapping gene set size; combined score, | ||
# and then filter to only keep top 50-75% entries from each db | ||
top = list() | ||
|
||
#### Enrichment filtering for all spatially-similar genes | ||
for (i in 1:length(enrichr)) { | ||
top[[i]] = lapply(enrichr[[i]], function(x) x %>% filter(Combined.Score >= c1, P.value <= p1)) | ||
top[[i]] = lapply(top[[i]], function(x) x %>% | ||
select(Term, Genes, P.value, Adjusted.P.value, Combined.Score, Overlap)) %>% | ||
do.call(rbind.data.frame, .) | ||
|
||
top[[i]] = top[[i]] %>% mutate(db = gsub("\\.*", "", rownames(top[[i]]))) | ||
top[[i]]$db = gsub('[[:digit:]]+', '', top[[i]]$db) | ||
top[[i]]$db = gsub("\\_$", "", top[[i]]$db) | ||
rownames(top[[i]]) = 1:nrow(top[[i]]) | ||
top[[i]] = top[[i]] %>% | ||
arrange(desc(as.integer(str_count(top[[i]]$Genes, ";")/as.integer(gsub(".*/", "", top[[i]]$Overlap)))), | ||
desc(str_count(top[[i]]$Genes, ";"))) %>% | ||
group_by(db) %>% | ||
#filter(Combined.Score > quantile(Combined.Score, d)) %>% | ||
arrange(db) %>% | ||
as.data.frame() } | ||
|
||
names(top) = names | ||
|
||
# add space before "Overlap" column to prevent excel converstion to date format | ||
for (i in 1:length(top)) { top[[i]]$Overlap = paste(" ", top[[i]]$Overlap) } | ||
|
||
|
||
# -------------------------------------------------- | ||
# SAVE RESULTS | ||
# .............. | ||
setwd("../../FunctionalNetworkAnalysis/") | ||
# -------------------------------------------------- | ||
|
||
# create or use existing gene list folders to save enrichment results | ||
lapply(seq_along(top), function(i) { | ||
dir.create(file.path(names(top)[i]), showWarnings = F) | ||
write.csv(top[[i]], file = paste0(names(top)[i], "/", names(top)[i], "_enrichR_withSpatiallySimilarGenes.csv"), row.names = F )}) | ||
|
||
# -------------------------------------------------- | ||
# SAVE SESSION | ||
# -------------------------------------------------- | ||
|
||
# save workspace + session info | ||
save( list = ls(), file = paste0("SessionInfo/FunctionalInsights", "_", substring(Sys.Date(), 3), ".Rdata")) | ||
writeLines(capture.output(sessionInfo()), "SessionInfo/FunctionalInsights_SessionInfo.txt") |