Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
akatrib committed Oct 19, 2020
1 parent 493b9e1 commit 493e23e
Show file tree
Hide file tree
Showing 3 changed files with 227 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.DS_Store
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,10 @@
## Systems Biology Analytics
Systems Biology Analytics

#### geneSet-enrichment.R:
Enrichment analysis, leveraging Enrichr's comprehensive list of curated gene set libraries to extract significantly represented:
- Pathways, molecular functions & biological processes
- Co-expressed/-localized molecular factors & interactors
- Organs, tissues & cell types / compartments
- Phenotypes & clinical traits
- Diseases, pathologies & symptoms
- Drugs & Therapeutic Targets
216 changes: 216 additions & 0 deletions geneSet-enrichment.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
# ————————————————————————————————————————————————————————————————————————————————
# geneSet-enrichment.R
# @AmalKatrib
#
# Objective: Leverage Enrichr's curated list of gene-set libraries to conduct enrichment analysis
# using an input of genes that were deemed as "interesting" from prior analysis
# Prerequisites:
# -- [ REQ'D INPUT ] Identify genes of interest & save in the appropriate folder
# -- [ OPTIONAL INPUT ] Extract spatially-correlated genes & save in the appropriate folder
# -- [ ONLINE ANALYSIS ] Use Enrichr.com to analyze & download enrichment results
# -- [ OFFLINE ANALYSIS ] Specify biological databases relevent to research question
#
# ————————————————————————————————————————————————————————————————————————————————
rm( list = ls (all = TRUE))
options(stringsAsFactors = F)

# load packages
library(dplyr)
library(stringr)
library(enrichR)

# --------------------------------------------------
# MANUAL INPUT
# --------------------------------------------------

# specify which dataset to analyze
# select from: "a", "b", "c"
type = "a"

# set corresponding directory for data input and analysis output
dirIn = ifelse(type == "a", "dirA1/", ifelse(type == "b", "dirB1/", ifelse(type == "c", "dirC1/", NA)))
dirAnalysis = ifelse(type == "a", "dirA2/", ifelse(type == "b", "dirB2/", ifelse(type == "c", "dirC2/", NA)))

# specify analysis type; select from:
# "allGene": to input protein biomarker + ALL spatially-correlated proteins from Human Protein Atlas & Allen Human Brain Atlas
# "intersectingGene": to input protein biomarker + ONLY OVERLAPPING spatially-correlated proteins from Human Protein Atlas & Allen Human Brain Atlas
analysisType = "allGene" # "allGene", "intersectingGene"

# set enrichr filtering parameters, where (1) is for analysis with spatially-similar genes and (2) for analysis without
p1 = 0.05 # Fisher exact p-val
p2 = 1
c1 = 10 # Combined Score = log(p-value) * z.score
c2 = 10
d = 0.25 # limit # of findings from one single database by removing those in the bottom x% for combined score

# --------------------------------------------------
# DATA INPUT
# ..............
setwd(dirAnalysis)
# --------------------------------------------------

# load gene list & adjust data input format to remove duplicates & sort
genes = read.table("genes.txt"()
genes = genes[,1] %>% unique() %>% sort

# --------------------------------------------------
# PRELIMINARY DATA ANALYSIS
# --------------------------------------------------

# load genes with similar spatial expression (from prior analysis))
genes.similar = list.files(path = dirAnalysis, full.names = T)
genes.similar = genes.similar[grep(analysisType, genes.similar)]
names = gsub(".*//(.*)\\_all.*", "\\1", genes.similar)
genes.similar = lapply(genes.similar, function(x) read.table(x) %>% unlist(use.names = F))
names(genes.similar) = names

# save non-empty entries
lapply(seq_along(genes.in.similar), function(i) {
write.table(genes.in.similar[[i]],
file = paste0(names(genes.in.similar)[i], "_spatiallySimilarGenes.txt"), row.names = F, col.names = F, quote = F) })

# save altogether
write.table(c(names(genes.in.similar), unlist(genes.in.similar, use.names = F)) %>% unique,
file = "listAllSpatiallySimilarGenes.txt", row.names = F, col.names = F, quote = F)

# now make sure we include the gene/protein biomarker itself to the list
genes.similar = lapply(seq_along(genes.similar), function(i) c(names(genes.similar)[i], genes.similar[[i]]))
names(genes.similar) = names


# --------------------------------------------------
# ENRICHR GENE-SET ENRICHMENT ANALYSIS
# --------------------------------------------------

#### ONLINE: get enrichr results from gene query search online
x = list.files()[grep(".txt", list.files())]
enrichr = lapply(1:length(x), function(i) read.csv(x[i], header = T, sep = "\t"))
names(enrichr) = gsub("_table.txt", "", x)


#### OFFLINE: using enrichr libraries
# grab all libraries from EnrichR. Make sure to comment out irrelevant libraries,
# for example, those that are: specific to a disease, outdated, etc.
dbs = listEnrichrDbs() %>% arrange(desc(libraryName))
ind = c(grep("mouse", dbs$libraryName, ignore.case = T),
grep("cancer", dbs$libraryName, ignore.case = T),
grep("onco", dbs$libraryName, ignore.case = T),
grep("computational", dbs$libraryName, ignore.case = T),
grep("microbe", dbs$libraryName, ignore.case = T),
grep("achilles", dbs$libraryName, ignore.case = T),
grep("virus", dbs$libraryName, ignore.case = T),
grep("muscle", dbs$libraryName, ignore.case = T),
grep("virus", dbs$libraryName, ignore.case = T),
grep("2017b", dbs$libraryName, ignore.case = T),
grep("gtex", dbs$libraryName, ignore.case = T),
grep("crispr", dbs$libraryName, ignore.case = T),
grep("LINCS", dbs$libraryName, ignore.case = T),
grep("GWAS", dbs$libraryName, ignore.case = T),
grep("NIH_Funded_", dbs$libraryName, ignore.case = T),
grep("microRNA", dbs$libraryName, ignore.case = T),
grep("miRTarBase", dbs$libraryName, ignore.case = T),
grep("Transcription", dbs$libraryName, ignore.case = T),
grep("TF-LOF", dbs$libraryName, ignore.case = T),
grep("TF_", dbs$libraryName, ignore.case = T),
grep("Phosphatase", dbs$libraryName, ignore.case = T),
grep("Pfam", dbs$libraryName, ignore.case = T),
grep("InterPro", dbs$libraryName, ignore.case = T),
grep("Homolo", dbs$libraryName, ignore.case = T),
grep("CHEA_", dbs$libraryName, ignore.case = T),
grep("_Coexp", dbs$libraryName, ignore.case = T),
grep("BioPlex", dbs$libraryName, ignore.case = T),
grep("CORUM", dbs$libraryName, ignore.case = T),
grep("NURSA", dbs$libraryName, ignore.case = T),
grep("Allen", dbs$libraryName, ignore.case = T),
grep("Aging", dbs$libraryName, ignore.case = T),
grep("MCF7", dbs$libraryName, ignore.case = T),
grep("lncHub", dbs$libraryName, ignore.case = T),
grep("Kinase", dbs$libraryName, ignore.case = T),
grep("KEA_", dbs$libraryName, ignore.case = T),
grep("Rare_Diseases", dbs$libraryName, ignore.case = T),
grep("SubCell", dbs$libraryName, ignore.case = T),
grep("JASPAR", dbs$libraryName, ignore.case = T),
grep("Ligand", dbs$libraryName, ignore.case = T),
grep("L1000", dbs$libraryName, ignore.case = T),
grep("SILAC", dbs$libraryName, ignore.case = T),
grep("Gene_Atlas", dbs$libraryName, ignore.case = T),
grep("Browser_", dbs$libraryName, ignore.case = T),
grep("Histone", dbs$libraryName, ignore.case = T),
grep("Metabolites", dbs$libraryName, ignore.case = T),
grep("Epigenomics", dbs$libraryName, ignore.case = T),
grep("PPI_", dbs$libraryName, ignore.case = T),
grep("ESCAPE", dbs$libraryName, ignore.case = T),
grep("ENCODE", dbs$libraryName, ignore.case = T),
grep("Chromosome", dbs$libraryName, ignore.case = T),
grep("CMAP_", dbs$libraryName, ignore.case = T))

dbs_tmp = dbs
dbs_tmp$libraryName = gsub('_Human', '', dbs_tmp$libraryName)
dbs_tmp$libraryName = gsub('_[[:digit:]]+', '', dbs_tmp$libraryName)

ind = c(ind, which(duplicated(dbs_tmp$libraryName))) %>% unique()
dbs = dbs$libraryName[-ind]
rm(dbs_tmp)

# run enichR offline for genes WITH spatially-similar genes
enrichr = lapply(genes.similar, function(i) enrichr(i, dbs))
enrichr_copy = enrichr #save copy

# remove missing database entries in list
for (i in 1:length(enrichr)) {
enrichr[[i]] = Filter(function(x) nrow(x) > 0, enrichr[[i]])
enrichr[[i]] = Filter(function(x) !is.null(x), enrichr[[i]]) }


# --------------------------------------------------
# ENRICHR RESULTS FILTERING
# --------------------------------------------------

# filter enrichr results to extract top enriched terms (pathways, processes, etc.),
# arrange by % genes/term; overlapping gene set size; combined score,
# and then filter to only keep top 50-75% entries from each db
top = list()

#### Enrichment filtering for all spatially-similar genes
for (i in 1:length(enrichr)) {
top[[i]] = lapply(enrichr[[i]], function(x) x %>% filter(Combined.Score >= c1, P.value <= p1))
top[[i]] = lapply(top[[i]], function(x) x %>%
select(Term, Genes, P.value, Adjusted.P.value, Combined.Score, Overlap)) %>%
do.call(rbind.data.frame, .)

top[[i]] = top[[i]] %>% mutate(db = gsub("\\.*", "", rownames(top[[i]])))
top[[i]]$db = gsub('[[:digit:]]+', '', top[[i]]$db)
top[[i]]$db = gsub("\\_$", "", top[[i]]$db)
rownames(top[[i]]) = 1:nrow(top[[i]])
top[[i]] = top[[i]] %>%
arrange(desc(as.integer(str_count(top[[i]]$Genes, ";")/as.integer(gsub(".*/", "", top[[i]]$Overlap)))),
desc(str_count(top[[i]]$Genes, ";"))) %>%
group_by(db) %>%
#filter(Combined.Score > quantile(Combined.Score, d)) %>%
arrange(db) %>%
as.data.frame() }

names(top) = names

# add space before "Overlap" column to prevent excel converstion to date format
for (i in 1:length(top)) { top[[i]]$Overlap = paste(" ", top[[i]]$Overlap) }


# --------------------------------------------------
# SAVE RESULTS
# ..............
setwd("../../FunctionalNetworkAnalysis/")
# --------------------------------------------------

# create or use existing gene list folders to save enrichment results
lapply(seq_along(top), function(i) {
dir.create(file.path(names(top)[i]), showWarnings = F)
write.csv(top[[i]], file = paste0(names(top)[i], "/", names(top)[i], "_enrichR_withSpatiallySimilarGenes.csv"), row.names = F )})

# --------------------------------------------------
# SAVE SESSION
# --------------------------------------------------

# save workspace + session info
save( list = ls(), file = paste0("SessionInfo/FunctionalInsights", "_", substring(Sys.Date(), 3), ".Rdata"))
writeLines(capture.output(sessionInfo()), "SessionInfo/FunctionalInsights_SessionInfo.txt")

0 comments on commit 493e23e

Please sign in to comment.