Skip to content

Commit 358e95c

Browse files
authored
Replaced old 00-buildGeneFile.R with new version from Sara
1 parent f66a6a7 commit 358e95c

File tree

1 file changed

+23
-13
lines changed

1 file changed

+23
-13
lines changed

build/genes/00-buildGeneFile.R

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ library(dplyr)
1010
##get entrez ids to symbol
1111
entrez<-as.data.frame(org.Hs.egALIAS2EG)
1212

13+
sym <- as.data.frame(org.Hs.egSYMBOL)
14+
1315
##get entriz ids to ensembl
1416
ens<-as.data.frame(org.Hs.egENSEMBL2EG)
1517

@@ -22,26 +24,34 @@ ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
2224
tab <- getBM(attributes=c('ensembl_gene_id'),filters='biotype', values=c('protein_coding'),mart=ensembl)
2325

2426

25-
joined.df<-entrez%>%full_join(ens)%>%
26-
dplyr::rename(entrez_id='gene_id',gene_symbol='alias_symbol',other_id='ensembl_id')%>%
27-
mutate(other_id_source='ensembl_gene')|>
28-
mutate(is_protein=other_id%in%tab$ensembl_gene_id)|>
29-
subset(is_protein)|>
30-
dplyr::select(-is_protein)
27+
joined.df<-entrez|>
28+
left_join(sym)|>
29+
dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='alias_symbol',gene_symbol='symbol')%>%
30+
mutate(other_id_source='entrez_alias')
31+
32+
##now get aliases from ensembl
33+
edf <- sym|>
34+
inner_join(ens)|>
35+
dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='ensembl_id')%>%
36+
mutate(other_id_source='ensembl_gene')
37+
3138

32-
tdf<-entrez|>
33-
full_join(enst)|>
34-
dplyr::rename(entrez_id='gene_id',gene_symbol='alias_symbol',other_id='trans_id')|>
35-
subset(entrez_id%in%joined.df$entrez_id)|>
36-
subset(gene_symbol%in%joined.df$gene_symbol)|>
39+
tdf<-sym|>
40+
inner_join(enst)|>
41+
dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='trans_id')|>
42+
subset(entrez_id%in%edf$entrez_id)|>
43+
# subset(gene_symbol%in%ed.df$gene_symbol)|>
3744
dplyr::mutate(other_id_source='ensembl_transcript')
3845

39-
joined.df<-rbind(joined.df,tdf)|>
46+
47+
prots<-subset(edf,other_id%in%tab$ensembl_gene_id)
48+
49+
full.df<-rbind(joined.df,edf,tdf)|>
50+
subset(entrez_id%in%prots$entrez_id)|>
4051
distinct()
4152

4253
#save to file and version
4354
write.table(joined.df,'/tmp/genes.csv',sep=',',row.names=F,quote=T)
4455

4556
##store this file somewhere!
4657

47-

0 commit comments

Comments
 (0)