@@ -10,6 +10,8 @@ library(dplyr)
10
10
# #get entrez ids to symbol
11
11
entrez <- as.data.frame(org.Hs.egALIAS2EG )
12
12
13
+ sym <- as.data.frame(org.Hs.egSYMBOL )
14
+
13
15
# #get entriz ids to ensembl
14
16
ens <- as.data.frame(org.Hs.egENSEMBL2EG )
15
17
@@ -22,26 +24,34 @@ ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
22
24
tab <- getBM(attributes = c(' ensembl_gene_id' ),filters = ' biotype' , values = c(' protein_coding' ),mart = ensembl )
23
25
24
26
25
- joined.df <- entrez %> %full_join(ens )%> %
26
- dplyr :: rename(entrez_id = ' gene_id' ,gene_symbol = ' alias_symbol' ,other_id = ' ensembl_id' )%> %
27
- mutate(other_id_source = ' ensembl_gene' )| >
28
- mutate(is_protein = other_id %in% tab $ ensembl_gene_id )| >
29
- subset(is_protein )| >
30
- dplyr :: select(- is_protein )
27
+ joined.df <- entrez | >
28
+ left_join(sym )| >
29
+ dplyr :: rename(entrez_id = ' gene_id' ,gene_symbol = ' symbol' ,other_id = ' alias_symbol' ,gene_symbol = ' symbol' )%> %
30
+ mutate(other_id_source = ' entrez_alias' )
31
+
32
+ # #now get aliases from ensembl
33
+ edf <- sym | >
34
+ inner_join(ens )| >
35
+ dplyr :: rename(entrez_id = ' gene_id' ,gene_symbol = ' symbol' ,other_id = ' ensembl_id' )%> %
36
+ mutate(other_id_source = ' ensembl_gene' )
37
+
31
38
32
- tdf <- entrez | >
33
- full_join (enst )| >
34
- dplyr :: rename(entrez_id = ' gene_id' ,gene_symbol = ' alias_symbol ' ,other_id = ' trans_id' )| >
35
- subset(entrez_id %in% joined.df $ entrez_id )| >
36
- subset(gene_symbol %in% joined .df$ gene_symbol )| >
39
+ tdf <- sym | >
40
+ inner_join (enst )| >
41
+ dplyr :: rename(entrez_id = ' gene_id' ,gene_symbol = ' symbol ' ,other_id = ' trans_id' )| >
42
+ subset(entrez_id %in% edf $ entrez_id )| >
43
+ # subset(gene_symbol%in%ed .df$gene_symbol)|>
37
44
dplyr :: mutate(other_id_source = ' ensembl_transcript' )
38
45
39
- joined.df <- rbind(joined.df ,tdf )| >
46
+
47
+ prots <- subset(edf ,other_id %in% tab $ ensembl_gene_id )
48
+
49
+ full.df <- rbind(joined.df ,edf ,tdf )| >
50
+ subset(entrez_id %in% prots $ entrez_id )| >
40
51
distinct()
41
52
42
53
# save to file and version
43
54
write.table(joined.df ,' /tmp/genes.csv' ,sep = ' ,' ,row.names = F ,quote = T )
44
55
45
56
# #store this file somewhere!
46
57
47
-
0 commit comments