Skip to content

Commit 585035a

Browse files
committed
added udpated gene file
1 parent 7b3cf19 commit 585035a

File tree

2 files changed

+27
-13
lines changed

2 files changed

+27
-13
lines changed

build/genes/00-buildGeneFile.R

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ library(dplyr)
1010
##get entrez ids to symbol
1111
entrez<-as.data.frame(org.Hs.egALIAS2EG)
1212

13+
sym <- as.data.frame(org.Hs.egSYMBOL)
14+
1315
##get entriz ids to ensembl
1416
ens<-as.data.frame(org.Hs.egENSEMBL2EG)
1517

@@ -22,21 +24,30 @@ ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
2224
tab <- getBM(attributes=c('ensembl_gene_id'),filters='biotype', values=c('protein_coding'),mart=ensembl)
2325

2426

25-
joined.df<-entrez%>%full_join(ens)%>%
26-
dplyr::rename(entrez_id='gene_id',gene_symbol='alias_symbol',other_id='ensembl_id')%>%
27-
mutate(other_id_source='ensembl_gene')|>
28-
mutate(is_protein=other_id%in%tab$ensembl_gene_id)|>
29-
subset(is_protein)|>
30-
dplyr::select(-is_protein)
27+
joined.df<-entrez|>
28+
left_join(sym)|>
29+
dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='alias_symbol',gene_symbol='symbol')%>%
30+
mutate(other_id_source='entrez_alias')
31+
32+
##now get aliases from ensembl
33+
edf <- sym|>
34+
inner_join(ens)|>
35+
dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='ensembl_id')%>%
36+
mutate(other_id_source='ensembl_gene')
37+
3138

32-
tdf<-entrez|>
33-
full_join(enst)|>
34-
dplyr::rename(entrez_id='gene_id',gene_symbol='alias_symbol',other_id='trans_id')|>
35-
subset(entrez_id%in%joined.df$entrez_id)|>
36-
subset(gene_symbol%in%joined.df$gene_symbol)|>
39+
tdf<-sym|>
40+
inner_join(enst)|>
41+
dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='trans_id')|>
42+
subset(entrez_id%in%edf$entrez_id)|>
43+
# subset(gene_symbol%in%ed.df$gene_symbol)|>
3744
dplyr::mutate(other_id_source='ensembl_transcript')
3845

39-
joined.df<-rbind(joined.df,tdf)|>
46+
47+
prots<-subset(edf,other_id%in%tab$ensembl_gene_id)
48+
49+
full.df<-rbind(joined.df,edf,tdf)|>
50+
subset(entrez_id%in%prots$entrez_id)|>
4051
distinct()
4152

4253
#save to file and version

build/pancpdo/02a-getPancPDODataFromSynapse.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ def main():
101101

102102
sc = synapseclient.login(args.token)
103103

104+
missingsamples = []
104105
if args.copy:
105106
##query synapse view for files
106107
cnvs = sc.tableQuery("select * from syn64608378 where parentId='syn64608163'").asDataFrame()
@@ -115,6 +116,7 @@ def main():
115116
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
116117
else:
117118
print('Missing sample id for '+sname)
119+
missingsamples.append('copy,'+sname)
118120
continue
119121
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
120122
res = parseCNVFile(path,sampid, genes)
@@ -134,13 +136,14 @@ def main():
134136
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
135137
else:
136138
print('Missing sample id for '+sname)
139+
missingsamples.append('mutation,'+sname)
137140
continue
138141
path = sc.get(sid).path
139142
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
140143
res = parseMutFile(path,sampid, genes)
141144
alldats.append(res)
142145
newmut = pd.concat(alldats)
143146
newmut.to_csv("/tmp/pancpdo_mutations.csv.gz",compression='gzip',index=False)
144-
147+
missingsamples.to_csv('missing.csv')
145148
if __name__=='__main__':
146149
main()

0 commit comments

Comments
 (0)