7
7
import numpy as np
8
8
import pandas as pd
9
9
import rpy2 .robjects as ro
10
+ from fast_bioservices import BioDBNet , Input , Output
10
11
from GSEpipeline import load_gse_soft
11
12
from instruments import AffyIO
12
-
13
- # from fast_bioservices import BioDBNet, Input, Output
14
- from multi_bioservices .biodbnet import InputDatabase , OutputDatabase , TaxonID , db2db
15
13
from rpy2 .robjects import pandas2ri
16
14
17
15
pandas2ri .activate ()
21
19
22
20
# gse = load_gse_soft(gsename)
23
21
22
+ from fast_bioservices import BioDBNet , Input , Output
23
+
24
24
25
- def download_gsm_id_maps (datadir , gse , gpls : Optional [list [str ]] = None , vendor = "affy" ):
25
+ def download_gsm_id_maps (
26
+ datadir ,
27
+ gse ,
28
+ biodbnet : BioDBNet ,
29
+ taxon_id : int ,
30
+ gpls : Optional [list [str ]] = None ,
31
+ vendor = "affy" ,
32
+ ):
26
33
"""
27
34
download ID to ENTREZ_GENE_ID maps, create a csv file for each platform, and return dictionary
28
35
:param gpls:
@@ -46,18 +53,19 @@ def download_gsm_id_maps(datadir, gse, gpls: Optional[list[str]] = None, vendor=
46
53
table ["CONTROL_TYPE" ] == "FALSE" , "SPOT_ID"
47
54
].tolist ()
48
55
49
- temp = db2db (
56
+ temp = biodbnet . db2db (
50
57
input_values = input_values ,
51
- input_db = InputDatabase .AGILENT_ID ,
52
- output_db = [OutputDatabase .GENE_ID , OutputDatabase .ENSEMBL_GENE_ID ],
58
+ input_db = Input .AGILENT_ID ,
59
+ output_db = [Output .GENE_ID , Output .ENSEMBL_GENE_ID ],
60
+ taxon = taxon_id ,
53
61
)
54
62
55
63
temp .drop (columns = ["Ensembl Gene ID" ], inplace = True )
56
64
temp .reset_index (inplace = True )
57
65
temp .rename (
58
66
columns = {
59
- InputDatabase .AGILENT_ID .value : "ID" ,
60
- OutputDatabase .GENE_ID .value : "ENTREZ_GENE_ID" ,
67
+ Input .AGILENT_ID .value : "ID" ,
68
+ Output .GENE_ID .value : "ENTREZ_GENE_ID" ,
61
69
},
62
70
inplace = True ,
63
71
)
@@ -74,14 +82,27 @@ def download_gsm_id_maps(datadir, gse, gpls: Optional[list[str]] = None, vendor=
74
82
75
83
76
84
class GSEproject :
77
- def __init__ (self , gsename , querytable , rootdir = "../" ):
85
+ def __init__ (
86
+ self ,
87
+ gsename ,
88
+ querytable ,
89
+ show_biodbnet_progress : bool = False ,
90
+ use_biodbnet_cache : bool = True ,
91
+ rootdir = "../" ,
92
+ ):
78
93
self .gsename = gsename
79
94
# Setup paths
80
95
self .querytable = querytable
81
96
self .rootdir = rootdir
82
97
self .datadir = os .path .join (self .rootdir , "data" )
83
98
self .outputdir = os .path .join (self .rootdir , "output" )
84
99
self .gene_dir = os .path .join (self .datadir , self .gsename + "_RAW" )
100
+
101
+ self .biodbnet = BioDBNet (
102
+ show_progress = show_biodbnet_progress ,
103
+ cache = use_biodbnet_cache ,
104
+ )
105
+
85
106
print (
86
107
"Initialize project ({}):\n Root: {}\n Raw data: {}" .format (
87
108
self .gsename , self .rootdir , self .gene_dir
@@ -137,7 +158,13 @@ def get_gsm_tables(self):
137
158
if not os .path .isfile (filepath ):
138
159
# Could improve to automatic download new tables based on platform
139
160
gse = load_gse_soft (self .gsename )
140
- download_gsm_id_maps (self .datadir , gse , gpls = [gpl ], vendor = vendor )
161
+ download_gsm_id_maps (
162
+ self .datadir ,
163
+ gse ,
164
+ gpls = [gpl ],
165
+ vendor = vendor ,
166
+ biodbnet = self .biodbnet ,
167
+ )
141
168
print ("Skip Unsupported Platform: {}, {}" .format (gpl , vendor ))
142
169
# continue
143
170
temp = pd .read_csv (filepath )
@@ -225,16 +252,6 @@ def get_entrez_table_pipeline(self, fromcsv=True):
225
252
output_db = [OutputDatabase .GENE_ID ],
226
253
)
227
254
228
- outputdf = instruments .readagilent (
229
- platformdir , list (self .gsm_platform .keys ())
230
- )
231
-
232
- gsm_maps [key ] = db2db (
233
- input_values = list (map (str , list (outputdf ["ProbeName" ]))),
234
- input_db = InputDatabase .AGILENT_ID ,
235
- output_db = [OutputDatabase .GENE_ID ],
236
- )
237
-
238
255
gsm_maps [key ].rename (
239
256
columns = {"Gene ID" : "ENTREZ_GENE_ID" }, inplace = True
240
257
)
@@ -271,23 +288,23 @@ def get_entrez_table_pipeline(self, fromcsv=True):
271
288
how = "outer" ,
272
289
)
273
290
274
- df_outer_sc500 .dropna (how = "all" , inplace = True ) # type: ignore
275
- print ("Full: {}" .format (df_outer_sc500 .shape )) # type: ignore
276
- df_outer_sc500 .rename (str .lower , axis = "columns" , inplace = True ) # type: ignore
291
+ df_outer_sc500 .dropna (how = "all" , inplace = True )
292
+ print ("Full: {}" .format (df_outer_sc500 .shape ))
293
+ df_outer_sc500 .rename (str .lower , axis = "columns" , inplace = True )
277
294
keys = []
278
295
vals = []
279
296
gsms_loaded = []
280
297
281
- for col in list (df_outer_sc500 ): # type: ignore
282
- if ".cel.gz" in col : # type: ignore
283
- strs = col .split (".cel.gz" ) # type: ignore
298
+ for col in list (df_outer_sc500 ):
299
+ if ".cel.gz" in col :
300
+ strs = col .split (".cel.gz" )
284
301
gsm = strs [0 ].split ("_" )[0 ]
285
302
newcol = "{}.cel.gz{}" .format (gsm , strs [- 1 ])
286
303
vals .append (newcol )
287
304
keys .append (col )
288
305
gsms_loaded .append (gsm )
289
306
290
- df_outer_sc500 .rename (columns = dict (zip (keys , vals )), inplace = True ) # type: ignore
307
+ df_outer_sc500 .rename (columns = dict (zip (keys , vals )), inplace = True )
291
308
gsms_loaded = list (set (gsms_loaded ).union (set (self .gsm_platform .keys ())))
292
309
293
310
# Remove duplicated items, keep largest VALUE for each GSM
@@ -329,7 +346,7 @@ def get_entrez_table_pipeline(self, fromcsv=True):
329
346
)
330
347
331
348
try :
332
- temp = df_outer_sc500 .loc [:, [col1 , col2 , col3 ]] # type: ignore
349
+ temp = df_outer_sc500 .loc [:, [col1 , col2 , col3 ]]
333
350
334
351
except :
335
352
if key in list (self .gsm_platform .keys ()):
0 commit comments