Skip to content

Commit ce2c833

Browse files
author
Josh Loecker
authored
Use fast_bioservices instead of multi_bioservices (#181)
* Fix conflicting package requirements * Initial `ruff` formatting * Added command line arguments to expose additional options * Fixed imports, `ruff` formatting and import sorting * Ignore Rout files * Migrate from `multi_bioservices` to `fast_bioservices` This also formats files using `ruff`, which makes it appear as if there are many more changes than actaully occured * Use fast_bioservices instead of multi_bioservices * Format with ruff, use fast_bioservices instead of multi_bioservices. This file may be deleted/reorganized because it is related to microarray, which we are removing from COMO * Fix arguments, add biodbnet progress * Fix `Input` usage * Fix argument formatting usage * Fix argument usage * Ignore microarray.db * Fix multi_bioservices import skip microarray tests
1 parent 7ad61b5 commit ce2c833

15 files changed

+387
-224
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ main/microarray.db
2222
main/data/config_sheets/*
2323
main/data/GSE*_RAW
2424
main/data/gpl*entrez.csv
25+
main/src/microarray.db

environment.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,10 @@ dependencies:
6060
# - conda-forge::xlrd~=2.0.1
6161
- gurobi::gurobi
6262
- pip:
63-
- multi_bioservices
6463
# - escher==1.7.3
6564
- git+https://github.com/JoshLoecker/escher.git@python38#subdirectory=py
6665
- framed==0.5.*
6766
- memote<=1.0
6867
- git+https://github.com/JoshLoecker/cobamp.git
6968
- git+https://github.com/JoshLoecker/troppo.git
69+
- git+https://github.com/JoshLoecker/fast_bioservices.git

main/COMO.ipynb

+12
Original file line numberDiff line numberDiff line change
@@ -799,11 +799,17 @@
799799
"mrna_weight = 6\n",
800800
"single_cell_weight = 6\n",
801801
"proteomics_weight = 10\n",
802+
"taxon_id = 9606 # Human\n",
803+
"show_biodbnet_progress = True\n",
804+
"use_biodbnet_cache = True\n",
802805
"\n",
803806
"cmd = \" \".join(\n",
804807
" [\n",
805808
" \"python3\", \"src/merge_xomics.py\",\n",
806809
" \"--merge-distribution\",\n",
810+
" \"--taxon-id\", f\"{taxon_id}\",\n",
811+
" \"--show-biodbnet-progress\", f\"{show_biodbnet_progress}\",\n",
812+
" \"--use-biodbnet-cache\", f\"{use_biodbnet_cache}\",\n",
807813
" #\"--microarray-config-file\", f\"{microarray_config_file}\", # If using micro-array, uncomment the start of this line\n",
808814
" \"--total-rnaseq-config-file\", f\"{trnaseq_config_file}\",\n",
809815
" # \"--mrnaseq-config-file\", f\"{mrnaseq_config_file}\",\n",
@@ -1229,6 +1235,9 @@
12291235
"import json\n",
12301236
"from src.utilities import stringlist_to_list\n",
12311237
"\n",
1238+
"show_biodbnet_progress = True\n",
1239+
"use_biodbnet_cache = True\n",
1240+
"\n",
12321241
"drug_raw_file = \"Repurposing_Hub_export.txt\"\n",
12331242
"for context in stringlist_to_list(context_names):\n",
12341243
" for recon_algorithm in recon_algorithms:\n",
@@ -1275,6 +1284,9 @@
12751284
" \"--disease-up\", f\"{up_regulated_disease_genes}\",\n",
12761285
" \"--disease-down\", f\"{down_regulated_disease_genes}\",\n",
12771286
" \"--raw-drug-file\", f\"{drug_raw_file}\",\n",
1287+
" \"--taxon-id\", f\"{taxon_id}\",\n",
1288+
" \"--show-biodbnet-progress\", f\"{show_biodbnet_progress}\",\n",
1289+
" \"--use-biodbnet-cache\", f\"{use_biodbnet_cache}\",\n",
12781290
" \"--solver\", f\"{sovler}\",\n",
12791291
" #\"--test-all\"\n",
12801292
" ]\n",

main/src/GSEpipelineFast.py

+46-29
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,9 @@
77
import numpy as np
88
import pandas as pd
99
import rpy2.robjects as ro
10+
from fast_bioservices import BioDBNet, Input, Output
1011
from GSEpipeline import load_gse_soft
1112
from instruments import AffyIO
12-
13-
# from fast_bioservices import BioDBNet, Input, Output
14-
from multi_bioservices.biodbnet import InputDatabase, OutputDatabase, TaxonID, db2db
1513
from rpy2.robjects import pandas2ri
1614

1715
pandas2ri.activate()
@@ -21,8 +19,17 @@
2119

2220
# gse = load_gse_soft(gsename)
2321

22+
from fast_bioservices import BioDBNet, Input, Output
23+
2424

25-
def download_gsm_id_maps(datadir, gse, gpls: Optional[list[str]] = None, vendor="affy"):
25+
def download_gsm_id_maps(
26+
datadir,
27+
gse,
28+
biodbnet: BioDBNet,
29+
taxon_id: int,
30+
gpls: Optional[list[str]] = None,
31+
vendor="affy",
32+
):
2633
"""
2734
download ID to ENTREZ_GENE_ID maps, create a csv file for each platform, and return dictionary
2835
:param gpls:
@@ -46,18 +53,19 @@ def download_gsm_id_maps(datadir, gse, gpls: Optional[list[str]] = None, vendor=
4653
table["CONTROL_TYPE"] == "FALSE", "SPOT_ID"
4754
].tolist()
4855

49-
temp = db2db(
56+
temp = biodbnet.db2db(
5057
input_values=input_values,
51-
input_db=InputDatabase.AGILENT_ID,
52-
output_db=[OutputDatabase.GENE_ID, OutputDatabase.ENSEMBL_GENE_ID],
58+
input_db=Input.AGILENT_ID,
59+
output_db=[Output.GENE_ID, Output.ENSEMBL_GENE_ID],
60+
taxon=taxon_id,
5361
)
5462

5563
temp.drop(columns=["Ensembl Gene ID"], inplace=True)
5664
temp.reset_index(inplace=True)
5765
temp.rename(
5866
columns={
59-
InputDatabase.AGILENT_ID.value: "ID",
60-
OutputDatabase.GENE_ID.value: "ENTREZ_GENE_ID",
67+
Input.AGILENT_ID.value: "ID",
68+
Output.GENE_ID.value: "ENTREZ_GENE_ID",
6169
},
6270
inplace=True,
6371
)
@@ -74,14 +82,27 @@ def download_gsm_id_maps(datadir, gse, gpls: Optional[list[str]] = None, vendor=
7482

7583

7684
class GSEproject:
77-
def __init__(self, gsename, querytable, rootdir="../"):
85+
def __init__(
86+
self,
87+
gsename,
88+
querytable,
89+
show_biodbnet_progress: bool = False,
90+
use_biodbnet_cache: bool = True,
91+
rootdir="../",
92+
):
7893
self.gsename = gsename
7994
# Setup paths
8095
self.querytable = querytable
8196
self.rootdir = rootdir
8297
self.datadir = os.path.join(self.rootdir, "data")
8398
self.outputdir = os.path.join(self.rootdir, "output")
8499
self.gene_dir = os.path.join(self.datadir, self.gsename + "_RAW")
100+
101+
self.biodbnet = BioDBNet(
102+
show_progress=show_biodbnet_progress,
103+
cache=use_biodbnet_cache,
104+
)
105+
85106
print(
86107
"Initialize project ({}):\nRoot: {}\nRaw data: {}".format(
87108
self.gsename, self.rootdir, self.gene_dir
@@ -137,7 +158,13 @@ def get_gsm_tables(self):
137158
if not os.path.isfile(filepath):
138159
# Could improve to automatic download new tables based on platform
139160
gse = load_gse_soft(self.gsename)
140-
download_gsm_id_maps(self.datadir, gse, gpls=[gpl], vendor=vendor)
161+
download_gsm_id_maps(
162+
self.datadir,
163+
gse,
164+
gpls=[gpl],
165+
vendor=vendor,
166+
biodbnet=self.biodbnet,
167+
)
141168
print("Skip Unsupported Platform: {}, {}".format(gpl, vendor))
142169
# continue
143170
temp = pd.read_csv(filepath)
@@ -225,16 +252,6 @@ def get_entrez_table_pipeline(self, fromcsv=True):
225252
output_db=[OutputDatabase.GENE_ID],
226253
)
227254

228-
outputdf = instruments.readagilent(
229-
platformdir, list(self.gsm_platform.keys())
230-
)
231-
232-
gsm_maps[key] = db2db(
233-
input_values=list(map(str, list(outputdf["ProbeName"]))),
234-
input_db=InputDatabase.AGILENT_ID,
235-
output_db=[OutputDatabase.GENE_ID],
236-
)
237-
238255
gsm_maps[key].rename(
239256
columns={"Gene ID": "ENTREZ_GENE_ID"}, inplace=True
240257
)
@@ -271,23 +288,23 @@ def get_entrez_table_pipeline(self, fromcsv=True):
271288
how="outer",
272289
)
273290

274-
df_outer_sc500.dropna(how="all", inplace=True) # type: ignore
275-
print("Full: {}".format(df_outer_sc500.shape)) # type: ignore
276-
df_outer_sc500.rename(str.lower, axis="columns", inplace=True) # type: ignore
291+
df_outer_sc500.dropna(how="all", inplace=True)
292+
print("Full: {}".format(df_outer_sc500.shape))
293+
df_outer_sc500.rename(str.lower, axis="columns", inplace=True)
277294
keys = []
278295
vals = []
279296
gsms_loaded = []
280297

281-
for col in list(df_outer_sc500): # type: ignore
282-
if ".cel.gz" in col: # type: ignore
283-
strs = col.split(".cel.gz") # type: ignore
298+
for col in list(df_outer_sc500):
299+
if ".cel.gz" in col:
300+
strs = col.split(".cel.gz")
284301
gsm = strs[0].split("_")[0]
285302
newcol = "{}.cel.gz{}".format(gsm, strs[-1])
286303
vals.append(newcol)
287304
keys.append(col)
288305
gsms_loaded.append(gsm)
289306

290-
df_outer_sc500.rename(columns=dict(zip(keys, vals)), inplace=True) # type: ignore
307+
df_outer_sc500.rename(columns=dict(zip(keys, vals)), inplace=True)
291308
gsms_loaded = list(set(gsms_loaded).union(set(self.gsm_platform.keys())))
292309

293310
# Remove duplicated items, keep largest VALUE for each GSM
@@ -329,7 +346,7 @@ def get_entrez_table_pipeline(self, fromcsv=True):
329346
)
330347

331348
try:
332-
temp = df_outer_sc500.loc[:, [col1, col2, col3]] # type: ignore
349+
temp = df_outer_sc500.loc[:, [col1, col2, col3]]
333350

334351
except:
335352
if key in list(self.gsm_platform.keys()):

main/src/arguments.py

+20-2
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,24 @@ def range_checker(arg: str):
5454
"help": "Tissue/cell name of models to generate. If making multiple models in a batch, then use the format: 'context1 context2 context3' ",
5555
}
5656

57+
show_biodbnet_progress_arg = {
58+
"flag": "--show-biodbnet-progress",
59+
"action": "store_true",
60+
"required": False,
61+
"default": False,
62+
"dest": "show_biodbnet_progress",
63+
"help": "Show progress of biodbnet queries",
64+
}
65+
66+
use_biodbnet_cache_arg = {
67+
"flag": "--use-biodbnet-cache",
68+
"action": "store_true",
69+
"required": False,
70+
"default": False,
71+
"dest": "use_biodbnet_cache",
72+
"help": "Use biodbnet cache",
73+
}
74+
5775
filtering_technique_arg = {
5876
"flag": "--filtering-technique",
5977
"type": str,
@@ -139,7 +157,7 @@ def range_checker(arg: str):
139157

140158
min_count_arg = {
141159
"flag": "--min-count",
142-
"type": int | str,
160+
"type": str,
143161
"required": False,
144162
"default": "default",
145163
"dest": "min_count",
@@ -405,7 +423,7 @@ def range_checker(arg: str):
405423

406424
expression_requirement_arg = {
407425
"flag": "--expression-requirement",
408-
"type": int | str,
426+
"type": str,
409427
"required": False,
410428
"default": "default",
411429
"dest": "expression_requirement",

main/src/cluster_rnaseq.py

+1-19
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
# cluster_io = SignatureTranslatedAnonymousPackage(string, "cluster_io")
4848

4949

50-
def main() -> None:
50+
def main(argv) -> None:
5151
"""
5252
Cluster RNA-seq Data
5353
"""
@@ -205,24 +205,6 @@ def main() -> None:
205205
seed=seed,
206206
)
207207
cluster_samples.call_function("cluster_samples_main")
208-
# cluster_io = rpy2_api.Rpy2(r_file_path=r_file_path)
209-
# cluster_io_function = cluster_io.call_function("cluster_samples_main")
210-
# cluster_io_function(
211-
# wd,
212-
# context_names,
213-
# technique,
214-
# clust_algo,
215-
# label,
216-
# min_dist=min_dist,
217-
# n_neigh_rep=n_neigh_rep,
218-
# n_neigh_batch=n_neigh_batch,
219-
# n_neigh_cont=n_neigh_cont,
220-
# rep_ratio=rep_ratio,
221-
# batch_ratio=batch_ratio,
222-
# quantile=quantile,
223-
# min_count=min_count,
224-
# seed=seed,
225-
# )
226208

227209

228210
if __name__ == "__main__":

main/src/create_context_specific_model.py

+14-12
Original file line numberDiff line numberDiff line change
@@ -589,18 +589,20 @@ def parse_args(argv):
589589
"https://github.com/HelikarLab/MADRID or email [email protected]",
590590
)
591591

592-
parser.add_argument(**context_names_arg)
593-
parser.add_argument(**reference_model_filepath_arg)
594-
parser.add_argument(**active_genes_filepath_arg)
595-
parser.add_argument(**objective_function_arg)
596-
parser.add_argument(**boundary_reactions_filepath_arg)
597-
parser.add_argument(**exclude_reactions_filepath_arg)
598-
parser.add_argument(**force_reactions_filepath_arg)
599-
parser.add_argument(**reconstruction_algorithm_arg)
600-
parser.add_argument(**imat_low_threshold_arg)
601-
parser.add_argument(**imat_high_threshold_arg)
602-
parser.add_argument(**reconstruction_solver_arg)
603-
parser.add_argument(**output_filetypes_arg)
592+
# fmt: off
593+
parser.add_argument(context_names_arg["flag"], **{k: v for k, v in context_names_arg.items() if k != "flag"})
594+
parser.add_argument(reference_model_filepath_arg["flag"], **{k: v for k, v in reference_model_filepath_arg.items() if k != "flag"})
595+
parser.add_argument(active_genes_filepath_arg["flag"], **{k: v for k, v in active_genes_filepath_arg.items() if k != "flag"})
596+
parser.add_argument(objective_function_arg["flag"], **{k: v for k, v in objective_function_arg.items() if k != "flag"})
597+
parser.add_argument(boundary_reactions_filepath_arg["flag"], **{k: v for k, v in boundary_reactions_filepath_arg.items() if k != "flag"})
598+
parser.add_argument(exclude_reactions_filepath_arg["flag"], **{k: v for k, v in exclude_reactions_filepath_arg.items() if k != "flag"})
599+
parser.add_argument(force_reactions_filepath_arg["flag"], **{k: v for k, v in force_reactions_filepath_arg.items() if k != "flag"})
600+
parser.add_argument(reconstruction_algorithm_arg["flag"], **{k: v for k, v in reconstruction_algorithm_arg.items() if k != "flag"})
601+
parser.add_argument(imat_low_threshold_arg["flag"], **{k: v for k, v in imat_low_threshold_arg.items() if k != "flag"})
602+
parser.add_argument(imat_high_threshold_arg["flag"], **{k: v for k, v in imat_high_threshold_arg.items() if k != "flag"})
603+
parser.add_argument(reconstruction_solver_arg["flag"], **{k: v for k, v in reconstruction_solver_arg.items() if k != "flag"})
604+
parser.add_argument(output_filetypes_arg["flag"], **{k: v for k, v in output_filetypes_arg.items() if k != "flag"})
605+
# fmt: on
604606

605607
args = parser.parse_args()
606608
return args

0 commit comments

Comments
 (0)