Skip to content

Commit 5466f95

Browse files
committed
start of a more advanced test
1 parent e2009ac commit 5466f95

34 files changed

+1246
-4
lines changed

MANIFEST.in

+3
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ include tests/loop/*
99
include tests/tmp1/tmp2/tmp3/.gitkeep
1010
include tests/tmp4/alpha/*
1111
include tests/wf/*
12+
include tests/wf/adv_prov/*
13+
include tests/wf/adv_prov/data/*
14+
include tests/wf/adv_prov/tools/*
1215
include tests/wf/operation/*
1316
include tests/override/*
1417
include tests/reloc/*.cwl

cwltool/provenance_profile.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -404,10 +404,16 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st
404404
# Transfer SCHEMA annotations to provenance
405405
for s in schema_annotations:
406406
if "additionalType" in s:
407-
additional_type = cast(str, schema_annotations[s]).split(sep="/")[
408-
-1
409-
] # find better method?
410-
file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]})
407+
atype = schema_annotations[s]
408+
if isinstance(atype, str):
409+
additional_type = atype.split(sep="/")[-1] # find better method?
410+
file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]})
411+
else:
412+
for a_entry in cast(List[str], atype):
413+
additional_type = a_entry.split(sep="/")[
414+
-1
415+
] # find better method?
416+
file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]})
411417
else:
412418
file_entity = self._add_nested_annotations(
413419
s, schema_annotations[s], file_entity

tests/test_provenance.py

+21
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,27 @@ def test_revsort_label_annotations(tmp_path: Path) -> None:
103103
)
104104

105105

106+
def test_advanced_prov_annotations(tmp_path: Path) -> None:
107+
"""Pass through of advanced input annotations."""
108+
base_path = cwltool(
109+
tmp_path,
110+
get_data("tests/wf/adv_prov/niaa_wf.cwl"),
111+
get_data("tests/wf/adv_prov/niaa_wf_job.yml"),
112+
)
113+
prov_file = base_path / "metadata" / "provenance" / "primary.cwlprov.nt"
114+
arcp_root = find_arcp(base_path)
115+
g = Graph()
116+
with open(prov_file, "rb") as f:
117+
g.parse(file=f, format="nt", publicID=arcp_root)
118+
mime_having_objects = list(g.subjects(SCHEMA.encodingFormat))
119+
assert len(mime_having_objects) == 8
120+
# for obj in mime_having_objects:
121+
# assert (
122+
# cast(Literal, list(g.objects(obj, SCHEMA.encodingFormat))[0]).value
123+
# == "https://www.iana.org/assignments/media-types/text/plain"
124+
# )
125+
126+
106127
@needs_docker
107128
def test_nested_workflow(tmp_path: Path) -> None:
108129
check_provenance(cwltool(tmp_path, get_data("tests/wf/nested.cwl")), nested=True)

tests/wf/adv_prov/data/pdb_query.json

Whitespace-only changes.

tests/wf/adv_prov/data/prepared_biolip_win_p_testing.csv

Whitespace-only changes.

tests/wf/adv_prov/data/prepared_biolip_win_p_training.csv

Whitespace-only changes.

tests/wf/adv_prov/data/sabdab_summary_all_20220527.tsv

Whitespace-only changes.

tests/wf/adv_prov/model_example_params.json

Whitespace-only changes.

tests/wf/adv_prov/niaa_wf.cwl

+186
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
#!/usr/bin/env cwl-runner
2+
3+
cwlVersion: v1.2
4+
class: Workflow
5+
6+
intent: [ edam:operation_2423 ] # Prediction ope
7+
doc: "This mock workflow calculates input features and labels which are used to train a deep learning model for epitope prediction."
8+
9+
requirements:
10+
ScatterFeatureRequirement: {}
11+
StepInputExpressionRequirement: {}
12+
SubworkflowFeatureRequirement: {}
13+
14+
inputs:
15+
sabdab_summary:
16+
type: File
17+
format: iana:text/tab-separated-values
18+
doc: "SAbDAb Summary metadata about all structures in the database."
19+
biodl_train_dataset:
20+
type: File
21+
format: iana:text/csv
22+
doc: "BioDL training dataset containing PPI interactions"
23+
biodl_test_dataset:
24+
type: File
25+
doc: "BioDL test dataset with PPI interactions."
26+
hhblits_db:
27+
type: Directory
28+
doc: "Reference database for HHblits"
29+
hhblits_db_name:
30+
type: string
31+
doc: "Name of hhblits reference database"
32+
pdb_search_api_query:
33+
type: File
34+
format: iana:application/json
35+
doc: "Structured query for PDB API."
36+
37+
outputs:
38+
model_output:
39+
type: File
40+
outputSource: train_epitope_prediction_model/train_log
41+
doc: "Output of the prediction model."
42+
43+
steps:
44+
run_pdb_query:
45+
in:
46+
pdb_search_query: pdb_search_api_query
47+
out:
48+
[ processed_response ]
49+
run: ./tools/pdb_query.cwl
50+
doc: |
51+
Use PDB search API to run a query on the Protein Data Bank. Returns .txt file with comma-separated PDB IDs which satisfy the query requirements.
52+
See https://search.rcsb.org/index.html#search-api for a tutorial.
53+
54+
download_pdb_files:
55+
in:
56+
input_file: run_pdb_query/processed_response
57+
mmcif_format: { default: True }
58+
pdb_format: { default: True }
59+
out:
60+
[ pdb_files ]
61+
run: ./tools/pdb_batch_download.cwl
62+
63+
decompress_pdb_files:
64+
in:
65+
pdb_archives: download_pdb_files/pdb_files
66+
out: [ cifs, pdbs ]
67+
run: ./tools/decompress.cwl
68+
doc: "Decompress files using gzip"
69+
70+
generate_dssp_labels:
71+
in:
72+
pdb_files: decompress_pdb_files/pdbs # change this later
73+
rsa_cutoff: { default : 0.06 }
74+
out: [ dssp_output_files ]
75+
run: ./tools/dssp.cwl
76+
doc: "Use DSSP to extract secondary structure and solvent accessibility from PDB files."
77+
78+
generate_ppi_labels:
79+
in:
80+
mmcif_files: decompress_pdb_files/cifs
81+
train_dataset: biodl_train_dataset
82+
test_dataset: biodl_test_dataset
83+
out: [ ppi_fasta_files ]
84+
run: ./tools/ppi_annotations.cwl
85+
doc: "Extract ppi annotations from BioDL. This step is partly emulated."
86+
87+
preprocess_sabdab_data:
88+
doc: "Extract antigen chains from SAbDab summary file."
89+
in:
90+
sabdab_summary: sabdab_summary
91+
out: [ processed_summary ]
92+
run: ./tools/process_sabdab.cwl
93+
94+
generate_epitope_labels:
95+
in:
96+
mmcif_files: decompress_pdb_files/cifs
97+
sabdab_processed: preprocess_sabdab_data/processed_summary
98+
out: [ epitope_fasta_dir ]
99+
run: ./tools/epitope_annotations.cwl
100+
doc: "Extract epitope annotations from PDB files."
101+
102+
combine_labels:
103+
doc: "Combine labels into 1 file per protein sequence."
104+
run: ./tools/combine_labels.cwl
105+
in:
106+
epitope_directory: generate_epitope_labels/epitope_fasta_dir
107+
ppi_directory: generate_ppi_labels/ppi_fasta_files
108+
dssp_directory: generate_dssp_labels/dssp_output_files
109+
out: [ labels_combined ]
110+
111+
generate_pc7:
112+
doc: Calculate PC7 features for each residue in each protein sequence.
113+
run: ./tools/pc7_inputs.cwl # to do: adapt tool so it takes directory of fasta files as input
114+
in:
115+
fasta: generate_ppi_labels/ppi_fasta_files
116+
out: [ pc7_features ]
117+
118+
generate_psp19:
119+
label: Calculate PSP19 features for each residue in each protein sequence.
120+
run: ./tools/psp19_inputs.cwl
121+
in:
122+
fasta: generate_ppi_labels/ppi_fasta_files
123+
out: [ psp19_features ]
124+
125+
generate_hhm:
126+
in:
127+
query_sequences:
128+
source: generate_ppi_labels/ppi_fasta_files # type Directory
129+
valueFrom: $(self.listing) # here type Directory is converted to File array
130+
hhblits_db: hhblits_db
131+
hhblits_db_name: hhblits_db_name
132+
hhblits_n_iterations: { default: 1 }
133+
out: [ hhm_file_array ]
134+
run:
135+
class: Workflow # this is a subworkflow as a workaround because generate_ppi_labels/ppi_fasta_files is Directory while run_hhblits takes File
136+
inputs:
137+
query_sequences: File[]
138+
hhblits_db: Directory
139+
hhblits_db_name: string
140+
hhblits_n_iterations: int
141+
outputs:
142+
hhm_file_array:
143+
type: File[]
144+
outputSource: run_hhblits/hhm
145+
steps:
146+
run_hhblits:
147+
in:
148+
protein_query_sequence: query_sequences
149+
database: hhblits_db
150+
database_name: hhblits_db_name
151+
n_iterations: hhblits_n_iterations
152+
out: [ hhm ]
153+
scatter: protein_query_sequence
154+
run: ./tools/hhm_inputs_scatter.cwl
155+
combine_features:
156+
in:
157+
input_sequences: generate_ppi_labels/ppi_fasta_files
158+
pc7_features: generate_pc7/pc7_features
159+
psp19_features: generate_psp19/psp19_features
160+
hhm_features: generate_hhm/hhm_file_array # file array, combine_features.cwl converts it to directory
161+
out: [ combined_features ]
162+
run: ./tools/combine_features.cwl
163+
164+
train_epitope_prediction_model: # This step incorporates both training and prediction, not sure if this is the case in the real workflow.
165+
in: # in the real workflow, the configuration file would be generated as part of the workflow as well
166+
input_features: combine_features/combined_features
167+
input_labels: combine_labels/labels_combined
168+
out: [ train_log ]
169+
run: ./tools/train_epitope_model.cwl
170+
doc: "Predict epitope residues using a multi-task learning approach. This step is not real yet."
171+
172+
$namespaces:
173+
iana: "https://www.iana.org/assignments/media-types/"
174+
s: "https://schema.org/"
175+
edam: "http://edamontology.org/"
176+
cwlprov: "https://w3id.org/cwl/prov#"
177+
178+
$schemas:
179+
- https://schema.org/version/latest/schemaorg-current-https.rdf
180+
- https://edamontology.org/EDAM_1.25.owl
181+
182+
s:author:
183+
- s:name: "Renske de Wit"
184+
s:identifier: https://orcid.org/0000-0003-0902-0086
185+
- s:name: "Katharina Waury"
186+
s:license: https://spdx.org/licenses/Apache-2.0

tests/wf/adv_prov/niaa_wf_job.yml

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
cwlprov:prov:
2+
sabdab_search:
3+
s:additionalType: s:SearchAction
4+
s:query: "All structures"
5+
s:endTime: 2022-05-27
6+
s:object:
7+
s:name: "Structural Antibody Database"
8+
s:citation:
9+
s:identifier: https://doi.org/10.1093/nar/gkab1050
10+
s:result: sabdab_summary
11+
s:description: "Search Action for metadata on antibody-antigen complexes in SAbDab"
12+
13+
14+
pdb_search_api_query:
15+
class: File
16+
location: ./data/pdb_query.json
17+
format: iana:application/json
18+
s:description: "Input query for PDB search API."
19+
s:additionalType:
20+
- edam:data_3786 # Query script
21+
22+
sabdab_summary:
23+
class: File
24+
path: ./data/sabdab_summary_all_20220527.tsv
25+
format: iana:text/tab-separated-values
26+
s:description: "Summary file downloaded from SAbDAb database, containing metadata for all structures."
27+
s:additionalType:
28+
- edam:data_2080 # database search results
29+
- s:Dataset
30+
31+
32+
biodl_train_dataset:
33+
class: File
34+
path: data/prepared_biolip_win_p_training.csv
35+
#location: https://www.ibi.vu.nl/downloads/PIPENN/PIPENN/BioDL-Datasets/prepared_biolip_win_p_training.csv
36+
format: iana:text/csv
37+
s:description: "BioDL training set containing PPI annotations for protein sequences (UniProt IDs)"
38+
s:name: "BioDL training dataset"
39+
s:citation:
40+
s:identifier: https://doi.org/10.1093/bioinformatics/btac071
41+
s:additionalType:
42+
- s:Dataset
43+
- edam:data_1277 # protein features
44+
45+
biodl_test_dataset:
46+
class: File
47+
path: data/prepared_biolip_win_p_testing.csv
48+
#location: https://www.ibi.vu.nl/downloads/PIPENN/PIPENN/BioDL-Datasets/prepared_biolip_win_p_testing.csv
49+
s:description: "BioDL test set containing PPI annotations for protein sequences (UniProt IDs)."
50+
s:name: "BioDL test dataset"
51+
s:citation:
52+
s:identifier: https://doi.org/10.1093/bioinformatics/btac071
53+
s:additionalType:
54+
- s:Dataset
55+
- edam:data_1277 # protein features
56+
57+
hhblits_db:
58+
class: Directory
59+
location: ../hhblits/databases
60+
s:citation:
61+
s:identifier: https://doi.org/10.1038/nmeth.1818
62+
s:name: "pdb70"
63+
s:description: "Directory containing HHBlits reference database."
64+
s:additionalType:
65+
- s:Dataset
66+
- edam:data_0955 # data index
67+
68+
hhblits_db_name: pdb70
69+
hhblits_n_iterations: 1
70+
71+
s:description: "Demonstration run of epitope prediction workflow. Some steps are emulated, so the results of the workflow are not yet biologically meaningful."
72+
73+
$namespaces:
74+
iana: "https://www.iana.org/assignments/media-types/"
75+
s: "https://schema.org/"
76+
edam: "http://edamontology.org/"
77+
cwlprov: "https://w3id.org/cwl/prov#"
78+

0 commit comments

Comments
 (0)