|
| 1 | +#!/usr/bin/env cwl-runner |
| 2 | + |
| 3 | +cwlVersion: v1.2 |
| 4 | +class: Workflow |
| 5 | + |
| 6 | +intent: [ edam:operation_2423 ] # Prediction ope |
| 7 | +doc: "This mock workflow calculates input features and labels which are used to train a deep learning model for epitope prediction." |
| 8 | + |
| 9 | +requirements: |
| 10 | + ScatterFeatureRequirement: {} |
| 11 | + StepInputExpressionRequirement: {} |
| 12 | + SubworkflowFeatureRequirement: {} |
| 13 | + |
| 14 | +inputs: |
| 15 | + sabdab_summary: |
| 16 | + type: File |
| 17 | + format: iana:text/tab-separated-values |
| 18 | + doc: "SAbDAb Summary metadata about all structures in the database." |
| 19 | + biodl_train_dataset: |
| 20 | + type: File |
| 21 | + format: iana:text/csv |
| 22 | + doc: "BioDL training dataset containing PPI interactions" |
| 23 | + biodl_test_dataset: |
| 24 | + type: File |
| 25 | + doc: "BioDL test dataset with PPI interactions." |
| 26 | + hhblits_db: |
| 27 | + type: Directory |
| 28 | + doc: "Reference database for HHblits" |
| 29 | + hhblits_db_name: |
| 30 | + type: string |
| 31 | + doc: "Name of hhblits reference database" |
| 32 | + pdb_search_api_query: |
| 33 | + type: File |
| 34 | + format: iana:application/json |
| 35 | + doc: "Structured query for PDB API." |
| 36 | + |
| 37 | +outputs: |
| 38 | + model_output: |
| 39 | + type: File |
| 40 | + outputSource: train_epitope_prediction_model/train_log |
| 41 | + doc: "Output of the prediction model." |
| 42 | + |
| 43 | +steps: |
| 44 | + run_pdb_query: |
| 45 | + in: |
| 46 | + pdb_search_query: pdb_search_api_query |
| 47 | + out: |
| 48 | + [ processed_response ] |
| 49 | + run: ./tools/pdb_query.cwl |
| 50 | + doc: | |
| 51 | + Use PDB search API to run a query on the Protein Data Bank. Returns .txt file with comma-separated PDB IDs which satisfy the query requirements. |
| 52 | + See https://search.rcsb.org/index.html#search-api for a tutorial. |
| 53 | + |
| 54 | + download_pdb_files: |
| 55 | + in: |
| 56 | + input_file: run_pdb_query/processed_response |
| 57 | + mmcif_format: { default: True } |
| 58 | + pdb_format: { default: True } |
| 59 | + out: |
| 60 | + [ pdb_files ] |
| 61 | + run: ./tools/pdb_batch_download.cwl |
| 62 | + |
| 63 | + decompress_pdb_files: |
| 64 | + in: |
| 65 | + pdb_archives: download_pdb_files/pdb_files |
| 66 | + out: [ cifs, pdbs ] |
| 67 | + run: ./tools/decompress.cwl |
| 68 | + doc: "Decompress files using gzip" |
| 69 | + |
| 70 | + generate_dssp_labels: |
| 71 | + in: |
| 72 | + pdb_files: decompress_pdb_files/pdbs # change this later |
| 73 | + rsa_cutoff: { default : 0.06 } |
| 74 | + out: [ dssp_output_files ] |
| 75 | + run: ./tools/dssp.cwl |
| 76 | + doc: "Use DSSP to extract secondary structure and solvent accessibility from PDB files." |
| 77 | + |
| 78 | + generate_ppi_labels: |
| 79 | + in: |
| 80 | + mmcif_files: decompress_pdb_files/cifs |
| 81 | + train_dataset: biodl_train_dataset |
| 82 | + test_dataset: biodl_test_dataset |
| 83 | + out: [ ppi_fasta_files ] |
| 84 | + run: ./tools/ppi_annotations.cwl |
| 85 | + doc: "Extract ppi annotations from BioDL. This step is partly emulated." |
| 86 | + |
| 87 | + preprocess_sabdab_data: |
| 88 | + doc: "Extract antigen chains from SAbDab summary file." |
| 89 | + in: |
| 90 | + sabdab_summary: sabdab_summary |
| 91 | + out: [ processed_summary ] |
| 92 | + run: ./tools/process_sabdab.cwl |
| 93 | + |
| 94 | + generate_epitope_labels: |
| 95 | + in: |
| 96 | + mmcif_files: decompress_pdb_files/cifs |
| 97 | + sabdab_processed: preprocess_sabdab_data/processed_summary |
| 98 | + out: [ epitope_fasta_dir ] |
| 99 | + run: ./tools/epitope_annotations.cwl |
| 100 | + doc: "Extract epitope annotations from PDB files." |
| 101 | + |
| 102 | + combine_labels: |
| 103 | + doc: "Combine labels into 1 file per protein sequence." |
| 104 | + run: ./tools/combine_labels.cwl |
| 105 | + in: |
| 106 | + epitope_directory: generate_epitope_labels/epitope_fasta_dir |
| 107 | + ppi_directory: generate_ppi_labels/ppi_fasta_files |
| 108 | + dssp_directory: generate_dssp_labels/dssp_output_files |
| 109 | + out: [ labels_combined ] |
| 110 | + |
| 111 | + generate_pc7: |
| 112 | + doc: Calculate PC7 features for each residue in each protein sequence. |
| 113 | + run: ./tools/pc7_inputs.cwl # to do: adapt tool so it takes directory of fasta files as input |
| 114 | + in: |
| 115 | + fasta: generate_ppi_labels/ppi_fasta_files |
| 116 | + out: [ pc7_features ] |
| 117 | + |
| 118 | + generate_psp19: |
| 119 | + label: Calculate PSP19 features for each residue in each protein sequence. |
| 120 | + run: ./tools/psp19_inputs.cwl |
| 121 | + in: |
| 122 | + fasta: generate_ppi_labels/ppi_fasta_files |
| 123 | + out: [ psp19_features ] |
| 124 | + |
| 125 | + generate_hhm: |
| 126 | + in: |
| 127 | + query_sequences: |
| 128 | + source: generate_ppi_labels/ppi_fasta_files # type Directory |
| 129 | + valueFrom: $(self.listing) # here type Directory is converted to File array |
| 130 | + hhblits_db: hhblits_db |
| 131 | + hhblits_db_name: hhblits_db_name |
| 132 | + hhblits_n_iterations: { default: 1 } |
| 133 | + out: [ hhm_file_array ] |
| 134 | + run: |
| 135 | + class: Workflow # this is a subworkflow as a workaround because generate_ppi_labels/ppi_fasta_files is Directory while run_hhblits takes File |
| 136 | + inputs: |
| 137 | + query_sequences: File[] |
| 138 | + hhblits_db: Directory |
| 139 | + hhblits_db_name: string |
| 140 | + hhblits_n_iterations: int |
| 141 | + outputs: |
| 142 | + hhm_file_array: |
| 143 | + type: File[] |
| 144 | + outputSource: run_hhblits/hhm |
| 145 | + steps: |
| 146 | + run_hhblits: |
| 147 | + in: |
| 148 | + protein_query_sequence: query_sequences |
| 149 | + database: hhblits_db |
| 150 | + database_name: hhblits_db_name |
| 151 | + n_iterations: hhblits_n_iterations |
| 152 | + out: [ hhm ] |
| 153 | + scatter: protein_query_sequence |
| 154 | + run: ./tools/hhm_inputs_scatter.cwl |
| 155 | + combine_features: |
| 156 | + in: |
| 157 | + input_sequences: generate_ppi_labels/ppi_fasta_files |
| 158 | + pc7_features: generate_pc7/pc7_features |
| 159 | + psp19_features: generate_psp19/psp19_features |
| 160 | + hhm_features: generate_hhm/hhm_file_array # file array, combine_features.cwl converts it to directory |
| 161 | + out: [ combined_features ] |
| 162 | + run: ./tools/combine_features.cwl |
| 163 | + |
| 164 | + train_epitope_prediction_model: # This step incorporates both training and prediction, not sure if this is the case in the real workflow. |
| 165 | + in: # in the real workflow, the configuration file would be generated as part of the workflow as well |
| 166 | + input_features: combine_features/combined_features |
| 167 | + input_labels: combine_labels/labels_combined |
| 168 | + out: [ train_log ] |
| 169 | + run: ./tools/train_epitope_model.cwl |
| 170 | + doc: "Predict epitope residues using a multi-task learning approach. This step is not real yet." |
| 171 | + |
| 172 | +$namespaces: |
| 173 | + iana: "https://www.iana.org/assignments/media-types/" |
| 174 | + s: "https://schema.org/" |
| 175 | + edam: "http://edamontology.org/" |
| 176 | + cwlprov: "https://w3id.org/cwl/prov#" |
| 177 | + |
| 178 | +$schemas: |
| 179 | +- https://schema.org/version/latest/schemaorg-current-https.rdf |
| 180 | +- https://edamontology.org/EDAM_1.25.owl |
| 181 | + |
| 182 | +s:author: |
| 183 | +- s:name: "Renske de Wit" |
| 184 | + s:identifier: https://orcid.org/0000-0003-0902-0086 |
| 185 | +- s:name: "Katharina Waury" |
| 186 | +s:license: https://spdx.org/licenses/Apache-2.0 |
0 commit comments