Skip to content

Commit

Permalink
added Public Domain notice; NOJIRA
Browse files Browse the repository at this point in the history
  • Loading branch information
azat-badretdin committed May 8, 2018
1 parent 06883bf commit f4cb632
Show file tree
Hide file tree
Showing 4 changed files with 216 additions and 0 deletions.
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,25 @@ Genome annotation is a multi-level process that includes prediction of protein-c
NCBI has developed an automatic prokaryotic genome annotation pipeline that combines ab initio gene prediction algorithms with homology based methods. The first version of NCBI Prokaryotic Genome Automatic Annotation Pipeline (PGAAP; see Pubmed Article) developed in 2005 has been replaced with an upgraded version that is capable of processing a larger data volume. NCBI's annotation pipeline depends on several internal databases and is not currently available for download or use outside of the NCBI environment.

This implementation is intended to be accessible outside of the NCBI network and availible to all users. It is based on CWL and Docker containers. This is a work in progress and not currently functional.

## Public Domain notice

National Center for Biotechnology Information.

This software is a "United States Government Work" under the terms of the
United States Copyright Act. It was written as part of the authors'
official duties as United States Government employees and thus cannot
be copyrighted. This software is freely available to the public for
use. The National Library of Medicine and the U.S. Government have not
placed any restriction on its use or reproduction.

Although all reasonable efforts have been taken to ensure the accuracy
and reliability of the software and data, the NLM and the U.S.
Government do not and cannot warrant the performance or results that
may be obtained by using this software or data. The NLM and the U.S.
Government disclaim all warranties, express or implied, including
warranties of performance, merchantability or fitness for any
particular purpose.

Please cite NCBI in any work or product based on this material.

6 changes: 6 additions & 0 deletions bacterial_kmer/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
input/*
output/*
*.asn
*.xml
pgap.*
output.txt
17 changes: 17 additions & 0 deletions bacterial_kmer/input_wf_bacterial_kmer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Extract_Kmers_From_Input___entry: # take this from Prepare Unannotated Sequences/out/sequences.asn
class: File
location: input/sequences.asn
gencoll_asn: # take this from Create Assembly From Sequences
class: File
location: input/gencoll.asn
asn_cache:
class: Directory
location: input/sequence_cache
ref_assembly_taxid: 243273
# Mycoplasma genitalium G37, corresponding to the input gencoll_asn file.
ANI_cutoff:
class: File
location: input/ANI_cutoff.xml
kmer_cache_path:
class: Directory
location: input/kmer-cache-minhash
171 changes: 171 additions & 0 deletions bacterial_kmer/wf_bacterial_kmer.cwl
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
#!/usr/bin/env cwl-runner
label: bacterial_kmer
cwlVersion: v1.0
class: Workflow
requirements:
- class: SubworkflowFeatureRequirement
- class: MultipleInputFeatureRequirement
hints:
DockerRequirement:
dockerPull: ncbi/gpdev:latest

inputs:
# kmer_minhash_tarball: File # tarball of all reference minhashes. For Mycoplasma genitalium, there are 8K files, in total 1G
Extract_Kmers_From_Input___entry: File
gencoll_asn: File
asn_cache: Directory
kmer_cache_path: Directory
ref_assembly_taxid: int
ANI_cutoff: File
outputs:
Identify_Top_N_ANI_annot:
type: File
outputSource: Identify_Top_N_ANI/annot
Identify_Top_N_ANI_top:
type: File
outputSource: Identify_Top_N_ANI/top
Extract_Top_Assemblies___tax_report:
type: File
outputSource: Extract_Top_Assemblies/tax_report
steps:
# order manually set to match the order of display on the GPC graph for the plane in the buildrun
Get_Reference_Assemblies:
run: ../task_types/tt_bact_get_kmer_reference.cwl
in: []
out: [gc_id_list]
Query_Kmer_Cache:
run: ../task_types/tt_kmer_cache_retrieve.cwl
in:
gc_id_list: Get_Reference_Assemblies/gc_id_list
kmer_cache_path: kmer_cache_path
out: [new_gc_id_list, out_kmer_file_list, out_kmer_cache_path]
Extract_Kmer_List:
run: ../task_types/tt_kmer_gc_extract_wnode.cwl
in:
new_gc_id_list: Query_Kmer_Cache/new_gc_id_list
asn_cache: asn_cache
out: [out_kmer_file_list]
Store_in_Kmer_Cache:
run: ../task_types/tt_kmer_cache_store.cwl
in:
kmer_cache_path: Query_Kmer_Cache/out_kmer_cache_path
kmer_file_list: Extract_Kmer_List/out_kmer_file_list
out: [out_kmer_file_list]
Extract_Kmers_From_Input:
run: ../task_types/tt_kmer_seq_entry_extract_wnode.cwl
in:
entry: Extract_Kmers_From_Input___entry
kmer_file_list:
source: [Query_Kmer_Cache/out_kmer_file_list, Store_in_Kmer_Cache/out_kmer_file_list]
linkMerge: merge_flattened
asn_cache: asn_cache
out: [out_kmer_file_list]
Compare_Kmer:
run: ../task_types/tt_kmer_ref_compare_wnode.cwl
in:
kmer_file_list: Extract_Kmers_From_Input/out_kmer_file_list
ref_kmer_file_list:
source: [Query_Kmer_Cache/out_kmer_file_list, Store_in_Kmer_Cache/out_kmer_file_list]
linkMerge: merge_flattened
dist_method:
default: minhash
minhash_signature:
default: minhash
score_method:
default: boolean
out: [distances]
Identify_Top_N:
run: ../task_types/tt_kmer_top_n.cwl
in:
distances: Compare_Kmer/distances
out: [matches, top_distances]
Compare_Kmer__Pairwise_:
run: ../task_types/tt_kmer_compare_wnode.cwl
in:
kmer_file_list:
source: [Extract_Kmers_From_Input/out_kmer_file_list, Identify_Top_N/matches]
linkMerge: merge_flattened
dist_method:
default: minhash
minhash_signature:
default: minhash
score_method:
default: boolean
out: [distances]
Extract_Top_Assemblies:
run: ../task_types/tt_kmer_top_n_extract.cwl
in:
top_distances: Identify_Top_N/top_distances
ref_assembly_taxid: ref_assembly_taxid
out: [tax_report, gc_id_list]
Build_Kmer_Tree:
run: ../task_types/tt_kmer_build_tree.cwl
in:
distances: Compare_Kmer__Pairwise_/distances
sort:
default: leaf-count-ascending
no_merge:
default: true
skip_markup:
default: true
out: [tree]
Get_Top_Assemblies_GenColl_ASN:
run: ../task_types/tt_gcaccess_from_list.cwl
in:
gc_id_list: Extract_Top_Assemblies/gc_id_list
out: [gencoll_asn]
Extract_Input_GenColl_IDs:
run: ../task_types/tt_extract_gencoll_ids.cwl
in:
assemblies: gencoll_asn
out: [gc_id_list]
Assembly_Assembly_BLASTn:
run: ../task_types/tt_assm_assm_blastn_wnode.cwl
in:
queries_gc_id_list: Extract_Input_GenColl_IDs/gc_id_list
subjects_gc_id_list: Extract_Top_Assemblies/gc_id_list
# this will brea here
Get_Top_Assemblies_GenColl_ASN_assemblies: Get_Top_Assemblies_GenColl_ASN/gencoll_asn
gencoll_asn: gencoll_asn
affinity:
default: 'subject'
# settings
asn_cache: asn_cache
compart:
default: "true"
evalue:
default: 0.0001
gapextend:
default: 1
gapopen:
default: 2
max_bases_per_call:
default: 500000000
max_target_seqs:
default: 250
merge_align_filter:
default: "((reciprocity = 3 AND align_length_ungap >= 5) OR align_length > 1000) AND pct_identity_gap > 25"
merge_engine:
default: "tree-merger"
soft_masking:
default: "true"
task:
default: megablast
use_common_components:
default: "true"
window_size:
default: 150
word_size:
default: 28
workers_per_cpu:
default: 0.4
out: [blast_align]
Identify_Top_N_ANI:
run: ../task_types/tt_ani_top_n.cwl
in:
asn_cache: asn_cache
ANI_cutoff: ANI_cutoff
gencoll_asn: gencoll_asn
blast_align: Assembly_Assembly_BLASTn/blast_align
ref_assembly_taxid: ref_assembly_taxid
out: [top,annot]

0 comments on commit f4cb632

Please sign in to comment.