EBI-Metagenomics
diff --git a/‎README.md
Lines changed: 33 additions & 31 deletions b/‎README.md
Lines changed: 33 additions & 31 deletions
diff --git a/‎bin/add_extensions_to_checkm.py
Lines changed: 77 additions & 0 deletions b/‎bin/add_extensions_to_checkm.py
Lines changed: 77 additions & 0 deletions
diff --git a/‎bin/annotate_gff.py
Lines changed: 34 additions & 13 deletions b/‎bin/annotate_gff.py
Lines changed: 34 additions & 13 deletions
diff --git a/‎bin/change_extensions.py
Lines changed: 53 additions & 0 deletions b/‎bin/change_extensions.py
Lines changed: 53 additions & 0 deletions
diff --git a/‎bin/checkm2csv.py
Lines changed: 30 additions & 16 deletions b/‎bin/checkm2csv.py
Lines changed: 30 additions & 16 deletions
@@ -7,36 +7,36 @@ Gurbich TA, Almeida A, Beracochea M, Burdett T, Burgin J, Cochrane G, Raj S, Ric
 Detailed information about existing MGnify catalogues: https://docs.mgnify.org/src/docs/genome-viewer.html
 
 ### Tools used in the pipeline
-| Tool/Database                    | Version          | Purpose |
-|----------------------------------|------------------|----------- |
-| CheckM                           | 1.1.3            | Determining genome quality       |
-| dRep                             | 3.2.2            | Genome clustering       |
-| Mash                             | 2.3              | Sketch for the catalogue; placement of genomes into clusters (update only); strain tree      |
-| GUNC                             | 1.0.3            | Quality control       |
-| GUNC DB                          | 2.0.4            | Database for GUNC       |
-| GTDB-Tk                          | 2.3.0            | Assigning taxonomy; generating alignments       |
-| GTDB                             | r214             | Database for GTDB-Tk       |
-| Prokka                           | 1.14.6           | Protein annotation       |
-| IQ-TREE 2                        | 2.2.0.3          | Generating a phylogenetic tree       |
-| Kraken 2                         | 2.1.2            | Generating a kraken database       |
-| Bracken                          | 2.6.2            | Generating a bracken database       |
-| MMseqs2                          | 13.45111         | Generating a protein catalogue       |
-| eggNOG-mapper                    | 2.1.11           | Protein annotation (eggNOG, KEGG, COG,  CAZy)       |
-| eggNOG DB                        | 5.0              | Database for eggNOG-mapper       |
-| Diamond                          | 2.0.11           | Protein annotation (eggNOG)       |
-| InterProScan                     | 5.62-94.0        | Protein annotation (InterPro, Pfam)       |
-| CRISPRCasFinder                  | 4.3.2            | Annotation of CRISPR arrays       |
-| AMRFinderPlus                    | 3.11.4           |   Antimicrobial resistance gene annotation; virulence factors, biocide, heat, acid, and metal resistance gene annotation     |
-| AMRFinderPlus DB                 | 3.11 2023-02-23.1 | Database for AMRFinderPlus      |
-| SanntiS                          | 0.9.3.2          | Biosynthetic gene cluster annotation       |
-| Infernal                         | 1.1.4            | RNA predictions       |
-| tRNAscan-SE                      | 2.0.9            | tRNA predictions       |
-| Rfam                             | 14.9             | Identification of SSU/LSU rRNA and other ncRNAs       |
-| Panaroo                          | 1.3.2            | Pan-genome computation       |
-| Seqtk                            | 1.3              | Generating a gene catalogue       |
-| VIRify                           | 2.0.0            | Viral sequence annotation       |
-| [Mobilome annotation pipeline](https://github.com/EBI-Metagenomics/mobilome-annotation-pipeline) | 2.0.0-rc.1       | Mobilome annotation       |
-| samtools                         | 1.15             | FASTA indexing       |
+| Tool/Database                                           | Version           | Purpose |
+|---------------------------------------------------------|-------------------|----------- |
+| CheckM2                                                 | 1.0.1             | Determining genome quality       |
+| dRep                                                    | 3.2.2             | Genome clustering       |
+| Mash                                                    | 2.3               | Sketch for the catalogue; placement of genomes into clusters (update only); strain tree      |
+| GUNC                                                    | 1.0.3             | Quality control       |
+| GUNC DB                                                 | 2.0.4             | Database for GUNC       |
+| GTDB-Tk                                                 | 2.3.0             | Assigning taxonomy; generating alignments       |
+| GTDB                                                    | r214              | Database for GTDB-Tk       |
+| Prokka                                                  | 1.14.6            | Protein annotation       |
+| IQ-TREE 2                                               | 2.2.0.3           | Generating a phylogenetic tree       |
+| Kraken 2                                                | 2.1.2             | Generating a kraken database       |
+| Bracken                                                 | 2.6.2             | Generating a bracken database       |
+| MMseqs2                                                 | 13.45111          | Generating a protein catalogue       |
+| eggNOG-mapper                                           | 2.1.11            | Protein annotation (eggNOG, KEGG, COG,  CAZy)       |
+| eggNOG DB                                               | 5.0.2             | Database for eggNOG-mapper       |
+| Diamond                                                 | 2.0.11            | Protein annotation (eggNOG)       |
+| InterProScan                                            | 5.62-94.0         | Protein annotation (InterPro, Pfam)       |
+| CRISPRCasFinder                                         | 4.3.2             | Annotation of CRISPR arrays       |
+| AMRFinderPlus                                           | 3.11.4            |   Antimicrobial resistance gene annotation; virulence factors, biocide, heat, acid, and metal resistance gene annotation     |
+| AMRFinderPlus DB                                        | 3.11 2023-02-23.1 | Database for AMRFinderPlus      |
+| SanntiS                                                 | 0.9.3.2           | Biosynthetic gene cluster annotation       |
+| Infernal                                                | 1.1.4             | RNA predictions       |
+| tRNAscan-SE                                             | 2.0.9             | tRNA predictions       |
+| Rfam                                                    | 14.9              | Identification of SSU/LSU rRNA and other ncRNAs       |
+| Panaroo                                                 | 1.3.2             | Pan-genome computation       |
+| Seqtk                                                   | 1.3               | Generating a gene catalogue       |
+| VIRify                                                  | 2.0.1             | Viral sequence annotation       |
+| [Mobilome annotation pipeline](https://github.com/EBI-Metagenomics/mobilome-annotation-pipeline) | 2.0.1             | Mobilome annotation       |
+| samtools                                                | 1.15              | FASTA indexing       |
 
 ## Setup
 
@@ -52,12 +52,13 @@ Requirements:
 The pipeline needs the following reference databases and configuration files (roughtly ~150G):
 
 - ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/genomes-pipeline/gunc_db_2.0.4.dmnd.gz
-- ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/genomes-pipeline/eggnog_db.tgz
+- ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/genomes-pipeline/eggnog_db_5.0.2.tgz
 - ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/genomes-pipeline/rfam_14.9/
 - ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/genomes-pipeline/kegg_classes.tsv
 - ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/genomes-pipeline/continent_countries.csv
 - https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.0/auxillary_files/gtdbtk_r214_data.tar.gz
 - ftp://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/3.11/2023-02-23.1
+- https://zenodo.org/records/4626519/files/uniref100.KO.v1.dmnd.gz
 
 ### Containers
 
@@ -100,6 +101,7 @@ nextflow run EBI-Metagenomics/genomes-pipeline -c <custom.config> -profile <prof
 --ena_genomes_checkm=<path to genomes quality data> \
 --mgyg_start=0 \
 --mgyg_end=10 \
+--preassigned_accessions=<path to file with preassigned accessions if using>
 --catalogue_name=zebrafish-faecal \
 --catalogue_version="1.0" \
 --ftp_name="zebrafish-faecal" \
 
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+
+# This file is part of MGnify genome analysis pipeline.
+#
+# MGnify genome analysis pipeline is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# MGnify genome analysis pipeline is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with MGnify genome analysis pipeline. If not, see <https://www.gnu.org/licenses/>.
+
+
+import argparse
+import os
+
+
+def main(fasta_file_directory, checkm_directory):
+    fasta_dict = load_file_list(fasta_file_directory)
+    checkm_path = os.path.join(checkm_directory, "quality_report.tsv")
+    assert os.path.isfile(checkm_path), "CheckM2 input doesn't exist"
+    contents = ""
+    with open(checkm_path, "r") as file_in:
+        for line in file_in:
+            if "Completeness" in line:
+                contents += line
+            else:
+                genome_name = line.split("\t")[0]
+                genome_with_ext = fasta_dict[genome_name]
+                line = line.replace(genome_name, genome_with_ext)
+                contents += line
+    with open(checkm_path, "w") as file_out:
+        file_out.write(contents)
+        
+    
+def load_file_list(fasta_file_directory):
+    fasta_dict = dict()
+    file_list = os.listdir(fasta_file_directory)
+    for file in file_list:
+        name = file.rsplit(".", 1)[0]
+        fasta_dict[name] = file
+    return fasta_dict
+        
+    
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=(
+            "The script processes CheckM2 output to put the genome file extensions back in."
+        )
+    )
+    parser.add_argument(
+        "-d",
+        dest="fasta_file_directory",
+        required=True,
+        help="Input directory containing FASTA files",
+    )
+    parser.add_argument(
+        "-i",
+        dest="checkm_directory",
+        help=(
+            "Folder containing output of checkm2"
+        ),
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(
+        args.fasta_file_directory,
+        args.checkm_directory,
+    )
@@ -25,14 +25,21 @@ def get_iprs(ipr_annot):
         for line in f:
             cols = line.strip().split("\t")
             protein = cols[0]
+            try:
+                evalue = float(cols[8])
+            except ValueError:
+                continue
+            if evalue > 1e-10:
+                continue
             if protein not in iprs:
                 iprs[protein] = [set(), set()]
             if cols[3] == "Pfam":
                 pfam = cols[4]
                 iprs[protein][0].add(pfam)
             if len(cols) > 12:
                 ipr = cols[11]
-                iprs[protein][1].add(ipr)
+                if not ipr == "-":
+                    iprs[protein][1].add(ipr)
     return iprs
 
 
@@ -45,26 +52,37 @@ def get_eggnog(eggnog_annot):
             if line.startswith("#"):
                 eggnog_fields = get_eggnog_fields(line)
             else:
+                try:
+                    evalue = float(cols[2])
+                except ValueError:
+                    continue
+                if evalue > 1e-10:
+                    continue
                 protein = cols[0]
                 eggnog = [cols[1]]
                 try:
                     cog = cols[eggnog_fields["cog_func"]]
-                    cog = cog.split()
+                    cog = list(cog)
                     if len(cog) > 1:
                         cog = ["R"]
                 except Exception:
                     cog = ["NA"]
                 kegg = cols[eggnog_fields["KEGG_ko"]].split(",")
-                eggnogs[protein] = [eggnog, cog, kegg]
+                go = cols[eggnog_fields["GOs"]]
+                eggnogs[protein] = [eggnog, cog, kegg, go]
     return eggnogs
 
 
 def get_eggnog_fields(line):
     cols = line.strip().split("\t")
+    try:
+        index_of_go = cols.index("GOs")
+    except ValueError:
+        sys.exit("Cannot find the GO terms column.")
     if cols[8] == "KEGG_ko" and cols[15] == "CAZy":
-        eggnog_fields = {"KEGG_ko": 8, "cog_func": 20}
+        eggnog_fields = {"KEGG_ko": 8, "cog_func": 20, "GOs": index_of_go}
     elif cols[11] == "KEGG_ko" and cols[18] == "CAZy":
-        eggnog_fields = {"KEGG_ko": 11, "cog_func": 6}
+        eggnog_fields = {"KEGG_ko": 11, "cog_func": 6, "GOs": index_of_go}
     else:
         sys.exit("Cannot parse eggNOG - unexpected field order or naming")
     return eggnog_fields
@@ -224,6 +242,8 @@ def add_gff(in_gff, eggnog_file, ipr_file, sanntis_file, amr_file):
                                     added_annot[protein]["COG"] = a
                                 elif pos == 3:
                                     added_annot[protein]["KEGG"] = a
+                                elif pos == 4:
+                                    added_annot[protein]["Ontology_term"] = a
                     except Exception:
                         pass
                     try:
@@ -257,7 +277,8 @@ def add_gff(in_gff, eggnog_file, ipr_file, sanntis_file, amr_file):
                         if a == "AMR":
                             cols[8] = "{};{}".format(cols[8], value)
                         else:
-                            cols[8] = "{};{}={}".format(cols[8], a, value)
+                            if value != "-":
+                                cols[8] = "{};{}={}".format(cols[8], a, value)
                     line = "\t".join(cols)
             out_gff.append(line)
     return out_gff
@@ -376,15 +397,15 @@ def add_ncrnas_and_crispr_to_gff(gff_outfile, ncrnas, crispr_annotations, res):
         help="GFF input file",
     )
     parser.add_argument(
-        "-e",
-        dest="eggnong",
-        help="eggnog annontations for the clutser repo",
+        "-i",
+        dest="ips",
+        help="InterproScan annotations results for the cluster rep",
         required=True,
     )
     parser.add_argument(
-        "-i",
-        dest="ips",
-        help="InterproScan annontations results for the cluster rep",
+        "-e",
+        dest="eggnog",
+        help="eggnog annotations for the cluster repo",
         required=True,
     )
     parser.add_argument(
@@ -414,7 +435,7 @@ def add_ncrnas_and_crispr_to_gff(gff_outfile, ncrnas, crispr_annotations, res):
 
     extended_gff = add_gff(
         in_gff=gff,
-        eggnog_file=args.eggnong,
+        eggnog_file=args.eggnog,
         ipr_file=args.ips,
         sanntis_file=args.sanntis,
         amr_file=args.amr,
 
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+
+# This file is part of MGnify genome analysis pipeline.
+#
+# MGnify genome analysis pipeline is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# MGnify genome analysis pipeline is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with MGnify genome analysis pipeline. If not, see <https://www.gnu.org/licenses/>.
+
+
+import os
+import argparse
+
+
+def change_file_extensions(directory_path):
+    for filename in os.listdir(directory_path):
+        file_path = os.path.join(directory_path, filename)
+
+        # Check if the path is a file (not a directory)
+        if os.path.isfile(file_path):
+            # Split the file name and extension
+            file_name, file_extension = os.path.splitext(filename)
+
+            # Check if the current extension is not "fa" and rename
+            if file_extension != '.fa':
+                new_file_name = file_name + '.fa'
+                new_file_path = os.path.join(directory_path, new_file_name)
+                os.rename(file_path, new_file_path)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='The script changes extensions of genomes in the '
+                                                 'NCBI folder to .fa.')
+    parser.add_argument('-i', dest='input_folder', required=True, help='Input folder name where genomes are located.')
+
+    args = parser.parse_args()
+    input_folder = args.input_folder
+
+    assert os.path.isdir(input_folder), f"Error: The input folder '{input_folder}' does not exist."
+
+    change_file_extensions(input_folder)
+
+
+if __name__ == '__main__':
+    main()
@@ -16,7 +16,6 @@
 # along with MGnify genome analysis pipeline. If not, see <https://www.gnu.org/licenses/>.
 
 
-import sys
 import argparse
 
 if __name__ == "__main__":
@@ -27,24 +26,39 @@
         "-i",
         "--input",
         dest="input",
-        help="checkm_results.tab: checkm output log",
+        help="checkm_results.tab (for CheckM) or quality_report.tsv (for CheckM2)",
         required=True,
     )
+    parser.add_argument(
+        "--checkm2",
+        action='store_true',
+        help="Use flag if input is produced by CheckM2; default: False",
+        default=False,
+    )
 
     args = parser.parse_args()
-
-    print("genome,completeness,contamination,strain_heterogeneity")
+    
+    if args.checkm2:
+        print("genome,completeness,contamination")
+    else:
+        print("genome,completeness,contamination,strain_heterogeneity")
 
     with open(args.input, "r") as f:
-        next(f)
-        for line in f:
-            if "INFO:" in line:
-                continue
-            if "Completeness" in line and "Contamination" in line:
-                continue
-            cols = line.strip("\n").split("\t")
-            genome = cols[0]
-            complet = cols[-3]
-            cont = cols[-2]
-            strain = cols[-1]
-            print("%s.fa,%s,%s,%s" % (genome, complet, cont, strain))
+        if args.checkm2:
+            next(f)
+            for line in f:
+                genome, complet, cont = line.split("\t")[:3]
+                print("{},{},{}".format(genome, complet, cont))
+        else:
+            next(f)
+            for line in f:
+                if "INFO:" in line:
+                    continue
+                if "Completeness" in line and "Contamination" in line:
+                    continue
+                cols = line.strip("\n").split("\t")
+                genome = cols[0]
+                complet = cols[-3]
+                cont = cols[-2]
+                strain = cols[-1]
+                print("{},{},{},{}".format(genome, complet, cont, strain))