Merge pull request #55 from MicrobialDarkMatter/arguments-update

Arguments update, minor postprocessing changes, bugfixes
MicrobialDarkMatter · Aug 2, 2024 · 52cb8f5 · 52cb8f5
2 parents 75a9e0d + 04dd384
commit 52cb8f5
Show file tree

Hide file tree

Showing 8 changed files with 90 additions and 43 deletions.
diff --git a/nanomotif/_version.py b/nanomotif/_version.py
@@ -1 +1 @@
-__version__ = "0.4.10"
+__version__ = "0.4.11"
diff --git a/nanomotif/bin_consensus.py b/nanomotif/bin_consensus.py
@@ -61,20 +61,22 @@ def merge_bin_motifs(bin_motifs, bins, pileup, assembly):
     assert list(bin_motifs.schema.keys()) == ['bin', 'motif', 'mod_position', 'mod_type', 'n_mod_bin', 'n_nomod_bin', 'contig_count', 'motif_type', 'mean_methylation']
 
     for (bin, mod_type), df in bin_motifs.groupby("bin", "mod_type"):
+        log.debug(f"Starting motif merge for bin {bin} and mod_type {mod_type}")
         contig_count = df.get_column("contig_count").max()
 
         # Get list of motifs
         motif_seq = df["motif"].to_list()
         motif_pos = df["mod_position"].to_list()
         motifs = [nm.candidate.Motif(nm.candidate.iupac_to_regex(seq), pos) for seq, pos in zip(motif_seq, motif_pos)]
-
+        new_bin_motifs = bin_motifs
         merged_motifs = nm.candidate.merge_motifs(motifs)
         for cluster, motifs in merged_motifs.items():
+            log.debug(f"Starting merge for cluster {cluster}")
             merged_motif = motifs[0]
             premerge_motifs = motifs[1]
 
             premerge_motifs_iupac = [motif.iupac() for motif in premerge_motifs]
-            previous_motif_mean_max = bin_motifs.filter(pl.col("motif").is_in(premerge_motifs_iupac)).get_column("mean_methylation").max()
+            previous_motif_mean_max = df.filter(pl.col("motif").is_in(premerge_motifs_iupac)).get_column("mean_methylation").max()
 
             merge_motif_n_mod = 0
             merge_motif_n_nomod = 0
@@ -90,9 +92,13 @@ def merge_bin_motifs(bin_motifs, bins, pileup, assembly):
             merge_motif_mean = merge_motif_n_mod / (merge_motif_n_mod + merge_motif_n_nomod)
 
             if merge_motif_mean - previous_motif_mean_max > -0.1:
-                bin_motifs = bin_motifs.filter(pl.col("motif").is_in(premerge_motifs_iupac).not_())
-                bin_motifs = pl.concat(
-                    [bin_motifs,
+                log.debug(f"Removing motifs {premerge_motifs_iupac} from bin {bin}")
+                new_bin_motifs = new_bin_motifs.filter(
+                    pl.col("motif").is_in(premerge_motifs_iupac).not_() | (pl.col("bin") != bin)
+                )
+                log.debug(f"Adding motif {merged_motif.iupac()} to bin {bin}")
+                new_bin_motifs = pl.concat(
+                    [new_bin_motifs,
                     pl.DataFrame(
                         {
                             "bin": bin,
@@ -105,7 +111,9 @@ def merge_bin_motifs(bin_motifs, bins, pileup, assembly):
                             "motif_type": nm.utils.motif_type(merged_motif.iupac()),
                             "mean_methylation": merge_motif_mean
                         },
-                        schema=bin_motifs.schema
+                        schema=new_bin_motifs.schema
                     )]
                 )
-    return bin_motifs
+            else:
+                log.debug(f"Skipping motif {merged_motif.iupac()} for bin {bin} and mod_type {mod_type} as it has a lower mean methylation than the previous motifs")
+    return new_bin_motifs
diff --git a/nanomotif/candidate.py b/nanomotif/candidate.py
@@ -75,6 +75,8 @@ def sub_string_of(self, other_motif):
 
         if self_stripped.length() < other_stripped.length():
             return False
+        if self_stripped.string == other_stripped.string:
+            return False
         size_difference = self_stripped.length() - other_stripped.length()
 
         # Split into list of bases
@@ -138,6 +140,8 @@ def have_isolated_bases(self, isolation_size=2):
             # If all surrounding positions are ".", it is isolated
             if set(motif_split[index_start:pos] + motif_split[pos+1:index_end]) == set(["."]):
                 isolated = True
+            if set(motif_split[index_start:pos] + motif_split[pos+1:index_end]) == set(["N"]):
+                isolated = True
         return isolated
 
 

diff --git a/nanomotif/constants.py b/nanomotif/constants.py
@@ -32,4 +32,10 @@
     "m":"C",
     "a":"A",
     "21839":"C"
-}
+}
+MOD_CODE_TO_PRETTY = {
+    "m":"5mC",
+    "a":"6mA",
+    "21839":"4mC"
+}
+MOD_PRETTY_TO_CODE = {v: k for k, v in MOD_CODE_TO_PRETTY.items()}
diff --git a/nanomotif/main.py b/nanomotif/main.py
@@ -89,7 +89,7 @@ def find_motifs(args, pileup = None, assembly = None):
             seed = args.seed
         )
     motifs = pl.DataFrame(motifs)
-    if motifs is None:
+    if motifs is None or len(motifs) == 0:
         log.info("No motifs found")
         return
 
@@ -297,6 +297,17 @@ def bin_consensus(args, pileup = None, assembly = None, motifs = None, motifs_sc
     output.write_csv(args.out + "/bin-motifs.tsv", separator="\t")
 
 def motif_discovery(args):
+    # Check if all required files exist
+    if not os.path.exists(args.pileup):
+        log.error(f"File {args.pileup} does not exist")
+        return
+    if not os.path.exists(args.assembly):
+        log.error(f"File {args.assembly} does not exist")
+        return
+    if not os.path.exists(args.bins):
+        log.error(f"File {args.bins} does not exist")
+        return
+
     # Check if output directory exists
     log.info("Loading required files")
     pileup = nm.load_pileup(args.pileup, threads = args.threads, min_fraction = args.threshold_methylation_general)

diff --git a/nanomotif/postprocess.py b/nanomotif/postprocess.py
@@ -8,19 +8,21 @@ def remove_noisy_motifs(motif_df):
     """
     Remove motifs that have isolated bases
     """
-    motif_df_clean = []
-    for contig, df in motif_df.groupby("contig"):
-        motif_strings = df.get_column("motif").to_list()
-        positions = df.get_column("mod_position").to_list()
-        motifs = [nm.candidate.Motif(motif_string, pos) for motif_string, pos in zip(motif_strings, positions)]
-        clean_motifs = []
-        for motif in motifs:
-            if not motif.have_isolated_bases(isolation_size = 3):
-                clean_motifs.append(motif)
-        df_clean = df.filter(col("motif").is_in(clean_motifs))
-        motif_df_clean.append(df_clean)
-    motif_df_clean = pl.concat(motif_df_clean)
-    return motif_df_clean
+    assert "motif" in motif_df.columns
+    assert "mod_position" in motif_df.columns
+    assert len(motif_df) > 0
+    motif_strings = motif_df.get_column("motif").to_list()
+    positions = motif_df.get_column("mod_position").to_list()
+    motifs = [nm.candidate.Motif(motif_string, pos) for motif_string, pos in zip(motif_strings, positions)]
+    clean_motifs = []
+    for motif in motifs:
+        if not motif.have_isolated_bases(isolation_size = 3):
+            clean_motifs.append(motif.string)
+    if len(clean_motifs) == 0:
+        raise ValueError("All identified motifs were noisy, stopping.")
+    else: 
+        motif_df_clean = motif_df.filter(pl.col("motif").is_in(clean_motifs))
+        return motif_df_clean
 
 def remove_child_motifs(motifs):
     parent_motifs = []

diff --git a/nanomotif/utils.py b/nanomotif/utils.py
@@ -8,7 +8,7 @@
 import nanomotif as nm
 np.random.seed(1)
 
-def has_n_character_stretches(sequence, n, character):
+def has_n_character_stretches_of_length_m(sequence, n, m, character):
     """
     Check if the given sequence has a segment of three or more consecutive dots.
 
@@ -18,24 +18,12 @@ def has_n_character_stretches(sequence, n, character):
     Returns:
     bool: True if there is a segment of three or more consecutive dots, False otherwise.
     """
-    count = 0
-    previous_char = ""
-    for char in sequence:
-        if char == character:
-            if previous_char ==character:
-                previous_char = char
-                continue
-            else:
-                previous_char = char
-                count += 1
-        else:
-            previous_char = char
-    if count >= n:
-        return True
-    else:
-        return False
+    regex_str = rf"({character}){{{m},}}"
+    matches = re.findall(regex_str, sequence)
+    return len(matches) >= n
+
 def motif_type(motif_str):
-    if has_n_character_stretches(motif_str, 2, "N"):
+    if has_n_character_stretches_of_length_m(motif_str, 2, 2, "N"):
         return "ambiguous"
     elif re.search(r"(N){3,}", motif_str):
         return "bipartite"
@@ -96,3 +84,5 @@ def all_lengths_equal(iterator):
     except StopIteration:
         return True
     return all(first == len(x) for x in iterator)
+
+
diff --git a/tests/test_candidate.py b/tests/test_candidate.py
@@ -1,5 +1,6 @@
 from nanomotif.candidate import *
 from nanomotif.constants import *
+from nanomotif.utils import *
 import pytest
 from hypothesis import given, strategies as st
 import itertools
@@ -95,7 +96,7 @@ def test_sub_string_of(self):
         # Equal motifs strings
         motif1 = Motif("ATCG", 2)
         motif2 = Motif("ATCG", 0)
-        assert motif1.sub_string_of(motif2) == True
+        assert motif1.sub_string_of(motif2) == False
 
         # Single base difference
         motif1 = Motif("AGCG", 2)
@@ -214,7 +215,32 @@ def test_iupac(self):
         assert motif.iupac() == "CNNCGNNNS"
 
 
-
 
+def test_has_n_character_stretches_of_length_m():
+    assert has_n_character_stretches_of_length_m("NNA", 1, 2, "N") == True
+    assert has_n_character_stretches_of_length_m("NNA", 2, 1, "N") == False
+    assert has_n_character_stretches_of_length_m("NNA", 2, 2, "N") == False
+    assert has_n_character_stretches_of_length_m("NNANN", 2, 2, "N") == True
+    assert has_n_character_stretches_of_length_m("NNANN", 1, 2, "N") == True
+    assert has_n_character_stretches_of_length_m("NNANN", 2, 1, "N") == True
+    assert has_n_character_stretches_of_length_m("NNANN", 1, 1, "N") == True
+    assert has_n_character_stretches_of_length_m("NNANN", 3, 1, "N") == False
+    assert has_n_character_stretches_of_length_m("NNANN", 4, 1, "N") == False
+    assert has_n_character_stretches_of_length_m("NNANN", 3, 2, "N") == False
+    assert has_n_character_stretches_of_length_m("NNANN", 4, 2, "N") == False
+    assert has_n_character_stretches_of_length_m("NNANN", 3, 3, "N") == False
+    assert has_n_character_stretches_of_length_m("NNANN", 4, 3, "N") == False
+    assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 4, 1, "N") == True
+    assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 4, 2, "N") == False
+    assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 4, 3, "N") == False
+    assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 4, 4, "N") == False
+    assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 3, 1, "N") == True
+    assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 3, 2, "N") == True
+    assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 3, 3, "N") == False
+    assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 3, 4, "N") == False
+    assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 2, 1, "N") == True
+    assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 2, 2, "N") == True
+    assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 2, 3, "N") == True
+    assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 2, 4, "N") == False