Merge pull request #72 from MicrobialDarkMatter/read_level_methylation

Read level methylation
MicrobialDarkMatter · Sep 19, 2024 · f20dcfe · f20dcfe
2 parents 273c2e8 + 78026ce
commit f20dcfe
Show file tree

Hide file tree

Showing 9 changed files with 647 additions and 346 deletions.
diff --git a/nanomotif/_version.py b/nanomotif/_version.py
@@ -1 +1 @@
-__version__ = "0.4.12"
+__version__ = "0.4.13"
diff --git a/nanomotif/argparser.py b/nanomotif/argparser.py
@@ -30,6 +30,8 @@ def  create_parser():
     parser_shared_find_motifs.add_argument("--threshold_valid_coverage", type=int, default=5, help="minimum valid base coverage for a position to be considered. Default: %(default)s")
     parser_shared_find_motifs.add_argument("--minimum_kl_divergence", type=float, default=0.05, help="minimum KL-divergence for a position to considered for expansion in  motif search. Higher value means less exhaustive, but faster search. Default: %(default)s")
     parser_shared_find_motifs.add_argument("--min_motifs_contig", type=int, default=20, help="minimum number of times a motif has to have been oberserved in a contig. Default: %(default)s")
+    parser_shared_find_motifs.add_argument("--read_level_methylation", action="store_true", help="If specified, methylation is calculated on read level instead of contig level. This is slower but produces more stable motifs.")
+    parser_shared_find_motifs.add_argument("--min_motif_score", type=float, default=0.2, help="minimum score for a motif to be kept after identification considered valid. Default: %(default)s")
     parser_find_motifs = subparsers.add_parser(
         'find_motifs', 
         parents=[parser_positional, parser_optional, parser_shared_find_motifs], 

diff --git a/nanomotif/bin_consensus.py b/nanomotif/bin_consensus.py
@@ -43,7 +43,7 @@ def convert_motifs_to_regex(motifs):
     #contig_motifs = nm.postprocess.remove_sub_motifs(motifs_scored_filt)
 
     contig_motifs = motifs_scored_filt.with_columns(
-            pl.col("motif").apply(lambda x: nm.seq.regex_to_iupac(x)).alias("motif"),
+            pl.col("motif").map_elements(lambda x: nm.seq.regex_to_iupac(x), return_dtype = pl.Utf8).alias("motif"),
             (pl.col("n_mod")  / (pl.col("n_mod") + pl.col("n_nomod"))).alias("mean")
         )
     bin_motifs = contig_motifs.groupby("bin", "motif", "mod_position", "mod_type") \

diff --git a/nanomotif/dataload.py b/nanomotif/dataload.py
@@ -39,6 +39,22 @@ def load_pileup(path: str, threads: int = 1, min_fraction: float = 0):
     pileup = pileup.rename({"column_1":"contig", "column_2": "position", "column_4": "mod_type", "column_6": "strand", "column_11": "fraction_mod", "column_10":"Nvalid_cov"})
     return Pileup(pileup)
 
+def load_low_coverage_positions(path_pileup: str, threads: int = 1, min_coverage: float = 5):
+    """
+    Load pileup file from path to pileup.bed output of modkit pileup
+    """
+    pileup = (
+        pl.scan_csv(path_pileup, separator = "\t", has_header = False)
+        .filter(pl.col("column_10") <= min_coverage)
+        .filter(pl.col("column_10") / (pl.col("column_10") + pl.col("column_17")) > 0.3)
+        .select(["column_1", "column_2","column_4", "column_6", "column_11", "column_10"])
+        .with_columns(pl.col("column_11") / 100)
+        .collect()
+    )
+    pileup = pileup.rename({"column_1":"contig", "column_2": "position", "column_4": "mod_type", "column_6": "strand", "column_11": "fraction_mod", "column_10":"Nvalid_cov"})
+    return pileup
+
+
 def load_assembly(path: str):
     """
     Load assembly from path to fasta file