Skip to content

Commit

Permalink
Merge pull request #55 from MicrobialDarkMatter/arguments-update
Browse files Browse the repository at this point in the history
Arguments update, minor postprocessing changes, bugfixes
  • Loading branch information
SorenHeidelbach authored Aug 2, 2024
2 parents 75a9e0d + 04dd384 commit 52cb8f5
Show file tree
Hide file tree
Showing 8 changed files with 90 additions and 43 deletions.
2 changes: 1 addition & 1 deletion nanomotif/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.10"
__version__ = "0.4.11"
22 changes: 15 additions & 7 deletions nanomotif/bin_consensus.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,20 +61,22 @@ def merge_bin_motifs(bin_motifs, bins, pileup, assembly):
assert list(bin_motifs.schema.keys()) == ['bin', 'motif', 'mod_position', 'mod_type', 'n_mod_bin', 'n_nomod_bin', 'contig_count', 'motif_type', 'mean_methylation']

for (bin, mod_type), df in bin_motifs.groupby("bin", "mod_type"):
log.debug(f"Starting motif merge for bin {bin} and mod_type {mod_type}")
contig_count = df.get_column("contig_count").max()

# Get list of motifs
motif_seq = df["motif"].to_list()
motif_pos = df["mod_position"].to_list()
motifs = [nm.candidate.Motif(nm.candidate.iupac_to_regex(seq), pos) for seq, pos in zip(motif_seq, motif_pos)]

new_bin_motifs = bin_motifs
merged_motifs = nm.candidate.merge_motifs(motifs)
for cluster, motifs in merged_motifs.items():
log.debug(f"Starting merge for cluster {cluster}")
merged_motif = motifs[0]
premerge_motifs = motifs[1]

premerge_motifs_iupac = [motif.iupac() for motif in premerge_motifs]
previous_motif_mean_max = bin_motifs.filter(pl.col("motif").is_in(premerge_motifs_iupac)).get_column("mean_methylation").max()
previous_motif_mean_max = df.filter(pl.col("motif").is_in(premerge_motifs_iupac)).get_column("mean_methylation").max()

merge_motif_n_mod = 0
merge_motif_n_nomod = 0
Expand All @@ -90,9 +92,13 @@ def merge_bin_motifs(bin_motifs, bins, pileup, assembly):
merge_motif_mean = merge_motif_n_mod / (merge_motif_n_mod + merge_motif_n_nomod)

if merge_motif_mean - previous_motif_mean_max > -0.1:
bin_motifs = bin_motifs.filter(pl.col("motif").is_in(premerge_motifs_iupac).not_())
bin_motifs = pl.concat(
[bin_motifs,
log.debug(f"Removing motifs {premerge_motifs_iupac} from bin {bin}")
new_bin_motifs = new_bin_motifs.filter(
pl.col("motif").is_in(premerge_motifs_iupac).not_() | (pl.col("bin") != bin)
)
log.debug(f"Adding motif {merged_motif.iupac()} to bin {bin}")
new_bin_motifs = pl.concat(
[new_bin_motifs,
pl.DataFrame(
{
"bin": bin,
Expand All @@ -105,7 +111,9 @@ def merge_bin_motifs(bin_motifs, bins, pileup, assembly):
"motif_type": nm.utils.motif_type(merged_motif.iupac()),
"mean_methylation": merge_motif_mean
},
schema=bin_motifs.schema
schema=new_bin_motifs.schema
)]
)
return bin_motifs
else:
log.debug(f"Skipping motif {merged_motif.iupac()} for bin {bin} and mod_type {mod_type} as it has a lower mean methylation than the previous motifs")
return new_bin_motifs
4 changes: 4 additions & 0 deletions nanomotif/candidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ def sub_string_of(self, other_motif):

if self_stripped.length() < other_stripped.length():
return False
if self_stripped.string == other_stripped.string:
return False
size_difference = self_stripped.length() - other_stripped.length()

# Split into list of bases
Expand Down Expand Up @@ -138,6 +140,8 @@ def have_isolated_bases(self, isolation_size=2):
# If all surrounding positions are ".", it is isolated
if set(motif_split[index_start:pos] + motif_split[pos+1:index_end]) == set(["."]):
isolated = True
if set(motif_split[index_start:pos] + motif_split[pos+1:index_end]) == set(["N"]):
isolated = True
return isolated


Expand Down
8 changes: 7 additions & 1 deletion nanomotif/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,10 @@
"m":"C",
"a":"A",
"21839":"C"
}
}
MOD_CODE_TO_PRETTY = {
"m":"5mC",
"a":"6mA",
"21839":"4mC"
}
MOD_PRETTY_TO_CODE = {v: k for k, v in MOD_CODE_TO_PRETTY.items()}
13 changes: 12 additions & 1 deletion nanomotif/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def find_motifs(args, pileup = None, assembly = None):
seed = args.seed
)
motifs = pl.DataFrame(motifs)
if motifs is None:
if motifs is None or len(motifs) == 0:
log.info("No motifs found")
return

Expand Down Expand Up @@ -297,6 +297,17 @@ def bin_consensus(args, pileup = None, assembly = None, motifs = None, motifs_sc
output.write_csv(args.out + "/bin-motifs.tsv", separator="\t")

def motif_discovery(args):
# Check if all required files exist
if not os.path.exists(args.pileup):
log.error(f"File {args.pileup} does not exist")
return
if not os.path.exists(args.assembly):
log.error(f"File {args.assembly} does not exist")
return
if not os.path.exists(args.bins):
log.error(f"File {args.bins} does not exist")
return

# Check if output directory exists
log.info("Loading required files")
pileup = nm.load_pileup(args.pileup, threads = args.threads, min_fraction = args.threshold_methylation_general)
Expand Down
28 changes: 15 additions & 13 deletions nanomotif/postprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,21 @@ def remove_noisy_motifs(motif_df):
"""
Remove motifs that have isolated bases
"""
motif_df_clean = []
for contig, df in motif_df.groupby("contig"):
motif_strings = df.get_column("motif").to_list()
positions = df.get_column("mod_position").to_list()
motifs = [nm.candidate.Motif(motif_string, pos) for motif_string, pos in zip(motif_strings, positions)]
clean_motifs = []
for motif in motifs:
if not motif.have_isolated_bases(isolation_size = 3):
clean_motifs.append(motif)
df_clean = df.filter(col("motif").is_in(clean_motifs))
motif_df_clean.append(df_clean)
motif_df_clean = pl.concat(motif_df_clean)
return motif_df_clean
assert "motif" in motif_df.columns
assert "mod_position" in motif_df.columns
assert len(motif_df) > 0
motif_strings = motif_df.get_column("motif").to_list()
positions = motif_df.get_column("mod_position").to_list()
motifs = [nm.candidate.Motif(motif_string, pos) for motif_string, pos in zip(motif_strings, positions)]
clean_motifs = []
for motif in motifs:
if not motif.have_isolated_bases(isolation_size = 3):
clean_motifs.append(motif.string)
if len(clean_motifs) == 0:
raise ValueError("All identified motifs were noisy, stopping.")
else:
motif_df_clean = motif_df.filter(pl.col("motif").is_in(clean_motifs))
return motif_df_clean

def remove_child_motifs(motifs):
parent_motifs = []
Expand Down
26 changes: 8 additions & 18 deletions nanomotif/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import nanomotif as nm
np.random.seed(1)

def has_n_character_stretches(sequence, n, character):
def has_n_character_stretches_of_length_m(sequence, n, m, character):
"""
Check if the given sequence has a segment of three or more consecutive dots.
Expand All @@ -18,24 +18,12 @@ def has_n_character_stretches(sequence, n, character):
Returns:
bool: True if there is a segment of three or more consecutive dots, False otherwise.
"""
count = 0
previous_char = ""
for char in sequence:
if char == character:
if previous_char ==character:
previous_char = char
continue
else:
previous_char = char
count += 1
else:
previous_char = char
if count >= n:
return True
else:
return False
regex_str = rf"({character}){{{m},}}"
matches = re.findall(regex_str, sequence)
return len(matches) >= n

def motif_type(motif_str):
if has_n_character_stretches(motif_str, 2, "N"):
if has_n_character_stretches_of_length_m(motif_str, 2, 2, "N"):
return "ambiguous"
elif re.search(r"(N){3,}", motif_str):
return "bipartite"
Expand Down Expand Up @@ -96,3 +84,5 @@ def all_lengths_equal(iterator):
except StopIteration:
return True
return all(first == len(x) for x in iterator)


30 changes: 28 additions & 2 deletions tests/test_candidate.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from nanomotif.candidate import *
from nanomotif.constants import *
from nanomotif.utils import *
import pytest
from hypothesis import given, strategies as st
import itertools
Expand Down Expand Up @@ -95,7 +96,7 @@ def test_sub_string_of(self):
# Equal motifs strings
motif1 = Motif("ATCG", 2)
motif2 = Motif("ATCG", 0)
assert motif1.sub_string_of(motif2) == True
assert motif1.sub_string_of(motif2) == False

# Single base difference
motif1 = Motif("AGCG", 2)
Expand Down Expand Up @@ -214,7 +215,32 @@ def test_iupac(self):
assert motif.iupac() == "CNNCGNNNS"




def test_has_n_character_stretches_of_length_m():
assert has_n_character_stretches_of_length_m("NNA", 1, 2, "N") == True
assert has_n_character_stretches_of_length_m("NNA", 2, 1, "N") == False
assert has_n_character_stretches_of_length_m("NNA", 2, 2, "N") == False
assert has_n_character_stretches_of_length_m("NNANN", 2, 2, "N") == True
assert has_n_character_stretches_of_length_m("NNANN", 1, 2, "N") == True
assert has_n_character_stretches_of_length_m("NNANN", 2, 1, "N") == True
assert has_n_character_stretches_of_length_m("NNANN", 1, 1, "N") == True
assert has_n_character_stretches_of_length_m("NNANN", 3, 1, "N") == False
assert has_n_character_stretches_of_length_m("NNANN", 4, 1, "N") == False
assert has_n_character_stretches_of_length_m("NNANN", 3, 2, "N") == False
assert has_n_character_stretches_of_length_m("NNANN", 4, 2, "N") == False
assert has_n_character_stretches_of_length_m("NNANN", 3, 3, "N") == False
assert has_n_character_stretches_of_length_m("NNANN", 4, 3, "N") == False
assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 4, 1, "N") == True
assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 4, 2, "N") == False
assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 4, 3, "N") == False
assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 4, 4, "N") == False
assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 3, 1, "N") == True
assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 3, 2, "N") == True
assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 3, 3, "N") == False
assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 3, 4, "N") == False
assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 2, 1, "N") == True
assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 2, 2, "N") == True
assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 2, 3, "N") == True
assert has_n_character_stretches_of_length_m("NaNNaNNNaNNNN", 2, 4, "N") == False


0 comments on commit 52cb8f5

Please sign in to comment.