Skip to content

Commit

Permalink
Merge branch 'main' into read_level_methylation
Browse files Browse the repository at this point in the history
  • Loading branch information
SorenHeidelbach authored Sep 19, 2024
2 parents cb7346f + 273c2e8 commit 78026ce
Show file tree
Hide file tree
Showing 7 changed files with 119 additions and 70 deletions.
17 changes: 17 additions & 0 deletions docs/source/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,23 @@ The output is two files: `include_contigs.tsv` and `new_contig_bin.tsv`. The `in

If decontamination should not be performed, the `include_contigs` can be run without the `--run_detect_contamination` flag or without the `--contamination_file` flag.

#### save_scores
Save scores will create a csv file for each contig, which contains all the information for scoring the contig. Columns are:
- bin: The bin the contig is compared to
- motif_mod: The motif being compared `motif_string + "_" + mod_type + "-" + mod_position`
- n_mod: The number of modified motifs in bin
- n_nomod: The number of unmodified motifs in bin
- n_motifs_bin: The total number of motifs in bin
- mean_methylation: n_mod / n_motifs_bin
- mean_methylation_bin: If the motif is methylated, this number represent the mean methylation degree only for contigs where the mean mean_methylation was above 0.25
- std_methylation_bin: The std deviation for the above calculation
- n_contigs: The number of contigs used in above calculation
- methylation_binary: Indicates if the bin is methylated or not
- contig_bin: The bin where the contig originally was assigned
- bin_compare: `bin + "_" + contig` for where the contig was originally assigned
- mean: The mean methylation degree for the contig
- methylation_mean_theshold: The threshold for assigning the contig as methylated or not. This depends on the variation of the bin methylation for the given motif
- methylation_binary_compare: The binary methylation for the contig

## MTase-linker

Expand Down
130 changes: 66 additions & 64 deletions nanomotif/binnary/scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from nanomotif.binnary import utils as ut
from nanomotif.parallel import update_progress_bar
import os
import sys
import polars as pl
import multiprocessing
from multiprocessing import get_context
Expand All @@ -17,9 +18,9 @@ def define_mean_methylation_thresholds(motif_binary_compare):
motif_binary_compare = motif_binary_compare.with_columns([
pl.when(pl.col("methylation_binary") == 1)
.then(
pl.when(pl.col("mean_methylation") - 4 * pl.col("std_methylation_bin") > 0.1)
pl.when(pl.col("mean_methylation") - 4 * pl.col("std_methylation_bin") > 0.25)
.then(pl.col("mean_methylation") - 4 * pl.col("std_methylation_bin"))
.otherwise(0.1)
.otherwise(0.25)
)
.otherwise(pl.lit(None))
.alias("methylation_mean_threshold")
Expand All @@ -31,7 +32,7 @@ def define_mean_methylation_thresholds(motif_binary_compare):
((pl.col("mean") >= pl.col("methylation_mean_threshold")) |
(pl.col("mean") > 0.4)))
.then(1)
.when(pl.col("methylation_binary") == 1)
.when((pl.col("methylation_binary") == 1) & pl.col("mean").is_not_null())
.then(0)
.otherwise(pl.lit(None))
.alias("methylation_binary_compare")
Expand Down Expand Up @@ -68,7 +69,7 @@ def compare_methylation_pattern(motif_binary_compare):
.when((motif_binary_compare['methylation_binary'] == 0) & (motif_binary_compare['methylation_binary_compare'] == 0))
.then(0)
.when((motif_binary_compare['methylation_binary'] == 1) & (motif_binary_compare['methylation_binary_compare'].is_null()))
.then(0)
.then(1)
.when((motif_binary_compare['methylation_binary'] == 0) & (motif_binary_compare['methylation_binary_compare'].is_null()))
.then(0)
.otherwise(pl.lit(None))
Expand Down Expand Up @@ -120,63 +121,61 @@ def process_bin_contig(

bin, bin_contig = task

try:
if mode == "contamination":
motif_binary_compare = bin_motifs_from_motifs_scored_in_bins \
.filter(pl.col("bin") == bin) \
.join(
ut.add_compare_df(motifs_scored_in_contigs, bin_contig),# motifs_scored_in_contigs.filter(pl.col("bin_compare") == bin_contig),
on="motif_mod"
)
if mode == "include":
motif_binary_compare = bin_motifs_from_motifs_scored_in_bins \
.filter(pl.col("bin") != bin) \
.join(
ut.add_compare_df(motifs_scored_in_contigs, bin_contig),# motifs_scored_in_contigs.filter(pl.col("bin_compare") == bin_contig),
on="motif_mod"
)

# Define methylation thresholds
motif_binary_compare = define_mean_methylation_thresholds(motif_binary_compare)

if args.save_scores:
path = os.path.join(args.out, "scores", args.command, "binary_compare" + bin_contig + ".csv")
motif_binary_compare.write_csv(path)
if mode == "contamination":
motif_binary_compare = bin_motifs_from_motifs_scored_in_bins \
.filter(pl.col("bin") == bin) \
.join(
ut.add_compare_df(motifs_scored_in_contigs, bin_contig),
on="motif_mod"
)
if mode == "include":
motif_binary_compare = bin_motifs_from_motifs_scored_in_bins \
.filter(pl.col("bin") != bin) \
.join(
ut.add_compare_df(motifs_scored_in_contigs, bin_contig),
on="motif_mod",
how="left"
)\
.with_columns(
pl.col("contig_bin").fill_null(bin),
pl.col("bin_compare").fill_null(bin_contig)
)

# Define methylation thresholds
motif_binary_compare = define_mean_methylation_thresholds(motif_binary_compare)

# Calculate the comparison score regardless of methylation presence
contig_bin_comparison_score = compare_methylation_pattern(motif_binary_compare)

# Check if the contig has no methylation and note it, but do not exclude it from further processing
contigHasNMethylation = motif_binary_compare.filter(pl.col("methylation_binary_compare") == 1).height


if mode == "include":
contig_bin_comparison_score = contig_bin_comparison_score \
.filter(
pl.col("binary_methylation_missmatch_score") == 0
)
if mode == "contamination":
contig_bin_comparison_score = contig_bin_comparison_score \
.filter(
pl.col("binary_methylation_missmatch_score") > 0
)

with lock:
counter.value += 1

if contigHasNMethylation == 0:
return contig_bin_comparison_score, bin_contig

return contig_bin_comparison_score, None
except Exception as e:
with lock:
counter.value += 1
if args.save_scores:
path = os.path.join(args.out, "scores", mode, "binary_compare" + bin_contig + ".csv")
motif_binary_compare.write_csv(path)

# Calculate the comparison score regardless of methylation presence
contig_bin_comparison_score = compare_methylation_pattern(motif_binary_compare)

# Check if the contig has no methylation and note it, but do not exclude it from further processing
contigHasNMethylation = motif_binary_compare.filter(pl.col("methylation_binary_compare") == 1).height


if mode == "include":
contig_bin_comparison_score = contig_bin_comparison_score \
.filter(
pl.col("binary_methylation_missmatch_score") == 0
)
if mode == "contamination":
contig_bin_comparison_score = contig_bin_comparison_score \
.filter(
pl.col("binary_methylation_missmatch_score") > 0
)

error_log_path = os.path.join(args.out, "logs", f"{args.command}_{bin_contig}_error.err")
with lock:
counter.value += 1

with open(error_log_path, 'w') as error_file:
error_file.write(f"Error processing {bin_contig}: {str(e)}\n")
return None, None
if contigHasNMethylation == 0:
return contig_bin_comparison_score, bin_contig
return contig_bin_comparison_score, None

def create_dir_if_not_exists(path):
if not os.path.exists(path):
os.makedirs(path)

def compare_methylation_pattern_multiprocessed(motifs_scored_in_bins, bin_consensus, mode, args, num_processes=1):
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -205,12 +204,10 @@ def compare_methylation_pattern_multiprocessed(motifs_scored_in_bins, bin_consen
progress_bar_process = multiprocessing.Process(target=update_progress_bar, args=(counter, len(tasks), True))
progress_bar_process.start()



if args.save_scores:
dir = os.path.join(args.out, "scores", args.command)
if not os.path.exists(dir):
os.makedirs(dir)
create_dir_if_not_exists(os.path.join(args.out, "scores"))
create_dir_if_not_exists(os.path.join(args.out, "scores", "pre-scoring"))
create_dir_if_not_exists(os.path.join(args.out, "scores", mode))

# Put them workers to work
results = pool.starmap(process_bin_contig, [(
Expand Down Expand Up @@ -239,6 +236,11 @@ def compare_methylation_pattern_multiprocessed(motifs_scored_in_bins, bin_consen
comparison_score = pl.concat([comparison_score, result])
if no_methylation is not None:
contigs_w_no_methylation.append(no_methylation)


if comparison_score.shape[0] == 0:
logger.warning("No contigs were found with the specified criteria.")
# exit
sys.exit(1)

return comparison_score, contigs_w_no_methylation

9 changes: 8 additions & 1 deletion nanomotif/scoremotifs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
from nanomotif import candidate, evaluate
from nanomotif.parallel import update_progress_bar
from nanomotif.logger import configure_logger
from nanomotif.utils import concatenate_motif_position_files, clean_up_motif_positions_files
import numpy as np

def score_contig_with_motifs(contig, mod_type, pileup, sequence, motifs, na_positions=None, save_motif_positions=False, positions_outdir=None):

log.info(f"Scoring motifs in contig {contig}, modtype {mod_type}")
# Ensuring motifs are in regex for searching
motifs = motifs\
Expand Down Expand Up @@ -187,6 +188,12 @@ def score_sample_parallel(
if len(results) == 0:
return None
motifs_all_scored = pl.concat(results)

if save_motif_positions:
files = [os.path.join(positions_outdir, f) for f in os.listdir(positions_outdir) if f.endswith("_motifs_positions.npz")]
concatenate_motif_position_files(files, positions_outdir)
clean_up_motif_positions_files(files)

return motifs_all_scored


Expand Down
22 changes: 22 additions & 0 deletions nanomotif/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import re
import random
import nanomotif as nm
import os
np.random.seed(1)

def has_n_character_stretches_of_length_m(sequence, n, m, character):
Expand Down Expand Up @@ -86,3 +87,24 @@ def all_lengths_equal(iterator):
return all(first == len(x) for x in iterator)


def concatenate_motif_position_files(files, path):
combined_data = {}
for f in files:
contig_name = os.path.basename(f).split("_")[0:-3]
contig_name = "_".join(contig_name)
mod_type = os.path.basename(f).split("_")[-3]

with np.load(f, allow_pickle=True) as data:
motif_data = {key: data[key].item() for key in data.files}

if contig_name not in combined_data:
combined_data[contig_name] = {}

combined_data[contig_name][mod_type] = motif_data

np.savez(os.path.join(path, "motif_positions_combined.npz"), **combined_data)

def clean_up_motif_positions_files(files):
for f in files:
os.remove(f)

2 changes: 1 addition & 1 deletion tests/binnary/test_cli_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def test_contamination_with_output_scores():
# Check that the output directory was created
assert os.path.isdir(outdir), "Output directory was not created"

bins_in_dir = glob.glob(f"{outdir}/scores/detect_contamination/*.csv")
bins_in_dir = glob.glob(f"{outdir}/scores/contamination/*.csv")
assert len(bins_in_dir) == 13, "No scores were written"

csvs = glob.glob(f"{outdir}/scores/*/*.csv")
Expand Down
5 changes: 3 additions & 2 deletions tests/binnary/test_include_contigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ def test_include_contigs(loaded_data, motifs_scored_in_bins_and_bin_motifs):
include_contigs_df = include_contigs_df.to_pandas()

assert include_contigs_df is not None
assert include_contigs_df.shape[0] == 3
assert include_contigs_df[include_contigs_df["bin"] == "b3"]["contig"].iloc[0] == "contig_15"
print(include_contigs_df)
assert include_contigs_df.shape[0] == 2
assert include_contigs_df[include_contigs_df["bin"] == "b2"]["contig"].iloc[0] == "contig_6"



Expand Down
4 changes: 2 additions & 2 deletions tests/binnary/test_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,6 @@ def test_compare_methylation_pattern_include(loaded_data, motifs_scored_in_bins_
contig_bin_comparison_score = contig_bin_comparison_score.to_pandas()

assert contig_bin_comparison_score is not None
assert set(contig_bin_comparison_score["bin"].unique()) =={'b2', 'b3'}
assert len(contig_bin_comparison_score["contig"].unique()) == 2
assert set(contig_bin_comparison_score["bin"].unique()) =={'b2'}
assert len(contig_bin_comparison_score["contig"].unique()) == 1

0 comments on commit 78026ce

Please sign in to comment.