Skip to content

Commit 1f5235e

Browse files
committed
rabbit fixes, better validation
1 parent d07cf66 commit 1f5235e

File tree

5 files changed

+31
-15
lines changed

5 files changed

+31
-15
lines changed

harpy/_conda.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def create_conda_recipes(outdir: str, envs: list=None) -> None:
3434
],
3535
"demultiplex": [
3636
"bioconda::pheniqs",
37-
"bioconda::pysam",
37+
"bioconda::pysam=0.22",
3838
"conda-forge::python-levenshtein"
3939
],
4040
"metassembly": [

harpy/_printing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def print_onerror(logfile):
101101
console.print(f"The workflow stopped because of an error. Full workflow log:\n[bold]{logfile}[/bold]")
102102
console.rule("[bold]Where Error Occurred", style = "red")
103103

104-
def workflow_info(*arg):
104+
def workflow_info(*arg: tuple[str, str | int | float]) -> Table:
105105
"""
106106
Accepts an unlimited number of length-2 lists or tuples and returns a rich.Table with the value of the first indices as the row names and the second indices as the values
107107
Use None instead of a list to ignore that entry (useful for conditionals). The second value will always be converted to a string.

harpy/_validations.py

+26-11
Original file line numberDiff line numberDiff line change
@@ -257,17 +257,32 @@ def validate_popsamples(infiles, popfile, quiet):
257257

258258
def validate_demuxschema(infile):
259259
"""Validate the file format of the demultiplex schema"""
260-
with open(infile, "r", encoding="utf-8") as f:
261-
rows = [i for i in f.readlines() if i != "\n" and not i.lstrip().startswith("#")]
262-
invalids = [(i,j) for i,j in enumerate(rows) if len(j.split()) < 2]
263-
if invalids:
264-
print_error(f"invalid format", "There are [bold]{len(invalids)}[/bold] rows in [blue]{infile}[/blue] without a space/tab delimiter or don't have two entries for sample[dim]<tab>[/dim]barcode. Terminating Harpy to avoid downstream errors.")
265-
print_solution_with_culprits(
266-
f"Make sure every entry in [blue]{infile}[/blue] uses space or tab delimeters and has both a sample name and barcode designation. You may comment out rows with a [green]#[/green] to have Harpy ignore them.",
267-
"The rows and values causing this error are:"
268-
)
269-
_ = [click.echo(f"{i[0]+1}\t{i[1]}", file = sys.stderr) for i in invalids]
270-
sys.exit(1)
260+
code_letters = set() #codes can be Axx, Bxx, Cxx, Dxx
261+
segment_ids = set()
262+
with open(infile, 'r') as file:
263+
for line in file:
264+
try:
265+
sample, segment_id = line.rstrip().split()
266+
code_letters.add(segment_id[0])
267+
if segment_id in segment_ids:
268+
print_error("An ID segment must only be associated with a single sample.")
269+
print_solution_with_culprits(
270+
"A barcode segment can only be associated with a single sample. For example, [green]C05[/green] cannot not be the segment that identifies both [blue]sample_01[/blue] and [blue]sample_2[/blue].",
271+
"The segment triggering this error is:"
272+
)
273+
click.echo(segment_id)
274+
sys.exit(1)
275+
else:
276+
segment_id.add(seg_id)
277+
except ValueError:
278+
# skip rows without two columns
279+
continue
280+
if not code_letters:
281+
print_error("incorrect schema format", f"Schema file {os.path.basename(infile)} has no valid rows. Rows should be sample<tab>segment, e.g. sample_01<tab>C75")
282+
sys.exit(1)
283+
if len(code_letters) > 1:
284+
print("invalid schema", f"Schema file {os.path.basename(file_path)} has sample IDs expected to be indentified across multiple barcode segments. All sample IDs for this technology should be in a single segment, such as [bold green]C[/bold green] or [bold green]D[/bold green].")
285+
sys.exit(1)
271286

272287
def validate_regions(regioninput, genome):
273288
"""validates the --regions input of harpy snp to infer whether it's an integer, region, or file"""

harpy/align.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_
139139
if setup_only:
140140
sys.exit(0)
141141

142-
start_text = (
142+
start_text = workflow_info(
143143
("Samples:",sample_count),
144144
("Genome:", genome),
145145
("Output Folder:", output_dir + "/"),

harpy/scripts/demultiplex_gen1.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ def read_barcodes(file_path, segment):
1414
try:
1515
code, seq = line.rstrip().split()
1616
if code[0].upper() != segment:
17-
parser.error(f"Segments in {file_path} are expected to begin with {segment}, but begin with {code[0].upper()}")
17+
sys.stderr.write(f"Segments in {file_path} are expected to begin with {segment}, but begin with {code[0].upper()}\n")
18+
sys.exit(1)
1819
data_dict[seq] = code
1920
except ValueError:
2021
# skip rows without two columns

0 commit comments

Comments
 (0)