Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Remove large dataset error on the rSPR entry #177

Merged
merged 11 commits into from
Nov 22, 2023
9 changes: 5 additions & 4 deletions bin/rspr_approx.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def parse_args(args=None):
parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("-c", "--core", dest="CORE_TREE", help="Core tree")
parser.add_argument(
"-a", "--acc", dest="GENE_TREES", nargs="+", help="Gene tree list"
"-a", "--acc", dest="GENE_TREES", help="Gene tree samplesheet path"
)
parser.add_argument(
"-ann",
Expand Down Expand Up @@ -98,15 +98,15 @@ def root_tree(input_path, output_path):
### FUNCTION ROOT_TREE
### Root all the unrooted input trees in directory
### core_tree: path of the core tree
### gene_trees: input gene tree directory path
### gene_trees: path of the csv file containing all the gene tree paths
### output_dir: output directory path
### results: dataframe of the results to store tree size
### merge_pair: boolean to check whether to merge coer tree and gene tree in a single file
### RETURN path of the rooted gene trees directory
#####################################################################


def root_trees(core_tree, gene_trees, output_dir, results, merge_pair=False):
def root_trees(core_tree, gene_trees_path, output_dir, results, merge_pair=False):
print("Rooting trees")
#'''
reference_tree = core_tree
Expand All @@ -119,8 +119,9 @@ def root_trees(core_tree, gene_trees, output_dir, results, merge_pair=False):
)
refer_content, refer_tree_size = root_tree(reference_tree, rooted_reference_tree)

df_gene_trees = pd.read_csv(gene_trees_path)
rooted_gene_trees_path = os.path.join(output_dir, "rooted_gene_trees")
for filename in gene_trees:
for filename in df_gene_trees["path"]:
basename = Path(filename).name
rooted_gene_tree_path = os.path.join(rooted_gene_trees_path, basename)
gene_content, gene_tree_size = root_tree(filename, rooted_gene_tree_path)
Expand Down
2 changes: 1 addition & 1 deletion modules/local/rspr/approx.nf
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ process RSPR_APPROX {
"""
rspr_approx.py \\
--core $core_tree \\
--acc \$(cat $gene_tree_list) \\
--acc $gene_tree_list \\
--annotation $annotation \\
-o approx \\
--min_rspr_distance $min_rspr_distance \\
Expand Down
8 changes: 1 addition & 7 deletions subworkflows/local/rspr.nf
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,11 @@ workflow RSPR {

take:
core_tree
gene_trees
gene_tree_sheet
annotation

main:

gene_trees
.flatten()
.map{it -> it.toString() }
.collectFile(name: 'gene_tree_paths.txt', newLine: true)
.set{ gene_tree_sheet }

RSPR_APPROX (
core_tree,
gene_tree_sheet,
Expand Down
2 changes: 0 additions & 2 deletions subworkflows/local/rspr_input_check.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@ workflow RSPR_INPUT_CHECK {

main:
samplesheet
.splitCsv(header: true)
.map { it -> get_sample_info_rspr(it.path) }
.set { trees }

emit:
Expand Down
15 changes: 14 additions & 1 deletion workflows/arete.nf
Original file line number Diff line number Diff line change
Expand Up @@ -231,9 +231,22 @@ workflow ARETE {
}

if (params.run_rspr) {
PHYLOGENOMICS.out.gene_trees
.flatten()
.map{it -> it.toString() }
.collectFile(newLine: true) { item ->
["${item}.txt",
"sample,path\n" + item + ',' + item ]
}
.set { individual_sheets }

individual_sheets
.collectFile(name: 'gene_tree_paths.txt', skip:1 , keepHeader: true)
.set{ gene_tree_sheet }

RSPR (
PHYLOGENOMICS.out.core_tree,
PHYLOGENOMICS.out.gene_trees,
gene_tree_sheet,
ANNOTATE_ASSEMBLIES.out.annotation
)
}
Expand Down