bug fixes

fdarthen · Oct 11, 2021 · ed4d543 · ed4d543
2 parents 51d95d8 + d01bd5c
commit ed4d543
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 30 deletions.
diff --git a/MILTS.sh b/MILTS.sh
@@ -10,6 +10,7 @@ config_path=$1
 # read variables from config file
 source <(grep : $config_path | sed 's/ *: */=/g')
 
+
 # extract first pbc_path from list in $pbc_paths
 pbc_path=$(echo $pbc_paths | cut -d ',' -f1 | awk -F '[' '{print $2}' | awk -F ']' '{print $1}')
 
@@ -173,8 +174,8 @@ echo "plot taxonomic assignment end (time elapsed:" $(($time6_2-$time6_1)) "s)"
 
 # 5.c) create static plots from json files
 # ${output_path}tmp/*.json not in "" so that filenames are preserved
-[[ "${output_pdf}" = "TRUE" ]] && xvfb-run -a orca graph ${output_path}tmp/*.json -f "pdf" -d "${output_path}taxonomic_assignment/"
-[[ "${output_png}" = "TRUE" ]] && xvfb-run -a orca graph ${output_path}tmp/*.json -f "png" -d "${output_path}taxonomic_assignment/"
+[[ "${output_pdf}" = "TRUE" ]] && orca graph ${output_path}tmp/*.json -f "pdf" -d "${output_path}taxonomic_assignment/"
+[[ "${output_png}" = "TRUE" ]] && orca graph ${output_path}tmp/*.json -f "png" -d "${output_path}taxonomic_assignment/"
 
 
 # 6) remove the temporarily created files

diff --git a/README.md b/README.md
@@ -1,3 +1,7 @@
-# MILTS
+## Introduction
+MILTS enables the interactive exploration of taxonomic footprints in gene sets. The specific goal is to detect and differentiate contamination and horizontal gene transfer.
 
-find the manual at http://core.izn-ffm.intern/wiki/doku.php?id=freya:master
+Besides the taxonomic assignment of genes, MILTS uses a total of 16 further indicators to faciliate this. Among these indicators are read coverage, sequence composition, gene length and position of genes within their scaffold. To identify genes which deviate from the mean set of genes, a principal component analysis (PCA) is used as it condenses data to fewer dimensions. Genes with similar values for certain variables are thereby clustered together, so that deviations are made visible. The results can be interactively examined in a 3D scatterplot, where the dot position respresents a combination of coverage, sequence composition and spatial information provided by the PCA and the color the taxonomic assignment.
+
+## Documentation
+Please see the [GitHub Wiki](https://github.com/fdhubert/MILTS/wiki) for further information. 
diff --git a/config.yml b/config.yml
@@ -1,32 +1,26 @@
 
 ## Input and output options ##
-fasta_path: "./example/test.fasta"
-gff_path: "./example/test.gff"
-gff_source: "default" # which source to use in GFF
-output_path: "./MILTS_report/" # directory to save results to
-taxon_id: "107806" # NCBI Taxon ID of you query species
+fasta_path: "path/to/assembly.fasta"
+gff_path: "path/to/assembly.gff"
+output_path: "path/to/output_directory/" # directory to save results to
+taxon_id: "<NCBI taxon ID>" # NCBI Taxon ID of query species
 
 ## Coverage options ##
-include_coverage: "TRUE"
+include_coverage: "FALSE"
 compute_coverage: "FALSE"
-pbc_paths: ["./example/test.pbc.txt","./example/test.pbc_2.txt"]
-bam: "path/to/mapping.bam"
-reads: "path/to/reads"
-insert_size: ""
-
-## Gene info options ##
-include_pseudogenes: "FALSE"
+pbc_paths: ["paths/to/pbc.txt"]
+bam: "path/to/bam"
+reads: ["paths/to/read_files"]
+insert_size: "200" # insert size for paired end reads
 
 ## Taxonomic assignment options ##
-compute_tax_assignment: "FALSE"
-extract_proteins: "FALSE"       # set proteins_path as output path
-proteins_path: "./example/test_proteins.fasta"
-tax_assignment_path: "./example/top10_hits.txt"
-database_path: "path/to/nrTaxonomy.dmnd"
+compute_tax_assignment: "TRUE"
+extract_proteins: "TRUE"       # set proteins_path as output path
+proteins_path: "path/to/proteins.fasta"
+tax_assignment_path: "path/to/taxonomic_hits.txt"
+database_path: "path/to/database.dmnd" #
 taxon_exclude: "TRUE"
-assignment_mode: "quick"
-quick_mode_search_rank: "phylum"
-quick_mode_match_rank: "class"
+assignment_mode: "exhaustive"
 
 ## Plot output options ##
 update_plots: "FALSE"
@@ -35,20 +29,23 @@ merging_labels: ["strain"]
 output_pdf: "TRUE"
 output_png: "FALSE"
 
+## Gene info options ##
+include_pseudogenes: "FALSE"
+
 ## PCA options ##
 input_variables: "c_name,c_num_of_genes,c_len,c_genelenm,c_genelensd,g_len,g_lendev_c,g_abspos,g_terminal,c_cov,c_covsd,g_cov,g_covsd,g_covdev_c,c_pearson_r,g_pearson_r_o,g_pearson_r_c"
 perform_parallel_analysis: "FALSE"
 num_pcs: "3"
 coverage_cutoff_mode: "default"
 
 ## Clustering options ##
-perform_kmeans: "TRUE"
+perform_kmeans: "FALSE"
 kmeans_k: "default"
-perform_hclust: "TRUE"
+perform_hclust: "FALSE"
 hclust_k: "default"
-perform_mclust: "TRUE"
+perform_mclust: "FALSE"
 mclust_k: "default"
-perform_dbscan: "TRUE"
+perform_dbscan: "FALSE"
 dbscan_groups: "default"
 custom_eps: "0.3"
 custom_minPts: "10"
diff --git a/produce_gene_info.py b/produce_gene_info.py
@@ -1462,7 +1462,7 @@ def main():
     # read parameters from config file
     config_obj=yaml.safe_load(open(config_path,'r'))
     gff_path=config_obj['gff_path'] # GFF file path
-    pbc_paths=list(config_obj['pbc_paths']) if 'pbc_paths' in config_obj.keys() else [] # per base coverage (PBC) file path(s)# per base coverage (PBC) file path(s)
+    pbc_paths=list(config_obj['pbc_paths']) if 'pbc_paths' in config_obj.keys() else [] # per base coverage (PBC) file path(s)
     output_path=config_obj['output_path'] # complete output path (ENDING ON A SLASH!)
     fasta_path= config_obj['fasta_path'] # path to FASTA file
     include_pseudogenes = config_obj['include_pseudogenes'] # boolean signifying whether pseudogenes should be included in the analysis