From 370339bb119e5368fc21d9811009dfc71522d3b5 Mon Sep 17 00:00:00 2001 From: Vyacheslav Brover Date: Wed, 19 Feb 2025 12:50:59 -0500 Subject: [PATCH] #3: min. complexity --- genetics/marker2qual.sh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/genetics/marker2qual.sh b/genetics/marker2qual.sh index b1028f76..8a284d45 100755 --- a/genetics/marker2qual.sh +++ b/genetics/marker2qual.sh @@ -1,16 +1,18 @@ #!/bin/bash --noprofile THIS=$( dirname $0 ) source $THIS/../bash_common.sh -if [ $# -ne 3 ]; then +if [ $# -ne 4 ]; then echo "Print a good quality subset of eukaryotic marker proteins created by tblastn2marker_euk.sh" echo "#1: marker proteins (FASTA)" echo "#2: min. score to length ratio" - echo "#3: output uniKernel file | ''" + echo "#3: min. complexity" + echo "#4: output uniKernel file | ''" exit 1 fi M=$1 T=$2 -UNI=$3 +COMPL=$3 +UNI=$4 TMP=$( mktemp ) @@ -18,13 +20,13 @@ TMP=$( mktemp ) grep '^>' $M | cut -f 1,7 -d ' '| sed 's/^>//1' | sed 's/ score=/\t/1' > $TMP.score $THIS/fasta2len $M > $TMP.len -paste $TMP.len $TMP.score | awk -F '\t' '{OFS="\t"; print $1, $4/$2};' > $TMP.stat +paste $TMP.len $TMP.score | awk -F '\t' '{OFS="\t"; print $1, $4 / $2};' > $TMP.stat if [ "$UNI" ]; then $THIS/../dm/conversion/cols2dm.sh $TMP.stat 0 5 1 > $TMP.dm $THIS/../dm/uniKernel $TMP "V2" -qc > $UNI fi awk '$2 > '$T $TMP.stat | cut -f 1 > $TMP.list -$THIS/filterFasta $M -aa -target $TMP.list -len_min 20 -complexity_min 3 +$THIS/filterFasta $M -aa -target $TMP.list -len_min 20 -complexity_min $COMPL rm $TMP*