fix: properly error in Counts app and combine step

stjude · Apr 13, 2021 · 81f2267 · 81f2267
1 parent dbb4faa
commit 81f2267
Show file tree

Hide file tree

Showing 11 changed files with 60 additions and 85 deletions.
diff --git a/stjude_warden_bam/dxapp.json b/stjude_warden_bam/dxapp.json
@@ -45,6 +45,7 @@
     {
       "name": "Genome",
       "class": "string",
+      "optional": false,
       "choices": [
         "Human_hg38_v31 : hg38 Gencode v31",
         "Human_hg19_v19 : hg19 Gencode v19",

diff --git a/...s/app_data/internal_source/warden_combine_counts/resources/usr/bin/process_count_files.py b/...s/app_data/internal_source/warden_combine_counts/resources/usr/bin/process_count_files.py
@@ -11,7 +11,6 @@
 OUTCOUNTS = open(out_counts_file, 'w')
 OUTSAMPLES = open(out_samples_file, 'w')
 ERRORS = open("errors.txt", 'w')
-warnings = []
 
 SAMPLEFILES = open(sample_files_list)
 COUNTFILES = open(count_files_lists)
@@ -22,7 +21,7 @@
 sample_lines = {}
 feature_list = {}
 
-print "Looping SAMPLEFILES"
+print("Looping SAMPLEFILES")
 for sample_file in SAMPLEFILES:
 
     sample_file = sample_file.rstrip("\n")
@@ -54,10 +53,10 @@
 
 counter_samples = {}
 got_feature_list = False
-print "done looping sample files\n"
-print "Samples:"
+print("done looping sample files\n")
+print("Samples:")
 print(samples)
-print ""
+print("")
 count_results = {}
 
 for count_file in COUNTFILES:
@@ -67,23 +66,18 @@
     count_header = count_header.rstrip("\n")
     count_header_list = count_header.split("\t")
     print(count_header)
-    feature_num = 0
-    for count_line in COUNTFILE:
+    for feature_num, count_line in enumerate(COUNTFILE):
         count_line = count_line.rstrip("\n")
         count_data = count_line.split("\t")
         feature = count_data[0]
         if feature[0:2] == "__":
             continue
         # if we arleady got a feature list .. check if same order
         if got_feature_list:
-            if feature_num not in feature_list:
-                print "FEATURE LISTS FROM COUNT FILES ARE OF DIFFERENT LENGTHS"
-                ERRORS.write("FEATURE LISTS FROM COUNT FILES ARE OF DIFFERENT LENGTHS")
-                sys.exit()
             old_feature = feature_list[feature_num]
             # features should be the same
             if feature != old_feature:
-                print "FEATURE LISTS BETWEEN COUNT FILES NOT THE SAME"
+                print("FEATURE LISTS BETWEEN COUNT FILES NOT THE SAME")
                 ERRORS.write("FEATURE LISTS BETWEEN COUNT FILES NOT THE SAME")
                 sys.exit()
         else:
@@ -94,20 +88,20 @@
         for sc in range(1, len(count_data)):
             sample = count_header_list[sc]
             if sample not in samples:
-                print "SAMPLE " + sample + " not in a sample sheet"
-                warnings.append("SAMPLE " + sample + " not in a sample sheet")
+                print("SAMPLE " + sample + " not in a sample sheet\nDoes the file have a header?")
+                ERRORS.write("SAMPLE " + sample + " not in a sample sheet\nDoes the file have a header?")
+                sys.exit()
 
             count = count_data[sc]
             if sample in count_results[feature]:
                 if count != count_results[feature][sample]:
-                    print "TWO SAMPLES WITH SAME NAME HAVE DIFFERENT COUNT VALUES"
+                    print("TWO SAMPLES WITH SAME NAME HAVE DIFFERENT COUNT VALUES")
                     ERRORS.write("TWO SAMPLES WITH SAME NAME HAVE DIFFERENT COUNT VALUES")
-                    sys.exit("")
+                    sys.exit()
             else:
                 count_results[feature][sample] = count
-        feature_num += 1
     got_feature_list = True
-print "done getting features"
+print("done getting features")
 samples_ordered = []
 for key, value in sorted(samples.iteritems(), key=lambda (k, v): (v, k)):
     samples_ordered.append(key)
@@ -132,9 +126,3 @@
 for s in samples_ordered:
     sample_line = sample_lines[s]
     OUTSAMPLES.write(sample_line + "\n")
-
-
-if len(warnings) > 0:
-    WARN = open("warnings.txt", 'w')
-    warn_lines = "\n".join(warnings)
-    WARN.write(warn_lines + "\n")
diff --git a/stjude_warden_bam/src/warden.sh b/stjude_warden_bam/src/warden.sh
@@ -76,7 +76,7 @@ main() {
     elif [ "$sample_list_extension" == "xlsx" ]; then
         python /usr/bin/parse_excel_sample_list.py sample_list.xlsx > sample_list.txt
     else
-        dx-jobutil-report-error "Improper Sample List Extension. This should be a .txt or .xlsx file" appError
+        dx-jobutil-report-error "Improper Sample List Extension. This should be a .txt or .xlsx file" AppError
     fi
 
     printf '%s\n' "${BAM_FILES_path[@]}" > bam_list.txt
@@ -91,7 +91,7 @@ main() {
     IS_PROCESSFILE_ERR=${#PROCESSFILE_ERR} #get size
     if [ "$IS_PROCESSFILE_ERR" -gt 0 ]; then
         echo "Error: $PROCESSFILE_ERR"
-        dx-jobutil-report-error "$PROCESSFILE_ERR" appError
+        dx-jobutil-report-error "$PROCESSFILE_ERR" AppError
     fi
 
     final_sample_list_id=$(dx upload --brief cleaned_sample_list.txt)
@@ -120,7 +120,7 @@ main() {
     echo ""
     if [ "$num_samples" -gt 64 ]; then
         echo "Error: Number of samples greater than 64.  The app limits samples to 64"
-        dx-jobutil-report-error "Number of samples greater than 64.  The app limits samples to 64" appError
+        dx-jobutil-report-error "Number of samples greater than 64.  The app limits samples to 64" AppError
     fi
     ###############
     {

diff --git a/stjude_warden_counts/dxapp.json b/stjude_warden_counts/dxapp.json
@@ -38,9 +38,11 @@
         "Mouse_mm9_levels_1_2 : Gencode vM1 (levels 1+2)",
         "Mouse_mm10_vM11_levels_1_2 : Gencode vM11 (levels 1+2)",
         "D_Mel_BDGPr5 : Drosophila_melanogaster.BDGP5.75.gtf",
-        "D_Mel_BDGP6 : Drosophila_melanogaster.BDGP6.86.gtf"
+        "D_Mel_BDGP6 : Drosophila_melanogaster.BDGP6.86.gtf",
+        "None"
       ],
-      "optional": true,
+      "default": "None",
+      "optional": false,
       "help": "Selection determines preloaded viewer. If left blank, no viewer shortcut will be created. First element (ie Human_hg38_v31) is a reference ID for app."
     },
     {

diff --git a/...s/app_data/internal_source/warden_combine_counts/resources/usr/bin/process_count_files.py b/...s/app_data/internal_source/warden_combine_counts/resources/usr/bin/process_count_files.py
@@ -11,7 +11,6 @@
 OUTCOUNTS = open(out_counts_file, 'w')
 OUTSAMPLES = open(out_samples_file, 'w')
 ERRORS = open("errors.txt", 'w')
-warnings = []
 
 SAMPLEFILES = open(sample_files_list)
 COUNTFILES = open(count_files_lists)
@@ -22,7 +21,7 @@
 sample_lines = {}
 feature_list = {}
 
-print "Looping SAMPLEFILES"
+print("Looping SAMPLEFILES")
 for sample_file in SAMPLEFILES:
 
     sample_file = sample_file.rstrip("\n")
@@ -54,10 +53,10 @@
 
 counter_samples = {}
 got_feature_list = False
-print "done looping sample files\n"
-print "Samples:"
+print("done looping sample files\n")
+print("Samples:")
 print(samples)
-print ""
+print("")
 count_results = {}
 
 for count_file in COUNTFILES:
@@ -67,23 +66,18 @@
     count_header = count_header.rstrip("\n")
     count_header_list = count_header.split("\t")
     print(count_header)
-    feature_num = 0
-    for count_line in COUNTFILE:
+    for feature_num, count_line in enumerate(COUNTFILE):
         count_line = count_line.rstrip("\n")
         count_data = count_line.split("\t")
         feature = count_data[0]
         if feature[0:2] == "__":
             continue
         # if we arleady got a feature list .. check if same order
         if got_feature_list:
-            if feature_num not in feature_list:
-                print "FEATURE LISTS FROM COUNT FILES ARE OF DIFFERENT LENGTHS"
-                ERRORS.write("FEATURE LISTS FROM COUNT FILES ARE OF DIFFERENT LENGTHS")
-                sys.exit()
             old_feature = feature_list[feature_num]
             # features should be the same
             if feature != old_feature:
-                print "FEATURE LISTS BETWEEN COUNT FILES NOT THE SAME"
+                print("FEATURE LISTS BETWEEN COUNT FILES NOT THE SAME")
                 ERRORS.write("FEATURE LISTS BETWEEN COUNT FILES NOT THE SAME")
                 sys.exit()
         else:
@@ -94,20 +88,20 @@
         for sc in range(1, len(count_data)):
             sample = count_header_list[sc]
             if sample not in samples:
-                print "SAMPLE " + sample + " not in a sample sheet"
-                warnings.append("SAMPLE " + sample + " not in a sample sheet")
+                print("SAMPLE " + sample + " not in a sample sheet\nDoes the file have a header?")
+                ERRORS.write("SAMPLE " + sample + " not in a sample sheet\nDoes the file have a header?")
+                sys.exit()
 
             count = count_data[sc]
             if sample in count_results[feature]:
                 if count != count_results[feature][sample]:
-                    print "TWO SAMPLES WITH SAME NAME HAVE DIFFERENT COUNT VALUES"
+                    print("TWO SAMPLES WITH SAME NAME HAVE DIFFERENT COUNT VALUES")
                     ERRORS.write("TWO SAMPLES WITH SAME NAME HAVE DIFFERENT COUNT VALUES")
-                    sys.exit("")
+                    sys.exit()
             else:
                 count_results[feature][sample] = count
-        feature_num += 1
     got_feature_list = True
-print "done getting features"
+print("done getting features")
 samples_ordered = []
 for key, value in sorted(samples.iteritems(), key=lambda (k, v): (v, k)):
     samples_ordered.append(key)
@@ -132,9 +126,3 @@
 for s in samples_ordered:
     sample_line = sample_lines[s]
     OUTSAMPLES.write(sample_line + "\n")
-
-
-if len(warnings) > 0:
-    WARN = open("warnings.txt", 'w')
-    warn_lines = "\n".join(warnings)
-    WARN.write(warn_lines + "\n")
diff --git a/stjude_warden_counts/resources/usr/bin/create_workflow.py b/stjude_warden_counts/resources/usr/bin/create_workflow.py
@@ -112,7 +112,7 @@ def build_workflow():
                 }
             },
         ]
-    simple_DE_input = {"input_count_file": dxpy.dxlink({"stage": combine_counts_stage_id, "outputField": "count_file"}), "sample_list_file": dxpy.dxlink(final_sample_list_id), "contrasts_file": dxpy.dxlink(comparisons_all_id), "difex_viewer": limma_viewer_link}
+    simple_DE_input = {"input_count_file": dxpy.dxlink({"stage": combine_counts_stage_id, "outputField": "count_file"}), "sample_list_file": dxpy.dxlink(final_sample_list_id), "contrasts_file": dxpy.dxlink(comparisons_all_id)}
     if parameters["limma_DE_viewer"] != "None":
         simple_DE_input["difex_viewer"] = limma_viewer_link
     simple_DE_stage_id = wf.add_stage(simple_DE_applet, stage_input=simple_DE_input, instance_type="azure:mem1_ssd1_x4", name="SIMPLE DIFFERENTIAL_EXPRESSION")

diff --git a/stjude_warden_counts/resources/usr/bin/process_files.py b/stjude_warden_counts/resources/usr/bin/process_files.py
@@ -93,6 +93,11 @@ def process_comparisons(comparison_line, comparison_list):
         sys.exit()
 
     sample_name = safe_name(line_data[0])
+    if sample_name != line_data[0]:
+        PROCESSERRORS.write("Malformed sample name: " + line_data[0] + "\n")
+        PROCESSERRORS.close()
+        sys.exit()
+
     read_file1 = line_data[2]
 
     if read_file1 in fastqs:

diff --git a/stjude_warden_counts/src/warden.sh b/stjude_warden_counts/src/warden.sh
@@ -26,12 +26,14 @@ for app in /app_data/internal_source/*; do
 done
 echo "Applets built"
 echo ""
-if [ -n "$Genome" ]; then
+if [ "$Genome" != "None" ]; then
     Genome=$(echo "$Genome" | awk '{print $1}')
     genome_json=/app_data/genome_data.json
     limma_DE_viewer=$(./jq-1.6 --raw-output ".$Genome.viewers.LIMMA_DifEx_Viewer" $genome_json)
     echo "DIFEXVIEWER: $limma_DE_viewer"
     echo ""
+else
+    limma_DE_viewer="None"
 fi
 
 ############################INPUT FILES#############################################################
@@ -45,7 +47,7 @@ main() {
     elif [ "$sample_list_extension" == "xlsx" ]; then
         python /usr/bin/parse_excel_sample_list.py sample_list.xlsx > sample_list.txt
     else
-        dx-jobutil-report-error "Improper Sample List Extension. This should be a .txt or .xlsx file" appError
+        dx-jobutil-report-error "Improper Sample List Extension. This should be a .txt or .xlsx file" AppError
     fi
 
     printf '%s\n' "${COUNT_FILES_path[@]}" > count_list.txt
@@ -60,7 +62,7 @@ main() {
     IS_PROCESSFILE_ERR=${#PROCESSFILE_ERR} #get size
     if [ "$IS_PROCESSFILE_ERR" -gt 0 ]; then
         echo "Error: $PROCESSFILE_ERR"
-        dx-jobutil-report-error "$PROCESSFILE_ERR" appError
+        dx-jobutil-report-error "$PROCESSFILE_ERR" AppError
     fi
 
     final_sample_list_id=$(dx upload --brief cleaned_sample_list.txt)
@@ -89,7 +91,7 @@ main() {
     echo ""
     if [ "$num_samples" -gt 64 ]; then
         echo "Error: Number of samples greater than 64.  The app limits samples to 64"
-        dx-jobutil-report-error "Number of samples greater than 64.  The app limits samples to 64" appError
+        dx-jobutil-report-error "Number of samples greater than 64.  The app limits samples to 64" AppError
     fi
     ###############
     {

diff --git a/stjude_warden_fastq/dxapp.json b/stjude_warden_fastq/dxapp.json
@@ -41,6 +41,7 @@
     {
       "name": "Genome",
       "class": "string",
+      "optional": false,
       "choices": [
         "Human_hg38_v31 : hg38 Gencode v31",
         "Human_hg19_v19 : hg19 Gencode v19",