add feature in tpr estimation

davidebolo1993 · davidebolo1993 · commit 82aa524f88aa · 2024-11-14T11:09:42.000+01:00
diff --git a/cosigt_smk/workflow/scripts/plottpr.r b/cosigt_smk/workflow/scripts/plottpr.r
@@ -17,22 +17,41 @@ get_region <- function(x) {
 }
 
 #Helper function #2
-process_file <- function(file_path, clusters) {
+process_file <- function(file_path, clusters, distances) {
   df <- fread(file_path, header = TRUE)
   sample <- basename(dirname(dirname(file_path)))
   sample<-unlist(strsplit(sample, ".", fixed=TRUE))[1]
   haps <- names(clusters)[grepl(sample, names(clusters))]
-  true_clusters <- sort(c(clusters[[haps[1]]], clusters[[haps[2]]]))
-  predicted_clusters <- sort(c(df$cluster.1[1], df$cluster.2[1]))
-  predicted_haplos <- c(df[[1]][1], df[[2]][1])
-  
+  if (length(haps) != 2) {
+    true_clusters<-c("ambiguous", "ambiguous")
+  } else {
+    true.cl.1<-ifelse(clusters[[haps[1]]] == "*", "unclustered", clusters[[haps[1]]])
+    true.cl.2<-ifelse(clusters[[haps[2]]] == "*", "unclustered", clusters[[haps[2]]])
+    true_clusters<-sort(c(true.cl.1, true.cl.2))
+  }
+  pred.cl.1<-ifelse(df$cluster.1[1] == "*", "unclustered", df$cluster.1[1])
+  pred.cl.2<-ifelse(df$cluster.2[1] == "*", "unclustered", df$cluster.2[1])
+  predicted_clusters<-sort(c(pred.cl.1, pred.cl.2))
+  predicted_haplos <- sort(c(df[[1]][1], df[[2]][1]))
+
   list(
+    #for table/plotting
     lenient = identical(true_clusters, predicted_clusters),
     strict = length(grep(sample, predicted_haplos)) == 2,
     w_lenient = ifelse(identical(true_clusters, predicted_clusters), "", sample),
     w_strict = ifelse(length(grep(sample, predicted_haplos)) == 2, "", sample),
     g_lenient = ifelse(identical(true_clusters, predicted_clusters), sample, ""),
-    g_strict = ifelse(length(grep(sample, predicted_haplos)) == 2, sample, "")
+    g_strict = ifelse(length(grep(sample, predicted_haplos)) == 2, sample, ""),
+    #for table
+    sample.id = sample,
+    hap.1.pred = predicted_haplos[1],
+    hap.2.pred = predicted_haplos[2],
+    cl.1.pred = predicted_clusters[1],
+    cl.2.pred = predicted_clusters[2],
+    cl.1.true = true_clusters[1],
+    cl.2.true = true_clusters[2],
+    cl.1.pred.true.dist = ifelse(true_clusters[1] == "ambiguous" || true_clusters[1] == "unclustered" || predicted_clusters[1] == "unclustered", "-", distances[which(distances$h.group == true_clusters[1]),][[predicted_clusters[1]]]),
+    cl.2.pred.true.dist = ifelse(true_clusters[2] == "ambiguous" || true_clusters[2] == "unclustered" || predicted_clusters[2] == "unclustered", "-", distances[which(distances$h.group == true_clusters[2]),][[predicted_clusters[2]]])
   )
 }
 
@@ -43,16 +62,31 @@ regions <- unique(sapply(files, get_region))
 tpr_list <- lapply(regions, function(r) {
   json_file <- file.path(args[2], paste0(r, ".clusters.json"))
   clusters <- fromJSON(file = json_file)
+  dist_file <-file.path(args[2], paste0(r, ".clusters.hapdist.tsv"))
+  distances<-fread(dist_file)
   region_files <- files[grepl(r, files)]
+  results <- lapply(region_files, process_file, clusters = clusters, distances=distances)
   
-  results <- lapply(region_files, process_file, clusters = clusters)
   lenient_scores <- sapply(results, `[[`, "lenient")
   strict_scores <- sapply(results, `[[`, "strict")
   lenient_w_names<-sapply(results, `[[`, "w_lenient")
   strict_w_names<-sapply(results, `[[`, "w_strict")
   lenient_g_names<-sapply(results, `[[`, "g_lenient")
   strict_g_names<-sapply(results, `[[`, "g_strict")
 
+  sample<-sapply(results, `[[`, "sample.id")
+  hap.1.pred<-sapply(results, `[[`, "hap.1.pred")
+  hap.2.pred<-sapply(results, `[[`, "hap.2.pred")
+  cl.1.pred<-sapply(results, `[[`, "cl.1.pred")
+  cl.2.pred<-sapply(results, `[[`, "cl.2.pred")
+  cl.1.true<-sapply(results, `[[`, "cl.1.true")
+  cl.2.true<-sapply(results, `[[`, "cl.2.true")
+  cl.1.pred.true.dist<-sapply(results, `[[`, "cl.1.pred.true.dist")
+  cl.2.pred.true.dist<-sapply(results, `[[`, "cl.2.pred.true.dist")
+
+  btab<-data.frame(sample=sample, hap.1.pred = hap.1.pred, hap.2.pred = hap.2.pred, cl.1.pred = cl.1.pred, cl.2.pred=cl.2.pred, cl.1.true=cl.1.true, cl.2.true=cl.2.true, cl.1.pred.true.dist = cl.1.pred.true.dist, cl.2.pred.true.dist=cl.2.pred.true.dist)
+  fwrite(btab, gsub("pdf", paste0(r,".wdist.tsv"), args[3]),col.names=T, row.names=F, sep="\t")
+
   data.frame(
     category = rep(c("lenient", "strict"), each = 2),
     measure = rep(c("tp", "fn"), 2),
@@ -61,6 +95,7 @@ tpr_list <- lapply(regions, function(r) {
               sum(strict_scores), length(region_files) - sum(strict_scores)),
     ids=c(paste(lenient_g_names[lenient_g_names!=""],collapse=";"),paste(lenient_w_names[lenient_w_names!=""],collapse=";"),paste(strict_g_names[strict_g_names!=""],collapse=";"),paste(strict_w_names[strict_w_names!=""],collapse=";"))
   )
+
 })
 
 tpr_df <- rbindlist(tpr_list)