Skip to content

Commit 82aa524

Browse files
add feature in tpr estimation
1 parent f393e04 commit 82aa524

File tree

1 file changed

+42
-7
lines changed

1 file changed

+42
-7
lines changed

cosigt_smk/workflow/scripts/plottpr.r

+42-7
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,41 @@ get_region <- function(x) {
1717
}
1818

1919
#Helper function #2
20-
process_file <- function(file_path, clusters) {
20+
process_file <- function(file_path, clusters, distances) {
2121
df <- fread(file_path, header = TRUE)
2222
sample <- basename(dirname(dirname(file_path)))
2323
sample<-unlist(strsplit(sample, ".", fixed=TRUE))[1]
2424
haps <- names(clusters)[grepl(sample, names(clusters))]
25-
true_clusters <- sort(c(clusters[[haps[1]]], clusters[[haps[2]]]))
26-
predicted_clusters <- sort(c(df$cluster.1[1], df$cluster.2[1]))
27-
predicted_haplos <- c(df[[1]][1], df[[2]][1])
28-
25+
if (length(haps) != 2) {
26+
true_clusters<-c("ambiguous", "ambiguous")
27+
} else {
28+
true.cl.1<-ifelse(clusters[[haps[1]]] == "*", "unclustered", clusters[[haps[1]]])
29+
true.cl.2<-ifelse(clusters[[haps[2]]] == "*", "unclustered", clusters[[haps[2]]])
30+
true_clusters<-sort(c(true.cl.1, true.cl.2))
31+
}
32+
pred.cl.1<-ifelse(df$cluster.1[1] == "*", "unclustered", df$cluster.1[1])
33+
pred.cl.2<-ifelse(df$cluster.2[1] == "*", "unclustered", df$cluster.2[1])
34+
predicted_clusters<-sort(c(pred.cl.1, pred.cl.2))
35+
predicted_haplos <- sort(c(df[[1]][1], df[[2]][1]))
36+
2937
list(
38+
#for table/plotting
3039
lenient = identical(true_clusters, predicted_clusters),
3140
strict = length(grep(sample, predicted_haplos)) == 2,
3241
w_lenient = ifelse(identical(true_clusters, predicted_clusters), "", sample),
3342
w_strict = ifelse(length(grep(sample, predicted_haplos)) == 2, "", sample),
3443
g_lenient = ifelse(identical(true_clusters, predicted_clusters), sample, ""),
35-
g_strict = ifelse(length(grep(sample, predicted_haplos)) == 2, sample, "")
44+
g_strict = ifelse(length(grep(sample, predicted_haplos)) == 2, sample, ""),
45+
#for table
46+
sample.id = sample,
47+
hap.1.pred = predicted_haplos[1],
48+
hap.2.pred = predicted_haplos[2],
49+
cl.1.pred = predicted_clusters[1],
50+
cl.2.pred = predicted_clusters[2],
51+
cl.1.true = true_clusters[1],
52+
cl.2.true = true_clusters[2],
53+
cl.1.pred.true.dist = ifelse(true_clusters[1] == "ambiguous" || true_clusters[1] == "unclustered" || predicted_clusters[1] == "unclustered", "-", distances[which(distances$h.group == true_clusters[1]),][[predicted_clusters[1]]]),
54+
cl.2.pred.true.dist = ifelse(true_clusters[2] == "ambiguous" || true_clusters[2] == "unclustered" || predicted_clusters[2] == "unclustered", "-", distances[which(distances$h.group == true_clusters[2]),][[predicted_clusters[2]]])
3655
)
3756
}
3857

@@ -43,16 +62,31 @@ regions <- unique(sapply(files, get_region))
4362
tpr_list <- lapply(regions, function(r) {
4463
json_file <- file.path(args[2], paste0(r, ".clusters.json"))
4564
clusters <- fromJSON(file = json_file)
65+
dist_file <-file.path(args[2], paste0(r, ".clusters.hapdist.tsv"))
66+
distances<-fread(dist_file)
4667
region_files <- files[grepl(r, files)]
68+
results <- lapply(region_files, process_file, clusters = clusters, distances=distances)
4769

48-
results <- lapply(region_files, process_file, clusters = clusters)
4970
lenient_scores <- sapply(results, `[[`, "lenient")
5071
strict_scores <- sapply(results, `[[`, "strict")
5172
lenient_w_names<-sapply(results, `[[`, "w_lenient")
5273
strict_w_names<-sapply(results, `[[`, "w_strict")
5374
lenient_g_names<-sapply(results, `[[`, "g_lenient")
5475
strict_g_names<-sapply(results, `[[`, "g_strict")
5576

77+
sample<-sapply(results, `[[`, "sample.id")
78+
hap.1.pred<-sapply(results, `[[`, "hap.1.pred")
79+
hap.2.pred<-sapply(results, `[[`, "hap.2.pred")
80+
cl.1.pred<-sapply(results, `[[`, "cl.1.pred")
81+
cl.2.pred<-sapply(results, `[[`, "cl.2.pred")
82+
cl.1.true<-sapply(results, `[[`, "cl.1.true")
83+
cl.2.true<-sapply(results, `[[`, "cl.2.true")
84+
cl.1.pred.true.dist<-sapply(results, `[[`, "cl.1.pred.true.dist")
85+
cl.2.pred.true.dist<-sapply(results, `[[`, "cl.2.pred.true.dist")
86+
87+
btab<-data.frame(sample=sample, hap.1.pred = hap.1.pred, hap.2.pred = hap.2.pred, cl.1.pred = cl.1.pred, cl.2.pred=cl.2.pred, cl.1.true=cl.1.true, cl.2.true=cl.2.true, cl.1.pred.true.dist = cl.1.pred.true.dist, cl.2.pred.true.dist=cl.2.pred.true.dist)
88+
fwrite(btab, gsub("pdf", paste0(r,".wdist.tsv"), args[3]),col.names=T, row.names=F, sep="\t")
89+
5690
data.frame(
5791
category = rep(c("lenient", "strict"), each = 2),
5892
measure = rep(c("tp", "fn"), 2),
@@ -61,6 +95,7 @@ tpr_list <- lapply(regions, function(r) {
6195
sum(strict_scores), length(region_files) - sum(strict_scores)),
6296
ids=c(paste(lenient_g_names[lenient_g_names!=""],collapse=";"),paste(lenient_w_names[lenient_w_names!=""],collapse=";"),paste(strict_g_names[strict_g_names!=""],collapse=";"),paste(strict_w_names[strict_w_names!=""],collapse=";"))
6397
)
98+
6499
})
65100

66101
tpr_df <- rbindlist(tpr_list)

0 commit comments

Comments
 (0)