@@ -17,22 +17,41 @@ get_region <- function(x) {
17
17
}
18
18
19
19
# Helper function #2
20
- process_file <- function (file_path , clusters ) {
20
+ process_file <- function (file_path , clusters , distances ) {
21
21
df <- fread(file_path , header = TRUE )
22
22
sample <- basename(dirname(dirname(file_path )))
23
23
sample <- unlist(strsplit(sample , " ." , fixed = TRUE ))[1 ]
24
24
haps <- names(clusters )[grepl(sample , names(clusters ))]
25
- true_clusters <- sort(c(clusters [[haps [1 ]]], clusters [[haps [2 ]]]))
26
- predicted_clusters <- sort(c(df $ cluster.1 [1 ], df $ cluster.2 [1 ]))
27
- predicted_haplos <- c(df [[1 ]][1 ], df [[2 ]][1 ])
28
-
25
+ if (length(haps ) != 2 ) {
26
+ true_clusters <- c(" ambiguous" , " ambiguous" )
27
+ } else {
28
+ true.cl.1 <- ifelse(clusters [[haps [1 ]]] == " *" , " unclustered" , clusters [[haps [1 ]]])
29
+ true.cl.2 <- ifelse(clusters [[haps [2 ]]] == " *" , " unclustered" , clusters [[haps [2 ]]])
30
+ true_clusters <- sort(c(true.cl.1 , true.cl.2 ))
31
+ }
32
+ pred.cl.1 <- ifelse(df $ cluster.1 [1 ] == " *" , " unclustered" , df $ cluster.1 [1 ])
33
+ pred.cl.2 <- ifelse(df $ cluster.2 [1 ] == " *" , " unclustered" , df $ cluster.2 [1 ])
34
+ predicted_clusters <- sort(c(pred.cl.1 , pred.cl.2 ))
35
+ predicted_haplos <- sort(c(df [[1 ]][1 ], df [[2 ]][1 ]))
36
+
29
37
list (
38
+ # for table/plotting
30
39
lenient = identical(true_clusters , predicted_clusters ),
31
40
strict = length(grep(sample , predicted_haplos )) == 2 ,
32
41
w_lenient = ifelse(identical(true_clusters , predicted_clusters ), " " , sample ),
33
42
w_strict = ifelse(length(grep(sample , predicted_haplos )) == 2 , " " , sample ),
34
43
g_lenient = ifelse(identical(true_clusters , predicted_clusters ), sample , " " ),
35
- g_strict = ifelse(length(grep(sample , predicted_haplos )) == 2 , sample , " " )
44
+ g_strict = ifelse(length(grep(sample , predicted_haplos )) == 2 , sample , " " ),
45
+ # for table
46
+ sample.id = sample ,
47
+ hap.1.pred = predicted_haplos [1 ],
48
+ hap.2.pred = predicted_haplos [2 ],
49
+ cl.1.pred = predicted_clusters [1 ],
50
+ cl.2.pred = predicted_clusters [2 ],
51
+ cl.1.true = true_clusters [1 ],
52
+ cl.2.true = true_clusters [2 ],
53
+ cl.1.pred.true.dist = ifelse(true_clusters [1 ] == " ambiguous" || true_clusters [1 ] == " unclustered" || predicted_clusters [1 ] == " unclustered" , " -" , distances [which(distances $ h.group == true_clusters [1 ]),][[predicted_clusters [1 ]]]),
54
+ cl.2.pred.true.dist = ifelse(true_clusters [2 ] == " ambiguous" || true_clusters [2 ] == " unclustered" || predicted_clusters [2 ] == " unclustered" , " -" , distances [which(distances $ h.group == true_clusters [2 ]),][[predicted_clusters [2 ]]])
36
55
)
37
56
}
38
57
@@ -43,16 +62,31 @@ regions <- unique(sapply(files, get_region))
43
62
tpr_list <- lapply(regions , function (r ) {
44
63
json_file <- file.path(args [2 ], paste0(r , " .clusters.json" ))
45
64
clusters <- fromJSON(file = json_file )
65
+ dist_file <- file.path(args [2 ], paste0(r , " .clusters.hapdist.tsv" ))
66
+ distances <- fread(dist_file )
46
67
region_files <- files [grepl(r , files )]
68
+ results <- lapply(region_files , process_file , clusters = clusters , distances = distances )
47
69
48
- results <- lapply(region_files , process_file , clusters = clusters )
49
70
lenient_scores <- sapply(results , `[[` , " lenient" )
50
71
strict_scores <- sapply(results , `[[` , " strict" )
51
72
lenient_w_names <- sapply(results , `[[` , " w_lenient" )
52
73
strict_w_names <- sapply(results , `[[` , " w_strict" )
53
74
lenient_g_names <- sapply(results , `[[` , " g_lenient" )
54
75
strict_g_names <- sapply(results , `[[` , " g_strict" )
55
76
77
+ sample <- sapply(results , `[[` , " sample.id" )
78
+ hap.1.pred <- sapply(results , `[[` , " hap.1.pred" )
79
+ hap.2.pred <- sapply(results , `[[` , " hap.2.pred" )
80
+ cl.1.pred <- sapply(results , `[[` , " cl.1.pred" )
81
+ cl.2.pred <- sapply(results , `[[` , " cl.2.pred" )
82
+ cl.1.true <- sapply(results , `[[` , " cl.1.true" )
83
+ cl.2.true <- sapply(results , `[[` , " cl.2.true" )
84
+ cl.1.pred.true.dist <- sapply(results , `[[` , " cl.1.pred.true.dist" )
85
+ cl.2.pred.true.dist <- sapply(results , `[[` , " cl.2.pred.true.dist" )
86
+
87
+ btab <- data.frame (sample = sample , hap.1.pred = hap.1.pred , hap.2.pred = hap.2.pred , cl.1.pred = cl.1.pred , cl.2.pred = cl.2.pred , cl.1.true = cl.1.true , cl.2.true = cl.2.true , cl.1.pred.true.dist = cl.1.pred.true.dist , cl.2.pred.true.dist = cl.2.pred.true.dist )
88
+ fwrite(btab , gsub(" pdf" , paste0(r ," .wdist.tsv" ), args [3 ]),col.names = T , row.names = F , sep = " \t " )
89
+
56
90
data.frame (
57
91
category = rep(c(" lenient" , " strict" ), each = 2 ),
58
92
measure = rep(c(" tp" , " fn" ), 2 ),
@@ -61,6 +95,7 @@ tpr_list <- lapply(regions, function(r) {
61
95
sum(strict_scores ), length(region_files ) - sum(strict_scores )),
62
96
ids = c(paste(lenient_g_names [lenient_g_names != " " ],collapse = " ;" ),paste(lenient_w_names [lenient_w_names != " " ],collapse = " ;" ),paste(strict_g_names [strict_g_names != " " ],collapse = " ;" ),paste(strict_w_names [strict_w_names != " " ],collapse = " ;" ))
63
97
)
98
+
64
99
})
65
100
66
101
tpr_df <- rbindlist(tpr_list )
0 commit comments