forked from EPINetz/EPINetz-Policy-Parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_rwr_terms.R
650 lines (508 loc) · 28.9 KB
/
get_rwr_terms.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
get_rwr_terms <- function(walk_network, # an object made by make_multiplex_objects, contatining the multiplex network and the Normalized Adjacency Matrix
network_name = NULL, # name of the network, used for matching the match_var. Usually the index when implented in imap functions. NULL to skip. Requires match_var
seeds, # a list of dataframes, where each entry represents a specific group of seeds, e.g. a single policy field
seed_var, # the variable in the seed dataframes containing the seed terms
match_var = NULL, # the variable in the seed dataframes to match on the network_name variable, e.g. specific time periods. NULL to skip. Requires network_name
flatten_results = TRUE, # should the results be flattened into a single dataframe (with policy_field indicator) or a list of one dataframe per group?
group_name = "policy_field", # name of the grouping variable if flatten_results = TRUE
positive_scores_only = FALSE, # should negative Walk Scores (i.e. very unlikely connection due to negative weights) and 0 scores be dropped? Applied before normalization
normalize_score = c(NULL, "seeds", "group"), # Should scores be normalized? "seeds" to normalize the scores for each seed walk. "group" to normalize within grouping vars. Set to NULL for no normalization.
walk_score = NULL, # minimal walk score for results to keep. Can be applied to normalized or raw scores, see normalize_score. Will always apply to normalized score if available
walk_score_quantile = FALSE, # Should the quantile be calculated as a dynamic minimum walk_score for each group? If TRUE, walk_score specifies the quantile, rather than a fixed value
report_quantiles = FALSE, # should quantile values for each group be reported as printed output? Returns the quantiles of the selected walk_score_measure before any walk_score filtering
walk_score_measure = c("default", "raw", "seeds", "seeds_mean", "group", "group_mean"), # if more than one normalization method is selected, based which one should the walk_score be filtered? defaults to the chosen normalization
calculate_means = TRUE, # should score means be calculated?
normalize_means = TRUE,
reduce_to_means = FALSE,
keep_seed_terms = TRUE, # should seed terms always be kept, regardless of score? Only available if flatten_results = TRUE
seedterm_value = NULL, # overwrite the score of seed terms with a fixed value. Applies to all scores calculated. NULL to skip
progress = FALSE) # should the progress be shown for the map function?
{
require(dplyr)
require(furrr)
require(purrr)
require(RandomWalkRestartMH)
require(data.table)
require(scales)
require(rlang)
if (!is.null(normalize_score)){ # arg_match does not handle NULL values, thus we skip it if the score normalization is skipped via NULL
normalize_score <- rlang::arg_match(normalize_score, multiple = TRUE) # check for correct argument specification here
}
walk_score_measure <- rlang::arg_match(walk_score_measure, multiple = FALSE) # check for correct argument specification here
# format checks
if (is.null(walk_network$multiplex) | is.null(walk_network$AdjMatrixNorm)) {
stop("walk_network is missing the multiplex object or the adjacency matrix")
}
if (!RandomWalkRestartMH::isMultiplex(walk_network$multiplex)) {
stop("walk_network$multiplex is not a valid multiplex object")
}
if (class(walk_network$AdjMatrixNorm) != "dgCMatrix") {
stop("walk_network$AdjMatrixNorm is not a valid dgCMatrix")
}
# setting of variables
if (reduce_to_means & !calculate_means) { # set reduce_to_means to FALSE if no means are calculated
message("No means calculated. Not reducing output to means.")
reduce_to_means <- FALSE
}
# if (!is.null(walk_score)) { # set the variable that the walk_score filter will be applied on
if (length(normalize_score) > 1 &
walk_score_measure == "default") {
stop("More than one normalization method selected, but no valid walk_score_measure selected. Select one of the normalization methods to filter the walk_score on, or set walk_score to NULL to skip")
}
if (walk_score_measure == "default") { # set default measure for filtering...
if (is.null(normalize_score)) {
filter_var <- "Score"
message("Applying walk_score to raw RWR scores. You can change this by setting the walk_score_measure")
} else {
if (calculate_means) {
if (normalize_score == "seeds") {
filter_var <- "ScoreNormMean"
message("Applying walk_score to mean seed scores. You can change this by setting the walk_score_measure")
}
if (normalize_score == "group" ) {
filter_var <- "ScoreNormGroupMean"
message("Applying walk_score to mean group scores. You can change this by setting the walk_score_measure")
}
} else {
if (normalize_score == "seeds") {
filter_var <- "ScoreNorm"
message("Applying walk_score to seed-normalized scores. You can change this by setting the walk_score_measure")
}
if (normalize_score == "group" ) {
filter_var <- "ScoreNormGroup"
message("Applying walk_score to group-normalized scores. You can change this by setting the walk_score_measure")
}
}
}
} else { # ... or accommodate explicit settings
if (walk_score_measure == "raw") {
filter_var <- "Score"
}
if (walk_score_measure == "seeds") {
filter_var <- "ScoreNorm"
}
if (walk_score_measure == "seeds_mean") {
filter_var <- "ScoreNormMean"
}
if (walk_score_measure == "group" ) {
filter_var <- "ScoreNormGroup"
}
if (walk_score_measure == "group_mean" ) {
filter_var <- "ScoreNormGroupMean"
}
}
# }
if (!is.null(seedterm_value) & !is.numeric(seedterm_value)) {
stop("seedterm_value must either be a numerical value to set the seed term mean to or NULL to skip")
}
# Run Random Walks
rwr_results <-
seeds %>% purrr::imap(\(seed_group, name)
{
if (!is.null(match_var) & !is.null(network_name)) { # match network names and match var
seed_group <- seed_group %>%
dplyr::mutate({{match_var}} := as.character(!!as.name(match_var))) %>%
dplyr::filter(!!as.name(match_var) == network_name)}
seed_group <- seed_group %>% dplyr::filter(feature %in% walk_network$multiplex$Pool_of_Nodes) # drop seeds not in the network
if (nrow(seed_group) > 0) { # make sure there are seeds available (esp. important utilizing a match var!)
group_results <- seed_group %>%
dplyr::distinct(!!as.name(seed_var)) %>% dplyr::pull() %>%
furrr::future_map(\(seed) # running parallelization over the seeds (instead of whole groups) might add stability
{# calling an external functions is more robust in terms of not copying unnecessary object to the workers
seed_walk_res <-
seed_walk(
seed = seed,
walk_network = walk_network,
normalize_score = normalize_score,
positive_scores_only = positive_scores_only
)
if (is.null(seed_walk_res)) { # NULL returns from seed_walk indicate errors
message(paste("No results returned for seed", seed, "in group", name, "\n"))
}
return(seed_walk_res)
}) %>%
purrr::compact() %>% # remove NULLs introduced by tryCatch for erroneous walks
data.table::rbindlist(use.names = TRUE)
group_results <- group_results %>%
dplyr::mutate(seed_term = dplyr::case_when( # mark seed terms of the group/policy field
NodeNames %in% seed_node ~ TRUE, .default = FALSE))
if(calculate_means) { # calculate means for non-normalized score
group_results <- group_results %>%
dplyr::mutate(ScoreMean = mean(Score), .by = c(seed_term, NodeNames))
}
if (!is.null(normalize_score)) { # calculate means for normalized score
if (("seeds" %in% normalize_score) & calculate_means) { # calculate means of normalized seed scores in group and normalize again
group_results <- group_results %>%
dplyr::mutate(ScoreNormMean = mean(ScoreNorm), .by = c(seed_term, NodeNames))
if (normalize_means) {
group_results <- group_results %>%
dplyr::mutate(ScoreNormMean = scales::rescale(ScoreNormMean, to = c(0, 1)))
}
}
if ("group" %in% normalize_score) { # normalization on Group Level (within each group, e.g. policy field)
group_results <- group_results %>%
dplyr::mutate(ScoreNormGroup = scales::rescale(Score, to = c(0, 1)))
if (calculate_means) { # calculation of means within group
group_results <- group_results %>%
dplyr::mutate(ScoreNormGroupMean = mean(ScoreNormGroup), .by = NodeNames)
if (normalize_means) {
group_results <- group_results %>%
dplyr::mutate(ScoreNormGroupMean = scales::rescale(ScoreNormGroupMean, to = c(0, 1)))
}
}
}
}
if (report_quantiles) {
cat(paste0(group_name, ": ", name, "\n"))
cat(paste(filter_var, "Quantiles:\n"))
group_results %>% dplyr::pull(!!as.name(filter_var)) %>%
stats::quantile() %>% print()
cat("\n")
}
if (reduce_to_means) {
group_results <- group_results %>%
dplyr::distinct(dplyr::across(c(NodeNames, seed_term,
dplyr::ends_with("Mean"))))
}
# overwrite seed term values with a fixed value if desired
if (!is.null(seedterm_value)) {
group_results <- group_results %>%
dplyr::mutate(dplyr::across(c(dplyr::starts_with("Score")),
dplyr::case_when(
seed_term == T ~ seedterm_value,
.default = .
)))
}
# Score filtering
if (!is.null(walk_score) & !walk_score_quantile) {
if (keep_seed_terms) {
group_results <- group_results %>%
dplyr::filter(!!as.name(filter_var) >= walk_score |
seed_term == TRUE)
} else {
group_results <- group_results %>%
dplyr::filter(!!as.name(filter_var) >= walk_score)
}
}
if (!is.null(walk_score) & walk_score_quantile) {
quantile_value <- group_results %>%
dplyr::pull(!!as.name(filter_var)) %>%
stats::quantile(walk_score) %>%
.[[1]]
cat(paste0("Quantile-based threshold (",
walk_score, " quantile)",
" for ",
group_name, " ",
name, " in ",
filter_var, ": ",
quantile_value, "\n"))
if (keep_seed_terms) {
group_results <- group_results %>%
dplyr::filter(!!as.name(filter_var) >= quantile_value |
seed_term == TRUE)
} else {
group_results <- group_results %>%
dplyr::filter(!!as.name(filter_var) >= quantile_value)
}
}
return(group_results)
} else return(NULL) # return NULL if no seeds are available
},
.progress = progress)
if (flatten_results) { # flatten results into a single dataframe if desired
rwr_results %>%
purrr::compact() %>% # remove NULLs introduced by missing seeds
data.table::rbindlist(idcol = group_name, use.names = TRUE) %>%
dplyr::relocate(dplyr::any_of(c("Score", "ScoreMean", # set a more convenient col order
"ScoreNorm", "ScoreNormMean",
"ScoreNormGroup", "ScoreNormGroupMean")),
.after = dplyr::last_col()) %>%
return()
} else {
return(rwr_results)
}
}
seed_walk <- function(seed, walk_network, normalize_score, positive_scores_only){ # convenience function for the seed walks to be used in get_rwr_terms()
res <- tryCatch(# capture non-standard errors thrown by Random.Walk.Restart.Multiplex
Random.Walk.Restart.Multiplex.failsafe(
x = walk_network$AdjMatrixNorm,
MultiplexObject = walk_network$multiplex,
Seeds = seed
),
error = function(e) NULL # return NULL for errors
)
if (!is.null(res)) { # process further if the walk was successfull
res <- res %>%
.[["RWRM_Results"]] %>% # pull relevant data
dplyr::as_tibble() %>%
dplyr::mutate(seed_node = seed) # and set the seed node indicator
if (positive_scores_only) {
res <- res %>%
dplyr::filter(Score > 0)
}
if (!is.null(normalize_score)) {
if ("seeds" %in% normalize_score){ # normalization and score filtering on Seed Level (for each Random Walk)
res <- res %>%
dplyr::mutate(ScoreNorm = scales::rescale(Score, to = c(0, 1)))
}
}
}
return(res)
}
# this function makes the multiplex objects to be passed on to the random walk function
make_multiplex_objects <- function(dat, # data to be passed to calculate_network()
vertex_a, # vertex A column to be passed to calculate_network()
vertex_b, # vertex B column to be passed to calculate_network()
directed = FALSE, # is the network directed?
pmi_weight = TRUE,# should PMI weights be calculated? If TRUE, make sure vertex A and B are specified correctly
keep_negative_weights = TRUE, # should edges with negative (pmi) weight be kept or discarded?
network = NULL, # can be used to pass already-calculated networks. If NULL, a network will be calculated from the vertices
keep_igraph_network = FALSE, # should the igraph network be kept as a seperate object? Note
keep_multiplex_network = TRUE, # should the multiplex network be kept as a separate object?
keep_adjacency_matrix = FALSE, # should the non-normalized adjacency matrix be kept?
keep_normalized_adjacency_matrix = TRUE # should the normalized adjacency matrix be calculated and kept?
)
{
require(RandomWalkRestartMH)
if (is.null(network)) {
network <- calculate_network(
dat,
vertex_a = vertex_a,
vertex_b = vertex_b,
directed = directed,
pmi_weight = pmi_weight,
as_data_frame = F # return as igraph object
)
}
multiplex <- RandomWalkRestartMH::create.multiplex(list(network = network)) # make multiplex object for random walks
if(keep_adjacency_matrix | # calculate adjacency matrix if needed
keep_normalized_adjacency_matrix) {
AdjMatrix <- compute.adjacency.matrix.mono(multiplex)
}
if(keep_normalized_adjacency_matrix){ # calculate normalized adjacency matrix if needed
AdjMatrixNorm <- RandomWalkRestartMH::normalize.multiplex.adjacency(AdjMatrix)
}
return(c(
if(keep_igraph_network) {
list(network = network)
},
if(keep_multiplex_network) {
list(multiplex = multiplex)
},
if(keep_adjacency_matrix){
list(AdjMatrix = AdjMatrix)
},
if(keep_normalized_adjacency_matrix){
list(AdjMatrixNorm = AdjMatrixNorm)
}
))
}
compute.adjacency.matrix.mono <- function(x) # delta is no longer needed for monoplex networks
### An adjusted version of the compute.adjacency.matrix() function from
### the RandomWalkRestartMH package, making monoplex network preparation
### more efficient by dropping unnecessary overhead
### Specifically, it vastly reduces the use of RAM (which would require
### manual flushing with gc() every time) by dropping everything connected
### to the line "offdiag <- (delta/(L-1))*Idem_Matrix" which is not needed
### for monoplex networks
{
require(igraph)
require(Matrix)
require(RandomWalkRestartMH)
if (!isMultiplex(x) & !isMultiplexHet(x)) {
stop("Not a Multiplex or Multiplex Heterogeneous object")
}
N <- x$Number_of_Nodes_Multiplex
L <- x$Number_of_Layers
Layers_Names <- names(x)[seq(L)]
## IDEM_MATRIX.
Idem_Matrix <- Matrix::Diagonal(N, x = 1)
counter <- 0
Layers_List <- lapply(x[Layers_Names],function(x){
counter <<- counter + 1;
if (is_weighted(x)){
Adjacency_Layer <- igraph::as_adjacency_matrix(x,sparse = TRUE,
attr = "weight")
} else {
Adjacency_Layer <- igraph::as_adjacency_matrix(x,sparse = TRUE)
}
Adjacency_Layer <- Adjacency_Layer[order(rownames(Adjacency_Layer)),
order(colnames(Adjacency_Layer))]
colnames(Adjacency_Layer) <-
paste0(colnames(Adjacency_Layer),"_",counter)
rownames(Adjacency_Layer) <-
paste0(rownames(Adjacency_Layer),"_",counter)
Adjacency_Layer
})
MyColNames <- unlist(lapply(Layers_List, function (x) unlist(colnames(x))))
MyRowNames <- unlist(lapply(Layers_List, function (x) unlist(rownames(x))))
names(MyColNames) <- c()
names(MyRowNames) <- c()
SupraAdjacencyMatrix <- Matrix::bdiag(unlist(Layers_List))
colnames(SupraAdjacencyMatrix) <-MyColNames
rownames(SupraAdjacencyMatrix) <-MyRowNames
SupraAdjacencyMatrix <- as(SupraAdjacencyMatrix, "dgCMatrix")
return(SupraAdjacencyMatrix)
}
Random.Walk.Restart.Multiplex.failsafe <- function(x, MultiplexObject, Seeds,
r=0.7,tau,MeanType="Geometric", DispResults="TopScores",...){
## an adjusted version of https://github.com/alberto-valdeolivas/RandomWalkRestartMH/blob/master/R/RWRandMatrices.R
## including a failsafe for values in the proximity vectors reaching (-)Inf
### We control the different values.
if (!is(x,"dgCMatrix")){
stop("Not a dgCMatrix object of Matrix package")
}
if (!isMultiplex(MultiplexObject)) {
stop("Not a Multiplex object")
}
L <- MultiplexObject$Number_of_Layers
N <- MultiplexObject$Number_of_Nodes
Seeds <- as.character(Seeds)
if (length(Seeds) < 1 | length(Seeds) >= N){
stop("The length of the vector containing the seed nodes is not
correct")
} else {
if (!all(Seeds %in% MultiplexObject$Pool_of_Nodes)){
stop("Some of the seeds are not nodes of the network")
}
}
if (r >= 1 || r <= 0) {
stop("Restart parameter should be between 0 and 1")
}
if(missing(tau)){
tau <- rep(1,L)/L
} else {
tau <- as.numeric(tau)
if (sum(tau)/L != 1) {
stop("The sum of the components of tau divided by the number of
layers should be 1")
}
}
if(!(MeanType %in% c("Geometric","Arithmetic","Sum"))){
stop("The type mean should be Geometric, Arithmetic or Sum")
}
if(!(DispResults %in% c("TopScores","Alphabetic"))){
stop("The way to display RWRM results should be TopScores or
Alphabetic")
}
## We define the threshold and the number maximum of iterations for
## the random walker.
Threeshold <- 1e-10
NetworkSize <- ncol(x)
## We initialize the variables to control the flux in the RW algo.
residue <- 1
iter <- 1
## We compute the scores for the different seeds.
Seeds_Score <- get.seed.scoresMultiplex(Seeds,L,tau)
## We define the prox_vector(The vector we will move after the first RWR
## iteration. We start from The seed. We have to take in account
## that the walker with restart in some of the Seed nodes, depending on
## the score we gave in that file).
prox_vector <- matrix(0,nrow = NetworkSize,ncol=1)
prox_vector[which(colnames(x) %in% Seeds_Score[,1])] <- (Seeds_Score[,2])
prox_vector <- prox_vector/sum(prox_vector)
restart_vector <- prox_vector
vector_range <- 0
while(!any(c(-Inf, Inf) %in% range(vector_range)) && # a failsafe to stop the loop from breaking when values reach -Inf or Inf
residue >= Threeshold){
old_prox_vector <- prox_vector
prox_vector <- (1-r)*(x %*% prox_vector) + r*restart_vector
vector_range <- range(prox_vector@x)
if (any(c(-Inf, Inf) %in% range(vector_range))) { # print the last Residue before infinity
warning(paste("Vector Range reached infinite values.",
"Stopping Random Walk at", iter, "Iterations with Residue",
residue, "instead of the intended Threshold of", Threeshold,
"for Seed", Seeds))
}
residue <- sqrt(sum((prox_vector-old_prox_vector)^2))
iter <- iter + 1;
}
NodeNames <- character(length = N)
Score = numeric(length = N)
rank_global <- data.frame(NodeNames = NodeNames, Score = Score)
rank_global$NodeNames <- gsub("_1", "", row.names(prox_vector)[seq_len(N)])
if (MeanType=="Geometric"){
rank_global$Score <- geometric.mean(as.vector(prox_vector[,1]),L,N)
} else {
if (MeanType=="Arithmetic") {
rank_global$Score <- regular.mean(as.vector(prox_vector[,1]),L,N)
} else {
rank_global$Score <- sumValues(as.vector(prox_vector[,1]),L,N)
}
}
if (DispResults=="TopScores"){
## We sort the nodes according to their score.
Global_results <-
rank_global[with(rank_global, order(-Score, NodeNames)), ]
### We remove the seed nodes from the Ranking and we write the results.
Global_results <-
Global_results[which(!Global_results$NodeNames %in% Seeds),]
} else {
Global_results <- rank_global
}
rownames(Global_results) <- c()
RWRM_ranking <- list(RWRM_Results = Global_results,Seed_Nodes = Seeds)
class(RWRM_ranking) <- "RWRM_Results"
return(RWRM_ranking)
}
get.seed.scoresMultiplex <- function(Seeds,Number_Layers,tau) {
## internal helper function from https://github.com/alberto-valdeolivas/RandomWalkRestartMH/blob/master/R/InternalFunctions.R
## required for random walk function
Nr_Seeds <- length(Seeds)
Seeds_Seeds_Scores <- rep(tau/Nr_Seeds,Nr_Seeds)
Seed_Seeds_Layer_Labeled <-
paste0(rep(Seeds,Number_Layers),sep="_",rep(seq(Number_Layers),
length.out = Nr_Seeds*Number_Layers,each=Nr_Seeds))
Seeds_Score <- data.frame(Seeds_ID = Seed_Seeds_Layer_Labeled,
Score = Seeds_Seeds_Scores, stringsAsFactors = FALSE)
return(Seeds_Score)
}
geometric.mean <- function(Scores, L, N) {
## internal helper function from https://github.com/alberto-valdeolivas/RandomWalkRestartMH/blob/master/R/InternalFunctions.R
## required for random walk function
FinalScore <- numeric(length = N)
for (i in seq_len(N)){
FinalScore[i] <- prod(Scores[seq(from = i, to = N*L, by=N)])^(1/L)
}
return(FinalScore)
}
# this function calculates a (pmi weighted) network. Based on https://github.com/TimBMK/Tools-Scripts/blob/master/Tools%20%26%20Scripts/network_snapshots.R
calculate_network <- # function to be mapped over timeframes
function(data,
vertex_a,
vertex_b,
directed = FALSE, # is the network directed? Always undirected if pmi_weighht = T
pmi_weight = TRUE, # should PMI weights be calculated? If TRUE, make sure vertex A and B are specified correctly
as_data_frame = FALSE, # should the output be returned as a data frame? removes duplicated edges (a to b | b to a)
keep_negative_weights = TRUE){ # should edges with negative weights be kept? Only applies if pmi_weight = T
require(dplyr)
require(tidyr)
require(igraph)
require(widyr)
if (pmi_weight == TRUE) {
suppressWarnings({ # suppress warnings about deprecated matrix function in pmi calculation
slice <-
data %>%
dplyr::select({{ vertex_a }}, {{ vertex_b }}) %>%
widyr::pairwise_pmi_(feature = {{vertex_a}}, item = {{vertex_b}}, sort = F) %>% dplyr::rename(weight = pmi) %>% # calculate PMI as weight (use pairwise_pmi_() avoid problems with column specification)
igraph::graph_from_data_frame(directed = F) %>% # make igraph object for slice
igraph::as_data_frame(what = "edges") %>% # temporarily convert to dataframe to identify identical a-b b-a edges
dplyr::distinct(from, to, .keep_all = TRUE) # remove duplicated edges introduced by PMI (a to b, b to a)
if (!keep_negative_weights){
slice <- slice %>% filter(weight > 0)
}
if (!as_data_frame){
slice <- slice %>% igraph::graph_from_data_frame(directed = F) # convert to igraph object. Always undirected if PMI weighted
}
})
} else {
# unweighted if not pmi-weighted
slice <-
data %>% dplyr::as_tibble() %>%
dplyr::select({{ vertex_a }}, {{ vertex_b }})
if (!as_data_frame) {
slice <- slice %>% igraph::graph_from_data_frame(directed = directed) # convert to igraph object
}
}
return(slice)
}