|
| 1 | +--- |
| 2 | +title: "scrapeGithub" |
| 3 | +output: html_document |
| 4 | +--- |
| 5 | +### An individual scripts a tool to link resources |
| 6 | + |
| 7 | +In this case a user has written a script to link a data resource, and the R package written for that resource, to resources in GitHub. The intention here is to survey ways in which the data resource is being used in analytic workflows. |
| 8 | + |
| 9 | +```{r ropensci_links, echo=TRUE, warning=FALSE, message=FALSE, results='hide'} |
| 10 | +library(magrittr) |
| 11 | +library(tidyverse) |
| 12 | +
|
| 13 | +ropensci_registry <- jsonlite::fromJSON("https://raw.githubusercontent.com/ropensci/roregistry/master/registry.json") |
| 14 | +
|
| 15 | +packages <- ropensci_registry$packages |
| 16 | +
|
| 17 | +dd <- filter(packages,ropensci_category == "data-access") |
| 18 | +all.ddcat <- unique(unlist(dd$category)) |
| 19 | +
|
| 20 | +to.use <- c("biology","archeology","maps","ecology","chemistry","conservation", |
| 21 | + "geospatial","environmental","waether","climate","biodiversity", |
| 22 | + "animals","plants","water","hydrology","noaa","NOAA","ecosystem", |
| 23 | + "remote-sensing") |
| 24 | +
|
| 25 | +dd.touse <- filter(dd,category %in% to.use) |
| 26 | +gh_token <- scan('gh.token', what = 'character') |
| 27 | +
|
| 28 | +if('all_github.rds' %in% list.files('data')) { |
| 29 | + test_pks <- readRDS('data/all_github.rds') |
| 30 | +} else { |
| 31 | + test_pks <- list() |
| 32 | +} |
| 33 | +
|
| 34 | +script_home <- 'https://github.com/throughput-ec/throughputdb/blob/master/populate/case_study.Rmd' |
| 35 | +
|
| 36 | +for (i in 1:length(packages)) { |
| 37 | + |
| 38 | + # I had to serialize this to avoid getting caught by GitHub's abuse detection. |
| 39 | + |
| 40 | + x <- packages[i] |
| 41 | + |
| 42 | + |
| 43 | + if (!length(test_pks) >= i) { |
| 44 | + Sys.sleep(5) # This is probably longer than it needs to be. . . |
| 45 | +
|
| 46 | + repos <- gh::gh(paste0('/search/code?q=library(',x, |
| 47 | + ')+in:file+language:R+extension:R+extension:Rmd'), |
| 48 | + .token = gh_token) |
| 49 | + |
| 50 | + annotation_text <- paste0("The GitHub repository uses the package ", |
| 51 | + x, " in a `library()` or `require()` call.") |
| 52 | + |
| 53 | + repo_list <- unique(sapply(repos$items, function(x)x$repository$html_url)) |
| 54 | + |
| 55 | + target_list <- lapply(repo_list, |
| 56 | + function(y) { |
| 57 | + if(length(repo_list) > 0) { |
| 58 | + return(object(value = y, type = 'URL')) |
| 59 | + } else { return(NULL) } |
| 60 | + } |
| 61 | + ) |
| 62 | + |
| 63 | + if (length(target_list) > 0) { |
| 64 | + |
| 65 | + test_pks[[i]] <- list( |
| 66 | + target = target_list, |
| 67 | + body = list(object(value = paste0('http://github.com/ropensci/',x), |
| 68 | + type = 'URL'), |
| 69 | + object( type = "annotationText", |
| 70 | + value = annotation_text), |
| 71 | + object(type = 'URL', |
| 72 | + value = script_home)), |
| 73 | + generator = creator(identifier = '0000-0002-2700-4605', |
| 74 | + PropertyID = 'orcid', |
| 75 | + lastName = 'Goring', |
| 76 | + firstName = 'Simon'), |
| 77 | + body_rel = object(type = 'URL', |
| 78 | + value = 'https://ropensci.org/'), |
| 79 | + source = object(type = 'URL', |
| 80 | + value = 'http://github.com')) |
| 81 | + } else { |
| 82 | + test_pks[[i]] <- list() |
| 83 | + } |
| 84 | + } |
| 85 | + |
| 86 | + if (length(test_pks[[i]]) > 0) { |
| 87 | + |
| 88 | + link_record(con, |
| 89 | + target = test_pks[[i]]$target, |
| 90 | + body = test_pks[[i]]$body, |
| 91 | + generator = test_pks[[i]]$generator, |
| 92 | + body_rel = test_pks[[i]]$body_rel, |
| 93 | + source = test_pks[[i]]$source) |
| 94 | + } |
| 95 | + |
| 96 | + saveRDS(object = test_pks, file = 'data/all_github.rds') |
| 97 | + cat(i, '\n') |
| 98 | + |
| 99 | +} |
| 100 | +
|
| 101 | +``` |
| 102 | + |
| 103 | +<div class='box curveshdw' align='center'> |
| 104 | + |
| 105 | +```{r ropensci_graph, echo=FALSE, fig.align = 'center', fig.width = 4, fig.caption = 'rOpenSci package use within the GitHub environment.'} |
| 106 | +
|
| 107 | +nodes <- cypher(con, "MATCH (n1)-[r1]-(n2) |
| 108 | + RETURN DISTINCT ID(n1) AS id, LABELS(n1) AS type, '12' AS size LIMIT 300") |
| 109 | +
|
| 110 | +edges <- cypher(con, "MATCH (n1)-[r1]-(n2) |
| 111 | + RETURN ID(n1) AS source, ID(n2) AS target, type(r1) AS type LIMIT 300") |
| 112 | +
|
| 113 | +fd_data <- data.frame(source = (0:(nrow(nodes)-1))[match(edges$source, nodes$id)], |
| 114 | + target = (0:(nrow(nodes)-1))[match(edges$target, nodes$id)], |
| 115 | + rel = edges$type) |
| 116 | +
|
| 117 | +forceNetwork(Links = fd_data, |
| 118 | + Nodes = nodes, |
| 119 | + Source = 'source', |
| 120 | + Target = 'target', |
| 121 | + Value = 'rel', |
| 122 | + NodeID = 'type', |
| 123 | + Group = 'type', |
| 124 | + Nodesize = 'size', |
| 125 | + height = 600, |
| 126 | + width = 600, |
| 127 | + opacity = 1, |
| 128 | + colourScale = JS("d3.scaleOrdinal(d3.schemeCategory10);")) |
| 129 | +
|
| 130 | +``` |
0 commit comments