Skip to content

Commit a0c8897

Browse files
committed
initial commit
1 parent 5f9ba66 commit a0c8897

File tree

2 files changed

+143
-0
lines changed

2 files changed

+143
-0
lines changed

crossResourceWorklowScraper.Rproj

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
Version: 1.0
2+
3+
RestoreWorkspace: Default
4+
SaveWorkspace: Default
5+
AlwaysSaveHistory: Default
6+
7+
EnableCodeIndexing: Yes
8+
UseSpacesForTab: Yes
9+
NumSpacesForTab: 2
10+
Encoding: UTF-8
11+
12+
RnwWeave: Sweave
13+
LaTeX: pdfLaTeX

scrapeGithub.Rmd

+130
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
---
2+
title: "scrapeGithub"
3+
output: html_document
4+
---
5+
### An individual scripts a tool to link resources
6+
7+
In this case a user has written a script to link a data resource, and the R package written for that resource, to resources in GitHub. The intention here is to survey ways in which the data resource is being used in analytic workflows.
8+
9+
```{r ropensci_links, echo=TRUE, warning=FALSE, message=FALSE, results='hide'}
10+
library(magrittr)
11+
library(tidyverse)
12+
13+
ropensci_registry <- jsonlite::fromJSON("https://raw.githubusercontent.com/ropensci/roregistry/master/registry.json")
14+
15+
packages <- ropensci_registry$packages
16+
17+
dd <- filter(packages,ropensci_category == "data-access")
18+
all.ddcat <- unique(unlist(dd$category))
19+
20+
to.use <- c("biology","archeology","maps","ecology","chemistry","conservation",
21+
"geospatial","environmental","waether","climate","biodiversity",
22+
"animals","plants","water","hydrology","noaa","NOAA","ecosystem",
23+
"remote-sensing")
24+
25+
dd.touse <- filter(dd,category %in% to.use)
26+
gh_token <- scan('gh.token', what = 'character')
27+
28+
if('all_github.rds' %in% list.files('data')) {
29+
test_pks <- readRDS('data/all_github.rds')
30+
} else {
31+
test_pks <- list()
32+
}
33+
34+
script_home <- 'https://github.com/throughput-ec/throughputdb/blob/master/populate/case_study.Rmd'
35+
36+
for (i in 1:length(packages)) {
37+
38+
# I had to serialize this to avoid getting caught by GitHub's abuse detection.
39+
40+
x <- packages[i]
41+
42+
43+
if (!length(test_pks) >= i) {
44+
Sys.sleep(5) # This is probably longer than it needs to be. . .
45+
46+
repos <- gh::gh(paste0('/search/code?q=library(',x,
47+
')+in:file+language:R+extension:R+extension:Rmd'),
48+
.token = gh_token)
49+
50+
annotation_text <- paste0("The GitHub repository uses the package ",
51+
x, " in a `library()` or `require()` call.")
52+
53+
repo_list <- unique(sapply(repos$items, function(x)x$repository$html_url))
54+
55+
target_list <- lapply(repo_list,
56+
function(y) {
57+
if(length(repo_list) > 0) {
58+
return(object(value = y, type = 'URL'))
59+
} else { return(NULL) }
60+
}
61+
)
62+
63+
if (length(target_list) > 0) {
64+
65+
test_pks[[i]] <- list(
66+
target = target_list,
67+
body = list(object(value = paste0('http://github.com/ropensci/',x),
68+
type = 'URL'),
69+
object( type = "annotationText",
70+
value = annotation_text),
71+
object(type = 'URL',
72+
value = script_home)),
73+
generator = creator(identifier = '0000-0002-2700-4605',
74+
PropertyID = 'orcid',
75+
lastName = 'Goring',
76+
firstName = 'Simon'),
77+
body_rel = object(type = 'URL',
78+
value = 'https://ropensci.org/'),
79+
source = object(type = 'URL',
80+
value = 'http://github.com'))
81+
} else {
82+
test_pks[[i]] <- list()
83+
}
84+
}
85+
86+
if (length(test_pks[[i]]) > 0) {
87+
88+
link_record(con,
89+
target = test_pks[[i]]$target,
90+
body = test_pks[[i]]$body,
91+
generator = test_pks[[i]]$generator,
92+
body_rel = test_pks[[i]]$body_rel,
93+
source = test_pks[[i]]$source)
94+
}
95+
96+
saveRDS(object = test_pks, file = 'data/all_github.rds')
97+
cat(i, '\n')
98+
99+
}
100+
101+
```
102+
103+
<div class='box curveshdw' align='center'>
104+
105+
```{r ropensci_graph, echo=FALSE, fig.align = 'center', fig.width = 4, fig.caption = 'rOpenSci package use within the GitHub environment.'}
106+
107+
nodes <- cypher(con, "MATCH (n1)-[r1]-(n2)
108+
RETURN DISTINCT ID(n1) AS id, LABELS(n1) AS type, '12' AS size LIMIT 300")
109+
110+
edges <- cypher(con, "MATCH (n1)-[r1]-(n2)
111+
RETURN ID(n1) AS source, ID(n2) AS target, type(r1) AS type LIMIT 300")
112+
113+
fd_data <- data.frame(source = (0:(nrow(nodes)-1))[match(edges$source, nodes$id)],
114+
target = (0:(nrow(nodes)-1))[match(edges$target, nodes$id)],
115+
rel = edges$type)
116+
117+
forceNetwork(Links = fd_data,
118+
Nodes = nodes,
119+
Source = 'source',
120+
Target = 'target',
121+
Value = 'rel',
122+
NodeID = 'type',
123+
Group = 'type',
124+
Nodesize = 'size',
125+
height = 600,
126+
width = 600,
127+
opacity = 1,
128+
colourScale = JS("d3.scaleOrdinal(d3.schemeCategory10);"))
129+
130+
```

0 commit comments

Comments
 (0)