Skip to content

Commit 97d2d46

Browse files
committed
[>] Complete page factory
1 parent b8fc42c commit 97d2d46

File tree

6 files changed

+261
-47
lines changed

6 files changed

+261
-47
lines changed

README.md

+42-33
Original file line numberDiff line numberDiff line change
@@ -4,42 +4,51 @@ A repository for my live-coding talk [Modern Java in Action](https://nipafx.dev/
44

55
## Next
66

7-
Operations:
8-
* implement methods in `Pretty`:
9-
```java
10-
public static String pageList(Page rootPage) {
11-
if (!(rootPage instanceof GitHubPage ghPage))
12-
return pageName(rootPage);
13-
14-
return ghPage
15-
.subtree()
16-
.map(Pretty::pageName)
17-
.collect(joining("\n"));
18-
}
7+
Records:
8+
* create `record PageWithLinks(Page page, Set<URI> links)`
9+
* additional constructor without `links`
1910

20-
public static String pageName(Page page) {
21-
return switch (page) {
22-
case ErrorPage(URI url, _) -> "💥 ERROR: " + url.getHost();
23-
case ExternalPage(URI url, _) -> "💤 EXTERNAL: " + url.getHost();
24-
case GitHubIssuePage(_, _, _, int nr) -> "🐈 ISSUE #" + nr;
25-
case GitHubPrPage(_, _, _, int nr) -> "🐙 PR #" + nr;
26-
};
27-
}
11+
Modules:
12+
* fix errors in `PageFactory`: `requires org.jsoup;`
13+
* fix errors in `PageTreeFactory`: `requires java.net.http;`
14+
15+
HTTP client:
16+
* instantiate `HttpClient` in `GitHubCrawl`:
17+
```java
18+
var client = HttpClient.newHttpClient();
2819
```
29-
* implement `Statistician::evaluatePage`:
20+
* `PageTreeFactory::fetchPageAsString`:
3021
```java
31-
private void evaluatePage(Page page) {
32-
if (evaluatedPages.contains(page))
33-
return;
34-
evaluatedPages.add(page);
35-
36-
switch (page) {
37-
case ErrorPage _ -> numberOfErrors++;
38-
case ExternalPage _ -> numberOfExternalLinks++;
39-
case GitHubIssuePage _ -> numberOfIssues++;
40-
case GitHubPrPage _ -> numberOfPrs++;
41-
}
22+
var request = HttpRequest
23+
.newBuilder(url)
24+
.GET()
25+
.build();
26+
return client
27+
.send(request, BodyHandlers.ofString())
28+
.body();
29+
```
30+
31+
Structured Concurrency:
32+
* `PageTreeFactory::resolveLinks`:
33+
```java
34+
try (var scope = new StructuredTaskScope.ShutdownOnFailure()) {
35+
var futurePages = new ArrayList<Subtask<Page>>();
36+
for (URI link : links)
37+
futurePages.add(scope.fork(() -> createPage(link, depth)));
38+
39+
scope.join();
40+
scope.throwIfFailed();
41+
42+
return futurePages.stream()
43+
.map(Subtask::get)
44+
.collect(toSet());
45+
} catch (ExecutionException ex) {
46+
// this should not happen as `ErrorPage` instances should have been created for all errors
47+
throw new IllegalStateException("Error cases should have been handled during page creation!", ex);
4248
}
4349
```
4450

45-
Run `GitHubCrawl`.
51+
Run:
52+
* add breakpoint for issue #740
53+
* run with arguments `https://github.com/junit-pioneer/junit-pioneer/issues/624 10`
54+
* create and show thread dump

src/main/java/dev/nipafx/demo/modern/GitHubCrawl.java

+9-9
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
11
package dev.nipafx.demo.modern;
22

3+
import dev.nipafx.demo.modern.crawler.PageTreeFactory;
34
import dev.nipafx.demo.modern.operations.Pretty;
45
import dev.nipafx.demo.modern.operations.Statistician;
5-
import dev.nipafx.demo.modern.page.ExternalPage;
6-
import dev.nipafx.demo.modern.page.GitHubIssuePage;
7-
import dev.nipafx.demo.modern.page.GitHubPrPage;
86

97
import java.net.URI;
108
import java.net.URISyntaxException;
11-
import java.util.Set;
9+
import java.net.http.HttpClient;
1210

1311
public class GitHubCrawl {
1412

@@ -19,11 +17,13 @@ public class GitHubCrawl {
1917
public static void main(String[] args) throws Exception {
2018
var config = Configuration.parse(args);
2119

22-
var rootPage = new GitHubIssuePage(URI.create("https://github.com/junit-pioneer/junit-pioneer/issues/624"), "",
23-
Set.of(
24-
new GitHubPrPage(URI.create("https://github.com/junit-pioneer/junit-pioneer/pull/629"), "", Set.of(), 629),
25-
new ExternalPage(URI.create("https://fasterxml.github.io/jackson-databind/javadoc/2.7/com/fasterxml/jackson/databind/ObjectMapper.html#findAndRegisterModules()"), "")
26-
), 624);
20+
System.out.printf("%nTo see virtual threads in action, run this while the app is resolving a bunch of links:%n");
21+
System.out.printf("jcmd %s Thread.dump_to_file -format=json -overwrite threads.json%n%n", ProcessHandle.current().pid());
22+
23+
// TODO
24+
var client = (HttpClient) null;
25+
var factory = new PageTreeFactory(client);
26+
var rootPage = factory.createPage(config.seedUrl(), config.depth());
2727

2828
System.out.println(Statistician.evaluate(rootPage));
2929
System.out.println(Pretty.pageList(rootPage));
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
package dev.nipafx.demo.modern.crawler;
2+
3+
import dev.nipafx.demo.modern.page.ExternalPage;
4+
import dev.nipafx.demo.modern.page.GitHubIssuePage;
5+
import dev.nipafx.demo.modern.page.GitHubPrPage;
6+
import org.jsoup.Jsoup;
7+
import org.jsoup.nodes.Document;
8+
9+
import java.net.URI;
10+
import java.net.URISyntaxException;
11+
import java.util.Set;
12+
import java.util.regex.Pattern;
13+
import java.util.stream.Stream;
14+
15+
import static java.util.stream.Collectors.toSet;
16+
17+
class PageFactory {
18+
19+
private static final Set<String> GITHUB_HOSTS = Set.of("github.com", "user-images.githubusercontent.com");
20+
private static final Pattern GITHUB_TRACKED_PAGE = Pattern.compile("/issues/\\d+/?$|/pull/\\d+/?$");
21+
private static final Pattern GITHUB_ISSUE_NUMBER = Pattern.compile(".*/issues/(\\d+)/?.*");
22+
private static final Pattern GITHUB_PR_NUMBER = Pattern.compile(".*/pull/(\\d+)/?.*");
23+
24+
private static final String GITHUB_ISSUE_CONTENT_SELECTOR = "#show_issue";
25+
private static final String GITHUB_PR_CONTENT_SELECTOR = ".clearfix.js-issues-results";
26+
27+
private PageFactory() {
28+
// private constructor to prevent instantiation of factory class
29+
}
30+
31+
public static PageWithLinks parsePage(URI url, String html) {
32+
// turn this into an `if`, I dare you!
33+
return switch (url) {
34+
case URI u when u.getHost().equals("github.com") && u.getPath().contains("/issues/") -> parseIssuePage(url, html);
35+
case URI u when u.getHost().equals("github.com") && u.getPath().contains("/pull/") -> parsePrPage(url, html);
36+
default -> parseExternalPage(url, html);
37+
};
38+
}
39+
40+
static PageWithLinks parseIssuePage(URI url, String html) {
41+
var document = Jsoup.parse(html);
42+
var content = extractContent(document, GITHUB_ISSUE_CONTENT_SELECTOR);
43+
var links = extractLinks(url, document, GITHUB_ISSUE_CONTENT_SELECTOR);
44+
var issueNr = getFirstMatchAsNumber(GITHUB_ISSUE_NUMBER, url);
45+
return new PageWithLinks(new GitHubIssuePage(url, content, issueNr), links);
46+
}
47+
48+
static PageWithLinks parsePrPage(URI url, String html) {
49+
var document = Jsoup.parse(html);
50+
var content = extractContent(document, GITHUB_PR_CONTENT_SELECTOR);
51+
var links = extractLinks(url, document, GITHUB_PR_CONTENT_SELECTOR);
52+
var issueNr = getFirstMatchAsNumber(GITHUB_PR_NUMBER, url);
53+
return new PageWithLinks(new GitHubPrPage(url, content, issueNr), links);
54+
}
55+
56+
private static PageWithLinks parseExternalPage(URI url, String html) {
57+
return new PageWithLinks(new ExternalPage(url, html), Set.of());
58+
}
59+
60+
private static String extractContent(Document document, String cssContentSelector) {
61+
var selectedElements = document.select(cssContentSelector);
62+
if (selectedElements.size() != 1)
63+
throw new IllegalArgumentException("The CSS selector '%s' yielded %d elements".formatted(cssContentSelector, selectedElements.size()));
64+
return selectedElements.getFirst().toString();
65+
}
66+
67+
private static Set<URI> extractLinks(URI url, Document document, String cssContentSelector) {
68+
return document
69+
.select(cssContentSelector + " a[href]").stream()
70+
.map(element -> element.attribute("href").getValue())
71+
.flatMap(href -> normalizePotentialLink(url, href))
72+
.filter(PageFactory::shouldRegisterLink)
73+
.collect(toSet());
74+
}
75+
76+
private static Stream<URI> normalizePotentialLink(URI pageUrl, String href) {
77+
if (href == null || href.isBlank())
78+
return Stream.empty();
79+
80+
try {
81+
var url = pageUrl.resolve(new URI(href));
82+
var isCyclicLink = url.equals(pageUrl);
83+
if (isCyclicLink)
84+
return Stream.empty();
85+
return Stream.of(url);
86+
} catch (URISyntaxException ex) {
87+
// nothing to be done
88+
return Stream.empty();
89+
}
90+
}
91+
92+
private static boolean shouldRegisterLink(URI url) {
93+
if (url.getHost() == null)
94+
return false;
95+
96+
var isExternalUrl = !GITHUB_HOSTS.contains(url.getHost());
97+
return isExternalUrl || GITHUB_TRACKED_PAGE.matcher(url.toString()).find();
98+
}
99+
100+
private static int getFirstMatchAsNumber(Pattern pattern, URI url) {
101+
var issueNumberMatcher = pattern.matcher(url.toString());
102+
var found = issueNumberMatcher.find();
103+
if (!found)
104+
throw new IllegalStateException("Alleged issue/PR URL %s does not seem to contain a number.".formatted(url));
105+
return Integer.parseInt(issueNumberMatcher.group(1));
106+
}
107+
108+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
package dev.nipafx.demo.modern.crawler;
2+
3+
import dev.nipafx.demo.modern.page.ErrorPage;
4+
import dev.nipafx.demo.modern.page.ExternalPage;
5+
import dev.nipafx.demo.modern.page.GitHubIssuePage;
6+
import dev.nipafx.demo.modern.page.GitHubPrPage;
7+
import dev.nipafx.demo.modern.page.Page;
8+
9+
import java.io.IOException;
10+
import java.net.URI;
11+
import java.net.http.HttpClient;
12+
import java.net.http.HttpRequest;
13+
import java.net.http.HttpResponse.BodyHandlers;
14+
import java.util.ArrayList;
15+
import java.util.Collections;
16+
import java.util.Set;
17+
import java.util.concurrent.ConcurrentHashMap;
18+
import java.util.concurrent.ConcurrentMap;
19+
import java.util.concurrent.ExecutionException;
20+
import java.util.concurrent.StructuredTaskScope;
21+
import java.util.concurrent.StructuredTaskScope.Subtask;
22+
23+
import static java.util.Objects.requireNonNull;
24+
import static java.util.stream.Collectors.toSet;
25+
26+
public class PageTreeFactory {
27+
28+
private final HttpClient client;
29+
private final ConcurrentMap<URI, Page> resolvedPages;
30+
31+
public PageTreeFactory(HttpClient client) {
32+
this.client = requireNonNull(client);
33+
resolvedPages = new ConcurrentHashMap<>();
34+
}
35+
36+
public Page createPage(URI url, int depth) throws InterruptedException {
37+
if (resolvedPages.containsKey(url)) {
38+
System.out.printf("Found cached '%s'%n", url);
39+
return resolvedPages.get(url);
40+
}
41+
42+
System.out.printf("Resolving '%s'...%n", url);
43+
var pageWithLinks = fetchPageWithLinks(url);
44+
var page = pageWithLinks.page();
45+
resolvedPages.computeIfAbsent(page.url(), __ -> page);
46+
System.out.printf("Resolved '%s' with children: %s%n", url, pageWithLinks.links());
47+
48+
return switch (page) {
49+
case GitHubIssuePage(var isUrl, var content, _, int nr) ->
50+
new GitHubIssuePage(isUrl, content, resolveLinks(pageWithLinks.links(), depth - 1), nr);
51+
case GitHubPrPage(var prUrl, var content, _, int nr) ->
52+
new GitHubIssuePage(prUrl, content, resolveLinks(pageWithLinks.links(), depth - 1), nr);
53+
case ExternalPage _, ErrorPage _ -> page;
54+
};
55+
}
56+
57+
private PageWithLinks fetchPageWithLinks(URI url) throws InterruptedException {
58+
try {
59+
var pageBody = fetchPageAsString(url);
60+
return PageFactory.parsePage(url, pageBody);
61+
} catch (InterruptedException iex) {
62+
throw iex;
63+
} catch (Exception ex) {
64+
return new PageWithLinks(new ErrorPage(url, ex));
65+
}
66+
}
67+
68+
private String fetchPageAsString(URI url) throws IOException, InterruptedException {
69+
// TODO: create and send HTTP request
70+
}
71+
72+
private Set<Page> resolveLinks(Set<URI> links, int depth) throws InterruptedException {
73+
if (depth < 0)
74+
return Collections.emptySet();
75+
76+
// TODO: resolve links in StructuredTaskScope
77+
}
78+
79+
}

src/main/java/dev/nipafx/demo/modern/operations/Pretty.java

+13-4
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,22 @@ private Pretty() {
1818
}
1919

2020
public static String pageList(Page rootPage) {
21-
// TODO
22-
return "";
21+
if (!(rootPage instanceof GitHubPage ghPage))
22+
return pageName(rootPage);
23+
24+
return ghPage
25+
.subtree()
26+
.map(Pretty::pageName)
27+
.collect(joining("\n"));
2328
}
2429

2530
public static String pageName(Page page) {
26-
// TODO
27-
return "";
31+
return switch (page) {
32+
case ErrorPage(URI url, _) -> "💥 ERROR: " + url.getHost();
33+
case ExternalPage(URI url, _) -> "💤 EXTERNAL: " + url.getHost();
34+
case GitHubIssuePage(_, _, _, int nr) -> "🐈 ISSUE #" + nr;
35+
case GitHubPrPage(_, _, _, int nr) -> "🐙 PR #" + nr;
36+
};
2837
}
2938

3039
}

src/main/java/dev/nipafx/demo/modern/operations/Statistician.java

+10-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,16 @@ private void evaluateTree(Page page) {
3737
}
3838

3939
private void evaluatePage(Page page) {
40-
// TODO
40+
if (evaluatedPages.contains(page))
41+
return;
42+
evaluatedPages.add(page);
43+
44+
switch (page) {
45+
case ErrorPage _ -> numberOfErrors++;
46+
case ExternalPage _ -> numberOfExternalLinks++;
47+
case GitHubIssuePage _ -> numberOfIssues++;
48+
case GitHubPrPage _ -> numberOfPrs++;
49+
}
4150
}
4251

4352
private Stats result() {

0 commit comments

Comments
 (0)