|
| 1 | +package dev.nipafx.demo.modern.crawler; |
| 2 | + |
| 3 | +import dev.nipafx.demo.modern.page.ExternalPage; |
| 4 | +import dev.nipafx.demo.modern.page.GitHubIssuePage; |
| 5 | +import dev.nipafx.demo.modern.page.GitHubPrPage; |
| 6 | +import org.jsoup.Jsoup; |
| 7 | +import org.jsoup.nodes.Document; |
| 8 | + |
| 9 | +import java.net.URI; |
| 10 | +import java.net.URISyntaxException; |
| 11 | +import java.util.Set; |
| 12 | +import java.util.regex.Pattern; |
| 13 | +import java.util.stream.Stream; |
| 14 | + |
| 15 | +import static java.util.stream.Collectors.toSet; |
| 16 | + |
| 17 | +class PageFactory { |
| 18 | + |
| 19 | + private static final Set<String> GITHUB_HOSTS = Set.of("github.com", "user-images.githubusercontent.com"); |
| 20 | + private static final Pattern GITHUB_TRACKED_PAGE = Pattern.compile("/issues/\\d+/?$|/pull/\\d+/?$"); |
| 21 | + private static final Pattern GITHUB_ISSUE_NUMBER = Pattern.compile(".*/issues/(\\d+)/?.*"); |
| 22 | + private static final Pattern GITHUB_PR_NUMBER = Pattern.compile(".*/pull/(\\d+)/?.*"); |
| 23 | + |
| 24 | + private static final String GITHUB_ISSUE_CONTENT_SELECTOR = "#show_issue"; |
| 25 | + private static final String GITHUB_PR_CONTENT_SELECTOR = ".clearfix.js-issues-results"; |
| 26 | + |
| 27 | + private PageFactory() { |
| 28 | + // private constructor to prevent instantiation of factory class |
| 29 | + } |
| 30 | + |
| 31 | + public static PageWithLinks parsePage(URI url, String html) { |
| 32 | + // turn this into an `if`, I dare you! |
| 33 | + return switch (url) { |
| 34 | + case URI u when u.getHost().equals("github.com") && u.getPath().contains("/issues/") -> parseIssuePage(url, html); |
| 35 | + case URI u when u.getHost().equals("github.com") && u.getPath().contains("/pull/") -> parsePrPage(url, html); |
| 36 | + default -> parseExternalPage(url, html); |
| 37 | + }; |
| 38 | + } |
| 39 | + |
| 40 | + static PageWithLinks parseIssuePage(URI url, String html) { |
| 41 | + var document = Jsoup.parse(html); |
| 42 | + var content = extractContent(document, GITHUB_ISSUE_CONTENT_SELECTOR); |
| 43 | + var links = extractLinks(url, document, GITHUB_ISSUE_CONTENT_SELECTOR); |
| 44 | + var issueNr = getFirstMatchAsNumber(GITHUB_ISSUE_NUMBER, url); |
| 45 | + return new PageWithLinks(new GitHubIssuePage(url, content, issueNr), links); |
| 46 | + } |
| 47 | + |
| 48 | + static PageWithLinks parsePrPage(URI url, String html) { |
| 49 | + var document = Jsoup.parse(html); |
| 50 | + var content = extractContent(document, GITHUB_PR_CONTENT_SELECTOR); |
| 51 | + var links = extractLinks(url, document, GITHUB_PR_CONTENT_SELECTOR); |
| 52 | + var issueNr = getFirstMatchAsNumber(GITHUB_PR_NUMBER, url); |
| 53 | + return new PageWithLinks(new GitHubPrPage(url, content, issueNr), links); |
| 54 | + } |
| 55 | + |
| 56 | + private static PageWithLinks parseExternalPage(URI url, String html) { |
| 57 | + return new PageWithLinks(new ExternalPage(url, html), Set.of()); |
| 58 | + } |
| 59 | + |
| 60 | + private static String extractContent(Document document, String cssContentSelector) { |
| 61 | + var selectedElements = document.select(cssContentSelector); |
| 62 | + if (selectedElements.size() != 1) |
| 63 | + throw new IllegalArgumentException("The CSS selector '%s' yielded %d elements".formatted(cssContentSelector, selectedElements.size())); |
| 64 | + return selectedElements.getFirst().toString(); |
| 65 | + } |
| 66 | + |
| 67 | + private static Set<URI> extractLinks(URI url, Document document, String cssContentSelector) { |
| 68 | + return document |
| 69 | + .select(cssContentSelector + " a[href]").stream() |
| 70 | + .map(element -> element.attribute("href").getValue()) |
| 71 | + .flatMap(href -> normalizePotentialLink(url, href)) |
| 72 | + .filter(PageFactory::shouldRegisterLink) |
| 73 | + .collect(toSet()); |
| 74 | + } |
| 75 | + |
| 76 | + private static Stream<URI> normalizePotentialLink(URI pageUrl, String href) { |
| 77 | + if (href == null || href.isBlank()) |
| 78 | + return Stream.empty(); |
| 79 | + |
| 80 | + try { |
| 81 | + var url = pageUrl.resolve(new URI(href)); |
| 82 | + var isCyclicLink = url.equals(pageUrl); |
| 83 | + if (isCyclicLink) |
| 84 | + return Stream.empty(); |
| 85 | + return Stream.of(url); |
| 86 | + } catch (URISyntaxException ex) { |
| 87 | + // nothing to be done |
| 88 | + return Stream.empty(); |
| 89 | + } |
| 90 | + } |
| 91 | + |
| 92 | + private static boolean shouldRegisterLink(URI url) { |
| 93 | + if (url.getHost() == null) |
| 94 | + return false; |
| 95 | + |
| 96 | + var isExternalUrl = !GITHUB_HOSTS.contains(url.getHost()); |
| 97 | + return isExternalUrl || GITHUB_TRACKED_PAGE.matcher(url.toString()).find(); |
| 98 | + } |
| 99 | + |
| 100 | + private static int getFirstMatchAsNumber(Pattern pattern, URI url) { |
| 101 | + var issueNumberMatcher = pattern.matcher(url.toString()); |
| 102 | + var found = issueNumberMatcher.find(); |
| 103 | + if (!found) |
| 104 | + throw new IllegalStateException("Alleged issue/PR URL %s does not seem to contain a number.".formatted(url)); |
| 105 | + return Integer.parseInt(issueNumberMatcher.group(1)); |
| 106 | + } |
| 107 | + |
| 108 | +} |
0 commit comments