cds

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URL;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Advanced High-Performance Web Crawler using Java Virtual Threads.
 * Save this file as WebCrawler.java
 */
public class WebCrawler {

    // Thread-safe set to track visited URLs and prevent infinite loops
    private static final Set<String> visitedUrls = ConcurrentHashMap.newKeySet();
    
    // Regex pattern to extract HTTP/HTTPS links from HTML content
    private static final Pattern URL_PATTERN = Pattern.compile("https?://[\\w\\d./?=&-]+");
    
    // Limits crawler depth to prevent excessive resource consumption
    private static final int MAX_DEPTH = 3; 

    public static void main(String[] args) {
        String seedUrl = "https://wikipedia.org";
        System.out.println("🚀 Starting Advanced Web Crawler with Virtual Threads...");
        System.out.println("Seed URL: " + seedUrl + "\n");

        // Using modern Virtual Threads for massive scalability without heavy OS thread overhead
        try (ExecutorService executor = Executors.newVirtualThreadPerTaskExecutor()) {
            crawl(seedUrl, 0, executor);
            
            // Keeps main thread alive briefly to await virtual thread execution completion
            Thread.sleep(10000); 
        } catch (InterruptedException e) {
            System.err.println("Crawler interrupted: " + e.getMessage());
            Thread.currentThread().interrupt();
        }
        
        System.out.println("\n🏁 Crawling session finished. Total unique URLs cached: " + visitedUrls.size());
    }

    /**
     * Recursively crawls webpages asynchronously.
     */
    private static void crawl(String url, int depth, ExecutorService executor) {
        if (depth > MAX_DEPTH || visitedUrls.contains(url)) {
            return;
        }

        // Atomic check-and-act to ensure unique page hits
        if (visitedUrls.add(url)) {
            System.out.printf("[Depth %d] 🕸️ Submitting Task for: %s%n", depth, url);
            
            // Asynchronously dispatch the HTTP fetch and parse operation
            executor.submit(() -> {
                String htmlContent = fetchPageSource(url);
                if (!htmlContent.isEmpty()) {
                    parseAndCrawlLinks(htmlContent, depth, executor);
                }
            });
        }
    }

    /**
     * Fetches raw HTML source text using standard low-overhead network I/O.
     */
    private static String fetchPageSource(String rawUrl) {
        StringBuilder rawHtml = new StringBuilder();
        try {
            URI uri = URI.create(rawUrl);
            URL url = uri.toURL();
            
            try (BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()))) {
                String line;
                while ((line = reader.readLine()) != null) {
                    rawHtml.append(line).append("\n");
                }
            }
        } catch (Exception e) {
            // Suppress network errors for dead or timed-out links
            return "";
        }
        return rawHtml.toString();
    }

    /**
     * Parses HTML content using regex patterns and spawns child crawl tasks.
     */
    private static void parseAndCrawlLinks(String html, int currentDepth, ExecutorService executor) {
        Matcher matcher = URL_PATTERN.matcher(html);
        while (matcher.find()) {
            String extractedUrl = matcher.group();
            // Self-schedule discovered links onto the virtual thread pool
            crawl(extractedUrl, currentDepth + 1, executor);
        }
    }
}


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

cds #306

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

cds #306

Description

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions