// Using modern Virtual Threads for massive scalability without heavy OS thread overhead
try (ExecutorService executor = Executors.newVirtualThreadPerTaskExecutor()) {
crawl(seedUrl, 0, executor);
// Keeps main thread alive briefly to await virtual thread execution completion
Thread.sleep(10000);
} catch (InterruptedException e) {
System.err.println("Crawler interrupted: " + e.getMessage());
Thread.currentThread().interrupt();
}
System.out.println("\n🏁 Crawling session finished. Total unique URLs cached: " + visitedUrls.size());
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URL;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
Advanced High-Performance Web Crawler using Java Virtual Threads.
Save this file as WebCrawler.java
*/
public class WebCrawler {
// Thread-safe set to track visited URLs and prevent infinite loops
private static final Set visitedUrls = ConcurrentHashMap.newKeySet();
// Regex pattern to extract HTTP/HTTPS links from HTML content
private static final Pattern URL_PATTERN = Pattern.compile("https?://[\w\d./?=&-]+");
// Limits crawler depth to prevent excessive resource consumption
private static final int MAX_DEPTH = 3;
public static void main(String[] args) {
String seedUrl = "https://wikipedia.org";
System.out.println("🚀 Starting Advanced Web Crawler with Virtual Threads...");
System.out.println("Seed URL: " + seedUrl + "\n");
}
/**
Recursively crawls webpages asynchronously.
*/
private static void crawl(String url, int depth, ExecutorService executor) {
if (depth > MAX_DEPTH || visitedUrls.contains(url)) {
return;
}
// Atomic check-and-act to ensure unique page hits
if (visitedUrls.add(url)) {
System.out.printf("[Depth %d] 🕸️ Submitting Task for: %s%n", depth, url);
}
}
/**
Fetches raw HTML source text using standard low-overhead network I/O.
*/
private static String fetchPageSource(String rawUrl) {
StringBuilder rawHtml = new StringBuilder();
try {
URI uri = URI.create(rawUrl);
URL url = uri.toURL();
} catch (Exception e) {
// Suppress network errors for dead or timed-out links
return "";
}
return rawHtml.toString();
}
/**
*/
private static void parseAndCrawlLinks(String html, int currentDepth, ExecutorService executor) {
Matcher matcher = URL_PATTERN.matcher(html);
while (matcher.find()) {
String extractedUrl = matcher.group();
// Self-schedule discovered links onto the virtual thread pool
crawl(extractedUrl, currentDepth + 1, executor);
}
}
}