Skip to content

cds #306

@siddharthrgade21-a11y

Description

@siddharthrgade21-a11y

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URL;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**

  • Advanced High-Performance Web Crawler using Java Virtual Threads.

  • Save this file as WebCrawler.java
    */
    public class WebCrawler {

    // Thread-safe set to track visited URLs and prevent infinite loops
    private static final Set visitedUrls = ConcurrentHashMap.newKeySet();

    // Regex pattern to extract HTTP/HTTPS links from HTML content
    private static final Pattern URL_PATTERN = Pattern.compile("https?://[\w\d./?=&-]+");

    // Limits crawler depth to prevent excessive resource consumption
    private static final int MAX_DEPTH = 3;

    public static void main(String[] args) {
    String seedUrl = "https://wikipedia.org";
    System.out.println("🚀 Starting Advanced Web Crawler with Virtual Threads...");
    System.out.println("Seed URL: " + seedUrl + "\n");

     // Using modern Virtual Threads for massive scalability without heavy OS thread overhead
     try (ExecutorService executor = Executors.newVirtualThreadPerTaskExecutor()) {
         crawl(seedUrl, 0, executor);
         
         // Keeps main thread alive briefly to await virtual thread execution completion
         Thread.sleep(10000); 
     } catch (InterruptedException e) {
         System.err.println("Crawler interrupted: " + e.getMessage());
         Thread.currentThread().interrupt();
     }
     
     System.out.println("\n🏁 Crawling session finished. Total unique URLs cached: " + visitedUrls.size());
    

    }

    /**

    • Recursively crawls webpages asynchronously.
      */
      private static void crawl(String url, int depth, ExecutorService executor) {
      if (depth > MAX_DEPTH || visitedUrls.contains(url)) {
      return;
      }

      // Atomic check-and-act to ensure unique page hits
      if (visitedUrls.add(url)) {
      System.out.printf("[Depth %d] 🕸️ Submitting Task for: %s%n", depth, url);

       // Asynchronously dispatch the HTTP fetch and parse operation
       executor.submit(() -> {
           String htmlContent = fetchPageSource(url);
           if (!htmlContent.isEmpty()) {
               parseAndCrawlLinks(htmlContent, depth, executor);
           }
       });
      

      }
      }

    /**

    • Fetches raw HTML source text using standard low-overhead network I/O.
      */
      private static String fetchPageSource(String rawUrl) {
      StringBuilder rawHtml = new StringBuilder();
      try {
      URI uri = URI.create(rawUrl);
      URL url = uri.toURL();

       try (BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()))) {
           String line;
           while ((line = reader.readLine()) != null) {
               rawHtml.append(line).append("\n");
           }
       }
      

      } catch (Exception e) {
      // Suppress network errors for dead or timed-out links
      return "";
      }
      return rawHtml.toString();
      }

    /**

    • Parses HTML content using regex patterns and spawns child crawl tasks.
      */
      private static void parseAndCrawlLinks(String html, int currentDepth, ExecutorService executor) {
      Matcher matcher = URL_PATTERN.matcher(html);
      while (matcher.find()) {
      String extractedUrl = matcher.group();
      // Self-schedule discovered links onto the virtual thread pool
      crawl(extractedUrl, currentDepth + 1, executor);
      }
      }
      }

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions