main.py

import nbformat as nbf
import os
import gc  # Add garbage collection
import psutil
import json
import time

def create_base_notebook():
    """Create the base notebook with common cells."""
    nb = nbf.v4.new_notebook()
    
    # Markdown cell for project description
    nb['cells'].append(nbf.v4.new_markdown_cell('''
# arXiv Email Crawler for AgentDomain Initiative

This notebook implements a system to harvest email addresses from AI and agent-related 
research papers on arXiv. Its purpose is to invite researchers to join the 
AgentDomain.xyz initiative and promote the .agent TLD.

**Workflow**:
1. Query the arXiv API for papers matching search terms
2. Store paper metadata in a SQLite database
3. Download and process PDFs for unprocessed papers
4. Extract email addresses from the PDFs
5. Update the database with the extracted emails
'''))
    
    return nb

def add_platform_setup(nb, platform='local'):
    """Add platform-specific setup cells."""
    if platform == 'colab':
        # Google Drive mount for Colab
        nb['cells'].append(nbf.v4.new_code_cell('''
from google.colab import drive
drive.mount('/content/drive')

# Change to the project directory
import os
os.chdir('/content/drive/MyDrive/arxiv_parser')
print("Current working directory:", os.getcwd())
'''))
    elif platform == 'kaggle':
        # Kaggle-specific setup
        nb['cells'].append(nbf.v4.new_code_cell('''
# Kaggle setup - notebooks run in /kaggle/working
import os
os.makedirs('data', exist_ok=True)
'''))

    # Common dependencies installation
    nb['cells'].append(nbf.v4.new_code_cell('''
!pip install arxiv pdfplumber feedparser
'''))

def add_common_code(nb):
    """Add the common code cells that are platform-independent."""
    # Imports and logging setup
    nb['cells'].append(nbf.v4.new_code_cell('''
import os
import sys
import time
import logging
import csv
import gc  # Add garbage collection
from typing import List, Dict, Any
from concurrent.futures import ThreadPoolExecutor, as_completed

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('arxiv_crawler')
'''))

    # Add utility functions
    nb['cells'].append(nbf.v4.new_code_cell('''
# ArXiv API utilities
import feedparser
import requests
import time
from typing import List, Dict, Any

def search_papers(query: str, start: int = 0, max_results: int = 10) -> List[Dict[str, Any]]:
    """
    Search arXiv for papers matching the query.
    
    Args:
        query: The search query
        start: Starting index
        max_results: Maximum number of results to return
        
    Returns:
        List of paper metadata dictionaries
    """
    base_url = 'http://export.arxiv.org/api/query?'
    search_query = f'{base_url}search_query={query}&start={start}&max_results={max_results}&sortBy=submittedDate&sortOrder=descending'
    
    response = feedparser.parse(search_query)
    
    papers = []
    for entry in response.entries:
        paper = {
            'arxiv_id': entry.id.split('/abs/')[-1],
            'title': entry.title,
            'authors': [author.name for author in entry.authors],
            'published_date': entry.published,
            'pdf_link': entry.link.replace('/abs/', '/pdf/'),
            'doi': entry.get('arxiv_doi'),
            'abstract': entry.summary,
            'processed': 0,
            'emails': []
        }
        papers.append(paper)
    
    return papers
'''))

    nb['cells'].append(nbf.v4.new_code_cell('''
# Database management
import sqlite3
from typing import List, Dict, Any, Optional

def init_db():
    """Initialize the SQLite database with the papers table."""
    conn = sqlite3.connect('data/papers.db')
    cursor = conn.cursor()
    
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS papers (
        arxiv_id TEXT PRIMARY KEY,
        title TEXT,
        authors TEXT,
        published_date TEXT,
        pdf_link TEXT,
        doi TEXT,
        abstract TEXT,
        emails TEXT,
        processed INTEGER DEFAULT 0
    )
    """)
    
    conn.commit()
    conn.close()

def paper_exists(arxiv_id: str) -> bool:
    """Check if a paper exists in the database."""
    conn = sqlite3.connect('data/papers.db')
    cursor = conn.cursor()
    
    cursor.execute('SELECT 1 FROM papers WHERE arxiv_id = ?', (arxiv_id,))
    exists = cursor.fetchone() is not None
    
    conn.close()
    return exists

def insert_paper(paper: Dict[str, Any]):
    """Insert a paper into the database."""
    conn = sqlite3.connect('data/papers.db')
    cursor = conn.cursor()
    
    cursor.execute("""
    INSERT INTO papers (arxiv_id, title, authors, published_date, pdf_link, doi, abstract, emails, processed)
    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
    """, (
        paper['arxiv_id'],
        paper['title'],
        ', '.join(paper['authors']),
        paper['published_date'],
        paper['pdf_link'],
        paper['doi'],
        paper.get('abstract', ''),
        ', '.join(paper.get('emails', [])),
        paper.get('processed', 0)
    ))
    
    conn.commit()
    conn.close()

def get_unprocessed_papers() -> List[Dict[str, Any]]:
    """Get all unprocessed papers from the database."""
    conn = sqlite3.connect('data/papers.db')
    cursor = conn.cursor()
    
    cursor.execute('SELECT * FROM papers WHERE processed = 0')
    papers = []
    
    for row in cursor.fetchall():
        papers.append({
            'arxiv_id': row[0],
            'title': row[1],
            'authors': row[2].split(', '),
            'published_date': row[3],
            'pdf_link': row[4],
            'doi': row[5],
            'abstract': row[6],
            'emails': row[7].split(', ') if row[7] else [],
            'processed': row[8]
        })
    
    conn.close()
    return papers

def get_all_papers() -> List[Dict[str, Any]]:
    """Get all papers from the database."""
    conn = sqlite3.connect('data/papers.db')
    cursor = conn.cursor()
    
    cursor.execute('SELECT * FROM papers')
    papers = []
    
    for row in cursor.fetchall():
        papers.append({
            'arxiv_id': row[0],
            'title': row[1],
            'authors': row[2].split(', '),
            'published_date': row[3],
            'pdf_link': row[4],
            'doi': row[5],
            'abstract': row[6],
            'emails': row[7].split(', ') if row[7] else [],
            'processed': row[8]
        })
    
    conn.close()
    return papers

def update_paper_emails(arxiv_id: str, emails: List[str]):
    """Update the emails for a paper and mark it as processed."""
    conn = sqlite3.connect('data/papers.db')
    cursor = conn.cursor()
    
    cursor.execute("""
    UPDATE papers 
    SET emails = ?, processed = 1
    WHERE arxiv_id = ?
    """, (', '.join(emails), arxiv_id))
    
    conn.commit()
    conn.close()
'''))

    nb['cells'].append(nbf.v4.new_code_cell('''
# PDF handling and email extraction
import pdfplumber
import re
import requests
import tempfile
import os
from typing import Tuple, Optional

def download_and_extract_text(pdf_url: str) -> Tuple[bool, Optional[str]]:
    """
    Download a PDF and extract its text content.
    
    Args:
        pdf_url: URL of the PDF to download
        
    Returns:
        Tuple of (success, text_content)
    """
    try:
        # Download the PDF
        response = requests.get(pdf_url)
        if response.status_code != 200:
            return False, None
        
        # Create a temporary file
        with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as temp_pdf:
            temp_pdf.write(response.content)
            temp_pdf_path = temp_pdf.name
        
        try:
            # Extract text from the PDF
            text = ""
            with pdfplumber.open(temp_pdf_path) as pdf:
                for page in pdf.pages:
                    text += page.extract_text() or ""
                    # Force garbage collection after each page
                    gc.collect()
            
            return True, text
            
        finally:
            # Clean up the temporary file
            os.unlink(temp_pdf_path)
            
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return False, None

def extract_and_clean_emails(text: str) -> List[str]:
    """
    Extract and clean email addresses from text.
    
    Args:
        text: Text to extract emails from
        
    Returns:
        List of cleaned email addresses
    """
    # Basic email regex pattern
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    
    # Find all email addresses
    emails = re.findall(email_pattern, text)
    
    # Clean and normalize emails
    cleaned_emails = []
    for email in emails:
        # Convert to lowercase
        email = email.lower()
        
        # Remove any trailing punctuation
        email = re.sub(r'[.,;]$', '', email)
        
        # Add to list if not already present
        if email not in cleaned_emails:
            cleaned_emails.append(email)
    
    return cleaned_emails
'''))

    # Main functions
    nb['cells'].append(nbf.v4.new_code_cell('''
def process_paper(paper: Dict[str, Any]) -> bool:
    """
    Process a paper by downloading its PDF, extracting text, and finding email addresses.
    Memory-optimized version with garbage collection.
    """
    arxiv_id = paper['arxiv_id']
    pdf_link = paper['pdf_link']
    
    logger.info(f"Processing paper: {paper['title']} (ID: {arxiv_id})")
    
    try:
        # Force garbage collection before processing
        gc.collect()
        
        # Download and extract text from the PDF
        success, text = download_and_extract_text(pdf_link)
        
        if not success or not text:
            logger.warning(f"Failed to download or extract text from PDF for paper {arxiv_id}")
            return False
        
        # Extract email addresses from the text
        emails = extract_and_clean_emails(text)
        
        # Clear the text variable to free memory
        text = None
        gc.collect()
        
        logger.info(f"Found {len(emails)} email addresses in paper {arxiv_id}")
        
        # Update the database with the extracted emails
        update_paper_emails(arxiv_id, emails)
        
        # Clear emails list to free memory
        emails = None
        gc.collect()
        
        logger.info(f"Successfully processed paper {arxiv_id}")
        return True
    
    except Exception as e:
        logger.error(f"Error processing paper {arxiv_id}: {str(e)}")
        return False
    finally:
        # Final garbage collection
        gc.collect()

def query_and_store_papers(search_query: str, start: int = 0, max_results: int = 10) -> List[Dict[str, Any]]:
    """
    Query arXiv for papers and store them in the database.
    """
    logger.info(f"Querying arXiv with search query: {search_query}")
    
    # Search for papers
    papers = search_papers(search_query, start, max_results)
    logger.info(f"Found {len(papers)} papers matching the query.")
    
    # Store papers in the database
    new_papers = []
    for paper in papers:
        if not paper_exists(paper['arxiv_id']):
            insert_paper(paper)
            new_papers.append(paper)
            logger.info(f"Added paper to database: {paper['title']} (ID: {paper['arxiv_id']})")
        else:
            logger.info(f"Paper already exists in database: {paper['arxiv_id']}")
    
    logger.info(f"Added {len(new_papers)} new papers to the database.")
    return new_papers

def export_results():
    """Export the results to CSV and text files."""
    # Get all papers from the database
    all_papers = get_all_papers()
    processed_papers = [paper for paper in all_papers if paper['processed'] == 1]
    
    # Count the total number of unique emails
    all_emails = []
    for paper in processed_papers:
        if paper['emails']:
            all_emails.extend(paper['emails'])
    
    unique_emails = list(set(all_emails))
    
    # Get papers with emails
    papers_with_emails = [paper for paper in processed_papers if paper['emails'] and len(paper['emails']) > 0]
    
    # Export papers with emails to a CSV file
    csv_path = os.path.join('data', 'papers_with_emails.csv')
    
    with open(csv_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['arxiv_id', 'title', 'authors', 'published_date', 'pdf_link', 'doi', 'emails'])
        
        for paper in papers_with_emails:
            writer.writerow([
                paper['arxiv_id'],
                paper['title'],
                ', '.join(paper['authors']),
                paper['published_date'],
                paper['pdf_link'],
                paper['doi'] or '',
                ', '.join(paper['emails'])
            ])
    
    logger.info(f"Exported {len(papers_with_emails)} papers with emails to {csv_path}")
    
    # Export unique emails to a text file
    emails_path = os.path.join('data', 'unique_emails.txt')
    
    with open(emails_path, 'w', encoding='utf-8') as f:
        for email in unique_emails:
            f.write(email + '\\n')
    
    logger.info(f"Exported {len(unique_emails)} unique emails to {emails_path}")
'''))

    # Example usage cell with memory optimization
    nb['cells'].append(nbf.v4.new_code_cell('''
import psutil
import json
import os

def get_memory_usage():
    """Get current memory usage in GB"""
    process = psutil.Process(os.getpid())
    memory_gb = process.memory_info().rss / 1024 / 1024 / 1024
    return memory_gb

def save_checkpoint(query_index, paper_index, search_queries):
    """Save progress to a checkpoint file"""
    checkpoint = {
        'query_index': query_index,
        'paper_index': paper_index,
        'last_query': search_queries[query_index] if query_index < len(search_queries) else None,
        'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
    }
    with open('data/checkpoint.json', 'w') as f:
        json.dump(checkpoint, f)
    logger.info(f"Saved checkpoint: Query {query_index}, Paper {paper_index}")

def load_checkpoint():
    """Load progress from checkpoint file"""
    try:
        with open('data/checkpoint.json', 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        return None

def process_papers_in_batches(papers, batch_size=5, memory_threshold_gb=10):
    """Process papers in batches with memory monitoring"""
    total_papers = len(papers)
    
    # Load checkpoint if exists
    checkpoint = load_checkpoint()
    start_index = checkpoint['paper_index'] if checkpoint else 0
    
    for i in range(start_index, total_papers, batch_size):
        batch = papers[i:i + batch_size]
        logger.info(f"Processing batch {i//batch_size + 1}/{(total_papers + batch_size - 1)//batch_size}")
        
        for j, paper in enumerate(batch):
            current_index = i + j
            logger.info(f"Processing paper {current_index + 1}/{total_papers}: {paper['title']}")
            
            # Check memory usage before processing
            memory_usage = get_memory_usage()
            if memory_usage > memory_threshold_gb:
                logger.warning(f"Memory usage high ({memory_usage:.2f}GB). Forcing garbage collection.")
                gc.collect()
                time.sleep(5)  # Give system time to free memory
            
            # Process the paper
            process_paper(paper)
            
            # Save checkpoint after each paper
            save_checkpoint(0, current_index, [])  # We'll update this for query tracking
            
            # Sleep between papers to respect arXiv's rate limits
            if current_index < total_papers - 1:
                time.sleep(20)
        
        # Force garbage collection after each batch
        gc.collect()
        logger.info(f"Batch complete. Memory usage: {get_memory_usage():.2f}GB")
        time.sleep(5)  # Brief pause between batches

def query_arxiv_with_resume(search_queries, start_index=0, max_results=1000, batch_size=50):
    """Query arXiv with resume capability and batch processing"""
    all_new_papers = []
    
    # Load checkpoint if exists
    checkpoint = load_checkpoint()
    query_start = checkpoint['query_index'] if checkpoint else 0
    
    for i, query in enumerate(search_queries[query_start:], start=query_start):
        logger.info(f"Processing query {i+1}/{len(search_queries)}: {query}")
        
        # Process query in batches
        for start in range(start_index, max_results, batch_size):
            try:
                # Check memory before query
                if get_memory_usage() > 10:
                    gc.collect()
                    time.sleep(5)
                
                new_papers = query_and_store_papers(query, start=start, max_results=batch_size)
                all_new_papers.extend(new_papers)
                
                # Save checkpoint after each batch
                save_checkpoint(i, start + batch_size, search_queries)
                
                # If no new papers found, move to next query
                if not new_papers:
                    logger.info(f"No more papers found for query: {query}")
                    break
                
                # Sleep to respect arXiv's rate limits
                time.sleep(3)
                gc.collect()
                
            except Exception as e:
                logger.error(f"Error processing batch for query {query} at start={start}: {str(e)}")
                # Save checkpoint so we can resume from here
                save_checkpoint(i, start, search_queries)
                raise
    
    return all_new_papers

# Initialize the database
init_db()
logger.info("Database initialized.")

# Install psutil if not already installed
!pip install psutil

# Comprehensive search queries for AI and agent-related papers
search_queries = [
    # Core AI and Agent queries
    "cat:cs.AI AND all:agent",
    "cat:cs.AI AND all:LLM",
    "cat:cs.AI AND all:GPT",
    "cat:cs.AI AND all:transformer",
    "cat:cs.AI AND all:reinforcement learning",
    
    # Multi-agent systems
    "cat:cs.MA",  # Multi-agent systems category
    "cat:cs.MA AND all:cooperation",
    "cat:cs.MA AND all:coordination",
    "cat:cs.MA AND all:negotiation",
    
    # Machine Learning and Deep Learning
    "cat:cs.LG AND all:agent",
    "cat:cs.LG AND all:multi-agent",
    "cat:cs.LG AND all:reinforcement",
    "cat:cs.LG AND all:transformer",
    
    # Natural Language Processing
    "cat:cs.CL AND all:LLM",
    "cat:cs.CL AND all:GPT",
    "cat:cs.CL AND all:transformer",
    
    # Robotics and Automation
    "cat:cs.RO AND all:agent",
    "cat:cs.RO AND all:autonomous",
    "cat:cs.RO AND all:multi-robot",
    
    # Specific Research Areas
    "all:\"multi agent\" AND all:learning",
    "all:\"autonomous agent\" AND all:decision",
    "all:\"agent based\" AND all:simulation",
    "all:\"distributed AI\" AND all:coordination",
    
    # Emerging Topics
    "all:\"foundation model\" AND all:agent",
    "all:\"large language model\" AND all:agent",
    "all:\"autonomous AI\" AND all:system"
]

def main():
    """Main execution function"""
    try:
        # Query arXiv with resume capability
        all_new_papers = query_arxiv_with_resume(
            search_queries,
            start_index=0,
            max_results=1000,
            batch_size=50
        )
        logger.info(f"Total new papers added: {len(all_new_papers)}")
        
        # Get and process unprocessed papers
        unprocessed_papers = get_unprocessed_papers()
        logger.info(f"Found {len(unprocessed_papers)} unprocessed papers in the database.")
        
        # Process papers in batches with memory monitoring
        process_papers_in_batches(
            unprocessed_papers,
            batch_size=5,
            memory_threshold_gb=10
        )
        
        # Export results
        export_results()
        
        # Clear checkpoint after successful completion
        if os.path.exists('data/checkpoint.json'):
            os.remove('data/checkpoint.json')
            logger.info("Cleared checkpoint file after successful completion")

    except Exception as e:
        logger.error(f"Process interrupted: {str(e)}")
        logger.info("You can resume from the last checkpoint by running the script again")

# Execute main function when running in Colab
if 'google.colab' in str(get_ipython()):
    print("Running in Google Colab - Starting paper processing...")
    main()
else:
    print("Not running in Colab - Paper processing ready to start")
'''))

def generate_notebooks():
    """Generate all three versions of the notebook."""
    # Create notebooks directory if it doesn't exist
    os.makedirs('notebooks', exist_ok=True)
    os.makedirs('data', exist_ok=True)  # Also ensure data directory exists
    
    # Generate local Jupyter notebook
    nb_local = create_base_notebook()
    add_platform_setup(nb_local, 'local')
    add_common_code(nb_local)
    with open('notebooks/arxiv_email_crawler.ipynb', 'w') as f:
        nbf.write(nb_local, f)
    
    # Generate Google Colab notebook
    nb_colab = create_base_notebook()
    add_platform_setup(nb_colab, 'colab')
    add_common_code(nb_colab)
    with open('notebooks/arxiv_email_crawler_colab.ipynb', 'w') as f:
        nbf.write(nb_colab, f)
    
    # Generate Kaggle notebook
    nb_kaggle = create_base_notebook()
    add_platform_setup(nb_kaggle, 'kaggle')
    add_common_code(nb_kaggle)
    with open('notebooks/arxiv_email_crawler_kaggle.ipynb', 'w') as f:
        nbf.write(nb_kaggle, f)

if __name__ == "__main__":
    generate_notebooks()
    print("Generated all notebook versions in the notebooks/ directory:")