Enhance capture_page function with improved error handling and page load management

npv2k1 · npv2k1 · commit 1caefe492719 · 2025-02-23T16:50:27.000+07:00
diff --git a/src/modules/apps/__init__.py b/src/modules/apps/__init__.py
@@ -1,13 +1,16 @@
 import gradio as gr
 import tempfile
 import os
+import time
+import traceback
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException
 from subprocess import PIPE, STDOUT
-import traceback
+import psutil
 
 print("Gradio app loaded.")
 
@@ -20,67 +23,65 @@ def capture_page(url: str, output_file: str = "screenshot.png"):
     """
     options = Options()
     
-    # Basic options
-    options.add_argument('--headless=new')  # New headless mode
-    options.add_argument('--no-sandbox')  # Required in Docker
-    options.add_argument('--disable-dev-shm-usage')  # Required in Docker
-    
-    # Performance and stability options
-    options.add_argument('--disable-gpu')  # Required in Docker
+    # Use new headless mode and basic options for Docker
+    options.add_argument('--headless=new')
+    options.add_argument('--no-sandbox')
+    options.add_argument('--disable-dev-shm-usage')
+    options.add_argument('--disable-gpu')
     options.add_argument('--disable-software-rasterizer')
     options.add_argument('--disable-extensions')
     options.add_argument('--disable-infobars')
-    
-    # Resource configuration
     options.add_argument('--window-size=1920,1080')
     options.add_argument('--disable-features=NetworkService,NetworkServiceInProcess')
     options.add_argument('--disable-features=site-per-process')
-    
-    # Memory and process settings
-    options.add_argument('--single-process')  # Run in single process mode
+    options.add_argument('--single-process')
     options.add_argument('--memory-pressure-off')
     options.add_argument('--disable-crash-reporter')
-    options.add_argument('--disable-breakpad')  # Disable crash reporting
-    
-    # Additional stability options
+    options.add_argument('--disable-breakpad')
     options.add_argument('--ignore-certificate-errors')
     options.add_argument('--disable-setuid-sandbox')
     options.add_argument('--disable-web-security')
-    
-    # Set specific shared memory /dev/shm size (if needed)
-    options.add_argument('--disable-dev-shm-usage')
     options.add_argument('--shm-size=2g')
     
-    # Set up Chrome service with explicit path to chromedriver and logging
+    # Set page load strategy to 'none' to avoid waiting indefinitely
+    options.page_load_strategy = "none"
+    
+    # Set up Chrome service (ensure chromedriver is in your PATH)
     service = Service(
-        # executable_path='/usr/local/bin/chromedriver',
-        log_output=PIPE,  # Redirect logs to pipe
-        service_args=['--verbose']  # Enable verbose logging
+        log_output=PIPE,
+        service_args=['--verbose']
     )
     
+    driver = None
     try:
         print("Initializing Chrome...")
-        driver = webdriver.Chrome(
-            service=service,
-            options=options
-        )
-        
+        driver = webdriver.Chrome(service=service, options=options)
+        if not driver:
+            raise Exception("Failed to initialize Chrome driver")
+            
         print("Chrome initialized successfully")
+        driver.implicitly_wait(5)
         
         try:
-            print(f"Navigating to URL: {url}")
-            driver.get(url)
+            # Set a 30-second timeout for page load
+            driver.set_page_load_timeout(30)
+            try:
+                print(f"Navigating to URL: {url}")
+                driver.get(url)
+            except TimeoutException:
+                print("Page load timed out. Proceeding with screenshot capture...")
             
-            # Wait for page load
-            print("Waiting for page to load...")
-            driver.implicitly_wait(10)  # Increased wait time
+            # Wait for the document ready state to be 'interactive' or 'complete'
+            try:
+                print("Waiting for document ready state...")
+                WebDriverWait(driver, 30).until(
+                    lambda d: d.execute_script('return document.readyState') in ["interactive", "complete"]
+                )
+            except TimeoutException:
+                print("Document did not reach ready state within timeout, proceeding anyway.")
             
-            # Additional wait for dynamic content
-            from selenium.webdriver.support.ui import WebDriverWait
-            from selenium.webdriver.support import expected_conditions as EC
-            # WebDriverWait(driver, 10).until(
-            #     lambda d: d.execute_script('return document.readyState') == 'complete'
-            # )
+            # Additional short delay to allow dynamic content to settle
+            time.sleep(2)
             
             print("Taking screenshot...")
             driver.save_screenshot(output_file)
@@ -92,18 +93,22 @@ def capture_page(url: str, output_file: str = "screenshot.png"):
             raise
         finally:
             print("Closing Chrome...")
-            try:
-                driver.close()  # Close current window
-                driver.quit()   # Quit browser completely
-                import psutil   # For process cleanup
-                current_pid = os.getpid()
-                current_process = psutil.Process(current_pid)
-                children = current_process.children(recursive=True)
-                for child in children:
-                    if 'chrome' in child.name().lower():
-                        child.terminate()
-            except Exception as cleanup_error:
-                print(f"Error during cleanup: {cleanup_error}")
+            # Wrap cleanup in a try/except to prevent errors if the session is already closed
+            if driver:
+                try:
+                    driver.quit()
+                except Exception as cleanup_error:
+                    print(f"Error during driver.quit(): {cleanup_error}")
+                
+                # Optionally clean up any lingering Chrome processes
+                try:
+                    current_pid = os.getpid()
+                    current_process = psutil.Process(current_pid)
+                    for child in current_process.children(recursive=True):
+                        if 'chrome' in child.name().lower():
+                            child.terminate()
+                except Exception as psutil_error:
+                    print(f"Error during process cleanup: {psutil_error}")
             
     except Exception as e:
         print(f"Error initializing Chrome: {str(e)}")
@@ -112,33 +117,24 @@ def capture_page(url: str, output_file: str = "screenshot.png"):
 def capture_and_show(url: str):
     """Capture webpage and return the image"""
     try:
-        # Get the temporary directory path (defaulting to /tmp if TMPDIR is not set)
         temp_dir = os.getenv('TMPDIR', '/tmp')
-        
         try:
-            # Ensure temp directory exists and has correct permissions
             os.makedirs(temp_dir, mode=0o777, exist_ok=True)
             print(f"Using temp directory: {temp_dir}")
-            
-            # Verify directory is writable
             if not os.access(temp_dir, os.W_OK):
                 print(f"Warning: Temp directory {temp_dir} is not writable")
-                # Try to create a user-specific temp directory instead
                 temp_dir = os.path.join('/tmp', f'chrome_screenshots_{os.getuid()}')
                 os.makedirs(temp_dir, mode=0o777, exist_ok=True)
                 print(f"Created user-specific temp directory: {temp_dir}")
                 
-            # Create temporary file in the specified directory
             temp_path = os.path.join(temp_dir, f"screenshot_{os.urandom(8).hex()}.png")
             print(f"Temp file path: {temp_path}")
             
-            # Capture the webpage
             success = capture_page(url, temp_path)
             if not success:
                 print("Screenshot capture returned False")
                 return None
-            
-            # Verify file was created
+                
             if not os.path.exists(temp_path):
                 print("Screenshot file was not created")
                 return None
@@ -183,7 +179,6 @@ def create_gradio_app():
             
         def capture_with_error(url):
             try:
-                # Basic URL validation
                 if not url:
                     return None, gr.update(visible=True, value="Please enter a URL")
                 if not url.startswith(('http://', 'https://')):
@@ -196,7 +191,6 @@ def capture_with_error(url):
             except Exception as e:
                 return None, gr.update(visible=True, value=f"Error: {str(e)}")
             
-        # Connect the components
         capture_btn.click(
             fn=capture_with_error,
             inputs=[url_input],
@@ -207,21 +201,19 @@ def capture_with_error(url):
 
 app = create_gradio_app()
 
-# Configure server settings for Docker deployment
-server_port = 7860  # Standard Gradio port
-server_name = "0.0.0.0"  # Allow external connections
+server_port = 7860
+server_name = "0.0.0.0"
 
 def main():
-    """Launch the Gradio application"""
     print("Starting Gradio server...")
     app.launch(
         server_name=server_name,
         server_port=server_port,
-        share=False,  # Disable sharing as we're running in Docker
-        auth=None,    # Can be configured if authentication is needed
-        ssl_verify=False,  # Disable SSL verification for internal Docker network
+        share=False,
+        auth=None,
+        ssl_verify=False,
         show_error=True,
         favicon_path=None
     )
 
-main()
+main()