chore: add static analysis for Python templates (#316)

Add static analysis (Ruff, Mypy) for Python templates. Edit: Mypy will require further updates due to our usage of `src/`: ``` $ make type-check poetry run mypy templates/python-crawlee-beautifulsoup/src/__init__.py: error: Duplicate module named "src" (also at "templates/python-beautifulsoup/src/__init__.py") templates/python-crawlee-beautifulsoup/src/__init__.py: note: See https://mypy.readthedocs.io/en/stable/running_mypy.html#mapping-file-paths-to-modules for more info templates/python-crawlee-beautifulsoup/src/__init__.py: note: Common resolutions include: a) using `--exclude` to avoid checking one of them, b) adding `__init__.py` somewhere, c) using `--explicit-package-bases` or adjusting MYPYPATH Found 1 error in 1 file (errors prevented further checking) make: *** [Makefile:16: type-check] Error 2 ``` This will be addressed later in [#317](#317 (comment)).
apify · Feb 18, 2025 · d09f9f9 · d09f9f9
1 parent 4b869fc
commit d09f9f9
Show file tree

Hide file tree

Showing 34 changed files with 3,454 additions and 23 deletions.
diff --git a/.github/workflows/lint_and_test.yaml b/.github/workflows/lint_and_test.yaml
@@ -8,6 +8,16 @@ on:
     - cron: '0 0 * * *'
 
 jobs:
+  python_lint_check:
+    name: Python lint check
+    uses: apify/workflows/.github/workflows/python_lint_check.yaml@main
+
+  # TODO: make Mypy work
+  # https://github.com/apify/actor-templates/issues/317
+  # python_type_check:
+  #   name: Python type check
+  #   uses: apify/workflows/.github/workflows/python_type_check.yaml@main
+
   lint_and_test:
     name: Lint and test (without templates)
     runs-on: ubuntu-latest

diff --git a/.gitignore b/.gitignore
@@ -17,6 +17,6 @@ __pycache__
 .mypy_cache
 .pytest_cache
 .ruff_cache
-pyproject.toml
 .venv
 venv
+poetry.toml
diff --git a/Makefile b/Makefile
@@ -0,0 +1,22 @@
+# This is used by the Github Actions to run the static analysis.
+
+.PHONY: clean install-dev lint type-check format check-code
+
+clean:
+	rm -rf .mypy_cache .pytest_cache .ruff_cache build dist htmlcov .coverage
+
+install-dev:
+	poetry install --all-extras
+
+lint:
+	poetry run ruff format --check
+	poetry run ruff check
+
+type-check:
+	poetry run mypy
+
+format:
+	poetry run ruff check --fix
+	poetry run ruff format
+
+check-code: lint type-check
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,95 @@
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.poetry]
+package-mode = false
+
+[tool.poetry.dependencies]
+python = "^3.9"
+apify = { extras = ["scrapy"], version = "<3.0.0" }
+beautifulsoup4 = "<5.0.0"
+crawlee = { extras = ["beautifulsoup", "playwright"], version = "<0.6.0" }
+nest-asyncio = "<2.0.0"
+playwright = "<2.0.0"
+scrapy = "<3.0.0"
+selenium = "<5.0.0"
+
+[tool.poetry.group.dev.dependencies]
+mypy = "~1.15.0"
+ruff = "~0.9.0"
+types-beautifulsoup4 = "~4.12.0.20250204"
+
+[tool.ruff]
+line-length = 120
+include = ["templates/**/*.py"]
+
+[tool.ruff.lint]
+select = ["ALL"]
+ignore = [
+    "ANN401",   # Dynamically typed expressions (typing.Any) are disallowed in {filename}
+    "BLE001",   # Do not catch blind exception
+    "C901",     # `{name}` is too complex
+    "COM812",   # This rule may cause conflicts when used with the formatter
+    "D100",     # Missing docstring in public module
+    "D104",     # Missing docstring in public package
+    "D107",     # Missing docstring in `__init__`
+    "EM",       # flake8-errmsg
+    "G004",     # Logging statement uses f-string
+    "ISC001",   # This rule may cause conflicts when used with the formatter
+    "FIX",      # flake8-fixme
+    "TRY003",   # Avoid specifying long messages outside the exception class
+]
+
+[tool.ruff.format]
+quote-style = "single"
+indent-style = "space"
+
+[tool.ruff.lint.per-file-ignores]
+"**/__init__.py" = [
+    "F401", # Unused imports
+]
+
+[tool.ruff.lint.flake8-quotes]
+docstring-quotes = "double"
+inline-quotes = "single"
+
+[tool.ruff.lint.flake8-type-checking]
+runtime-evaluated-base-classes = [
+    "pydantic.BaseModel",
+    "pydantic_settings.BaseSettings",
+]
+
+[tool.ruff.lint.flake8-builtins]
+builtins-ignorelist = ["id"]
+
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+
+[tool.ruff.lint.pylint]
+max-branches = 18
+
+[tool.pytest.ini_options]
+addopts = "-ra"
+asyncio_default_fixture_loop_scope = "function"
+asyncio_mode = "auto"
+timeout = 1200
+
+[tool.mypy]
+python_version = "3.9"
+plugins = ["pydantic.mypy"]
+files = ["templates"]
+check_untyped_defs = true
+disallow_incomplete_defs = true
+disallow_untyped_calls = true
+disallow_untyped_decorators = true
+disallow_untyped_defs = true
+no_implicit_optional = true
+warn_redundant_casts = true
+warn_return_any = true
+warn_unreachable = true
+warn_unused_ignores = true
+
+[[tool.mypy.overrides]]
+module = ['nest_asyncio']
+ignore_missing_imports = true
diff --git a/templates/python-beautifulsoup/src/__init__.py b/templates/python-beautifulsoup/src/__init__.py
diff --git a/templates/python-beautifulsoup/src/main.py b/templates/python-beautifulsoup/src/main.py
@@ -8,11 +8,10 @@
 
 from urllib.parse import urljoin
 
+from apify import Actor, Request
 from bs4 import BeautifulSoup
 from httpx import AsyncClient
 
-from apify import Actor, Request
-
 
 async def main() -> None:
     """Main entry point for the Apify Actor.

diff --git a/templates/python-beautifulsoup/src/py.typed b/templates/python-beautifulsoup/src/py.typed
diff --git a/templates/python-crawlee-beautifulsoup/src/__init__.py b/templates/python-crawlee-beautifulsoup/src/__init__.py
diff --git a/templates/python-crawlee-beautifulsoup/src/main.py b/templates/python-crawlee-beautifulsoup/src/main.py
@@ -6,7 +6,7 @@
 https://docs.apify.com/sdk/python
 """
 
-from apify import Actor, Request
+from apify import Actor
 from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 
 

diff --git a/templates/python-crawlee-beautifulsoup/src/py.typed b/templates/python-crawlee-beautifulsoup/src/py.typed
diff --git a/templates/python-crawlee-playwright/src/__init__.py b/templates/python-crawlee-playwright/src/__init__.py
diff --git a/templates/python-crawlee-playwright/src/main.py b/templates/python-crawlee-playwright/src/main.py
@@ -6,7 +6,7 @@
 https://docs.apify.com/sdk/python
 """
 
-from apify import Actor, Request
+from apify import Actor
 from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 
 
@@ -41,7 +41,7 @@ async def main() -> None:
             headless=True,
             browser_launch_options={
                 'args': ['--disable-gpu'],
-            }
+            },
         )
 
         # Define a request handler, which will be called for every request.

diff --git a/templates/python-crawlee-playwright/src/py.typed b/templates/python-crawlee-playwright/src/py.typed
diff --git a/templates/python-empty/src/__init__.py b/templates/python-empty/src/__init__.py
diff --git a/templates/python-empty/src/py.typed b/templates/python-empty/src/py.typed
diff --git a/templates/python-playwright/src/__init__.py b/templates/python-playwright/src/__init__.py
diff --git a/templates/python-playwright/src/main.py b/templates/python-playwright/src/main.py
@@ -8,9 +8,8 @@
 
 from urllib.parse import urljoin
 
-from playwright.async_api import async_playwright
-
 from apify import Actor, Request
+from playwright.async_api import async_playwright
 
 # Note: To run this Actor locally, ensure that Playwright browsers are installed.
 # Run `playwright install --with-deps` in the Actor's virtual environment to install them.

diff --git a/templates/python-playwright/src/py.typed b/templates/python-playwright/src/py.typed
diff --git a/templates/python-scrapy/src/__init__.py b/templates/python-scrapy/src/__init__.py
diff --git a/templates/python-scrapy/src/__main__.py b/templates/python-scrapy/src/__main__.py
@@ -1,3 +1,4 @@
+# ruff: noqa: E402
 """Apify Actor integration for Scrapy projects.
 
 This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's
@@ -17,11 +18,10 @@
 from logging import StreamHandler, getLogger
 from typing import Any
 
+from apify.log import ActorLogFormatter
 from scrapy.utils import log as scrapy_logging
 from scrapy.utils.project import get_project_settings
 
-from apify.log import ActorLogFormatter
-
 # Define names of the loggers.
 MAIN_LOGGER_NAMES = ['apify', 'apify_client', 'scrapy']
 OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']

diff --git a/templates/python-scrapy/src/main.py b/templates/python-scrapy/src/main.py
@@ -21,10 +21,9 @@
 
 from __future__ import annotations
 
-from scrapy.crawler import CrawlerProcess
-
 from apify import Actor
 from apify.scrapy.utils import apply_apify_settings
+from scrapy.crawler import CrawlerProcess
 
 # Import your Scrapy spider here.
 from .spiders.title import TitleSpider as Spider

diff --git a/templates/python-scrapy/src/middlewares.py b/templates/python-scrapy/src/middlewares.py
@@ -1,3 +1,4 @@
+# ruff: noqa: D101, D102, ARG002, UP028
 """Scrapy middlewares module.
 
 This module defines Scrapy middlewares. Middlewares are processing components that handle requests and
@@ -20,7 +21,7 @@
     from collections.abc import Generator, Iterable
 
     from scrapy.crawler import Crawler
-    from scrapy.http import Response
+    from scrapy.http.response import Response
 
 
 class TitleSpiderMiddleware:
@@ -68,10 +69,12 @@ def process_spider_exception(
         pass
 
     def process_start_requests(
-        self, start_requests: Iterable[Request], spider: Spider
+        self,
+        start_requests: Iterable[Request],
+        spider: Spider,
     ) -> Iterable[Request]:  # Called with the start requests of the spider, and works
         # similarly to the process_spider_output() method, except
-        # that it doesn’t have a response associated.
+        # that it doesn't have a response associated.
 
         # Must return only requests (not items).
         for r in start_requests:

diff --git a/templates/python-scrapy/src/pipelines.py b/templates/python-scrapy/src/pipelines.py
@@ -1,3 +1,4 @@
+# ruff: noqa: ARG002, D102
 """Scrapy item pipelines module.
 
 This module defines Scrapy item pipelines for scraped data. Item pipelines are processing components

diff --git a/templates/python-scrapy/src/spiders/title.py b/templates/python-scrapy/src/spiders/title.py
@@ -1,3 +1,4 @@
+# ruff: noqa: RUF012, TID252
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
@@ -10,7 +11,7 @@
 if TYPE_CHECKING:
     from collections.abc import Generator
 
-    from scrapy.responsetypes import Response
+    from scrapy.http.response import Response
 
 
 class TitleSpider(Spider):

diff --git a/templates/python-selenium/src/__init__.py b/templates/python-selenium/src/__init__.py
diff --git a/templates/python-selenium/src/main.py b/templates/python-selenium/src/main.py
@@ -9,18 +9,18 @@
 import asyncio
 from urllib.parse import urljoin
 
+from apify import Actor, Request
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options as ChromeOptions
 from selenium.webdriver.common.by import By
 
-from apify import Actor, Request
-
 # To run this Actor locally, you need to have the Selenium Chromedriver installed.
 # Follow the installation guide at:
 # https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/
 # When running on the Apify platform, the Chromedriver is already included
 # in the Actor's Docker image.
 
+
 async def main() -> None:
     """Main entry point for the Apify Actor.
 

diff --git a/templates/python-selenium/src/py.typed b/templates/python-selenium/src/py.typed
diff --git a/templates/python-standby/src/__init__.py b/templates/python-standby/src/__init__.py
diff --git a/templates/python-standby/src/main.py b/templates/python-standby/src/main.py
@@ -14,7 +14,8 @@
 class GetHandler(SimpleHTTPRequestHandler):
     """A simple GET HTTP handler that will respond with a message."""
 
-    def do_GET(self) -> None:
+    def do_get(self) -> None:
+        """Handle GET request and respond with a message."""
         self.send_response(200)
         self.end_headers()
         self.wfile.write(b'Hello from Actor Standby!')

diff --git a/templates/python-standby/src/py.typed b/templates/python-standby/src/py.typed
diff --git a/templates/python-start/src/__init__.py b/templates/python-start/src/__init__.py
diff --git a/templates/python-start/src/main.py b/templates/python-start/src/main.py
@@ -8,16 +8,15 @@
 
 # Beautiful Soup - A library for pulling data out of HTML and XML files. Read more at:
 # https://www.crummy.com/software/BeautifulSoup/bs4/doc
+# Apify SDK - A toolkit for building Apify Actors. Read more at:
+# https://docs.apify.com/sdk/python
+from apify import Actor
 from bs4 import BeautifulSoup
 
 # HTTPX - A library for making asynchronous HTTP requests in Python. Read more at:
 # https://www.python-httpx.org/
 from httpx import AsyncClient
 
-# Apify SDK - A toolkit for building Apify Actors. Read more at:
-# https://docs.apify.com/sdk/python
-from apify import Actor
-
 
 async def main() -> None:
     """Main entry point for the Apify Actor.
@@ -30,6 +29,8 @@ async def main() -> None:
         # Retrieve the input object for the Actor. The structure of input is defined in input_schema.json.
         actor_input = await Actor.get_input() or {'url': 'https://apify.com/'}
         url = actor_input.get('url')
+        if not url:
+            raise ValueError('Missing "url" attribute in input!')
 
         # Create an asynchronous HTTPX client for making HTTP requests.
         async with AsyncClient() as client:

diff --git a/templates/python-start/src/py.typed b/templates/python-start/src/py.typed