From f0f9641091c6d7a67e800e1f36492c6323664cc3 Mon Sep 17 00:00:00 2001 From: AndrewKorzh <92707967+AndrewKorzh@users.noreply.github.com> Date: Fri, 9 Aug 2024 17:42:18 +0300 Subject: [PATCH] Local mode (#34) * # * add_local * requirements * Update README.md * code refactor * Update __init__.py * Update middleware.py * Update middleware.py * Fix unnecessary changes Signed-off-by: yatskov <9622929+Yatskov@users.noreply.github.com> * Fix unnecessary changes Signed-off-by: yatskov <9622929+Yatskov@users.noreply.github.com> --------- Signed-off-by: yatskov <9622929+Yatskov@users.noreply.github.com> Co-authored-by: yatskov <9622929+yatskov@users.noreply.github.com> --- README.md | 18 ++ requirements.txt | 3 + scrapypuppeteer/browser_managers/__init__.py | 16 ++ .../browser_managers/local_browser_manager.py | 257 ++++++++++++++++++ .../service_browser_manager.py | 221 +++++++++++++++ scrapypuppeteer/middleware.py | 197 ++------------ setup.py | 20 +- 7 files changed, 552 insertions(+), 180 deletions(-) create mode 100644 scrapypuppeteer/browser_managers/__init__.py create mode 100644 scrapypuppeteer/browser_managers/local_browser_manager.py create mode 100644 scrapypuppeteer/browser_managers/service_browser_manager.py diff --git a/README.md b/README.md index ecae545..cf7e42e 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,22 @@ DOWNLOADER_MIDDLEWARES = { 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042 } +PUPPETEER_SERVICE_URL = 'http://localhost:3000' + +#To run locally (without scrapy-puppeteer-service started), you need to enable the setting: +PUPPETEER_LOCAL = True +``` +For local execution it is also necessary to install Chromium for Pyppeteer. + +## Configuration + +You should have [scrapy-puppeteer-service](https://github.com/ispras/scrapy-puppeteer-service) started. +Then add its URL to `settings.py` and enable puppeteer downloader middleware: +```python +DOWNLOADER_MIDDLEWARES = { + 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042 +} + PUPPETEER_SERVICE_URL = 'http://localhost:3000' ``` @@ -50,6 +66,7 @@ There is a parent `PuppeteerResponse` class from which other response classes ar Here is a list of them all: - `PuppeteerHtmlResponse` - has `html` and `cookies` properties - `PuppeteerScreenshotResponse` - has `screenshot` property +- `PuppeteerHarResponse` - has `har` property - `PuppeteerJsonResponse` - has `data` property and `to_html()` method which tries to transform itself to `PuppeteerHtmlResponse` - `PuppeteerRecaptchaSolverResponse(PuppeteerJsonResponse, PuppeteerHtmlResponse)` - has `recaptcha_data` property @@ -66,6 +83,7 @@ Here is the list of available actions: - `Click(selector, click_options, wait_options)` - click on element on page - `Scroll(selector, wait_options)` - scroll page - `Screenshot(options)` - take screenshot +- `Har()` - to get the HAR file, pass the `har_recording=True` argument to `PuppeteerRequest` at the start of execution. - `RecaptchaSolver(solve_recaptcha)` - find or solve recaptcha on page - `CustomJsAction(js_function)` - evaluate JS function on page diff --git a/requirements.txt b/requirements.txt index 34188d2..df33418 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,4 @@ scrapy>=2.6 +pyppeteer +syncer +bs4 \ No newline at end of file diff --git a/scrapypuppeteer/browser_managers/__init__.py b/scrapypuppeteer/browser_managers/__init__.py new file mode 100644 index 0000000..cc5db82 --- /dev/null +++ b/scrapypuppeteer/browser_managers/__init__.py @@ -0,0 +1,16 @@ +__all__ = ["BrowserManager"] + +from abc import ABC, abstractmethod + +class BrowserManager(ABC): + @abstractmethod + def process_request(self, request, spider): + pass + + @abstractmethod + def close_used_contexts(self): + pass + + @abstractmethod + def process_response(self, middleware, request, response, spider): + pass \ No newline at end of file diff --git a/scrapypuppeteer/browser_managers/local_browser_manager.py b/scrapypuppeteer/browser_managers/local_browser_manager.py new file mode 100644 index 0000000..55d727f --- /dev/null +++ b/scrapypuppeteer/browser_managers/local_browser_manager.py @@ -0,0 +1,257 @@ +from scrapypuppeteer.response import ( + PuppeteerHtmlResponse, + PuppeteerScreenshotResponse, +) +from scrapypuppeteer.request import ActionRequest, PuppeteerRequest, CloseContextRequest + +import asyncio +from pyppeteer import launch +import syncer +import uuid +import base64 +from scrapypuppeteer.browser_managers import BrowserManager + + +class ContextManager: + + def __init__(self): + self.browser = syncer.sync(launch()) + self.contexts = {} + self.pages = {} + self.context_page_map = {} + + + async def check_context_and_page(self, context_id, page_id): + if not context_id or not page_id: + context_id, page_id = await self.open_new_page() + return context_id, page_id + + async def open_new_page(self): + context_id = uuid.uuid4().hex.upper() + page_id = uuid.uuid4().hex.upper() + + self.contexts[context_id] = await self.browser.createIncognitoBrowserContext() + self.pages[page_id] = await self.contexts[context_id].newPage() + self.context_page_map[context_id] = page_id + + return context_id, page_id + + def get_page_by_id(self, context_id, page_id): + return self.pages[page_id] + + def close_browser(self): + if self.browser: + syncer.sync(self.browser.close()) + + def close_contexts(self, request: CloseContextRequest): + for context_id in request.contexts: + if context_id in self.contexts: + syncer.sync(self.contexts[context_id].close()) + page_id = self.context_page_map.get(context_id) + if page_id in self.pages: + del self.pages[page_id] + + del self.contexts[context_id] + del self.context_page_map[context_id] + + def __del__(self): + self.close_browser() + + +class LocalBrowserManager(BrowserManager): + + def __init__(self): + self.context_manager = ContextManager() + self.action_map = { + "goto": self.goto, + "click": self.click, + "back": self.go_back, + "forward": self.go_forward, + "scroll": self.scroll, + "screenshot": self.screenshot, + "action": self.action, + "recaptcha_solver": self.recaptcha_solver, + "har": self.har + } + + def process_request(self, request): + + if isinstance(request, PuppeteerRequest): + endpoint = request.action.endpoint + action_function = self.action_map.get(endpoint) + if action_function: + return action_function(request) + + if isinstance(request, CloseContextRequest): + return self.close_contexts(request) + + def close_contexts(self, request: CloseContextRequest): + self.context_manager.close_contexts(request) + + def close_used_contexts(self): + self.context_manager.close_browser() + + def process_response(self, middleware, request, response, spider): + return response + + async def wait_with_options(self, page, wait_options): + timeout = wait_options.get("selectorOrTimeout", 1000) + visible = wait_options.get("visible", False) + hidden = wait_options.get("hidden", False) + + if isinstance(timeout, (int, float)): + await asyncio.sleep(timeout / 1000) + else: + await page.waitFor(selector=timeout, options={ + 'visible': visible, + 'hidden': hidden, + 'timeout': 30000 + }) + + def goto(self, request: PuppeteerRequest): + context_id, page_id = syncer.sync(self.context_manager.check_context_and_page(request.context_id, request.page_id)) + page = self.context_manager.get_page_by_id(context_id, page_id) + + async def async_goto(): + url = request.action.payload()["url"] + cookies = request.cookies + navigation_options = request.action.navigation_options + await page.goto(url, navigation_options) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + return PuppeteerHtmlResponse(url, + request, + context_id = context_id, + page_id = page_id, + html = response_html, + cookies=cookies) + + return syncer.sync(async_goto()) + + def click(self, request: PuppeteerRequest): + context_id, page_id = syncer.sync(self.context_manager.check_context_and_page(request.context_id, request.page_id)) + page = self.context_manager.get_page_by_id(context_id, page_id) + + async def async_click(): + selector = request.action.payload().get("selector") + cookies = request.cookies + click_options = request.action.click_options or {} + navigation_options = request.action.navigation_options or {} + options = merged = {**click_options, **navigation_options} + await page.click(selector, options) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + return PuppeteerHtmlResponse(request.url, + request, + context_id = context_id, + page_id = page_id, + html = response_html, + cookies=cookies) + + return syncer.sync(async_click()) + + def go_back(self, request: PuppeteerRequest): + context_id, page_id = syncer.sync(self.context_manager.check_context_and_page(request.context_id, request.page_id)) + page = self.context_manager.get_page_by_id(context_id, page_id) + + async def async_go_back(): + cookies = request.cookies + navigation_options = request.action.navigation_options + await page.goBack(navigation_options) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + return PuppeteerHtmlResponse(request.url, + request, + context_id = context_id, + page_id = page_id, + html = response_html, + cookies=cookies) + + return syncer.sync(async_go_back()) + + + def go_forward(self, request: PuppeteerRequest): + context_id, page_id = syncer.sync(self.context_manager.check_context_and_page(request.context_id, request.page_id)) + page = self.context_manager.get_page_by_id(context_id, page_id) + + async def async_go_forward(): + cookies = request.cookies + navigation_options = request.action.navigation_options + await page.goForward(navigation_options) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + return PuppeteerHtmlResponse(request.url, + request, + context_id = context_id, + page_id = page_id, + html = response_html, + cookies=cookies) + + return syncer.sync(async_go_forward()) + + + + def screenshot(self, request: PuppeteerRequest): + context_id, page_id = syncer.sync(self.context_manager.check_context_and_page(request.context_id, request.page_id)) + page = self.context_manager.get_page_by_id(context_id, page_id) + + async def async_screenshot(): + request_options = request.action.options or {} + screenshot_options = {'encoding': 'binary'} + screenshot_options.update(request_options) + screenshot_bytes = await page.screenshot(screenshot_options) + screenshot_base64 = base64.b64encode(screenshot_bytes).decode('utf-8') + return PuppeteerScreenshotResponse(request.url, + request, + context_id = context_id, + page_id = page_id, + screenshot = screenshot_base64) + + return syncer.sync(async_screenshot()) + + + def scroll(self, request: PuppeteerRequest): + context_id, page_id = syncer.sync(self.context_manager.check_context_and_page(request.context_id, request.page_id)) + page = self.context_manager.get_page_by_id(context_id, page_id) + + async def async_scroll(): + cookies = request.cookies + selector = request.action.payload().get("selector", None) + + if selector: + script = f""" + document.querySelector('{selector}').scrollIntoView(); + """ + else: + script = """ + window.scrollBy(0, document.body.scrollHeight); + """ + await page.evaluate(script) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + return PuppeteerHtmlResponse(request.url, + request, + context_id = context_id, + page_id = page_id, + html = response_html, + cookies=cookies) + + return syncer.sync(async_scroll()) + + + def action(self, request: PuppeteerRequest): + raise ValueError("CustomJsAction is not available in local mode") + + def recaptcha_solver(self, request: PuppeteerRequest): + raise ValueError("RecaptchaSolver is not available in local mode") + + def har(self, request: PuppeteerRequest): + raise ValueError("Har is not available in local mode") + + + diff --git a/scrapypuppeteer/browser_managers/service_browser_manager.py b/scrapypuppeteer/browser_managers/service_browser_manager.py new file mode 100644 index 0000000..2e7e488 --- /dev/null +++ b/scrapypuppeteer/browser_managers/service_browser_manager.py @@ -0,0 +1,221 @@ +import json +import logging +from collections import defaultdict +from typing import List, Union +from urllib.parse import urlencode, urljoin +from abc import ABC, abstractmethod + +from scrapy import signals +from scrapy.crawler import Crawler +from scrapy.exceptions import IgnoreRequest, NotConfigured, DontCloseSpider +from scrapy.http import Headers, TextResponse, Response +from scrapy.utils.log import failure_to_exc_info +from twisted.python.failure import Failure +import time + +from scrapypuppeteer.actions import ( + Click, + GoBack, + GoForward, + GoTo, + RecaptchaSolver, + Screenshot, + Scroll, + CustomJsAction, + Har, +) +from scrapypuppeteer.response import ( + PuppeteerResponse, + PuppeteerHtmlResponse, + PuppeteerScreenshotResponse, + PuppeteerHarResponse, + PuppeteerRecaptchaSolverResponse, + PuppeteerJsonResponse, +) +from scrapypuppeteer.request import ActionRequest, PuppeteerRequest, CloseContextRequest + +from scrapypuppeteer.browser_managers import BrowserManager + + +class ServiceBrowserManager(BrowserManager): + def __init__(self, service_base_url, include_meta, include_headers, crawler): + self.service_base_url = service_base_url + self.include_meta = include_meta + self.include_headers = include_headers + self.used_contexts = defaultdict(set) + self.service_logger = logging.getLogger(__name__) + self.crawler = crawler + + if self.service_base_url is None: + raise ValueError("Puppeteer service URL must be provided") + + + def process_request(self, request): + + if isinstance(request, CloseContextRequest): + return self.process_close_context_request(request) + + if isinstance(request, PuppeteerRequest): + return self.process_puppeteer_request(request) + + + def process_close_context_request(self, request: CloseContextRequest): + if not request.is_valid_url: + return request.replace( + url=urljoin(self.service_base_url, "/close_context"), + ) + + + def process_puppeteer_request(self, request: PuppeteerRequest): + action = request.action + service_url = urljoin(self.service_base_url, action.endpoint) + service_params = self._encode_service_params(request) + if service_params: + service_url += "?" + service_params + meta = { + "puppeteer_request": request, + "dont_obey_robotstxt": True, + "proxy": None, + } + if self.include_meta: + meta = {**request.meta, **meta} + action_request = ActionRequest( + url=service_url, + action=action, + method="POST", + headers=Headers({"Content-Type": action.content_type}), + body=self._serialize_body(action, request), + dont_filter=True, + cookies=request.cookies, + priority=request.priority, + callback=request.callback, + cb_kwargs=request.cb_kwargs, + errback=request.errback, + meta=meta, + ) + return action_request + + @staticmethod + def _encode_service_params(request): + service_params = {} + if request.context_id is not None: + service_params["contextId"] = request.context_id + if request.page_id is not None: + service_params["pageId"] = request.page_id + if request.close_page: + service_params["closePage"] = 1 + return urlencode(service_params) + + def _serialize_body(self, action, request): + payload = action.payload() + if action.content_type == "application/json": + if isinstance(payload, dict): + # disallow null values in top-level request parameters + payload = {k: v for k, v in payload.items() if v is not None} + proxy = request.meta.get("proxy") + if proxy: + payload["proxy"] = proxy + include_headers = ( + self.include_headers + if request.include_headers is None + else request.include_headers + ) + if include_headers: + headers = request.headers.to_unicode_dict() + if isinstance(include_headers, list): + headers = { + h.lower(): headers[h] for h in include_headers if h in headers + } + payload["headers"] = headers + return json.dumps(payload) + return str(payload) + + def close_used_contexts(self, spider): + contexts = list(self.used_contexts.pop(id(spider), set())) + if contexts: + request = CloseContextRequest( + contexts, + meta={"proxy": None}, + ) + def handle_close_contexts_result(result): + if isinstance(result, Response): + if result.status == 200: + self.service_logger.debug( + f"Successfully closed {len(request.contexts)} " + f"contexts with request {result.request}" + ) + else: + self.service_logger.warning( + f"Could not close contexts: {result.text}" + ) + elif isinstance(result, Failure): + self.service_logger.warning( + f"Could not close contexts: {result.value}", + exc_info=failure_to_exc_info(result), + ) + dfd = self.crawler.engine.download(request) + dfd.addBoth(handle_close_contexts_result) + + raise DontCloseSpider() + + + def process_response(self, middleware, request, response, spider): + if not isinstance(response, TextResponse): + return response + + puppeteer_request = request.meta.get("puppeteer_request") + if puppeteer_request is None: + return response + + if b"application/json" not in response.headers.get(b"Content-Type", b""): + return response.replace(request=request) + + response_data = json.loads(response.text) + if response.status != 200: + reason = response_data.pop("error", f"undefined, status {response.status}") + middleware.service_logger.warning( + f"Request {request} is not succeeded. Reason: {reason}" + ) + context_id = response_data.get("contextId") + if context_id: + middleware.used_contexts[id(spider)].add(context_id) + return response + + response_cls = self._get_response_class(puppeteer_request.action) + + return self._form_response( + response_cls, + response_data, + puppeteer_request.url, + request, + puppeteer_request, + spider, + ) + + def _form_response( + self, response_cls, response_data, url, request, puppeteer_request, spider + ): + context_id = response_data.pop("contextId", puppeteer_request.context_id) + page_id = response_data.pop("pageId", puppeteer_request.page_id) + self.used_contexts[id(spider)].add(context_id) + + return response_cls( + url=url, + puppeteer_request=puppeteer_request, + context_id=context_id, + page_id=page_id, + request=request, + **response_data, + ) + + @staticmethod + def _get_response_class(request_action): + if isinstance(request_action, (GoTo, GoForward, GoBack, Click, Scroll)): + return PuppeteerHtmlResponse + if isinstance(request_action, Screenshot): + return PuppeteerScreenshotResponse + if isinstance(request_action, Har): + return PuppeteerHarResponse + if isinstance(request_action, RecaptchaSolver): + return PuppeteerRecaptchaSolverResponse + return PuppeteerJsonResponse diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index f7b79a5..88587ff 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -3,6 +3,7 @@ from collections import defaultdict from typing import List, Union from urllib.parse import urlencode, urljoin +from abc import ABC, abstractmethod from scrapy import signals from scrapy.crawler import Crawler @@ -26,12 +27,15 @@ PuppeteerResponse, PuppeteerHtmlResponse, PuppeteerScreenshotResponse, - PuppeteerHarResponse, PuppeteerRecaptchaSolverResponse, PuppeteerJsonResponse, ) from scrapypuppeteer.request import ActionRequest, PuppeteerRequest, CloseContextRequest +from scrapypuppeteer.browser_managers.local_browser_manager import LocalBrowserManager +from scrapypuppeteer.browser_managers.service_browser_manager import ServiceBrowserManager + +from scrapypuppeteer.browser_managers import BrowserManager class PuppeteerServiceDownloaderMiddleware: """ @@ -66,6 +70,8 @@ class PuppeteerServiceDownloaderMiddleware: SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META" DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately + PUPPETEER_LOCAL_SETTING = "PUPPETEER_LOCAL" + service_logger = logging.getLogger(__name__) def __init__( @@ -74,18 +80,19 @@ def __init__( service_url: str, include_headers: Union[bool, List[str]], include_meta: bool, + browser_manager: BrowserManager ): self.service_base_url = service_url self.include_headers = include_headers self.include_meta = include_meta self.crawler = crawler self.used_contexts = defaultdict(set) + self.browser_manager = browser_manager @classmethod def from_crawler(cls, crawler): service_url = crawler.settings.get(cls.SERVICE_URL_SETTING) - if service_url is None: - raise ValueError("Puppeteer service URL must be provided") + local_mode = crawler.settings.getbool(cls.PUPPETEER_LOCAL_SETTING, False) if cls.INCLUDE_HEADERS_SETTING in crawler.settings: try: include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING) @@ -94,181 +101,25 @@ def from_crawler(cls, crawler): else: include_headers = cls.DEFAULT_INCLUDE_HEADERS include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) - middleware = cls(crawler, service_url, include_headers, include_meta) - crawler.signals.connect( - middleware.close_used_contexts, signal=signals.spider_idle - ) - return middleware - def process_request(self, request, **_): - if isinstance(request, CloseContextRequest): - return self.process_close_context_request(request) - - if isinstance(request, PuppeteerRequest): - return self.process_puppeteer_request(request) - - def process_close_context_request(self, request: CloseContextRequest): - if not request.is_valid_url: - return request.replace( - url=urljoin(self.service_base_url, "/close_context"), - ) + if local_mode: + browser_manager = LocalBrowserManager() + else: + browser_manager = ServiceBrowserManager(service_url, include_meta, include_headers, crawler) - def process_puppeteer_request(self, request: PuppeteerRequest): - action = request.action - service_url = urljoin(self.service_base_url, action.endpoint) - service_params = self._encode_service_params(request) - if service_params: - service_url += "?" + service_params - - meta = { - "puppeteer_request": request, - "dont_obey_robotstxt": True, - "proxy": None, - } - if self.include_meta: - meta = {**request.meta, **meta} - - return ActionRequest( - url=service_url, - action=action, - method="POST", - headers=Headers({"Content-Type": action.content_type}), - body=self._serialize_body(action, request), - dont_filter=True, - cookies=request.cookies, - priority=request.priority, - callback=request.callback, - cb_kwargs=request.cb_kwargs, - errback=request.errback, - meta=meta, + middleware = cls(crawler, service_url, include_headers, include_meta, browser_manager) + crawler.signals.connect( + middleware.browser_manager.close_used_contexts, signal=signals.spider_idle ) - - @staticmethod - def _encode_service_params(request): - service_params = {} - if request.context_id is not None: - service_params["contextId"] = request.context_id - if request.page_id is not None: - service_params["pageId"] = request.page_id - if request.close_page: - service_params["closePage"] = 1 - return urlencode(service_params) - - def _serialize_body(self, action, request): - payload = action.payload() - if action.content_type == "application/json": - if isinstance(payload, dict): - # disallow null values in top-level request parameters - payload = {k: v for k, v in payload.items() if v is not None} - proxy = request.meta.get("proxy") - if proxy: - payload["proxy"] = proxy - include_headers = ( - self.include_headers - if request.include_headers is None - else request.include_headers - ) - if include_headers: - headers = request.headers.to_unicode_dict() - if isinstance(include_headers, list): - headers = { - h.lower(): headers[h] for h in include_headers if h in headers - } - payload["headers"] = headers - return json.dumps(payload) - return str(payload) - + return middleware + + def process_request(self, request, spider): + return self.browser_manager.process_request(request) + def process_response(self, request, response, spider): - if not isinstance(response, TextResponse): - return response - - puppeteer_request = request.meta.get("puppeteer_request") - if puppeteer_request is None: - return response - - if b"application/json" not in response.headers.get(b"Content-Type", b""): - return response.replace(request=request) - - response_data = json.loads(response.text) - if response.status != 200: - reason = response_data.pop("error", f"undefined, status {response.status}") - self.service_logger.warning( - f"Request {request} is not succeeded. Reason: {reason}" - ) - context_id = response_data.get("contextId") - if context_id: - self.used_contexts[id(spider)].add(context_id) - return response - - response_cls = self._get_response_class(puppeteer_request.action) - - return self._form_response( - response_cls, - response_data, - puppeteer_request.url, - request, - puppeteer_request, - spider, - ) - - def _form_response( - self, response_cls, response_data, url, request, puppeteer_request, spider - ): - context_id = response_data.pop("contextId", puppeteer_request.context_id) - page_id = response_data.pop("pageId", puppeteer_request.page_id) - - self.used_contexts[id(spider)].add(context_id) - - return response_cls( - url=url, - puppeteer_request=puppeteer_request, - context_id=context_id, - page_id=page_id, - request=request, - **response_data, - ) - - @staticmethod - def _get_response_class(request_action): - if isinstance(request_action, (GoTo, GoForward, GoBack, Click, Scroll)): - return PuppeteerHtmlResponse - if isinstance(request_action, Screenshot): - return PuppeteerScreenshotResponse - if isinstance(request_action, Har): - return PuppeteerHarResponse - if isinstance(request_action, RecaptchaSolver): - return PuppeteerRecaptchaSolverResponse - return PuppeteerJsonResponse - - def close_used_contexts(self, spider): - contexts = list(self.used_contexts.pop(id(spider), set())) - if contexts: - request = CloseContextRequest( - contexts, - meta={"proxy": None}, - ) - - def handle_close_contexts_result(result): - if isinstance(result, Response): - if result.status == 200: - self.service_logger.debug( - f"Successfully closed {len(request.contexts)} " - f"contexts with request {result.request}" - ) - else: - self.service_logger.warning( - f"Could not close contexts: {result.text}" - ) - elif isinstance(result, Failure): - self.service_logger.warning( - f"Could not close contexts: {result.value}", - exc_info=failure_to_exc_info(result), - ) + return self.browser_manager.process_response(self, request, response, spider) - dfd = self.crawler.engine.download(request) - dfd.addBoth(handle_close_contexts_result) - raise DontCloseSpider() class PuppeteerRecaptchaDownloaderMiddleware: @@ -462,4 +313,4 @@ def __is_closing(self, response, remove_request: bool = True) -> bool: close_page = main_request in self._page_closing if close_page and remove_request: self._page_closing.remove(main_request) - return close_page + return close_page \ No newline at end of file diff --git a/setup.py b/setup.py index 9435c26..f0e383d 100644 --- a/setup.py +++ b/setup.py @@ -1,22 +1,28 @@ #!/usr/bin/env python -from setuptools import setup +from setuptools import setup, find_packages -with open("README.md", "r") as readme: - long_description = readme.read() +def read_long_description(file_path): + with open(file_path, "r") as file: + return file.read() setup( name="scrapy-puppeteer-client", - version="0.3.3", + version="0.3.4", description="A library to use Puppeteer-managed browser in Scrapy spiders", - long_description=long_description, + long_description=read_long_description("README.md"), long_description_content_type="text/markdown", url="https://github.com/ispras/scrapy-puppeteer", author="MODIS @ ISP RAS", maintainer="Maksim Varlamov", maintainer_email="varlamov@ispras.ru", - packages=["scrapypuppeteer"], - install_requires=["scrapy>=2.6"], + packages=find_packages(), + install_requires=[ + "scrapy>=2.6", + "pyppeteer", + "syncer", + "bs4" + ], python_requires=">=3.6", license="BSD", classifiers=[