From 3d5ed098968a91f648addcab9b538d63014b6ca7 Mon Sep 17 00:00:00 2001 From: ricky50575 Date: Mon, 6 Feb 2023 15:35:31 +0800 Subject: [PATCH] =?UTF-8?q?=E6=94=B9=E7=94=A8Scrapy=E6=A1=86=E6=9E=B6?= =?UTF-8?q?=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 + README.md | 11 +++ lightnovel/__init__.py | 35 ++++++++ lightnovel/items.py | 51 ++++++++++++ lightnovel/middlewares.py | 143 +++++++++++++++++++++++++++++++++ lightnovel/pipelines.py | 137 +++++++++++++++++++++++++++++++ lightnovel/settings.py | 101 +++++++++++++++++++++++ lightnovel/spiders/__init__.py | 4 + lightnovel/spiders/libi.py | 112 ++++++++++++++++++++++++++ lightnovel/spiders/wenku8.py | 96 ++++++++++++++++++++++ scrapy.cfg | 11 +++ 11 files changed, 704 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 lightnovel/__init__.py create mode 100644 lightnovel/items.py create mode 100644 lightnovel/middlewares.py create mode 100644 lightnovel/pipelines.py create mode 100644 lightnovel/settings.py create mode 100644 lightnovel/spiders/__init__.py create mode 100644 lightnovel/spiders/libi.py create mode 100644 lightnovel/spiders/wenku8.py create mode 100644 scrapy.cfg diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a1df691 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +img +proxy +*.db \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..71866ee --- /dev/null +++ b/README.md @@ -0,0 +1,11 @@ +## 注意事项 + +1. 在重新开始一次采集前,需要手动删除`job`目录下的对应工作进度文件,再启动采集。 +2. 尽管自定义下载器中间件`LightnovelDownloaderMiddleware`能够处理网络中断引发的异常,但依然建议在发生网络中断时立即暂停采集任务。 +3. 请勿在采集器工作时强行中断下载进程,否则可能导致请求丢失或响应无法被处理。 + +## 已知问题 + +### libi.py + +由于[PSYCOME炼爱学狱 第五卷 与杀人机共度体育灾 第五项 炼狱的爱吼、狂乱的死鸣 Behemoth feat.Leviathan(4)_哔哩轻小说 (linovelib.com)](https://www.linovelib.com/novel/94/13214_4.html)源代码的105行存在一个异常的`