Skip to content

Commit

Permalink
改用Scrapy框架实现
Browse files Browse the repository at this point in the history
  • Loading branch information
CodingMoeButa committed Feb 6, 2023
0 parents commit 3d5ed09
Show file tree
Hide file tree
Showing 11 changed files with 704 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
img
proxy
*.db
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
## 注意事项

1. 在重新开始一次采集前,需要手动删除`job`目录下的对应工作进度文件,再启动采集。
2. 尽管自定义下载器中间件`LightnovelDownloaderMiddleware`能够处理网络中断引发的异常,但依然建议在发生网络中断时立即暂停采集任务。
3. 请勿在采集器工作时强行中断下载进程,否则可能导致请求丢失或响应无法被处理。

## 已知问题

### libi.py

由于[PSYCOME炼爱学狱 第五卷 与杀人机共度体育灾 第五项 炼狱的爱吼、狂乱的死鸣 Behemoth feat.Leviathan(4)_哔哩轻小说 (linovelib.com)](https://www.linovelib.com/novel/94/13214_4.html)源代码的105行存在一个异常的`<Style>`字符串,网页不能被正确解析,在浏览器中表现为网页内容在此处被截断,在采集器中表现为`libi.py`模块的第86行抛出迭代器索引溢出异常并中止了该部作品的数据采集,即《PSYCOME炼爱学狱》从第五卷《与杀人机共度体育灾》的第五项《炼狱的爱吼、狂乱的死鸣 Behemoth feat.Leviathan》开始的内容不能被采集。同样的内容在[第五卷 与杀人机共度体育灾 第五项 炼狱的爱吼、狂乱的死鸣 Behemoth feat.Leviathan-PSYCOME炼爱学狱(恋狱剧场)-Fami通文库-轻小说文库 (wenku8.net)](https://www.wenku8.net/novel/1/1488/69360.htm)源代码的第966行中为`&lt;Style&gt;`,网页显示正常。怀疑哩哔轻小说亦是使用了爬虫程序从轻小说文库上获取了小说正文,而且没有对字符做转义。由于是网页代码错误引发的异常,根本原因不在采集器,因此该问题不会被修复。
35 changes: 35 additions & 0 deletions lightnovel/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pymysql
from dbutils.persistent_db import PersistentDB
import requests

class MysqlPool:
_pool = None

@classmethod
def connect(cls):
if cls._pool == None:
cls._pool = PersistentDB(
creator=pymysql,
host='127.0.0.1',
port=3306,
user='root',
password='',
database='lightnovel_spider'
)
return cls._pool.connection()

def isOnline(proxy:str=None) -> bool:
if proxy == None:
PROXIES = None
else:
PROXIES = {
'http': proxy,
'https': proxy
}
def test(url:str) -> bool:
try:
requests.get(url, timeout=1, proxies=PROXIES)
return True
except requests.RequestException as e:
return False
return test('http://www.baidu.com') or test('http://www.bing.com')
51 changes: 51 additions & 0 deletions lightnovel/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class LightnovelItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass

class Wenku8BookItem(scrapy.Item):
aid = scrapy.Field()
title = scrapy.Field()
author = scrapy.Field()
category = scrapy.Field()
tags = scrapy.Field()
description = scrapy.Field()

class Wenku8ChapterItem(scrapy.Item):
aid = scrapy.Field()
cid = scrapy.Field()
volume = scrapy.Field()
chapter = scrapy.Field()
content = scrapy.Field()

class Wenku8ImageItem(scrapy.Item):
cid = scrapy.Field()
filename = scrapy.Field()
content = scrapy.Field()

class LibiBookItem(scrapy.Item):
bid = scrapy.Field()
title = scrapy.Field()
author = scrapy.Field()
category = scrapy.Field()
description = scrapy.Field()

class LibiChapterItem(scrapy.Item):
bid = scrapy.Field()
cid = scrapy.Field()
volume = scrapy.Field()
chapter = scrapy.Field()
content = scrapy.Field()

class LibiImageItem(scrapy.Item):
cid = scrapy.Field()
filename = scrapy.Field()
content = scrapy.Field()
143 changes: 143 additions & 0 deletions lightnovel/middlewares.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals
from scrapy import Request, Spider
from twisted.internet import defer
from twisted.internet.error import (
ConnectError,
ConnectionDone,
ConnectionLost,
ConnectionRefusedError,
DNSLookupError,
TCPTimedOutError,
TimeoutError,
)
from twisted.web.client import ResponseFailed
from scrapy.core.downloader.handlers.http11 import TunnelError
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from lightnovel import isOnline

class LightnovelSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.

@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.

# Should return None or raise an exception.
return None

def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.

# Must return an iterable of Request, or item objects.
for i in result:
yield i

def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.

# Should return either None or an iterable of Request or item objects.
pass

def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.

# Must return only requests (not items).
for r in start_requests:
yield r

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)


class LightnovelDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.

@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls(crawler.settings)
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

# IOError is raised by the HttpCompression middleware when trying to
# decompress an empty response
NETWORK_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError,
ConnectionRefusedError, ConnectionDone, ConnectError,
ConnectionLost, TCPTimedOutError, ResponseFailed,
IOError, TunnelError)

def __init__(self, settings) -> None:
self.proxy_addr = settings.get('PROXY_ADDR', None)
self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')

def process_request(self, request:Request, spider):
# Called for each request that goes through the downloader
# middleware.

# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called

# 设置代理
if self.proxy_addr != None:
request.meta['proxy'] = self.proxy_addr

return None

def process_response(self, request, response, spider):
# Called with the response returned from the downloader.

# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response

def process_exception(self, request:Request, exception, spider:Spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.

# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
if isinstance(exception, self.NETWORK_EXCEPTIONS):
if isOnline(self.proxy_addr):
if request.meta.get('network_retry_times', 0) < 1:
request.meta['network_retry_times'] = request.meta.get('network_retry_times', 0) + 1
else:
return None
else:
request.meta['network_retry_times'] = 0
spider.logger.warn('网络连接异常,正在等待连接……')
if request.meta.get('retry_times', 0) > 0:
request.meta['retry_times'] = 0
request.priority = request.priority - self.priority_adjust
return request

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
137 changes: 137 additions & 0 deletions lightnovel/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

import os
import pymysql
from lightnovel import MysqlPool
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy import Spider, Item
from lightnovel.items import *

class LightnovelPipeline:
def process_item(self, item, spider):
return item

class Wenku8Pipeline:
def process_item(self, item:Item, spider):
if isinstance(item, Wenku8BookItem):
adapter = ItemAdapter(item)
aid = adapter['aid']
title = adapter['title']
author = adapter['author']
category = adapter['category']
tags = adapter['tags']
description = adapter['description']
with MysqlPool.connect() as conn:
with conn.cursor() as cur:
try:
cur.execute('INSERT IGNORE INTO wenku8_books(aid,title,author,category,tags,description) VALUES (%s,%s,%s,%s,%s,%s)', (aid,title,author,category,tags,description))
conn.commit()
except pymysql.err.Error as e:
conn.rollback()
raise e
elif isinstance(item, Wenku8ChapterItem):
adapter = ItemAdapter(item)
aid = adapter['aid']
cid = adapter['cid']
volume = adapter['volume']
chapter = adapter['chapter']
content = adapter['content']
with MysqlPool.connect() as conn:
with conn.cursor() as cur:
try:
cur.execute('INSERT IGNORE INTO wenku8_chapters(aid,cid,volume,chapter,content) VALUES (%s,%s,%s,%s,%s)', (aid,cid,volume,chapter,content))
conn.commit()
except pymysql.err.Error as e:
conn.rollback()
raise e
elif isinstance(item, Wenku8ImageItem):
adapter = ItemAdapter(item)
cid = adapter['cid']
filename = adapter['filename']
content = adapter['content']
try:
os.mkdir('./img')
except FileExistsError:
pass
try:
os.mkdir('./img/wenku8')
except FileExistsError:
pass
try:
os.mkdir('./img/wenku8/' + str(int(cid/10000)))
except FileExistsError:
pass
try:
os.mkdir('./img/wenku8/' + str(int(cid/10000)) + '/' + str(cid))
except FileExistsError:
pass
with open('./img/wenku8/' + str(int(cid/10000)) + '/' + str(cid) + '/' + filename, 'wb') as f:
f.write(content)
return item

class LibiPipeline:
def open_spider(self, spider:Spider):
pass

def process_item(self, item:Item, spider):
if isinstance(item, LibiBookItem):
adapter = ItemAdapter(item)
bid = adapter['bid']
title = adapter['title']
author = adapter['author']
category = adapter['category']
description = adapter['description']
with MysqlPool.connect() as conn:
with conn.cursor() as cur:
try:
cur.execute('INSERT IGNORE INTO libi_books(bid,title,author,category,description) VALUES (%s,%s,%s,%s,%s)', (bid,title,author,category,description))
conn.commit()
except pymysql.err.Error as e:
conn.rollback()
raise e
elif isinstance(item, LibiChapterItem):
adapter = ItemAdapter(item)
bid = adapter['bid']
cid = adapter['cid']
volume = adapter['volume']
chapter = adapter['chapter']
content = adapter['content']
with MysqlPool.connect() as conn:
with conn.cursor() as cur:
try:
cur.execute('INSERT IGNORE INTO libi_chapters(bid,cid,volume,chapter,content) VALUES (%s,%s,%s,%s,%s)', (bid,cid,volume,chapter,content))
conn.commit()
except pymysql.err.Error as e:
conn.rollback()
raise e
elif isinstance(item, LibiImageItem):
adapter = ItemAdapter(item)
cid = adapter['cid']
filename = adapter['filename']
content = adapter['content']
try:
os.mkdir('./img')
except FileExistsError:
pass
try:
os.mkdir('./img/libi')
except FileExistsError:
pass
try:
os.mkdir('./img/libi/' + str(int(cid/10000)))
except FileExistsError:
pass
try:
os.mkdir('./img/libi/' + str(int(cid/10000)) + '/' + str(cid))
except FileExistsError:
pass
with open('./img/libi/' + str(int(cid/10000)) + '/' + str(cid) + '/' + filename, 'wb') as f:
f.write(content)
return item

def close_spider(self, spider):
pass
Loading

0 comments on commit 3d5ed09

Please sign in to comment.