-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxiaochun.py
81 lines (68 loc) · 2.96 KB
/
xiaochun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# coding=utf-8
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import Request
import re
import os
downloadDir = './sns/' # ダウンロード場所
if os.path.exists(downloadDir) == False: # フォルダ有り無しチェック
os.makedirs(downloadDir) # フォルダ作成
class xiaochun():
# 検索
def search():
driver = webdriver.Firefox()
base_url = "https://www.incnjp.com/forum-92-1.html"
driver.get(base_url)
# login_url = "https://www.incnjp.com/member.php?mod=logging&action=login"
# driver.get(login_url)
# driver.find_element_by_xpath("//*[@id='username_LUyc9']").send_keys("id")
# driver.find_element_by_xpath("//*[@id='password3_LUyc9']").send_keys("pass")
# driver.find_element_by_xpath("/html/body/div[6]/div/div[2]/div/div[2]/div[1]/div[1]/div[2]/form/div/div[6]/table/tbody/tr/td[1]/button").click() # login
a_s = driver.find_elements_by_xpath("/html/body/div[6]/div[4]/div/div/div[4]/div[2]/form/table/tbody[*]/tr/th/a[2]")#文書URLリストのタグa
# driver.implicitly_wait(20)
urls = [] # 文書URLリスト
for a in a_s:
chapter_href = a.get_attribute("href") # 文書URL
if -1 != chapter_href.rfind('http'):
urls.append(chapter_href)
print("文書の数:",len(urls))
return urls
# 文書URLにより文書を取得
def get_name_content(urls):
chapters = []
for url in urls:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
response = urlopen(req)
html = response.read()
soup = BeautifulSoup(html, "lxml")
# soup = BeautifulSoup(html, "html5lib")
title = soup.findAll('h1')[0].text
print("文書title:" + title)
chapters.append(title)
content = soup.findAll(id=re.compile("postmessage.*"))[0]
for string in content:
st = str(string)
if -1 != st.rfind('<br/>'):
continue
else:
chapters.append(st)
#print(st)
xiaochun.save_book(title,chapters)
chapters = []
# txtファイルに書き込み
def save_book(bookName,chapters):
bookName = bookName.replace("\n","")
bookname = downloadDir + bookName + '.txt'
file = open(bookname, 'w+', encoding='utf-8')
for i in chapters:
# file.write('\t')
for ii in i:
if ii.startswith('<div'): # 不要な<div……></div>を除く
ii = ""
file.write(ii)
# file.write('\n') #改行
if __name__ == "__main__":
chapter_urls = xiaochun.search()
xiaochun.get_name_content(chapter_urls)