1
+ # coding:utf-8
2
+ import re
3
+ import json
4
+ import os
5
+ import threading
6
+ import time
7
+ import requests
8
+ from requests .exceptions import RequestException
9
+ from bs4 import BeautifulSoup
10
+
11
+ import random
12
+
13
+ def get_proble_set (url ):
14
+ try :
15
+ response = requests .get (url )
16
+ if response .status_code == 200 :
17
+ return response .text
18
+ return None
19
+ except RequestException :
20
+ return None
21
+
22
+ def parse_proble_set (problemSet ):
23
+ # print(len(problemSet)) # 2218
24
+ # for i in range(len(problemSet)):
25
+ for i in range (930 , len (problemSet )):
26
+ title = problemSet [i ]["stat" ]["question__title_slug" ]
27
+ if os .path .exists ("[no content]{}.json" .format (title )) or os .path .exists ("{}.json" .format (title )):
28
+ print (i , "has been parsed." )
29
+ # print("The question has been parsed: {}".format(title))
30
+ continue
31
+ #construct_url(title)
32
+ # time.sleep(0.5)
33
+ time .sleep (1 )
34
+ # time.sleep(random.randint(0,9) / 10)
35
+ t = threading .Thread (target = construct_url ,args = (title ,))
36
+ t .start ()
37
+
38
+ print (i , "is done." )
39
+ continue
40
+
41
+ def construct_url (problemTitle ):
42
+ url = "https://leetcode.com/problems/" + problemTitle + "/description/"
43
+ # print(url)
44
+ get_proble_content (url ,problemTitle )
45
+
46
+ def save_problem (title ,content ):
47
+ #content = bytes(content,encoding = 'utf8')
48
+ filename = title + ".html"
49
+ with open (filename ,'w+' ,encoding = "utf-8" )as f :
50
+ f .write (content )
51
+
52
+ def get_proble_content (problemUrl ,title ):
53
+ response = requests .get (problemUrl )
54
+ setCookie = response .headers ["Set-Cookie" ]
55
+ '''
56
+ print(setCookie)
57
+ setCookie = json.loads(setCookie)
58
+ print(type(setCookie))
59
+ '''
60
+ try :
61
+ pattern = re .compile ("csrftoken=(.*?);.*?" ,re .S )
62
+ csrftoken = re .search (pattern , setCookie )
63
+ url = "https://leetcode.com/graphql"
64
+ data = {
65
+ #"operationName":"getQuestionDetail",
66
+ "operationName" :"questionData" ,
67
+ "variables" :{"titleSlug" :title },
68
+ # "query":"query getQuestionDetail($titleSlug: String!) {\n isCurrentUserAuthenticated\n question(titleSlug: $titleSlug) {\n questionId\n questionFrontendId\n questionTitle\n translatedTitle\n questionTitleSlug\n content\n translatedContent\n difficulty\n stats\n allowDiscuss\n contributors\n similarQuestions\n mysqlSchemas\n randomQuestionUrl\n sessionId\n categoryTitle\n submitUrl\n interpretUrl\n codeDefinition\n sampleTestCase\n enableTestMode\n metaData\n enableRunCode\n enableSubmit\n judgerAvailable\n infoVerified\n envInfo\n urlManager\n article\n questionDetailUrl\n libraryUrl\n companyTags {\n name\n slug\n translatedName\n __typename\n }\n companyTagStats\n topicTags {\n name\n slug\n translatedName\n __typename\n }\n __typename\n }\n interviewed {\n interviewedUrl\n companies {\n id\n name\n slug\n __typename\n }\n timeOptions {\n id\n name\n __typename\n }\n stageOptions {\n id\n name\n __typename\n }\n __typename\n }\n subscribeUrl\n isPremium\n loginUrl\n}\n"
69
+ "query" : "query questionData($titleSlug: String!) {\n question(titleSlug: $titleSlug) {\n questionId\n questionFrontendId\n boundTopicId\n title\n titleSlug\n content\n translatedTitle\n translatedContent\n isPaidOnly\n difficulty\n likes\n dislikes\n isLiked\n similarQuestions\n exampleTestcases\n categoryTitle\n contributors {\n username\n profileUrl\n avatarUrl\n __typename\n }\n topicTags {\n name\n slug\n translatedName\n __typename\n }\n companyTagStats\n codeSnippets {\n lang\n langSlug\n code\n __typename\n }\n stats\n hints\n solution {\n id\n canSeeDetail\n paidOnly\n hasVideoSolution\n paidOnlyVideo\n __typename\n }\n status\n sampleTestCase\n metaData\n judgerAvailable\n judgeType\n mysqlSchemas\n enableRunCode\n enableTestMode\n enableDebugger\n envInfo\n libraryUrl\n adminUrl\n challengeQuestion {\n id\n date\n incompleteChallengeCount\n streakCount\n type\n __typename\n }\n __typename\n }\n }\n "
70
+ }
71
+ headers = {
72
+ 'x-csrftoken' : csrftoken .group (1 ),
73
+ 'referer' :problemUrl ,
74
+ 'content-type' :'application/json' ,
75
+ 'origin' :'https://leetcode.com' ,
76
+ 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
77
+ }
78
+ cookies = {
79
+ '__cfduid' :'d9ce37537c705e759f6bea15fffc9c58b1525271602' ,
80
+ '_ga' :'GA1.2.5783653.1525271604' ,
81
+ '_gid' :'GA1.2.344320119.1533189808' ,
82
+ 'csrftoken' :csrftoken .group (1 ),
83
+ ' _gat' :'1'
84
+ }
85
+ #payload表单为json格式
86
+
87
+ dumpJsonData = json .dumps (data )
88
+ response = requests .post (url ,data = dumpJsonData , headers = headers ,cookies = cookies )
89
+ dictInfo = json .loads (response .text )
90
+ if dictInfo ["data" ]["question" ].get ("content" ) is not None :
91
+ saveJSON (dictInfo , title + ".json" )
92
+ content = dictInfo ["data" ]["question" ]["content" ]
93
+ save_problem (title ,content )
94
+ # soup = BeautifulSoup(content, 'lxml')
95
+ # save_problem(title,soup.prettify())
96
+ else :
97
+ saveJSON (dictInfo , "[no content]" + title + ".json" )
98
+ # print("no content")
99
+ except Exception as e :
100
+ print ("[error] " , e , problemUrl )
101
+
102
+ def saveJSON (data , filename ):
103
+ with open (filename , 'w' , encoding = 'utf-8' ) as f :
104
+ json .dump (data , f , ensure_ascii = False , indent = 4 )
105
+
106
+ def main ():
107
+ # url = "https://leetcode.com/api/problems/all/"
108
+ # html = json.loads(get_proble_set(url))
109
+ # problemset = html["stat_status_pairs"]
110
+ # saveJSON(html, "[en]json1-origin-data.json")
111
+ # saveJSON(problemset, "[en]json2-problemset.json")
112
+
113
+ # url = "https://leetcode-cn.com/api/problems/all/"
114
+ # html = json.loads(get_proble_set(url))
115
+ # problemset = html["stat_status_pairs"]
116
+ # saveJSON(html, "[cn]json1-origin-data.json")
117
+ # saveJSON(problemset, "[cn]json2-problemset.json")
118
+ # exit()
119
+
120
+ problemset = json .load (open ("[en]json2-problemset.json" , 'r' , encoding = 'utf-8' ))
121
+ parse_proble_set (problemset )
122
+
123
+
124
+ if __name__ == '__main__' :
125
+ if os .path .exists ("算法题" ):
126
+ os .chdir ("算法题" )
127
+ else :
128
+ os .mkdir ("算法题" )
129
+ os .chdir ("算法题" )
130
+ main ()
0 commit comments