Skip to content

Commit 852051f

Browse files
authored
Merge pull request #586 from decipher07/main
feat: adding codeforces scrapper
2 parents 5267081 + b2f919c commit 852051f

File tree

3 files changed

+163
-0
lines changed

3 files changed

+163
-0
lines changed

codeforces_scraper/README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Codeforces Scraper
2+
This python script will let the user to scrape problem statements from Codeforces.
3+
4+
5+
## Prerequisite Steps:
6+
Download the required packages from the following command in you terminal.(Make sure you're in the same project directory)
7+
8+
```
9+
pip3 install -r requirements.txt
10+
11+
```
12+
13+
## Running the script:
14+
After installing all the requirements,run this command in your terminal.
15+
16+
```
17+
python3 script.py
18+
19+
```

codeforces_scraper/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
beautifulsoup4
2+
requests

codeforces_scraper/script.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
import requests
2+
import json
3+
import bs4
4+
import re
5+
6+
CLEANR = re.compile("<.*?>")
7+
8+
9+
def cleanhtml(raw_html):
10+
cleantext = re.sub(CLEANR, "", raw_html)
11+
return cleantext
12+
13+
14+
def split_value_and_unit(soup):
15+
length = soup.split()
16+
return {"value": int(length[0]), "unit": length[1]}
17+
18+
19+
def test_group(lst):
20+
return [{"input": _in, "output": _out} for _in, _out in pairwise(lst)]
21+
22+
23+
def test_sample(souped_html):
24+
return test_group(get_tags_contents(souped_html, "pre"))
25+
26+
27+
def get_tags_contents(souped_html, tag_name, class_name=None):
28+
return [
29+
concat_contents(tag.contents)
30+
for tag in souped_html.find_all(tag_name, class_name)
31+
]
32+
33+
34+
def pairwise(iterable):
35+
a = iter(iterable)
36+
return zip(a, a)
37+
38+
39+
def get_statement(soup):
40+
return concat_contents(soup.find("div", "header").next_sibling.contents)
41+
42+
43+
def get_content(soup, _class=""):
44+
element = soup.find("div", _class)
45+
if not element:
46+
return None
47+
tags = element.contents
48+
tags.pop(0)
49+
return concat_contents(tags)
50+
51+
52+
def concat_contents(ls):
53+
return "".join([str(i) for i in ls])
54+
55+
56+
def scrap_wraper(problem_link):
57+
markup = requests.get(problem_link).text
58+
soup = bs4.BeautifulSoup(markup, "html.parser")
59+
problem = {
60+
"title": soup.find("div", "title").string,
61+
"timeLimit": split_value_and_unit(
62+
soup.find("div", "time-limit").contents[1].string
63+
),
64+
"memoryLimit": split_value_and_unit(
65+
soup.find("div", "memory-limit").contents[1].string
66+
),
67+
"statement": get_statement(soup),
68+
"inputSpecification": get_content(soup, "input-specification"),
69+
"outputSpecification": get_content(soup, "output-specification"),
70+
"samples": test_sample(soup),
71+
"note": get_content(soup, "note"),
72+
}
73+
return problem
74+
75+
76+
def get_all_problems():
77+
url = "https://codeforces.com/api/problemset.problems"
78+
print(url)
79+
80+
r = requests.get(url)
81+
82+
if r.status_code == 200:
83+
data = r.json()
84+
print(json.dumps(data["result"]["problems"], sort_keys=True, indent=4))
85+
else:
86+
print("SORRY! SERVER ERROR EXISTS")
87+
88+
89+
def get_all_problems_by_tag(tag):
90+
url = "https://codeforces.com/api/problemset.problems"
91+
92+
r = requests.get(url)
93+
94+
if r.status_code == 200:
95+
data = r.json()
96+
list_of_all_problems = data["result"]["problems"]
97+
for index in list_of_all_problems:
98+
tags_of_problem = index["tags"]
99+
if tags_of_problem.count(tag):
100+
print(index)
101+
102+
else:
103+
print("SORRY! SERVER ERROR EXISTS")
104+
105+
106+
def get_problem_statement_by_id_and_index(id, index):
107+
url = "https://codeforces.com/problemset/problem/" + id + "/" + index
108+
data = scrap_wraper(url)
109+
print(cleanhtml(data["statement"]))
110+
print(cleanhtml(data["inputSpecification"]))
111+
print(cleanhtml(data["outputSpecification"]))
112+
113+
114+
def main():
115+
ch = "YES"
116+
while ch == "YES":
117+
print("PLEASE SELECT ANY ONE OF THE BELOW :")
118+
print("\n1. GET ALL PROBLEMS")
119+
print("\n2. GET ALL PROBLEMS BY TAGS \n3. GET PROBLEM STATEMENT ")
120+
121+
answer = int(input())
122+
123+
if answer == 1:
124+
get_all_problems()
125+
126+
elif answer == 2:
127+
print("\nPlease Enter Your Tag : ")
128+
tag = input()
129+
get_all_problems_by_tag(tag)
130+
131+
elif answer == 3:
132+
print("\nPlease Enter Id and Index as Follows : \nId : ")
133+
id = input()
134+
print("\nIndex : ")
135+
index = input()
136+
get_problem_statement_by_id_and_index(id, index)
137+
138+
ch = input("WOULD YOU LIKE TO CONTINUE : ")
139+
140+
141+
if __name__ == "__main__":
142+
main()

0 commit comments

Comments
 (0)