-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcount-rfcs.py
207 lines (194 loc) · 6.53 KB
/
count-rfcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import http.client
import re
import json
import sys
import pprint
from bs4 import BeautifulSoup
# Here are the parameters to set. See the README for more information
name = 'Martin Duke'
include_informational = False
include_experimental = True
include_acknowledgments = True
first_year = 2013
first_rfc = 4614
last_rfc = 20000
first_ad_year = 2020
# The filters below only apply to RFCs published before 'first_ad_year'
# Matching any of these is sufficient
#all values: streams = [ 'IAB', 'IETF', 'INDEPENDENT', 'IRTF' ]
streams = [ ]
#all values: areas = [ 'art', 'gen', 'int', 'ops', 'rtg', 'sec', 'tsv' ]
areas = [ 'tsv' ]
#all values: too many to list here; include 'NON' for non-WG; 'IESG' is also
# accepted
wgs = [ ]
# Result maps
author = {}
responsible_ad = {}
shepherd = {}
contributor = {}
balloted = {}
retrieve_error = {}
# not yet supported
art_reviewer = {}
rfced = http.client.HTTPSConnection('www.rfc-editor.org')
rfced.request('GET', '/rfc-index2.html')
try: rfced_resp = rfced.getresponse()
except ConnectionError as e:
print(type(e).__qualname__)
sys.exit()
else:
rfc_list = rfced_resp.read()
if rfced_resp.closed:
rfced = http.client.HTTPSConnection('www.rfc-editor.org')
# Set up datatracker connection
dt = http.client.HTTPSConnection('datatracker.ietf.org')
rfc_soup = BeautifulSoup(rfc_list, 'html.parser')
# Find third table in document
table = rfc_soup.table
table = table.find_next_sibling("table")
table = table.find_next_sibling("table")
for row in table.contents:
if row.name != 'tr':
continue
if (row.td.noscript == None):
continue
rfcnum = row.td.noscript.get_text()
print(rfcnum, end=" ")
# Break after documents I couldn't possibly have affected
if int(rfcnum) < first_rfc:
print("RFC number too early")
break
if int(rfcnum) > last_rfc:
continue
# Check other metadata before querying datatracker
longline = row.td.find_next_sibling("td").get_text()
if longline.find('Not Issued') >= 0:
print ("Not issued")
continue
title = row.td.find_next_sibling("td").b.get_text()
print('"' + title, end='": ')
date = re.search(r"\[[ a-zA-Z0-9]+\]", longline)
date = date[0].strip("[]")
dmy = date.split(" ")
year = int(dmy[len(dmy)-2])
if (year < first_year):
print("RFC year too early")
break
fields = re.finditer(r"[a-zA-Z0-9]+\: [a-zA-Z0-9]+", longline)
doc = {}
for fielditer in fields:
field = fielditer.group()
[key, value] = field.split(': ')
doc[key] = value;
# Do not apply stream, area, wg filters for ADs
if (year < first_ad_year):
if (doc["Stream"] != 'IETF') and not (doc["Stream"] in streams):
print(doc["Stream"] + " stream not tracked")
continue
if (doc["WG"] == 'NON') and not ('NON' in wgs):
print("Discarding No working group")
continue
if (doc["WG"] == 'IESG') and not ('IESG' in wgs):
print("Discarding IESG working group")
continue
if not ((doc["Area"] in areas) or (doc["WG"] in wgs)):
print(doc["Area"] + " and " + doc["WG"] + " don't match")
continue
if (not include_informational) and (doc['Status'] == 'INFORMATIONAL'):
print("Discarding INFORMATIONAL")
continue
if (not include_experimental) and (doc['Status'] == 'EXPERIMENTAL'):
print("Discarding EXPERIMENTAL")
continue
# Get the datatracker metadata
retry = True
while retry:
dt.request('GET', '/doc/rfc'+rfcnum+'/doc.json')
try: dt_resp = dt.getresponse()
except ConnectionError as e:
continue
# retrieve_error[rfcnum] = title
# print("Could not retrieve .json: " + type(e).__qualname__)
else:
retry = False
dt_json = dt_resp.read()
# Check for author, shepherd, AD
found = False
dt_data = json.loads(dt_json)
for auth_entry in dt_data['authors']:
if auth_entry['name'] == name:
author[rfcnum]= title
found = True
break
if found:
continue
if dt_data['shepherd'] != None:
shep = dt_data['shepherd'].split(" <")[0]
if (shep == name):
print("Shepherd")
shepherd[rfcnum] = title
continue
if dt_data['ad'] != None:
ad = dt_data['ad'].split(" <")[0]
if (ad == name):
print("Responsible AD")
responsible_ad[rfcnum] = title
continue
# Check the text of the RFC for acknowledgments
if include_acknowledgments:
retry = True
while retry:
rfced.request('GET', '/rfc/rfc'+rfcnum+'.txt')
try: rfced_resp = rfced.getresponse()
except ConnectionError as e:
continue
# retrieve_error[rfcnum] = title
# print("Could not retrieve .txt: " + type(e).__qualname__)
else:
retry = False
rfc_txt = rfced_resp.read().decode('utf-8')
if rfced_resp.closed:
rfced = http.client.HTTPSConnection('www.rfc-editor.org')
if rfc_txt.find(name) >= 0:
print("Contributor")
contributor[rfcnum] = title
continue
if (year < first_ad_year):
print("Name did not appear")
continue
# Check if balloted
dt.request('GET', '/doc/rfc'+rfcnum+'/ballot/')
try: dt_resp = dt.getresponse()
except ConnectionError as e:
retrieve_error[rfcnum] = title
print("Could not retrieve ballot: " + type(e).__qualname__)
continue
else:
ballot_html = dt_resp.read()
ballot_soup = BeautifulSoup(ballot_html, 'html.parser')
for ad in ballot_soup.select('div[class="balloter-name"]'):
if ad.a == None: # did not review
continue
ad_name = ad.get_text().strip()
if (ad_name == name) or (ad_name == '(' + name + ')'):
print("Balloted")
found = True
balloted[rfcnum] = title
break
if not found:
print("Name did not appear")
rfced.close()
dt.close()
print("Authored: " + str(len(author)))
pprint.pprint(author)
print("Shepherded:" + str(len(shepherd)))
pprint.pprint(shepherd)
print("Responsible AD: " + str(len(responsible_ad)))
pprint.pprint(responsible_ad)
print("Balloted: " + str(len(balloted)))
pprint.pprint(balloted)
print("Acknowledged: " + str(len(contributor)))
pprint.pprint(contributor)
print("Document retrieval errors: " + str(len(retrieve_error)))
pprint.pprint(retrieve_error)