From d511b3eac886b1e37afdff14e8d4b89657ce1909 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Wed, 15 Mar 2017 17:37:58 +0000 Subject: [PATCH] better parsing for Lords Amemdments rather than just parsing it all into a single line of text parse all the paragraphs and indents so that we try and retain a bit more structure. --- pyscraper/new_hansard.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index 05a3f9a9c..2f7265bdc 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -1663,12 +1663,17 @@ def parse_tabledby(self, tabledby): ) def parse_amendment(self, amendment): - self.parse_para_with_member( - amendment, - None, - css_class='italic', - pwmotiontext='unrecognized' - ) + # Amendments are often things like: + # + # 54: + # Clause 67, page 30, line 9, leave out “high” and insert + # “higher” + # + # so we need to parse the tags to make sure we get the + # indenting etc + for tag in amendment.getchildren(): + tag_name = self.get_tag_name_no_ns(tag) + self.handle_tag(tag_name, tag) def parse_clause_heading(self, heading): tag = etree.Element('p')