diff --git a/pyscraper/regmem/filter.py b/pyscraper/regmem/filter.py index e5347e0e3..8ac8fb443 100755 --- a/pyscraper/regmem/filter.py +++ b/pyscraper/regmem/filter.py @@ -4,7 +4,7 @@ import re import os -from BeautifulSoup import BeautifulStoneSoup +from bs4 import BeautifulSoup from contextexception import ContextException from resolvemembernames import memberList from xmlfilewrite import WriteXMLHeader @@ -25,7 +25,7 @@ def RunRegmemFilters2010(fout, text, sdate, sdatever): memberset = set() text = re.sub('([^<]*?)', r'\1', text) - t = BeautifulStoneSoup(text) + t = BeautifulSoup(text, 'xml') for page in t('page'): title = page.h2.renderContents() if title in ('HAGUE, Rt Hon William (Richmond (Yorks)', 'PEARCE, Teresa (Erith and Thamesmead'): diff --git a/pyscraper/regmem/pullgluepages.py b/pyscraper/regmem/pullgluepages.py index b348e3d96..15777e0e3 100755 --- a/pyscraper/regmem/pullgluepages.py +++ b/pyscraper/regmem/pullgluepages.py @@ -9,7 +9,7 @@ import os.path import time import tempfile -import BeautifulSoup +from bs4 import BeautifulSoup import miscfuncs toppath = miscfuncs.toppath @@ -37,8 +37,8 @@ def GlueByContents(fout, url_contents, regmemdate, remote): sr = ur.read() ur.close() - soup = BeautifulSoup.BeautifulSoup(sr) - mps = soup.find('a', attrs={'name':'A'}).parent.findNextSiblings('p') + soup = BeautifulSoup(sr, 'lxml') + mps = soup.find('a', attrs={'name':'A'}).parent.find_next_siblings('p') for p in mps: url = urlparse.urljoin(url_contents, p.a['href']) url = url.encode('utf-8') @@ -61,9 +61,9 @@ def GlueByContents(fout, url_contents, regmemdate, remote): (url, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt))) sr = re.sub(']*)/>', r'

', sr) - soup_mp = BeautifulSoup.BeautifulSoup(sr) + soup_mp = BeautifulSoup(sr, 'lxml') try: - page = soup_mp.find('h1').findNextSiblings(lambda t: t.name != 'div') + page = soup_mp.find('h1').find_next_siblings(lambda t: t.name != 'div') except: print 'Problem with ' + url.decode('utf-8') page = '\n'.join([ str(p) for p in page ]) + '\n' @@ -195,9 +195,9 @@ def FindRegmemPages(remote): sys.exit("Cloudflare please wait page, cannot proceed") ur.close() - soup = BeautifulSoup.BeautifulSoup(content) + soup = BeautifulSoup(content, 'lxml') soup = soup.find(attrs='main-body').find('ul') - ixurls = [urlparse.urljoin(idxurl, ix['href']) for ix in soup.findAll('a', href=True)] + ixurls = [urlparse.urljoin(idxurl, ix['href']) for ix in soup.find_all('a', href=True)] for ixurl in ixurls: ur = opener.open(ixurl) @@ -213,7 +213,7 @@ def FindRegmemPages(remote): '\g<0>', content) - soup = BeautifulSoup.BeautifulSoup(content) + soup = BeautifulSoup(content, 'lxml') if soup.find(text=re.compile('^Contents$(?i)')): # An immediate register page. @@ -229,7 +229,7 @@ def FindRegmemPages(remote): if (date, ixurl) not in urls: urls.append((date, ixurl)) elif re.search('Session 201[79]|Session 20[2-9]', content): - allurl_soups = soup.findAll('a', href=re.compile("(memi02|part1contents|/contents\.htm)")) + allurl_soups = soup.find_all('a', href=re.compile("(memi02|part1contents|/contents\.htm)")) for url_soup in allurl_soups: url = url_soup['href'] url = urlparse.urljoin(ixurl, url) @@ -240,7 +240,7 @@ def FindRegmemPages(remote): if (date, url) not in urls: urls.append((date, url)) else: - allurl_soups = soup.findAll('a', href=re.compile("(memi02|part1contents|/contents\.htm)")) + allurl_soups = soup.find_all('a', href=re.compile("(memi02|part1contents|/contents\.htm)")) for url_soup in allurl_soups: row_content = url_soup.findParent('tr').renderContents() url = url_soup['href'] diff --git a/pyscraper/wa/parse.py b/pyscraper/wa/parse.py index abc8ac345..e1b144eca 100755 --- a/pyscraper/wa/parse.py +++ b/pyscraper/wa/parse.py @@ -10,7 +10,7 @@ sys.path.append('../') from contextexception import ContextException -from BeautifulSoup import BeautifulStoneSoup, Tag +from bs4 import BeautifulSoup import codecs streamWriter = codecs.lookup('utf-8')[-1] @@ -144,7 +144,7 @@ def display_vote(self, item, speech): print 'Vote: %s' % speech['text'] def parse_xml_day(self, date): - soup = BeautifulStoneSoup(self.text) + soup = BeautifulSoup(self.text, 'xml') items = soup.find('XML_Plenary_Bilingual') #soup = soup('XML_Plenary_Bilingual') diff --git a/requirements.txt b/requirements.txt index 612be64d3..f46484c99 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -BeautifulSoup==3.2.1 beautifulsoup4==4.3.2 everypolitician==0.0.13 lxml==3.4.0 diff --git a/scripts/fetch-pw-json b/scripts/fetch-pw-json index 046593724..0d9224c25 100755 --- a/scripts/fetch-pw-json +++ b/scripts/fetch-pw-json @@ -6,7 +6,7 @@ import os import re import requests -from BeautifulSoup import MinimalSoup +from bs4 import BeautifulSoup base_url = 'https://www.publicwhip.org.uk/data/popolo/' headers = { @@ -17,8 +17,8 @@ OUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../parlda all_json = requests.get(base_url, headers=headers).content -soup = MinimalSoup(all_json) -json_files = soup.findAll( href=re.compile("json") ) +soup = BeautifulSoup(all_json, 'lxml') +json_files = soup.find_all( href=re.compile("json") ) for json in json_files: url = "%s%s" % ( base_url, json['href'] )