diff --git a/pyscraper/regmem/filter.py b/pyscraper/regmem/filter.py
index e5347e0e3..8ac8fb443 100755
--- a/pyscraper/regmem/filter.py
+++ b/pyscraper/regmem/filter.py
@@ -4,7 +4,7 @@
import re
import os
-from BeautifulSoup import BeautifulStoneSoup
+from bs4 import BeautifulSoup
from contextexception import ContextException
from resolvemembernames import memberList
from xmlfilewrite import WriteXMLHeader
@@ -25,7 +25,7 @@ def RunRegmemFilters2010(fout, text, sdate, sdatever):
memberset = set()
text = re.sub('([^<]*?)', r'\1', text)
- t = BeautifulStoneSoup(text)
+ t = BeautifulSoup(text, 'xml')
for page in t('page'):
title = page.h2.renderContents()
if title in ('HAGUE, Rt Hon William (Richmond (Yorks)', 'PEARCE, Teresa (Erith and Thamesmead'):
diff --git a/pyscraper/regmem/pullgluepages.py b/pyscraper/regmem/pullgluepages.py
index b348e3d96..15777e0e3 100755
--- a/pyscraper/regmem/pullgluepages.py
+++ b/pyscraper/regmem/pullgluepages.py
@@ -9,7 +9,7 @@
import os.path
import time
import tempfile
-import BeautifulSoup
+from bs4 import BeautifulSoup
import miscfuncs
toppath = miscfuncs.toppath
@@ -37,8 +37,8 @@ def GlueByContents(fout, url_contents, regmemdate, remote):
sr = ur.read()
ur.close()
- soup = BeautifulSoup.BeautifulSoup(sr)
- mps = soup.find('a', attrs={'name':'A'}).parent.findNextSiblings('p')
+ soup = BeautifulSoup(sr, 'lxml')
+ mps = soup.find('a', attrs={'name':'A'}).parent.find_next_siblings('p')
for p in mps:
url = urlparse.urljoin(url_contents, p.a['href'])
url = url.encode('utf-8')
@@ -61,9 +61,9 @@ def GlueByContents(fout, url_contents, regmemdate, remote):
(url, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt)))
sr = re.sub('
]*)/>', r'
', sr)
- soup_mp = BeautifulSoup.BeautifulSoup(sr)
+ soup_mp = BeautifulSoup(sr, 'lxml')
try:
- page = soup_mp.find('h1').findNextSiblings(lambda t: t.name != 'div')
+ page = soup_mp.find('h1').find_next_siblings(lambda t: t.name != 'div')
except:
print 'Problem with ' + url.decode('utf-8')
page = '\n'.join([ str(p) for p in page ]) + '\n'
@@ -195,9 +195,9 @@ def FindRegmemPages(remote):
sys.exit("Cloudflare please wait page, cannot proceed")
ur.close()
- soup = BeautifulSoup.BeautifulSoup(content)
+ soup = BeautifulSoup(content, 'lxml')
soup = soup.find(attrs='main-body').find('ul')
- ixurls = [urlparse.urljoin(idxurl, ix['href']) for ix in soup.findAll('a', href=True)]
+ ixurls = [urlparse.urljoin(idxurl, ix['href']) for ix in soup.find_all('a', href=True)]
for ixurl in ixurls:
ur = opener.open(ixurl)
@@ -213,7 +213,7 @@ def FindRegmemPages(remote):
'\g<0>',
content)
- soup = BeautifulSoup.BeautifulSoup(content)
+ soup = BeautifulSoup(content, 'lxml')
if soup.find(text=re.compile('^Contents$(?i)')):
# An immediate register page.
@@ -229,7 +229,7 @@ def FindRegmemPages(remote):
if (date, ixurl) not in urls:
urls.append((date, ixurl))
elif re.search('Session 201[79]|Session 20[2-9]', content):
- allurl_soups = soup.findAll('a', href=re.compile("(memi02|part1contents|/contents\.htm)"))
+ allurl_soups = soup.find_all('a', href=re.compile("(memi02|part1contents|/contents\.htm)"))
for url_soup in allurl_soups:
url = url_soup['href']
url = urlparse.urljoin(ixurl, url)
@@ -240,7 +240,7 @@ def FindRegmemPages(remote):
if (date, url) not in urls:
urls.append((date, url))
else:
- allurl_soups = soup.findAll('a', href=re.compile("(memi02|part1contents|/contents\.htm)"))
+ allurl_soups = soup.find_all('a', href=re.compile("(memi02|part1contents|/contents\.htm)"))
for url_soup in allurl_soups:
row_content = url_soup.findParent('tr').renderContents()
url = url_soup['href']
diff --git a/pyscraper/wa/parse.py b/pyscraper/wa/parse.py
index abc8ac345..e1b144eca 100755
--- a/pyscraper/wa/parse.py
+++ b/pyscraper/wa/parse.py
@@ -10,7 +10,7 @@
sys.path.append('../')
from contextexception import ContextException
-from BeautifulSoup import BeautifulStoneSoup, Tag
+from bs4 import BeautifulSoup
import codecs
streamWriter = codecs.lookup('utf-8')[-1]
@@ -144,7 +144,7 @@ def display_vote(self, item, speech):
print 'Vote: %s' % speech['text']
def parse_xml_day(self, date):
- soup = BeautifulStoneSoup(self.text)
+ soup = BeautifulSoup(self.text, 'xml')
items = soup.find('XML_Plenary_Bilingual')
#soup = soup('XML_Plenary_Bilingual')
diff --git a/requirements.txt b/requirements.txt
index 612be64d3..f46484c99 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-BeautifulSoup==3.2.1
beautifulsoup4==4.3.2
everypolitician==0.0.13
lxml==3.4.0
diff --git a/scripts/fetch-pw-json b/scripts/fetch-pw-json
index 046593724..0d9224c25 100755
--- a/scripts/fetch-pw-json
+++ b/scripts/fetch-pw-json
@@ -6,7 +6,7 @@ import os
import re
import requests
-from BeautifulSoup import MinimalSoup
+from bs4 import BeautifulSoup
base_url = 'https://www.publicwhip.org.uk/data/popolo/'
headers = {
@@ -17,8 +17,8 @@ OUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../parlda
all_json = requests.get(base_url, headers=headers).content
-soup = MinimalSoup(all_json)
-json_files = soup.findAll( href=re.compile("json") )
+soup = BeautifulSoup(all_json, 'lxml')
+json_files = soup.find_all( href=re.compile("json") )
for json in json_files:
url = "%s%s" % ( base_url, json['href'] )