Skip to content

Commit

Permalink
Upgrade any remaining BeautifulSoup to v4.
Browse files Browse the repository at this point in the history
  • Loading branch information
dracos committed Mar 11, 2023
1 parent 5476b8d commit 403ee7b
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 18 deletions.
4 changes: 2 additions & 2 deletions pyscraper/regmem/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import re
import os

from BeautifulSoup import BeautifulStoneSoup
from bs4 import BeautifulSoup
from contextexception import ContextException
from resolvemembernames import memberList
from xmlfilewrite import WriteXMLHeader
Expand All @@ -25,7 +25,7 @@ def RunRegmemFilters2010(fout, text, sdate, sdatever):

memberset = set()
text = re.sub('<span class="highlight">([^<]*?)</span>', r'\1', text)
t = BeautifulStoneSoup(text)
t = BeautifulSoup(text, 'xml')
for page in t('page'):
title = page.h2.renderContents()
if title in ('HAGUE, Rt Hon William (Richmond (Yorks)', 'PEARCE, Teresa (Erith and Thamesmead'):
Expand Down
20 changes: 10 additions & 10 deletions pyscraper/regmem/pullgluepages.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import os.path
import time
import tempfile
import BeautifulSoup
from bs4 import BeautifulSoup

import miscfuncs
toppath = miscfuncs.toppath
Expand Down Expand Up @@ -37,8 +37,8 @@ def GlueByContents(fout, url_contents, regmemdate, remote):
sr = ur.read()
ur.close()

soup = BeautifulSoup.BeautifulSoup(sr)
mps = soup.find('a', attrs={'name':'A'}).parent.findNextSiblings('p')
soup = BeautifulSoup(sr, 'lxml')
mps = soup.find('a', attrs={'name':'A'}).parent.find_next_siblings('p')
for p in mps:
url = urlparse.urljoin(url_contents, p.a['href'])
url = url.encode('utf-8')
Expand All @@ -61,9 +61,9 @@ def GlueByContents(fout, url_contents, regmemdate, remote):
(url, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt)))

sr = re.sub('<p([^>]*)/>', r'<p\1></p>', sr)
soup_mp = BeautifulSoup.BeautifulSoup(sr)
soup_mp = BeautifulSoup(sr, 'lxml')
try:
page = soup_mp.find('h1').findNextSiblings(lambda t: t.name != 'div')
page = soup_mp.find('h1').find_next_siblings(lambda t: t.name != 'div')
except:
print 'Problem with ' + url.decode('utf-8')
page = '\n'.join([ str(p) for p in page ]) + '\n'
Expand Down Expand Up @@ -195,9 +195,9 @@ def FindRegmemPages(remote):
sys.exit("Cloudflare please wait page, cannot proceed")
ur.close()

soup = BeautifulSoup.BeautifulSoup(content)
soup = BeautifulSoup(content, 'lxml')
soup = soup.find(attrs='main-body').find('ul')
ixurls = [urlparse.urljoin(idxurl, ix['href']) for ix in soup.findAll('a', href=True)]
ixurls = [urlparse.urljoin(idxurl, ix['href']) for ix in soup.find_all('a', href=True)]

for ixurl in ixurls:
ur = opener.open(ixurl)
Expand All @@ -213,7 +213,7 @@ def FindRegmemPages(remote):
'<tr>\g<0>',
content)

soup = BeautifulSoup.BeautifulSoup(content)
soup = BeautifulSoup(content, 'lxml')

if soup.find(text=re.compile('^Contents$(?i)')):
# An immediate register page.
Expand All @@ -229,7 +229,7 @@ def FindRegmemPages(remote):
if (date, ixurl) not in urls:
urls.append((date, ixurl))
elif re.search('Session 201[79]|Session 20[2-9]', content):
allurl_soups = soup.findAll('a', href=re.compile("(memi02|part1contents|/contents\.htm)"))
allurl_soups = soup.find_all('a', href=re.compile("(memi02|part1contents|/contents\.htm)"))
for url_soup in allurl_soups:
url = url_soup['href']
url = urlparse.urljoin(ixurl, url)
Expand All @@ -240,7 +240,7 @@ def FindRegmemPages(remote):
if (date, url) not in urls:
urls.append((date, url))
else:
allurl_soups = soup.findAll('a', href=re.compile("(memi02|part1contents|/contents\.htm)"))
allurl_soups = soup.find_all('a', href=re.compile("(memi02|part1contents|/contents\.htm)"))
for url_soup in allurl_soups:
row_content = url_soup.findParent('tr').renderContents()
url = url_soup['href']
Expand Down
4 changes: 2 additions & 2 deletions pyscraper/wa/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

sys.path.append('../')
from contextexception import ContextException
from BeautifulSoup import BeautifulStoneSoup, Tag
from bs4 import BeautifulSoup

import codecs
streamWriter = codecs.lookup('utf-8')[-1]
Expand Down Expand Up @@ -144,7 +144,7 @@ def display_vote(self, item, speech):
print 'Vote: %s' % speech['text']

def parse_xml_day(self, date):
soup = BeautifulStoneSoup(self.text)
soup = BeautifulSoup(self.text, 'xml')

items = soup.find('XML_Plenary_Bilingual')
#soup = soup('XML_Plenary_Bilingual')
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
BeautifulSoup==3.2.1
beautifulsoup4==4.3.2
everypolitician==0.0.13
lxml==3.4.0
Expand Down
6 changes: 3 additions & 3 deletions scripts/fetch-pw-json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import os
import re
import requests

from BeautifulSoup import MinimalSoup
from bs4 import BeautifulSoup

base_url = 'https://www.publicwhip.org.uk/data/popolo/'
headers = {
Expand All @@ -17,8 +17,8 @@ OUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../parlda


all_json = requests.get(base_url, headers=headers).content
soup = MinimalSoup(all_json)
json_files = soup.findAll( href=re.compile("json") )
soup = BeautifulSoup(all_json, 'lxml')
json_files = soup.find_all( href=re.compile("json") )

for json in json_files:
url = "%s%s" % ( base_url, json['href'] )
Expand Down

0 comments on commit 403ee7b

Please sign in to comment.