Skip to content

Commit

Permalink
Remove remaining mx.DateTime uses.
Browse files Browse the repository at this point in the history
  • Loading branch information
dracos committed Mar 11, 2023
1 parent 231c840 commit 5476b8d
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 39 deletions.
14 changes: 6 additions & 8 deletions filtersentence_xml.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
#! /usr/bin/python

from datetime import datetime
import re
import string

import mx.DateTime

from lxml import etree

from contextexception import ContextException
Expand Down Expand Up @@ -40,7 +39,7 @@
reqnum = re.compile("\s*\[(\d+)\]\s*$")
refqnum = re.compile("\s*\[(\d+)\]\s*")

redatephraseval = re.compile('(?:(?:%s),?)?(\d+(?: | )*(?:%s)(\d+)?)' % (parlPhrases.daysofweek, parlPhrases.monthsofyear))
redatephraseval = re.compile('(?:(?:%s),? )?(\d+(?: | )*(?:%s)( \d+)?)' % (parlPhrases.daysofweek, parlPhrases.monthsofyear))


def TokenDate(ldate, phrtok):
Expand All @@ -49,9 +48,8 @@ def TokenDate(ldate, phrtok):
if not ldate.group(2):
tdate += " %s" % sdate_year
try:
lldate = mx.DateTime.DateTimeFrom(tdate)
ldate = lldate.date
phrtok.lastdate = ldate
lldate = datetime.strptime(tdate, '%A, %d %B %Y')
phrtok.lastdate = lldate.date
except:
phrtok.lastdate = ''
return ('phrase', ' class="date" code="%s"' % phrtok.lastdate)
Expand Down Expand Up @@ -166,10 +164,10 @@ def TokenOffRepWDate(qoffrep, phrtok):
m = re.match('(\d+)/(\d+)/(\d+)', date)
if m:
lordsdate = True
date = mx.DateTime.DateTimeFrom('%s/%s/%s' % (m.group(2), m.group(1), m.group(3))).date
date = datetime.strptime(date, '%d/%m/%Y').date
else:
lordsdate = False
date = mx.DateTime.DateTimeFrom(date).date
date = datetime.strptime(date, '%d %B %Y').date
if qcolprefix:
qcolprefix = qcolprefix.upper()
if qcolsuffix:
Expand Down
14 changes: 6 additions & 8 deletions pyscraper/new_hansard.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import datetime
import re
import os
import sys
import io
import tempfile
from lxml import etree
import xml.sax
import mx.DateTime
import miscfuncs

xmlvalidate = xml.sax.make_parser()
Expand Down Expand Up @@ -199,10 +199,8 @@ def reset(self):

def is_pre_new_parser(self):
is_pre = False
parser_start = mx.DateTime.Date(2016, 4, 1)
file_date = mx.DateTime.DateTimeFrom(self.date)

if file_date < parser_start:
parser_start = datetime.date(2016, 4, 1)
if self.date < parser_start:
is_pre = True

return is_pre
Expand Down Expand Up @@ -312,7 +310,7 @@ def new_speech(self, member, url):

def parse_system_header(self, header):
sitting = header.xpath('./ns:Sitting', namespaces=self.ns_map)[0]
date = mx.DateTime.DateTimeFrom(sitting.get('short-date')).date
date = datetime.datetime.strptime(sitting.get('short-date'), '%d %B %Y').date
if date:
self.date = date

Expand Down Expand Up @@ -889,8 +887,8 @@ def parse_time(self, tag):
minutes = int(matches.group(2) or 0)
if matches.group(3) == 'pm' and hours < 12:
hours += 12
time = mx.DateTime.DateTimeFrom(hour=hours, minute=minutes)
self.current_time = time.strftime('%H:%M:%S')
time = datetime.time(hours, minutes)
self.current_time = time.isoformat()
elif time_txt in ('Noon', 'noon') or re.match('12\s*?noon', time_txt):
self.current_time = "12:00:00"
elif re.match('12\s*?midnight', time_txt):
Expand Down
27 changes: 5 additions & 22 deletions pyscraper/regmem/pullgluepages.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#! /usr/bin/python

from datetime import datetime
import glob
import sys
import urllib
import urlparse
import re
import os.path
import time
import mx.DateTime
import tempfile
import BeautifulSoup

Expand Down Expand Up @@ -90,7 +90,7 @@ def GlueByNext(fout, url, regmemdate):
if not dateinpage:
raise Exception, 'Not found date marker'
dateinpage = dateinpage.group(1).replace("&nbsp;", " ")
dateinpage = mx.DateTime.DateTimeFrom(dateinpage).date
dateinpage = datetime.strptime(dateinpage, '%d %B %Y').date
if dateinpage != regmemdate:
raise Exception, 'Date in page is %s, expected %s - update the URL list in regmempullgluepages.py' % (dateinpage, regmemdate)
matcheddate = True
Expand Down Expand Up @@ -225,7 +225,7 @@ def FindRegmemPages(remote):
print alldates
raise Exception, 'Date match failed, expected one got %d\n%s' % (len(alldates), url)

date = mx.DateTime.DateTimeFrom(alldates[0]).date
date = datetime.strptime(alldates[0], '%d %B %Y').date
if (date, ixurl) not in urls:
urls.append((date, ixurl))
elif re.search('Session 201[79]|Session 20[2-9]', content):
Expand Down Expand Up @@ -261,28 +261,14 @@ def FindRegmemPages(remote):
date = corrections[url_path]
else:
alldates[0] = re.sub('\s+', ' ', alldates[0])
date = mx.DateTime.DateTimeFrom(alldates[0]).date
alldates[0] = re.sub('(?<=\d)(st|nd|rd|th)', '', alldates[0])
date = datetime.strptime(alldates[0], '%d %B %Y').date

if (date, url) not in urls:
urls.append((date, url))

return urls

def FindLordRegmemPages():
urls = [('2004-10-01', 'http://www.publications.parliament.uk/pa/ld200304/ldreg/reg01.htm')]
ixurl = 'http://www.publications.parliament.uk/pa/ld/ldreg.htm'
ur = opener.open(ixurl)
content = ur.read()
ur.close();

allurls = re.findall('<a href="([^>]*reg01[^>]*)">.*?position on (.*?)\)</a>(?i)', content)
for match in allurls:
url = urlparse.urljoin(ixurl, match[0])
date = mx.DateTime.DateTimeFrom(match[1]).date
urls.append((date, url))

return urls

###############
# main function
###############
Expand All @@ -307,8 +293,5 @@ def RegmemPullGluePages(options):
# third parameter is a regexp, fourth is the filename (%s becomes the date).
GlueAllType(pwcmregmem, urls, 'regmem%s.html', options.forcescrape, options.remote)

# urls = FindLordRegmemPages()
# GlueAllType(pwldregmem, urls, 'regmem%s.html', forcescrape)

if __name__ == '__main__':
RegmemPullGluePages(False)
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
BeautifulSoup==3.2.1
beautifulsoup4==4.3.2
egenix-mx-base==3.2.9
everypolitician==0.0.13
lxml==3.4.0
python-dateutil==2.2
Expand Down

0 comments on commit 5476b8d

Please sign in to comment.