Skip to content

Commit 0eead10

Browse files
authored
Fix CivPlus bugs #176 #189 (#190)
* Fix CivPlus bugs #176 #189 * linter fixe * Fix GH build process
1 parent a7775f1 commit 0eead10

File tree

12 files changed

+2370
-15
lines changed

12 files changed

+2370
-15
lines changed

.github/workflows/continuous-deployment.yml

+3-1
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,9 @@ jobs:
117117
118118
- id: build
119119
name: Build release
120-
run: make build-release
120+
run: |
121+
pipenv run pip install setuptools-scm>=8.1.0 --force-reinstall --upgrade
122+
make build-release
121123
122124
- id: check
123125
name: Check release

civic_scraper/base/asset.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def __init__(
4545
meeting_id: str = None,
4646
scraped_by: str = None,
4747
content_type: str = None,
48-
content_length: str = None
48+
content_length: str = None,
4949
) -> None:
5050
self.url = url
5151
self.asset_name = asset_name

civic_scraper/platforms/civic_clerk/site.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ def __init__(self, url, place=None, state_or_province=None, cache=Cache()):
2525
self.cache = cache
2626

2727
self.session = Session()
28-
self.session.headers[
29-
"User-Agent"
30-
] = "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"
28+
self.session.headers["User-Agent"] = (
29+
"Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"
30+
)
3131

3232
# Raise an error if a request gets a failing status code
3333
self.session.hooks = {

civic_scraper/platforms/civic_plus/parser.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ def file_links_with_no_title(tag):
3838
)
3939

4040
metadata = []
41+
# Links often appear twice (once under meeting title, once in download menu)
42+
# so we track which we've already seen to avoid duplicate entries
43+
bookkeeping = set()
4144
for div in divs:
4245
cmte_name = self._committee_name(div)
4346
# Line-item data for each meeting is inside table rows.
@@ -52,6 +55,9 @@ def file_links_with_no_title(tag):
5255
# Skip links to page listing previous agenda versions
5356
if self._previous_version_link(link):
5457
continue
58+
# Skip previously harvested links
59+
if link["href"] in bookkeeping:
60+
continue
5561
metadata.append(
5662
{
5763
"committee_name": cmte_name,
@@ -63,13 +69,18 @@ def file_links_with_no_title(tag):
6369
"asset_type": self._asset_type(link["href"]),
6470
}
6571
)
72+
bookkeeping.add(link["href"])
6673
return metadata
6774

6875
def _committee_name(self, div):
69-
# Remove span that contains
76+
# If present, remove span that contains
7077
# arrow ▼ for toggling meeting list
71-
div.h2.span.extract()
72-
return div.h2.text.strip()
78+
try:
79+
div.h2.span.extract()
80+
except AttributeError:
81+
pass
82+
header_node = div.h2 or div.h3
83+
return header_node.text.strip()
7384

7485
def _mtg_title(self, row):
7586
return row.p.text.strip()

civic_scraper/platforms/civic_plus/site.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@ def __init__(self, base_url, cache=Cache(), parser_kls=Parser, place_name=None):
2929

3030
@property
3131
def place(self):
32-
return self.place_name or self._get_asset_metadata(r"(?<=-)\w+(?=\.)", self.base_url)
32+
return self.place_name or self._get_asset_metadata(
33+
r"(?<=-)\w+(?=\.)", self.base_url
34+
)
3335

3436
def scrape(
3537
self,

civic_scraper/platforms/legistar/site.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def _create_asset(self, event, meeting_meta, asset_type):
105105
name_bits.append(asset_type)
106106
kwargs = {
107107
"url": event[asset_type]["url"],
108-
"asset_type": asset_type.lower().replace(' ', '_'),
108+
"asset_type": asset_type.lower().replace(" ", "_"),
109109
"asset_name": " - ".join(name_bits),
110110
"content_type": None,
111111
"content_length": None,

civic_scraper/platforms/primegov/site.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@ def __init__(self, url, place=None, state_or_province=None, cache=Cache()):
2727
self.cache = cache
2828

2929
self.session = Session()
30-
self.session.headers[
31-
"User-Agent"
32-
] = "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"
30+
self.session.headers["User-Agent"] = (
31+
"Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"
32+
)
3333

3434
# Raise an error if a request gets a failing status code
3535
self.session.hooks = {

scripts/generate_civicplus_sites.py

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"/Users/amydipierro/GitHub/test.csv"
2121
2222
"""
23+
2324
import csv
2425
import re
2526

scripts/run_scraper.py

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
path/to/target.csv \
2424
--scraper_args '{"start_date": "2015-09-09", "end_date": "2015-10-14"}'
2525
"""
26+
2627
from civic_scraper.scrapers import SUPPORTED_SITES
2728

2829

0 commit comments

Comments
 (0)