Skip to content

Commit d4645ef

Browse files
author
Cristi Constantin
committed
Strip empty prop and content tags
Updated the tests
1 parent 8e69411 commit d4645ef

File tree

5 files changed

+14
-6
lines changed

5 files changed

+14
-6
lines changed

extruct/opengraph.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ def extract_items(self, document, base_url=None):
3030
namespaces.update(self.get_namespaces(head))
3131
props = []
3232
for el in head.xpath('meta[@property and @content]'):
33-
prop = el.attrib['property']
34-
val = el.attrib['content']
33+
prop = el.attrib['property'].strip()
34+
val = el.attrib['content'].strip()
3535
if prop == '' or val == '':
3636
continue
3737
ns = prop.partition(':')[0]

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,5 @@ requests
77
rdflib
88
rdflib-jsonld
99
mf2py>=1.1.0
10-
six
10+
six>=1.11
1111
w3lib

tests/samples/songkick/elysianfields.html

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
<meta property="og:site_name" content="Songkick">
2828
<meta property="og:type" content="songkick-concerts:artist">
2929
<meta property="og:title" content="Elysian Fields">
30+
<meta property="og:title" content=" ">
3031
<meta property="og:description" content="Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.">
3132
<meta property="og:description" content="" />
3233
<meta property="og:url" content="http://www.songkick.com/artists/236156-elysian-fields">

tests/samples/songkick/elysianfields.json

+3
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,9 @@
253253
"http://ogp.me/ns#title": [
254254
{
255255
"@value": "Elysian Fields"
256+
},
257+
{
258+
"@value": " "
256259
}
257260
],
258261
"http://ogp.me/ns#type": [

tests/test_extruct.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,13 @@ def test_all(self):
1616
body = get_testdata('songkick', 'elysianfields.html')
1717
expected = json.loads(get_testdata('songkick', 'elysianfields.json').decode('UTF-8'))
1818
data = extruct.extract(body, base_url='http://www.songkick.com/artists/236156-elysian-fields')
19-
# See test_rdfa_not_preserving_order()
20-
del data['rdfa'][0]['http://ogp.me/ns#image']
21-
del expected['rdfa'][0]['http://ogp.me/ns#image']
19+
# Sorting the values here because RDFa is not preserving ordering on duplicated properties.
20+
# See https://github.com/scrapinghub/extruct/issues/116
21+
# Also see test_rdfa_not_preserving_order()
22+
for rdf in data['rdfa']:
23+
for key, pairs in rdf.items():
24+
if ':' in key and isinstance(pairs, list):
25+
rdf[key] = sorted(pairs, key=lambda e: e["@value"], reverse=True)
2226
self.assertEqual(jsonize_dict(data), expected)
2327

2428
@pytest.mark.xfail

0 commit comments

Comments
 (0)