Skip to content

Commit cc9ac04

Browse files
authored
Merge pull request #1436 from RDFLib/jsonld_conneg
Allow URLInputSource to get content-negotiation links from the Link headers
2 parents a10198e + e00529f commit cc9ac04

File tree

5 files changed

+153
-36
lines changed

5 files changed

+153
-36
lines changed

rdflib/parser.py

+55-7
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
1111
"""
1212

13+
from typing import TYPE_CHECKING
1314
import codecs
1415
import os
1516
import pathlib
@@ -20,6 +21,7 @@
2021
IO,
2122
Any,
2223
BinaryIO,
24+
List,
2325
Optional,
2426
TextIO,
2527
Tuple,
@@ -30,6 +32,7 @@
3032
from urllib.request import Request
3133
from urllib.request import url2pathname
3234
from urllib.request import urlopen
35+
from urllib.parse import urljoin
3336
from urllib.error import HTTPError
3437

3538
from xml.sax import xmlreader
@@ -39,6 +42,7 @@
3942
from rdflib.namespace import Namespace
4043

4144
if TYPE_CHECKING:
45+
from http.client import HTTPMessage, HTTPResponse
4246
from rdflib import Graph
4347

4448
__all__ = [
@@ -190,9 +194,43 @@ def __init__(
190194

191195
class URLInputSource(InputSource):
192196
"""
193-
TODO:
197+
Constructs an RDFLib Parser InputSource from a URL to read it from the Web.
194198
"""
195199

200+
links: List[str]
201+
202+
@classmethod
203+
def getallmatchingheaders(cls, message: 'HTTPMessage', name):
204+
# This is reimplemented here, because the method
205+
# getallmatchingheaders from HTTPMessage is broken since Python 3.0
206+
name = name.lower()
207+
return [val for key, val in message.items() if key.lower() == name]
208+
209+
@classmethod
210+
def get_links(cls, response: 'HTTPResponse'):
211+
linkslines = cls.getallmatchingheaders(response.headers, "Link")
212+
retarray = []
213+
for linksline in linkslines:
214+
links = [l.strip() for l in linksline.split(",")]
215+
for link in links:
216+
retarray.append(link)
217+
return retarray
218+
219+
def get_alternates(self, type_: Optional[str] = None) -> List[str]:
220+
typestr: Optional[str] = f"type=\"{type_}\"" if type_ else None
221+
relstr = "rel=\"alternate\""
222+
alts = []
223+
for link in self.links:
224+
parts = [p.strip() for p in link.split(";")]
225+
if relstr not in parts:
226+
continue
227+
if typestr:
228+
if typestr in parts:
229+
alts.append(parts[0].strip("<>"))
230+
else:
231+
alts.append(parts[0].strip("<>"))
232+
return alts
233+
196234
def __init__(self, system_id: Optional[str] = None, format: Optional[str] = None):
197235
super(URLInputSource, self).__init__(system_id)
198236
self.url = system_id
@@ -244,16 +282,26 @@ def _urlopen(req: Request) -> Any:
244282
else:
245283
raise
246284

247-
file = _urlopen(req)
248-
# Fix for issue 130 https://github.com/RDFLib/rdflib/issues/130
249-
self.url = file.geturl() # in case redirections took place
285+
response: HTTPResponse = _urlopen(req)
286+
self.url = response.geturl() # in case redirections took place
287+
self.links = self.get_links(response)
288+
if format in ("json-ld", "application/ld+json"):
289+
alts = self.get_alternates(type_="application/ld+json")
290+
for link in alts:
291+
full_link = urljoin(self.url, link)
292+
if full_link != self.url and full_link != system_id:
293+
response = _urlopen(Request(full_link))
294+
self.url = response.geturl() # in case redirections took place
295+
break
296+
250297
self.setPublicId(self.url)
251-
self.content_type = file.info().get("content-type")
298+
content_types = self.getallmatchingheaders(response.headers, "content-type")
299+
self.content_type = content_types[0] if content_types else None
252300
if self.content_type is not None:
253301
self.content_type = self.content_type.split(";", 1)[0]
254-
self.setByteStream(file)
302+
self.setByteStream(response)
255303
# TODO: self.setEncoding(encoding)
256-
self.response_info = file.info() # a mimetools.Message instance
304+
self.response_info = response.info() # a mimetools.Message instance
257305

258306
def __repr__(self):
259307
return self.url

rdflib/plugins/parsers/jsonld.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ def parse(self, source, sink, **kwargs):
9292
)
9393

9494
context_data = kwargs.get("context")
95-
if not context_data and isinstance(source, URLInputSource):
95+
if not context_data and hasattr(source, "url") and hasattr(source, "links"):
9696
context_data = context_from_urlinputsource(source)
9797

9898
try:
@@ -186,7 +186,9 @@ def _add_to_graph(self, dataset, graph, context, node, topcontext=False):
186186
id_val = context.get_id(node)
187187

188188
if id_val is None:
189-
id_val = self._get_nested_id(context, node)
189+
nested_id = self._get_nested_id(context, node)
190+
if nested_id is not None and len(nested_id) > 0:
191+
id_val = nested_id
190192

191193
if isinstance(id_val, str):
192194
subj = self._to_rdf_id(context, id_val)

rdflib/plugins/shared/jsonld/util.py

+18-7
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,17 @@
1616
from posixpath import normpath
1717

1818
from urllib.parse import urljoin, urlsplit, urlunsplit
19-
20-
from rdflib.parser import create_input_source, PythonInputSource, StringInputSource
19+
from rdflib.parser import (
20+
create_input_source,
21+
BytesIOWrapper,
22+
PythonInputSource,
23+
StringInputSource,
24+
)
2125

2226
from io import TextIOBase, TextIOWrapper
2327

2428

2529
def source_to_json(source):
26-
2730
if isinstance(source, PythonInputSource):
2831
return source.data
2932

@@ -32,9 +35,10 @@ def source_to_json(source):
3235

3336
# TODO: conneg for JSON (fix support in rdflib's URLInputSource!)
3437
source = create_input_source(source, format="json-ld")
35-
3638
stream = source.getByteStream()
3739
try:
40+
if isinstance(stream, BytesIOWrapper):
41+
stream = stream.wrapped
3842
# Use character stream as-is, or interpret byte stream as UTF-8
3943
if isinstance(stream, TextIOBase):
4044
use_stream = stream
@@ -86,10 +90,17 @@ def norm_url(base, url):
8690

8791

8892
def context_from_urlinputsource(source):
89-
if source.content_type == "application/json":
90-
# response_info was added to InputSource in rdflib 4.2
93+
"""
94+
Please note that JSON-LD documents served with the application/ld+json media type
95+
MUST have all context information, including references to external contexts,
96+
within the body of the document. Contexts linked via a
97+
http://www.w3.org/ns/json-ld#context HTTP Link Header MUST be
98+
ignored for such documents.
99+
"""
100+
if source.content_type != "application/ld+json":
91101
try:
92-
links = source.response_info.getallmatchingheaders("Link")
102+
# source.links is the new way of getting Link headers from URLInputSource
103+
links = source.links
93104
except AttributeError:
94105
return
95106
for link in links:

test/jsonld/runner.py

+68-18
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
# -*- coding: UTF-8 -*-
22
import json
3+
from functools import partial
34
from rdflib import ConjunctiveGraph
45
from rdflib.compare import isomorphic
5-
from rdflib.plugins.parsers.jsonld import to_rdf
6+
from rdflib.parser import InputSource
7+
from rdflib.plugins.parsers.jsonld import to_rdf, JsonLDParser
68
from rdflib.plugins.serializers.jsonld import from_rdf
79
from rdflib.plugins.shared.jsonld.keys import CONTEXT, GRAPH
810

@@ -16,26 +18,69 @@ def _preserving_nodeid(self, bnode_context=None):
1618
return bNode(self.eat(r_nodeid).group(1))
1719

1820

21+
22+
1923
DEFAULT_PARSER_VERSION = 1.0
2024

2125

26+
def make_fake_urlinputsource(input_uri, format=None, suite_base=None, options={}):
27+
local_url = input_uri.replace("https://w3c.github.io/json-ld-api/tests/", "./")
28+
try:
29+
f = open(local_url, "rb")
30+
except FileNotFoundError:
31+
f = None
32+
source = InputSource(input_uri)
33+
source.setPublicId(input_uri)
34+
source.setByteStream(f)
35+
source.url = input_uri
36+
source.links = []
37+
if local_url.endswith((".jsonld", ".jldt")):
38+
source.content_type = "application/ld+json"
39+
else:
40+
source.content_type = "application/json"
41+
source.format = format
42+
if options:
43+
if "httpLink" in options:
44+
source.links.append(options["httpLink"])
45+
if "contentType" in options:
46+
source.content_type = options['contentType']
47+
if "redirectTo" in options:
48+
redir = suite_base + options['redirectTo']
49+
local_redirect = redir.replace("https://w3c.github.io/json-ld-api/tests/", "./")
50+
if f:
51+
f.close()
52+
try:
53+
f = open(local_redirect, "rb")
54+
except FileNotFoundError:
55+
f = None
56+
source.setByteStream(f)
57+
source.url = redir
58+
source.setPublicId(redir)
59+
source.setSystemId(redir)
60+
return source
61+
2262
def do_test_json(suite_base, cat, num, inputpath, expectedpath, context, options):
2363
input_uri = suite_base + inputpath
24-
input_obj = _load_json(inputpath)
2564
input_graph = ConjunctiveGraph()
26-
to_rdf(
27-
input_obj,
28-
input_graph,
29-
base=input_uri,
30-
context_data=context,
31-
generalized_rdf=True,
32-
)
65+
if cat == "remote-doc":
66+
input_src = make_fake_urlinputsource(input_uri, format="json-ld", suite_base=suite_base, options=options)
67+
p = JsonLDParser()
68+
p.parse(input_src, input_graph, base=input_src.getPublicId(), context_data=context, generalized_rdf=True)
69+
else:
70+
input_obj = _load_json(inputpath)
71+
to_rdf(
72+
input_obj,
73+
input_graph,
74+
base=input_uri,
75+
context_data=context,
76+
generalized_rdf=True,
77+
)
3378
expected_json = _load_json(expectedpath)
3479
use_native_types = True # CONTEXT in input_obj
3580
result_json = from_rdf(
3681
input_graph,
3782
context,
38-
base=input_uri,
83+
base="./", # deliberately set base different to the input base
3984
use_native_types=options.get("useNativeTypes", use_native_types),
4085
use_rdf_type=options.get("useRdfType", False),
4186
)
@@ -72,14 +117,19 @@ def do_test_parser(suite_base, cat, num, inputpath, expectedpath, context, optio
72117
version = 1.1
73118
elif requested_version == "json-ld-1.0":
74119
version = 1.0
75-
to_rdf(
76-
input_obj,
77-
result_graph,
78-
context_data=context,
79-
base=options.get("base", input_uri),
80-
version=version,
81-
generalized_rdf=options.get("produceGeneralizedRdf", False),
82-
)
120+
if cat == "remote-doc":
121+
input_src = make_fake_urlinputsource(input_uri, format="json-ld", options=options)
122+
p = JsonLDParser()
123+
p.parse(input_src, result_graph, base=input_uri, context_data=context, generalized_rdf=True)
124+
else:
125+
to_rdf(
126+
input_obj,
127+
result_graph,
128+
context_data=context,
129+
base=options.get("base", input_uri),
130+
version=version,
131+
generalized_rdf=options.get("produceGeneralizedRdf", False),
132+
)
83133
assert isomorphic(result_graph, expected_graph), "Expected:\n%s\nGot:\n%s" % (
84134
expected_graph.serialize(),
85135
result_graph.serialize(),

test/jsonld/test_onedotone.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515

1616
testsuite_dir = p.join(p.abspath(p.dirname(__file__)), "1.1")
1717

18-
1918
unsupported_tests: Tuple[str, ...] = ("frame", "normalize")
2019
unsupported_tests += (
2120
"error",
@@ -137,6 +136,13 @@
137136
"toRdf/tn02-in",
138137
# TODO: Rdflib should silently reject bad predicate URIs
139138
"toRdf/wf02-in",
139+
# TODO: we don't extract context or json-ld that's embedded in HTML
140+
"remote-doc/0013-in",
141+
"remote-doc/la01-in",
142+
"remote-doc/la02-in",
143+
"remote-doc/la03-in",
144+
"remote-doc/la04-in",
145+
"remote-doc/la05-in",
140146
)
141147

142148
if os.name == "nt":
@@ -208,7 +214,7 @@ def get_test_suite_cases():
208214
):
209215
# Skip the JSON v1.0 tests
210216
continue
211-
if inputpath.endswith(".jsonld"): # toRdf
217+
if inputpath.endswith((".jldt", ".json", ".jsonld")): # toRdf
212218
if expectedpath.endswith(".jsonld"): # compact/expand/flatten
213219
func = runner.do_test_json
214220
else: # toRdf

0 commit comments

Comments
 (0)