Skip to content

Allow URLInputSource to get content-negotiation links from the Link headers #1436

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Mar 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 55 additions & 7 deletions rdflib/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

"""

from typing import TYPE_CHECKING
import codecs
import os
import pathlib
Expand All @@ -20,6 +21,7 @@
IO,
Any,
BinaryIO,
List,
Optional,
TextIO,
Tuple,
Expand All @@ -30,6 +32,7 @@
from urllib.request import Request
from urllib.request import url2pathname
from urllib.request import urlopen
from urllib.parse import urljoin
from urllib.error import HTTPError

from xml.sax import xmlreader
Expand All @@ -39,6 +42,7 @@
from rdflib.namespace import Namespace

if TYPE_CHECKING:
from http.client import HTTPMessage, HTTPResponse
from rdflib import Graph

__all__ = [
Expand Down Expand Up @@ -190,9 +194,43 @@ def __init__(

class URLInputSource(InputSource):
"""
TODO:
Constructs an RDFLib Parser InputSource from a URL to read it from the Web.
"""

links: List[str]

@classmethod
def getallmatchingheaders(cls, message: 'HTTPMessage', name):
# This is reimplemented here, because the method
# getallmatchingheaders from HTTPMessage is broken since Python 3.0
name = name.lower()
return [val for key, val in message.items() if key.lower() == name]

@classmethod
def get_links(cls, response: 'HTTPResponse'):
linkslines = cls.getallmatchingheaders(response.headers, "Link")
retarray = []
for linksline in linkslines:
links = [l.strip() for l in linksline.split(",")]
for link in links:
retarray.append(link)
return retarray

def get_alternates(self, type_: Optional[str] = None) -> List[str]:
typestr: Optional[str] = f"type=\"{type_}\"" if type_ else None
relstr = "rel=\"alternate\""
alts = []
for link in self.links:
parts = [p.strip() for p in link.split(";")]
if relstr not in parts:
continue
if typestr:
if typestr in parts:
alts.append(parts[0].strip("<>"))
else:
alts.append(parts[0].strip("<>"))
return alts

def __init__(self, system_id: Optional[str] = None, format: Optional[str] = None):
super(URLInputSource, self).__init__(system_id)
self.url = system_id
Expand Down Expand Up @@ -244,16 +282,26 @@ def _urlopen(req: Request) -> Any:
else:
raise

file = _urlopen(req)
# Fix for issue 130 https://github.com/RDFLib/rdflib/issues/130
self.url = file.geturl() # in case redirections took place
response: HTTPResponse = _urlopen(req)
self.url = response.geturl() # in case redirections took place
self.links = self.get_links(response)
if format in ("json-ld", "application/ld+json"):
alts = self.get_alternates(type_="application/ld+json")
for link in alts:
full_link = urljoin(self.url, link)
if full_link != self.url and full_link != system_id:
response = _urlopen(Request(full_link))
self.url = response.geturl() # in case redirections took place
break

self.setPublicId(self.url)
self.content_type = file.info().get("content-type")
content_types = self.getallmatchingheaders(response.headers, "content-type")
self.content_type = content_types[0] if content_types else None
if self.content_type is not None:
self.content_type = self.content_type.split(";", 1)[0]
self.setByteStream(file)
self.setByteStream(response)
# TODO: self.setEncoding(encoding)
self.response_info = file.info() # a mimetools.Message instance
self.response_info = response.info() # a mimetools.Message instance

def __repr__(self):
return self.url
Expand Down
6 changes: 4 additions & 2 deletions rdflib/plugins/parsers/jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def parse(self, source, sink, **kwargs):
)

context_data = kwargs.get("context")
if not context_data and isinstance(source, URLInputSource):
if not context_data and hasattr(source, "url") and hasattr(source, "links"):
context_data = context_from_urlinputsource(source)

try:
Expand Down Expand Up @@ -186,7 +186,9 @@ def _add_to_graph(self, dataset, graph, context, node, topcontext=False):
id_val = context.get_id(node)

if id_val is None:
id_val = self._get_nested_id(context, node)
nested_id = self._get_nested_id(context, node)
if nested_id is not None and len(nested_id) > 0:
id_val = nested_id

if isinstance(id_val, str):
subj = self._to_rdf_id(context, id_val)
Expand Down
25 changes: 18 additions & 7 deletions rdflib/plugins/shared/jsonld/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,17 @@
from posixpath import normpath

from urllib.parse import urljoin, urlsplit, urlunsplit

from rdflib.parser import create_input_source, PythonInputSource, StringInputSource
from rdflib.parser import (
create_input_source,
BytesIOWrapper,
PythonInputSource,
StringInputSource,
)

from io import TextIOBase, TextIOWrapper


def source_to_json(source):

if isinstance(source, PythonInputSource):
return source.data

Expand All @@ -32,9 +35,10 @@ def source_to_json(source):

# TODO: conneg for JSON (fix support in rdflib's URLInputSource!)
source = create_input_source(source, format="json-ld")

stream = source.getByteStream()
try:
if isinstance(stream, BytesIOWrapper):
stream = stream.wrapped
# Use character stream as-is, or interpret byte stream as UTF-8
if isinstance(stream, TextIOBase):
use_stream = stream
Expand Down Expand Up @@ -86,10 +90,17 @@ def norm_url(base, url):


def context_from_urlinputsource(source):
if source.content_type == "application/json":
# response_info was added to InputSource in rdflib 4.2
"""
Please note that JSON-LD documents served with the application/ld+json media type
MUST have all context information, including references to external contexts,
within the body of the document. Contexts linked via a
http://www.w3.org/ns/json-ld#context HTTP Link Header MUST be
ignored for such documents.
"""
if source.content_type != "application/ld+json":
try:
links = source.response_info.getallmatchingheaders("Link")
# source.links is the new way of getting Link headers from URLInputSource
links = source.links
except AttributeError:
return
for link in links:
Expand Down
86 changes: 68 additions & 18 deletions test/jsonld/runner.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# -*- coding: UTF-8 -*-
import json
from functools import partial
from rdflib import ConjunctiveGraph
from rdflib.compare import isomorphic
from rdflib.plugins.parsers.jsonld import to_rdf
from rdflib.parser import InputSource
from rdflib.plugins.parsers.jsonld import to_rdf, JsonLDParser
from rdflib.plugins.serializers.jsonld import from_rdf
from rdflib.plugins.shared.jsonld.keys import CONTEXT, GRAPH

Expand All @@ -16,26 +18,69 @@ def _preserving_nodeid(self, bnode_context=None):
return bNode(self.eat(r_nodeid).group(1))




DEFAULT_PARSER_VERSION = 1.0


def make_fake_urlinputsource(input_uri, format=None, suite_base=None, options={}):
local_url = input_uri.replace("https://w3c.github.io/json-ld-api/tests/", "./")
try:
f = open(local_url, "rb")
except FileNotFoundError:
f = None
source = InputSource(input_uri)
source.setPublicId(input_uri)
source.setByteStream(f)
source.url = input_uri
source.links = []
if local_url.endswith((".jsonld", ".jldt")):
source.content_type = "application/ld+json"
else:
source.content_type = "application/json"
source.format = format
if options:
if "httpLink" in options:
source.links.append(options["httpLink"])
if "contentType" in options:
source.content_type = options['contentType']
if "redirectTo" in options:
redir = suite_base + options['redirectTo']
local_redirect = redir.replace("https://w3c.github.io/json-ld-api/tests/", "./")
if f:
f.close()
try:
f = open(local_redirect, "rb")
except FileNotFoundError:
f = None
source.setByteStream(f)
source.url = redir
source.setPublicId(redir)
source.setSystemId(redir)
return source

def do_test_json(suite_base, cat, num, inputpath, expectedpath, context, options):
input_uri = suite_base + inputpath
input_obj = _load_json(inputpath)
input_graph = ConjunctiveGraph()
to_rdf(
input_obj,
input_graph,
base=input_uri,
context_data=context,
generalized_rdf=True,
)
if cat == "remote-doc":
input_src = make_fake_urlinputsource(input_uri, format="json-ld", suite_base=suite_base, options=options)
p = JsonLDParser()
p.parse(input_src, input_graph, base=input_src.getPublicId(), context_data=context, generalized_rdf=True)
else:
input_obj = _load_json(inputpath)
to_rdf(
input_obj,
input_graph,
base=input_uri,
context_data=context,
generalized_rdf=True,
)
expected_json = _load_json(expectedpath)
use_native_types = True # CONTEXT in input_obj
result_json = from_rdf(
input_graph,
context,
base=input_uri,
base="./", # deliberately set base different to the input base
use_native_types=options.get("useNativeTypes", use_native_types),
use_rdf_type=options.get("useRdfType", False),
)
Expand Down Expand Up @@ -72,14 +117,19 @@ def do_test_parser(suite_base, cat, num, inputpath, expectedpath, context, optio
version = 1.1
elif requested_version == "json-ld-1.0":
version = 1.0
to_rdf(
input_obj,
result_graph,
context_data=context,
base=options.get("base", input_uri),
version=version,
generalized_rdf=options.get("produceGeneralizedRdf", False),
)
if cat == "remote-doc":
input_src = make_fake_urlinputsource(input_uri, format="json-ld", options=options)
p = JsonLDParser()
p.parse(input_src, result_graph, base=input_uri, context_data=context, generalized_rdf=True)
else:
to_rdf(
input_obj,
result_graph,
context_data=context,
base=options.get("base", input_uri),
version=version,
generalized_rdf=options.get("produceGeneralizedRdf", False),
)
assert isomorphic(result_graph, expected_graph), "Expected:\n%s\nGot:\n%s" % (
expected_graph.serialize(),
result_graph.serialize(),
Expand Down
10 changes: 8 additions & 2 deletions test/jsonld/test_onedotone.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

testsuite_dir = p.join(p.abspath(p.dirname(__file__)), "1.1")


unsupported_tests: Tuple[str, ...] = ("frame", "normalize")
unsupported_tests += (
"error",
Expand Down Expand Up @@ -137,6 +136,13 @@
"toRdf/tn02-in",
# TODO: Rdflib should silently reject bad predicate URIs
"toRdf/wf02-in",
# TODO: we don't extract context or json-ld that's embedded in HTML
"remote-doc/0013-in",
"remote-doc/la01-in",
"remote-doc/la02-in",
"remote-doc/la03-in",
"remote-doc/la04-in",
"remote-doc/la05-in",
)

if os.name == "nt":
Expand Down Expand Up @@ -208,7 +214,7 @@ def get_test_suite_cases():
):
# Skip the JSON v1.0 tests
continue
if inputpath.endswith(".jsonld"): # toRdf
if inputpath.endswith((".jldt", ".json", ".jsonld")): # toRdf
if expectedpath.endswith(".jsonld"): # compact/expand/flatten
func = runner.do_test_json
else: # toRdf
Expand Down