Merge pull request #1436 from RDFLib/jsonld_conneg

white-gecko · web-flow · commit cc9ac040d795 · 2022-03-17T09:17:37.000Z
Allow URLInputSource to get content-negotiation links from the Link headers
diff --git a/rdflib/parser.py b/rdflib/parser.py
@@ -10,6 +10,7 @@
 
 """
 
+from typing import TYPE_CHECKING
 import codecs
 import os
 import pathlib
@@ -20,6 +21,7 @@
     IO,
     Any,
     BinaryIO,
+    List,
     Optional,
     TextIO,
     Tuple,
@@ -30,6 +32,7 @@
 from urllib.request import Request
 from urllib.request import url2pathname
 from urllib.request import urlopen
+from urllib.parse import urljoin
 from urllib.error import HTTPError
 
 from xml.sax import xmlreader
@@ -39,6 +42,7 @@
 from rdflib.namespace import Namespace
 
 if TYPE_CHECKING:
+    from http.client import HTTPMessage, HTTPResponse
     from rdflib import Graph
 
 __all__ = [
@@ -190,9 +194,43 @@ def __init__(
 
 class URLInputSource(InputSource):
     """
-    TODO:
+    Constructs an RDFLib Parser InputSource from a URL to read it from the Web.
     """
 
+    links: List[str]
+
+    @classmethod
+    def getallmatchingheaders(cls, message: 'HTTPMessage', name):
+        # This is reimplemented here, because the method
+        # getallmatchingheaders from HTTPMessage is broken since Python 3.0
+        name = name.lower()
+        return [val for key, val in message.items() if key.lower() == name]
+
+    @classmethod
+    def get_links(cls, response: 'HTTPResponse'):
+        linkslines = cls.getallmatchingheaders(response.headers, "Link")
+        retarray = []
+        for linksline in linkslines:
+            links = [l.strip() for l in linksline.split(",")]
+            for link in links:
+                retarray.append(link)
+        return retarray
+
+    def get_alternates(self, type_: Optional[str] = None) -> List[str]:
+        typestr: Optional[str] = f"type=\"{type_}\"" if type_ else None
+        relstr = "rel=\"alternate\""
+        alts = []
+        for link in self.links:
+            parts = [p.strip() for p in link.split(";")]
+            if relstr not in parts:
+                continue
+            if typestr:
+                if typestr in parts:
+                    alts.append(parts[0].strip("<>"))
+            else:
+                alts.append(parts[0].strip("<>"))
+        return alts
+
     def __init__(self, system_id: Optional[str] = None, format: Optional[str] = None):
         super(URLInputSource, self).__init__(system_id)
         self.url = system_id
@@ -244,16 +282,26 @@ def _urlopen(req: Request) -> Any:
                 else:
                     raise
 
-        file = _urlopen(req)
-        # Fix for issue 130 https://github.com/RDFLib/rdflib/issues/130
-        self.url = file.geturl()  # in case redirections took place
+        response: HTTPResponse = _urlopen(req)
+        self.url = response.geturl()  # in case redirections took place
+        self.links = self.get_links(response)
+        if format in ("json-ld", "application/ld+json"):
+            alts = self.get_alternates(type_="application/ld+json")
+            for link in alts:
+                full_link = urljoin(self.url, link)
+                if full_link != self.url and full_link != system_id:
+                    response = _urlopen(Request(full_link))
+                    self.url = response.geturl()  # in case redirections took place
+                    break
+
         self.setPublicId(self.url)
-        self.content_type = file.info().get("content-type")
+        content_types = self.getallmatchingheaders(response.headers, "content-type")
+        self.content_type = content_types[0] if content_types else None
         if self.content_type is not None:
             self.content_type = self.content_type.split(";", 1)[0]
-        self.setByteStream(file)
+        self.setByteStream(response)
         # TODO: self.setEncoding(encoding)
-        self.response_info = file.info()  # a mimetools.Message instance
+        self.response_info = response.info()  # a mimetools.Message instance
 
     def __repr__(self):
         return self.url
diff --git a/rdflib/plugins/parsers/jsonld.py b/rdflib/plugins/parsers/jsonld.py
@@ -92,7 +92,7 @@ def parse(self, source, sink, **kwargs):
         )
 
         context_data = kwargs.get("context")
-        if not context_data and isinstance(source, URLInputSource):
+        if not context_data and hasattr(source, "url") and hasattr(source, "links"):
             context_data = context_from_urlinputsource(source)
 
         try:
@@ -186,7 +186,9 @@ def _add_to_graph(self, dataset, graph, context, node, topcontext=False):
         id_val = context.get_id(node)
 
         if id_val is None:
-            id_val = self._get_nested_id(context, node)
+            nested_id = self._get_nested_id(context, node)
+            if nested_id is not None and len(nested_id) > 0:
+                id_val = nested_id
 
         if isinstance(id_val, str):
             subj = self._to_rdf_id(context, id_val)
diff --git a/rdflib/plugins/shared/jsonld/util.py b/rdflib/plugins/shared/jsonld/util.py
@@ -16,14 +16,17 @@
 from posixpath import normpath
 
 from urllib.parse import urljoin, urlsplit, urlunsplit
-
-from rdflib.parser import create_input_source, PythonInputSource, StringInputSource
+from rdflib.parser import (
+    create_input_source,
+    BytesIOWrapper,
+    PythonInputSource,
+    StringInputSource,
+)
 
 from io import TextIOBase, TextIOWrapper
 
 
 def source_to_json(source):
-
     if isinstance(source, PythonInputSource):
         return source.data
 
@@ -32,9 +35,10 @@ def source_to_json(source):
 
     # TODO: conneg for JSON (fix support in rdflib's URLInputSource!)
     source = create_input_source(source, format="json-ld")
-
     stream = source.getByteStream()
     try:
+        if isinstance(stream, BytesIOWrapper):
+            stream = stream.wrapped
         # Use character stream as-is, or interpret byte stream as UTF-8
         if isinstance(stream, TextIOBase):
             use_stream = stream
@@ -86,10 +90,17 @@ def norm_url(base, url):
 
 
 def context_from_urlinputsource(source):
-    if source.content_type == "application/json":
-        # response_info was added to InputSource in rdflib 4.2
+    """
+    Please note that JSON-LD documents served with the application/ld+json media type
+    MUST have all context information, including references to external contexts,
+    within the body of the document. Contexts linked via a
+    http://www.w3.org/ns/json-ld#context HTTP Link Header MUST be
+    ignored for such documents.
+    """
+    if source.content_type != "application/ld+json":
         try:
-            links = source.response_info.getallmatchingheaders("Link")
+            # source.links is the new way of getting Link headers from URLInputSource
+            links = source.links
         except AttributeError:
             return
         for link in links:
diff --git a/test/jsonld/runner.py b/test/jsonld/runner.py
@@ -1,8 +1,10 @@
 # -*- coding: UTF-8 -*-
 import json
+from functools import partial
 from rdflib import ConjunctiveGraph
 from rdflib.compare import isomorphic
-from rdflib.plugins.parsers.jsonld import to_rdf
+from rdflib.parser import InputSource
+from rdflib.plugins.parsers.jsonld import to_rdf, JsonLDParser
 from rdflib.plugins.serializers.jsonld import from_rdf
 from rdflib.plugins.shared.jsonld.keys import CONTEXT, GRAPH
 
@@ -16,26 +18,69 @@ def _preserving_nodeid(self, bnode_context=None):
     return bNode(self.eat(r_nodeid).group(1))
 
 
+
+
 DEFAULT_PARSER_VERSION = 1.0
 
 
+def make_fake_urlinputsource(input_uri, format=None, suite_base=None, options={}):
+    local_url = input_uri.replace("https://w3c.github.io/json-ld-api/tests/", "./")
+    try:
+        f = open(local_url, "rb")
+    except FileNotFoundError:
+        f = None
+    source = InputSource(input_uri)
+    source.setPublicId(input_uri)
+    source.setByteStream(f)
+    source.url = input_uri
+    source.links = []
+    if local_url.endswith((".jsonld", ".jldt")):
+        source.content_type = "application/ld+json"
+    else:
+        source.content_type = "application/json"
+    source.format = format
+    if options:
+        if "httpLink" in options:
+            source.links.append(options["httpLink"])
+        if "contentType" in options:
+            source.content_type = options['contentType']
+        if "redirectTo" in options:
+            redir = suite_base + options['redirectTo']
+            local_redirect = redir.replace("https://w3c.github.io/json-ld-api/tests/", "./")
+            if f:
+                f.close()
+            try:
+                f = open(local_redirect, "rb")
+            except FileNotFoundError:
+                f = None
+            source.setByteStream(f)
+            source.url = redir
+            source.setPublicId(redir)
+            source.setSystemId(redir)
+    return source
+
 def do_test_json(suite_base, cat, num, inputpath, expectedpath, context, options):
     input_uri = suite_base + inputpath
-    input_obj = _load_json(inputpath)
     input_graph = ConjunctiveGraph()
-    to_rdf(
-        input_obj,
-        input_graph,
-        base=input_uri,
-        context_data=context,
-        generalized_rdf=True,
-    )
+    if cat == "remote-doc":
+        input_src = make_fake_urlinputsource(input_uri, format="json-ld", suite_base=suite_base, options=options)
+        p = JsonLDParser()
+        p.parse(input_src, input_graph, base=input_src.getPublicId(), context_data=context, generalized_rdf=True)
+    else:
+        input_obj = _load_json(inputpath)
+        to_rdf(
+            input_obj,
+            input_graph,
+            base=input_uri,
+            context_data=context,
+            generalized_rdf=True,
+        )
     expected_json = _load_json(expectedpath)
     use_native_types = True  # CONTEXT in input_obj
     result_json = from_rdf(
         input_graph,
         context,
-        base=input_uri,
+        base="./",  # deliberately set base different to the input base
         use_native_types=options.get("useNativeTypes", use_native_types),
         use_rdf_type=options.get("useRdfType", False),
     )
@@ -72,14 +117,19 @@ def do_test_parser(suite_base, cat, num, inputpath, expectedpath, context, optio
             version = 1.1
         elif requested_version == "json-ld-1.0":
             version = 1.0
-    to_rdf(
-        input_obj,
-        result_graph,
-        context_data=context,
-        base=options.get("base", input_uri),
-        version=version,
-        generalized_rdf=options.get("produceGeneralizedRdf", False),
-    )
+    if cat == "remote-doc":
+        input_src = make_fake_urlinputsource(input_uri, format="json-ld", options=options)
+        p = JsonLDParser()
+        p.parse(input_src, result_graph, base=input_uri, context_data=context, generalized_rdf=True)
+    else:
+        to_rdf(
+            input_obj,
+            result_graph,
+            context_data=context,
+            base=options.get("base", input_uri),
+            version=version,
+            generalized_rdf=options.get("produceGeneralizedRdf", False),
+        )
     assert isomorphic(result_graph, expected_graph), "Expected:\n%s\nGot:\n%s" % (
         expected_graph.serialize(),
         result_graph.serialize(),
diff --git a/test/jsonld/test_onedotone.py b/test/jsonld/test_onedotone.py
@@ -15,7 +15,6 @@
 
 testsuite_dir = p.join(p.abspath(p.dirname(__file__)), "1.1")
 
-
 unsupported_tests: Tuple[str, ...] = ("frame", "normalize")
 unsupported_tests += (
     "error",
@@ -137,6 +136,13 @@
     "toRdf/tn02-in",
     # TODO: Rdflib should silently reject bad predicate URIs
     "toRdf/wf02-in",
+    # TODO: we don't extract context or json-ld that's embedded in HTML
+    "remote-doc/0013-in",
+    "remote-doc/la01-in",
+    "remote-doc/la02-in",
+    "remote-doc/la03-in",
+    "remote-doc/la04-in",
+    "remote-doc/la05-in",
 )
 
 if os.name == "nt":
@@ -208,7 +214,7 @@ def get_test_suite_cases():
             ):
                 # Skip the JSON v1.0 tests
                 continue
-        if inputpath.endswith(".jsonld"):  # toRdf
+        if inputpath.endswith((".jldt", ".json", ".jsonld")):  # toRdf
             if expectedpath.endswith(".jsonld"):  # compact/expand/flatten
                 func = runner.do_test_json
             else:  # toRdf