Rework how identifiers are rewritten in packing to handle $imports. (#277)

tetron · web-flow · commit 5495aa50167a · 2017-01-19T13:26:07.000-05:00
* Rework how identifiers are rewritten in packing to handle $imports.
* Don't prepend filename to ids in primary file.
* Only rewrite file:// ids.
* Packing also collects schemas.
diff --git a/cwltool/load_tool.py b/cwltool/load_tool.py
@@ -123,7 +123,7 @@ def validate_document(document_loader,   # type: Loader
         }
 
     if not isinstance(workflowobj, dict):
-        raise ValueError("workflowjobj must be a dict")
+        raise ValueError("workflowjobj must be a dict, got '%s': %s" % (type(workflowobj), workflowobj))
 
     jobobj = None
     if "cwl:tool" in workflowobj:
diff --git a/cwltool/pack.py b/cwltool/pack.py
@@ -1,5 +1,5 @@
 import copy
-import json
+import urlparse
 
 from schema_salad.ref_resolver import Loader
 
@@ -19,68 +19,114 @@ def flatten_deps(d, files):  # type: (Any, Set[Text]) -> None
         if "listing" in d:
             flatten_deps(d["listing"], files)
 
-def find_run(d, runs):  # type: (Any, Set[Text]) -> None
+def find_run(d, loadref, runs):  # type: (Any, Callable[[Text, Text], Union[Dict, List, Text]], Set[Text]) -> None
     if isinstance(d, list):
         for s in d:
-            find_run(s, runs)
+            find_run(s, loadref, runs)
     elif isinstance(d, dict):
         if "run" in d and isinstance(d["run"], (str, unicode)):
-            runs.add(d["run"])
+            if d["run"] not in runs:
+                runs.add(d["run"])
+                find_run(loadref(None, d["run"]), loadref, runs)
         for s in d.values():
-            find_run(s, runs)
+            find_run(s, loadref, runs)
+
+def find_ids(d, ids):  # type: (Any, Set[Text]) -> None
+    if isinstance(d, list):
+        for s in d:
+            find_ids(s, ids)
+    elif isinstance(d, dict):
+        for i in ("id", "name"):
+            if i in d and isinstance(d[i], (str, unicode)):
+                ids.add(d[i])
+        for s in d.values():
+            find_ids(s, ids)
 
 def replace_refs(d, rewrite, stem, newstem):
     # type: (Any, Dict[Text, Text], Text, Text) -> None
     if isinstance(d, list):
         for s,v in enumerate(d):
-            if isinstance(v, (str, unicode)) and v.startswith(stem):
-                d[s] = newstem + v[len(stem):]
+            if isinstance(v, (str, unicode)):
+                if v in rewrite:
+                    d[s] = rewrite[v]
+                elif v.startswith(stem):
+                    d[s] = newstem + v[len(stem):]
             else:
                 replace_refs(v, rewrite, stem, newstem)
     elif isinstance(d, dict):
-        if "run" in d and isinstance(d["run"], (str, unicode)):
-            d["run"] = rewrite[d["run"]]
         for s,v in d.items():
-            if isinstance(v, (str, unicode)) and v.startswith(stem):
-                d[s] = newstem + v[len(stem):]
+            if isinstance(v, (str, unicode)):
+                if v in rewrite:
+                    d[s] = rewrite[v]
+                elif v.startswith(stem):
+                    d[s] = newstem + v[len(stem):]
             replace_refs(v, rewrite, stem, newstem)
 
 def pack(document_loader, processobj, uri, metadata):
     # type: (Loader, Union[Dict[Text, Any], List[Dict[Text, Any]]], Text, Dict[Text, Text]) -> Dict[Text, Any]
     def loadref(b, u):
         # type: (Text, Text) -> Union[Dict, List, Text]
         return document_loader.resolve_ref(u, base_url=b)[0]
-    deps = scandeps(uri, processobj, set(("run",)), set(), loadref)
 
-    fdeps = set((uri,))
-    flatten_deps(deps, fdeps)
+    runs = set((uri,))
+    find_run(processobj, loadref, runs)
 
-    runs = set()  # type: Set[Text]
-    for f in fdeps:
-        find_run(document_loader.idx[f], runs)
+    ids = set()  # type: Set[Text]
+    for f in runs:
+        find_ids(document_loader.resolve_ref(f)[0], ids)
 
     names = set()  # type: Set[Text]
-    rewrite = {}
-    if isinstance(processobj, list):
-        for p in processobj:
-            rewrite[p["id"]] = "#" + uniquename(shortname(p["id"]), names)
-    else:
-        rewrite[uri] = "#main"
+    rewrite = {}   # type: Dict[Text, Text]
 
-    for r in sorted(runs):
-        rewrite[r] = "#" + uniquename(shortname(r), names)
+    mainpath, _ = urlparse.urldefrag(uri)
+
+    def rewrite_id(r, mainuri):
+        # type: (Text, Text) -> None
+        if r == mainuri:
+            rewrite[r] = "#main"
+        elif r.startswith(mainuri) and r[len(mainuri)] in ("#", "/"):
+            pass
+        else:
+            path, frag = urlparse.urldefrag(r)
+            if path == mainpath:
+                rewrite[r] = "#" + uniquename(frag, names)
+            else:
+                if path not in rewrite:
+                    rewrite[path] = "#" + uniquename(shortname(path), names)
+
+    sortedids = sorted(ids)
+
+    for r in sortedids:
+        if r.startswith("file://"):
+            rewrite_id(r, uri)
 
     packed = {"$graph": [], "cwlVersion": metadata["cwlVersion"]
             }  # type: Dict[Text, Any]
 
-    for r in sorted(rewrite.keys()):
+    schemas = set()  # type: Set[Text]
+    for r in sorted(runs):
+        dcr, metadata = document_loader.resolve_ref(r)
+        if not isinstance(dcr, dict):
+            continue
+        for doc in (dcr, metadata):
+            if "$schemas" in doc:
+                for s in doc["$schemas"]:
+                    schemas.add(s)
+        if dcr.get("class") not in ("Workflow", "CommandLineTool", "ExpressionTool"):
+            continue
+        dc = cast(Dict[Text, Any], copy.deepcopy(dcr))
         v = rewrite[r]
-        dc = cast(Dict[Text, Any], copy.deepcopy(document_loader.idx[r]))
         dc["id"] = v
-        for n in ("name", "cwlVersion"):
+        for n in ("name", "cwlVersion", "$namespaces", "$schemas"):
             if n in dc:
                 del dc[n]
-        replace_refs(dc, rewrite, r+"/" if "#" in r else r+"#", v+"/")
         packed["$graph"].append(dc)
 
+    if schemas:
+        packed["$schemas"] = list(schemas)
+
+    for r in rewrite:
+        v = rewrite[r]
+        replace_refs(packed, rewrite, r+"/" if "#" in r else r+"#", v+"/")
+
     return packed
diff --git a/tests/test_pack.py b/tests/test_pack.py
@@ -22,5 +22,8 @@ def test_pack(self):
             expect_packed = json.load(f)
         adjustFileObjs(packed, partial(makeRelative, os.path.abspath("tests/wf")))
         adjustDirObjs(packed, partial(makeRelative, os.path.abspath("tests/wf")))
+        self.assertIn("$schemas", packed)
+        del packed["$schemas"]
+        del expect_packed["$schemas"]
 
         self.assertEqual(expect_packed, packed)
diff --git a/tests/wf/empty.ttl b/tests/wf/empty.ttl
diff --git a/tests/wf/expect_packed.cwl b/tests/wf/expect_packed.cwl
@@ -1,5 +1,6 @@
 {
     "cwlVersion": "v1.0",
+    "$schemas": ["file:///home/peter/work/cwltool/tests/wf/empty.ttl"],
     "$graph": [
         {
             "inputs": [
diff --git a/tests/wf/revtool.cwl b/tests/wf/revtool.cwl
@@ -4,6 +4,8 @@
 class: CommandLineTool
 cwlVersion: v1.0
 doc: "Reverse each line using the `rev` command"
+$schemas:
+  - empty.ttl
 
 # The "inputs" array defines the structure of the input object that describes
 # the inputs to the underlying program.  Here, there is one input field

Original file line number	Diff line number	Diff line change
`@@ -123,7 +123,7 @@ def validate_document(document_loader, # type: Loader`
`123`	`123`	`}`
`124`	`124`
`125`	`125`	`if not isinstance(workflowobj, dict):`
`126`		`- raise ValueError("workflowjobj must be a dict")`
	`126`	`+ raise ValueError("workflowjobj must be a dict, got '%s': %s" % (type(workflowobj), workflowobj))`
`127`	`127`
`128`	`128`	`jobobj = None`
`129`	`129`	`if "cwl:tool" in workflowobj:`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"cwlVersion": "v1.0",`
	`3`	`+ "$schemas": ["file:///home/peter/work/cwltool/tests/wf/empty.ttl"],`
`3`	`4`	`"$graph": [`
`4`	`5`	`{`
`5`	`6`	`"inputs": [`