.

imandr · imandr · commit 348e1e3a3344 · 2023-07-24T12:55:52.000-05:00
diff --git a/metacat/common/dbbase.py b/metacat/common/dbbase.py
@@ -1,12 +1,12 @@
-from metacat.util import fetch_generator
-import json
+from metacat.util import fetch_generator, chunked
+import json, io, csv
 
 def transactioned(method):
     def decorated(first, *params, transaction=None, **args):
 
         if transaction is not None:
             return method(first, *params, transaction=transaction, **args)
-
+        
         if isinstance(first, HasDB):
             transaction = first.DB.transaction()
         elif isinstance(first, type):
@@ -20,37 +20,31 @@ def decorated(first, *params, transaction=None, **args):
 
     return decorated
 
-def insert_many(transaction, table, column_names, tuples, copy_threshold = 100):
-
-    # if the tuples list or iterable is short enough, do it as multiple inserts
-    tuples_lst, tuples = make_list_if_short(tuples, copy_threshold)
-    if tuples_lst is not None and len(tuples_lst) <= copy_threshold:
-        columns = ",". join(column_names)
-        placeholders = ",".join(["%s"]*len(column_names))
-        try:
-            transaction.executemany(f"""
-                insert into parent_child({columns}) values({placeholders})
-            """, tuples_lst)
-            if do_commit:   cursor.execute("commit")
-        except Exception as e:
-            cursor.execute("rollback")
-            raise
-    else:
-        
-        csv_file = io.StringIO()
-        writer = csv.writer(csv_file, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
-
-        for tup in tuples:
-            assert len(tup) == len(column_names)
-            tup = ["\\N" if x is None else x for x in tup]
-            writer.writerow(tup)
-        csv_file.seek(0,0)
-        try:
-            cursor.copy_from(csv_file, table, columns = column_names)
-            if do_commit:   cursor.execute("commit")
-        except Exception as e:
-            cursor.execute("rollback")
-            raise
+@transactioned
+def insert_many(db, table, items, column_names=None, copy_threshold=0, chunk_size=1000, make_tuple=None, transaction=None):
+    for chunk in chunked(items, chunk_size):
+        if chunk:
+            if make_tuple is not None:
+                chunk = [make_tuple(item) for item in chunk]
+            if len(chunk) <= copy_threshold:
+                cols = "" if not column_names else "(" + ",".join(column_names) + ")"
+                ncols = len(column_names) if column_names else len(chunk[0])
+                vals = ",".join(["%s"] * ncols)
+                print("cols:", cols)
+                print("vals:", vals)
+                print("chunk:", chunk)
+                sql = f"insert into {table} {cols} values({vals})"
+                print("sql:", sql)
+                transaction.executemany(sql, chunk)
+            else:
+                csv_file = io.StringIO()
+                writer = csv.writer(csv_file, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
+                for tup in chunk:
+                    assert len(tup) == len(column_names)
+                    tup = ["\\N" if x is None else x for x in tup]          # null in Postgres
+                    writer.writerow(tup)
+                csv_file.seek(0,0)
+                transaction.copy_from(csv_file, table, columns = column_names)
 
 
 class HasDB(object):
diff --git a/metacat/db/__init__.py b/metacat/db/__init__.py
@@ -4,7 +4,7 @@
 
 from .common import (
     AlreadyExistsError, NotFoundError, IntegrityError, MetaValidationError, DatasetCircularDependencyDetected,
-    parse_name, alias, make_list_if_short, insert_bulk
+    parse_name, alias, make_list_if_short
 )
 
 from .param_category import DBParamCategory
diff --git a/metacat/db/common.py b/metacat/db/common.py
@@ -71,36 +71,3 @@ def make_list_if_short(iterable, limit):
             return head, None
     else:
         return None, iterable
-
-def insert_bulk(cursor, table, column_names, tuples, do_commit=True, copy_threshold = 100):
-
-    # if the tuples list or iterable is short enough, do it as multiple inserts
-    tuples_lst, tuples = make_list_if_short(tuples, copy_threshold)
-    if tuples_lst is not None and len(tuples_lst) <= copy_threshold:
-        columns = ",". join(column_names)
-        placeholders = ",".join(["%s"]*len(column_names))
-        try:
-            cursor.executemany(f"""
-                insert into parent_child({columns}) values({placeholders})
-            """, tuples_lst)
-            if do_commit:   cursor.execute("commit")
-        except Exception as e:
-            cursor.execute("rollback")
-            raise
-    else:
-        
-        csv_file = io.StringIO()
-        writer = csv.writer(csv_file, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
-
-        for tup in tuples:
-            assert len(tup) == len(column_names)
-            tup = ["\\N" if x is None else x for x in tup]
-            writer.writerow(tup)
-        csv_file.seek(0,0)
-        try:
-            cursor.copy_from(csv_file, table, columns = column_names)
-            if do_commit:   cursor.execute("commit")
-        except Exception as e:
-            cursor.execute("rollback")
-            raise
-        
diff --git a/metacat/db/dbobjects2.py b/metacat/db/dbobjects2.py
@@ -16,7 +16,7 @@ def debug(*parts):
 
 from .common import (
     AlreadyExistsError, DatasetCircularDependencyDetected, NotFoundError, MetaValidationError,
-    parse_name, alias, insert_bulk
+    parse_name, alias
 )
 
 class DBFileSet(DBObject):
@@ -586,10 +586,11 @@ def create(self, creator=None, transaction=None):
             (self.FID, self.Namespace, self.Name, meta, self.Size, checksums, creator))
         self.CreatedTimestamp = c.fetchone()[0]
         if self.Parents:
-            insert_many(transaction,
+            insert_many(self.DB,
                 "parent_child", 
-                ["parent_id", "child_id"], 
                 ((p.FID if isinstance(p, DBFile) else p, self.FID) for p in self.Parents),
+                column_names=["parent_id", "child_id"], 
+                transaction=transaction
             )
         return self
 
@@ -761,8 +762,8 @@ def get_files(db, files, transaction=None):
                 name text);
             truncate table {temp_table};
                 """)
-        cvs = strio.getvalue()
-        transaction.copy_from(io.StringIO(cvs), temp_table)
+        csv = strio.getvalue()
+        transaction.copy_from(io.StringIO(csv), temp_table)
         #print("DBFile.get_files: strio:", strio.getvalue())
         
         columns = DBFile.all_columns("f")
@@ -775,6 +776,36 @@ def get_files(db, files, transaction=None):
         
         return DBFileSet(db, sql=sql)
         
+    @staticmethod
+    @transactioned
+    def move_to_namespace(db, namespace, files, transaction=None):
+        """
+        
+        WARNING: DOES NOT check namespace permissions for the source namespace
+        
+        files expected to be a list of DBFile objects with correct file ids 
+        """
+
+        suffix = int(time.time()*1000) % 10000
+        temp_table = f"temp_fids_{suffix}"
+        transaction.execute(f"create temp table {temp_table} ( id text );")
+        
+        errors = []
+
+        for chunk in chunked(files, chunk_size=1000):
+            chunk = 
+
+        insert_many(db, temp_table, ((f.FID,) for f in files), column_names=["id"], transaction=transaction)
+
+        transaction.execute(f"""
+            update files set namespace = %(ns)s
+                from {temp_table} tt
+                where files.id = tt.id
+                    and files.namespace != %(ns)s
+            """, {"ns": namespace}
+        )
+        return transaction.rowcount
+        
     @staticmethod
     @transactioned
     def get(db, fid = None, namespace = None, name = None, with_metadata = False, transaction=None):
diff --git a/metacat/ui/metacat_file.py b/metacat/ui/metacat_file.py
@@ -658,7 +658,7 @@ class AddCommand(CLICommand):
             -s|--sample           - print JOSN file list sample
     """
     
-    Usage = 'Use "metacat dataset add..." instead'
+    Usage = 'DEPRECATED. Use "metacat dataset add..." instead'
     
     AddSample = json.dumps(
         [
@@ -707,6 +707,43 @@ def __call__(self, command, client, opts, args):
         file_list = load_file_list(opts["-f"])
         dataset = args[-1]
         out = client.add_files(dataset, file_list)
+        
+class MoveCommand(CLICommand):
+    
+    Usage = """[options] - move files to a new namespace
+            
+            -n|--namespace                                  - new namespace
+            
+            Specify files explicitly
+            -f|--files <file namespace>:<file name>[,...]
+            -f|--files <file id>[,...]
+            -f|--files <file>                               - read the list from text file
+            -f|--files <JSON file>                          - read the list from JSON file
+            -f|--files -                                    - read the list from stdin
+
+            Use results of a query
+            -q|--query "<MQL query>"
+            -q|--query <file>                           - read query from the file
+            -q|--query -                                - read query from stdin
+            
+    """
+    Opts = ("f:n:q:", ["namespace=", "files=", "query="])
+
+    def __call__(self, command, client, opts, args):
+        namespace = opts.get("-n") or opts.get("--namespace")
+        if not namespace:
+            raise InvalidArguments("Namespace must be specified")
+        query = opts.get("-q") or opts.get("--query")
+        if query:
+            query = load_text(query)
+        file_list = opts.get("-f") or opts.get("--files")
+        if file_list:
+            file_list = load_file_list(file_list)
+        if (file_list is None) == (query is None):
+            raise InvalidArguments("Either query or file list must be specified, but not both")
+        client.Timeout = None       # this may take long time, so turn the timeout off
+        nmoved = client.move_files(namespace, file_list=file_list, query=query)
+        print(nmoved, "files moved")
 
 FileCLI = CLI(
     "declare",  DeclareSingleCommand(),
diff --git a/metacat/webapi/webapi.py b/metacat/webapi/webapi.py
@@ -102,7 +102,10 @@ class HTTPClient(object):
     def __init__(self, server_url, token, timeout=None):
         self.ServerURL = server_url
         self.Token = token
-        self.Timeout = timeout or self.DefaultTimeout
+        if timeout is not None and timeout <= 0:
+            self.Timeout = None         # no timeout
+        else:
+            self.Timeout = timeout or self.DefaultTimeout
         self.LastResponse = self.LastURL = self.LastStatusCode = None
 
     def retry_request(self, method, url, timeout=None, **args):
@@ -725,6 +728,51 @@ def declare_files(self, dataset, files, namespace=None, dry_run=False):
         out = self.post_json(url, lst)
         return out
         
+    def move_files(self, namespace, file_list=None, query=None):
+        """
+        Arguments
+        ---------
+        namespace : str
+            namespace to move files to
+        query : str
+            MQL query to run and add files matching the query
+        file_list : list
+            List of dictionaries, one dictionary per file. Each dictionary must contain either a file id
+        
+            .. code-block:: python
+        
+                    { "fid": "abcd12345" }
+
+            or namespace/name:
+        
+            .. code-block:: python
+
+                    { "name": "filename.data", "namespace": "my_namespace" }
+
+            or DID:
+        
+            .. code-block:: python
+
+                    { "did": "my_namespace:filename.data" }
+        """    
+        params = {
+            "namespace": namespace,
+        }
+        if file_list is not None:
+            lst = []
+            for f in file_list:
+                spec = ObjectSpec.from_dict(f, default_namespace)
+                spec.validate()
+                lst.append(spec.as_dict())
+            params["files"] = lst
+        elif query:
+            params["query"] = query
+        else:
+            raise ValueError("Either file_list or query must be specified, but not both")
+
+        out = self.post_json(url, params)
+        return out["files_moved"]
+
     def update_file(self, did=None, namespace=None, name=None, fid=None, replace=False,
                 size=None, checksums=None, parents=None, children=None, metadata=None
         ):
diff --git a/upload_pypi.sh b/upload_pypi.sh
@@ -4,4 +4,5 @@
 rm -rf build dist *.egg-info
 python setup.py sdist bdist_wheel
 twine upload dist/*
+rm -rf build dist *.egg-info
 
diff --git a/webserver/data_handler.py b/webserver/data_handler.py

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@`
`4`	`4`
`5`	`5`	`from .common import (`
`6`	`6`	`AlreadyExistsError, NotFoundError, IntegrityError, MetaValidationError, DatasetCircularDependencyDetected,`
`7`		`- parse_name, alias, make_list_if_short, insert_bulk`
	`7`	`+ parse_name, alias, make_list_if_short`
`8`	`8`	`)`
`9`	`9`
`10`	`10`	`from .param_category import DBParamCategory`