Merge branch 'dev' into resource-allocation-page

Gossty · web-flow · commit 5b235dbdba85 · 2024-09-11T08:56:28.000-07:00
diff --git a/.github/workflows/qiita-ci.yml b/.github/workflows/qiita-ci.yml
@@ -104,9 +104,7 @@ jobs:
       - name: Install plugins
         shell: bash -l {0}
         run: |
-          wget https://data.qiime2.org/distro/core/qiime2-2022.11-py38-linux-conda.yml
-          conda env create --quiet -n qtp-biom --file qiime2-2022.11-py38-linux-conda.yml
-          rm qiime2-2022.11-py38-linux-conda.yml
+          conda env create -n qtp-biom --file https://data.qiime2.org/distro/amplicon/qiime2-amplicon-2024.5-py39-linux-conda.yml
           export QIITA_ROOTCA_CERT=`pwd`/qiita_core/support_files/ci_rootca.crt
           export QIITA_CONFIG_FP=`pwd`/qiita_core/support_files/config_test.cfg
           export REDBIOM_HOST="http://localhost:7379"
diff --git a/qiita_db/artifact.py b/qiita_db/artifact.py
@@ -1342,23 +1342,6 @@ def _helper(sql_edges, edges, nodes):
                         # If the job is in success we don't need to do anything
                         # else since it would've been added by the code above
                         if jstatus != 'success':
-                            # Connect the job with his input artifacts, the
-                            # input artifacts may or may not exist yet, so we
-                            # need to check both the input_artifacts and the
-                            # pending properties
-                            for in_art in n_obj.input_artifacts:
-                                iid = in_art.id
-                                if iid not in nodes and iid in extra_nodes:
-                                    nodes[iid] = extra_nodes[iid]
-                                _add_edge(edges, nodes[iid], nodes[n_obj.id])
-
-                            pending = n_obj.pending
-                            for pred_id in pending:
-                                for pname in pending[pred_id]:
-                                    in_node_id = '%s:%s' % (
-                                        pred_id, pending[pred_id][pname])
-                                    _add_edge(edges, nodes[in_node_id],
-                                              nodes[n_obj.id])
 
                             if jstatus != 'error':
                                 # If the job is not errored, we can add the
@@ -1380,6 +1363,34 @@ def _helper(sql_edges, edges, nodes):
                                     queue.append(cjob.id)
                                     if cjob.id not in nodes:
                                         nodes[cjob.id] = ('job', cjob)
+
+                                    # including the outputs
+                                    for o_name, o_type in cjob.command.outputs:
+                                        node_id = '%s:%s' % (cjob.id, o_name)
+                                        node = TypeNode(
+                                            id=node_id, job_id=cjob.id,
+                                            name=o_name, type=o_type)
+                                        if node_id not in nodes:
+                                            nodes[node_id] = ('type', node)
+
+                            # Connect the job with his input artifacts, the
+                            # input artifacts may or may not exist yet, so we
+                            # need to check both the input_artifacts and the
+                            # pending properties
+                            for in_art in n_obj.input_artifacts:
+                                iid = in_art.id
+                                if iid not in nodes and iid in extra_nodes:
+                                    nodes[iid] = extra_nodes[iid]
+                                _add_edge(edges, nodes[iid], nodes[n_obj.id])
+
+                            pending = n_obj.pending
+                            for pred_id in pending:
+                                for pname in pending[pred_id]:
+                                    in_node_id = '%s:%s' % (
+                                        pred_id, pending[pred_id][pname])
+                                    _add_edge(edges, nodes[in_node_id],
+                                              nodes[n_obj.id])
+
                     elif n_type == 'type':
                         # Connect this 'future artifact' with the job that will
                         # generate it
diff --git a/qiita_db/metadata_template/prep_template.py b/qiita_db/metadata_template/prep_template.py
@@ -815,6 +815,9 @@ def _get_predecessors(workflow, node):
                 pred.append(data)
                 return pred
 
+            # this is only helpful for when there are no _get_predecessors
+            return pred
+
         # Note: we are going to use the final BIOMs to figure out which
         #       processing is missing from the back/end to the front, as this
         #       will prevent generating unnecessary steps (AKA already provided
@@ -937,6 +940,8 @@ def _get_predecessors(workflow, node):
                     if set(merging_schemes[info]) >= set(cxns):
                         init_artifacts = merging_schemes[info]
                         break
+            if not predecessors:
+                pnode = node
             if init_artifacts is None:
                 pdp = pnode.default_parameter
                 pdp_cmd = pdp.command
diff --git a/qiita_db/software.py b/qiita_db/software.py
@@ -1995,9 +1995,20 @@ def graph(self):
             qdb.sql_connection.TRN.add(sql, [self.id])
             db_edges = qdb.sql_connection.TRN.execute_fetchindex()
 
+            # let's track what nodes are actually being used so if they do not
+            # have an edge we still return them as part of the graph
+            used_nodes = nodes.copy()
             for edge_id, p_id, c_id in db_edges:
                 e = DefaultWorkflowEdge(edge_id)
                 g.add_edge(nodes[p_id], nodes[c_id], connections=e)
+                if p_id in used_nodes:
+                    del used_nodes[p_id]
+                if c_id in used_nodes:
+                    del used_nodes[c_id]
+            # adding the missing nodes
+            for ms in used_nodes:
+                g.add_node(nodes[ms])
+
         return g
 
     @property
diff --git a/qiita_db/support_files/patches/91.sql b/qiita_db/support_files/patches/91.sql
diff --git a/qiita_db/test/test_artifact.py b/qiita_db/test/test_artifact.py
@@ -404,9 +404,8 @@ def test_descendants_with_jobs(self):
             '"phred_offset": "auto"}')
         params = qdb.software.Parameters.load(qdb.software.Command(1),
                                               json_str=json_str)
-        user = qdb.user.User('test@foo.bar')
         wf = qdb.processing_job.ProcessingWorkflow.from_scratch(
-            user, params, name='Test WF')
+            qdb.user.User('test@foo.bar'), params, name='Test WF')
         parent = list(wf.graph.nodes())[0]
         wf.add(qdb.software.DefaultParameters(10),
                connections={parent: {'demultiplexed': 'input_data'}})
@@ -699,6 +698,8 @@ def setUp(self):
 
         self._clean_up_files.extend([self.fwd, self.rev])
 
+        self.user = qdb.user.User('test@foo.bar')
+
     def tearDown(self):
         for f in self._clean_up_files:
             if exists(f):
@@ -1039,7 +1040,7 @@ def test_delete_in_construction_job(self):
             '"min_per_read_length_fraction": 0.75, "sequence_max_n": 0, '
             '"phred_offset": ""}' % test.id)
         qdb.processing_job.ProcessingJob.create(
-            qdb.user.User('test@foo.bar'),
+            self.user,
             qdb.software.Parameters.load(qdb.software.Command(1),
                                          json_str=json_str))
         uploads_fp = join(qdb.util.get_mountpoint("uploads")[0][1],
@@ -1064,7 +1065,7 @@ def test_delete_error_running_job(self):
             '"min_per_read_length_fraction": 0.75, "sequence_max_n": 0, '
             '"phred_offset": ""}' % test.id)
         job = qdb.processing_job.ProcessingJob.create(
-            qdb.user.User('test@foo.bar'),
+            self.user,
             qdb.software.Parameters.load(qdb.software.Command(1),
                                          json_str=json_str))
         job._set_status('running')
@@ -1147,7 +1148,7 @@ def test_delete_with_jobs(self):
             '"min_per_read_length_fraction": 0.75, "sequence_max_n": 0, '
             '"phred_offset": ""}' % test.id)
         job = qdb.processing_job.ProcessingJob.create(
-            qdb.user.User('test@foo.bar'),
+            self.user,
             qdb.software.Parameters.load(qdb.software.Command(1),
                                          json_str=json_str))
         job._set_status('success')
@@ -1177,8 +1178,7 @@ def test_being_deleted_by(self):
         cmd = qiita_plugin.get_command('delete_artifact')
         params = qdb.software.Parameters.load(
             cmd, values_dict={'artifact': test.id})
-        job = qdb.processing_job.ProcessingJob.create(
-            qdb.user.User('test@foo.bar'), params, True)
+        job = qdb.processing_job.ProcessingJob.create(self.user, params, True)
         job._set_status('running')
 
         # verifying that there is a job and is the same than above
@@ -1189,8 +1189,7 @@ def test_being_deleted_by(self):
         self.assertIsNone(test.being_deleted_by)
 
         # now, let's actually remove
-        job = qdb.processing_job.ProcessingJob.create(
-            qdb.user.User('test@foo.bar'), params, True)
+        job = qdb.processing_job.ProcessingJob.create(self.user, params, True)
         job.submit()
         # let's wait for job
         wait_for_processing_job(job.id)
@@ -1207,7 +1206,7 @@ def test_delete_as_output_job(self):
         data = {'OTU table': {'filepaths': [(fp, 'biom')],
                               'artifact_type': 'BIOM'}}
         job = qdb.processing_job.ProcessingJob.create(
-            qdb.user.User('test@foo.bar'),
+            self.user,
             qdb.software.Parameters.load(
                 qdb.software.Command.get_validator('BIOM'),
                 values_dict={'files': dumps({'biom': [fp]}),
@@ -1448,29 +1447,50 @@ def test_descendants_with_jobs(self):
             data_type="16S")
         self.assertEqual(len(a.analysis.artifacts), 3)
         # 3. add jobs conencting the new artifact to the other root
+        #    - currently:
         #    a -> job -> b
         #    c
-        #    job1 connects b & c
-        #    job2 connects a & c
+        #    - expected:
+        #    a --> job  -> b
+        #                  |-> job2 -> out
+        #                        ^
+        #                  |-----|---> job1 -> out
+        #    c ------------|
         cmd = qdb.software.Command.create(
             qdb.software.Software(1),
             "CommandWithMultipleInputs", "", {
-                'input_b': ['artifact:["BIOM"]', None],
-                'input_c': ['artifact:["BIOM"]', None]}, {'out': 'BIOM'})
-        params = qdb.software.Parameters.load(
-            cmd, values_dict={'input_b': a.children[0].id, 'input_c': c.id})
-        job1 = qdb.processing_job.ProcessingJob.create(
-            qdb.user.User('test@foo.bar'), params)
+                'input_x': ['artifact:["BIOM"]', None],
+                'input_y': ['artifact:["BIOM"]', None]}, {'out': 'BIOM'})
         params = qdb.software.Parameters.load(
-            cmd, values_dict={'input_b': a.id, 'input_c': c.id})
-        job2 = qdb.processing_job.ProcessingJob.create(
-            qdb.user.User('test@foo.bar'), params)
+            cmd, values_dict={'input_x': a.children[0].id, 'input_y': c.id})
+        wf = qdb.processing_job.ProcessingWorkflow.from_scratch(
+            self.user, params, name='Test WF')
+        job1 = list(wf.graph.nodes())[0]
 
+        cmd_dp = qdb.software.DefaultParameters.create("", cmd)
+        wf.add(cmd_dp, req_params={'input_x': a.id, 'input_y': c.id})
+        job2 = list(wf.graph.nodes())[1]
         jobs = [j[1] for e in a.descendants_with_jobs.edges
                 for j in e if j[0] == 'job']
         self.assertIn(job1, jobs)
         self.assertIn(job2, jobs)
 
+        # 4. add job3 connecting job2 output with c as inputs
+        #    - expected:
+        #    a --> job  -> b
+        #                  |-> job2 -> out -> job3 -> out
+        #                        ^             ^
+        #                        |             |
+        #                        |             |
+        #                  |-----|---> job1 -> out
+        #    c ------------|
+        wf.add(cmd_dp, connections={
+            job1: {'out': 'input_x'}, job2: {'out': 'input_y'}})
+        job3 = list(wf.graph.nodes())[2]
+        jobs = [j[1] for e in a.descendants_with_jobs.edges
+                for j in e if j[0] == 'job']
+        self.assertIn(job3, jobs)
+
 
 @qiita_test_checker()
 class ArtifactArchiveTests(TestCase):
diff --git a/qiita_db/util.py b/qiita_db/util.py
@@ -2803,7 +2803,7 @@ def update_resource_allocation_table(weeks=1, test=None):
     sacct = [
         'sacct', '-p',
         '--format=JobID,ElapsedRaw,MaxRSS,Submit,Start,End,CPUTimeRAW,'
-        'ReqMem,AllocCPUs,AveVMSize', '--starttime',
+        'ReqMem,AllocCPUs,AveVMSize,MaxVMSizeNode', '--starttime',
         dates[0].strftime('%Y-%m-%d'), '--endtime',
         dates[1].strftime('%Y-%m-%d'), '--user', 'qiita', '--state', 'CD']
 
@@ -2922,6 +2922,7 @@ def merge_rows(rows):
     df['MaxRSSRaw'] = df.MaxRSS.apply(lambda x: MaxRSS_helper(str(x)))
     df['ElapsedRawTime'] = df.ElapsedRaw.apply(
         lambda x: timedelta(seconds=float(x)))
+    df.replace({np.nan: None}, inplace=True)
 
     for index, row in df.iterrows():
         with qdb.sql_connection.TRN:
diff --git a/qiita_pet/handlers/software.py b/qiita_pet/handlers/software.py
@@ -61,6 +61,7 @@ def _default_parameters_parsing(node):
         # getting the main default parameters
         nodes = []
         edges = []
+        at = w.artifact_type
 
         # first get edges as this will give us the main connected commands
         # and their order
@@ -72,18 +73,22 @@ def _default_parameters_parsing(node):
         #                   output_type: output_node_name}, ...}
         # for easy look up and merge of output_names
         main_nodes = dict()
+        not_used_nodes = {n.id: n for n in graph.nodes}
         for i, (x, y) in enumerate(graph.edges):
+            if x.id in not_used_nodes:
+                del not_used_nodes[x.id]
+            if y.id in not_used_nodes:
+                del not_used_nodes[y.id]
+            vals_x, input_x, output_x = _default_parameters_parsing(x)
+            vals_y, input_y, output_y = _default_parameters_parsing(y)
+
             connections = []
             for a, _, c in graph[x][y]['connections'].connections:
                 connections.append("%s | %s" % (a, c))
 
-            vals_x, input_x, output_x = _default_parameters_parsing(x)
-            vals_y, input_y, output_y = _default_parameters_parsing(y)
-
             if i == 0:
                 # we are in the first element so we can specifically select
                 # the type we are looking for
-                at = w.artifact_type
                 if at in input_x[0][1]:
                     input_x[0][1] = at
                 else:
@@ -144,6 +149,37 @@ def _default_parameters_parsing(node):
 
         wparams = w.parameters
 
+        # adding nodes without edges
+        # as a first step if not_used_nodes is not empty we'll confirm that
+        # nodes/edges are empty; in theory we should never hit this
+        if not_used_nodes and (nodes or edges):
+            raise ValueError(
+                'Error, please check your workflow configuration')
+
+        # note that this block is similar but not identical to adding connected
+        # nodes
+        for i, (_, x) in enumerate(not_used_nodes.items()):
+            vals_x, input_x, output_x = _default_parameters_parsing(x)
+            if at in input_x[0][1]:
+                input_x[0][1] = at
+            else:
+                input_x[0][1] = '** WARNING, NOT DEFINED **'
+
+            name_x = vals_x[0]
+            if vals_x not in (nodes):
+                nodes.append(vals_x)
+                for a, b in input_x:
+                    if b in inputs:
+                        name = inputs[b]
+                    else:
+                        name = 'input_%s_%s' % (name_x, b)
+                    nodes.append([name, a, b])
+                    edges.append([name, vals_x[0]])
+                for a, b in output_x:
+                    name = 'output_%s_%s' % (name_x, b)
+                    nodes.append([name, a, b])
+                    edges.append([name_x, name])
+
         workflows.append(
             {'name': w.name, 'id': w.id, 'data_types': w.data_type,
              'description': w.description, 'active': w.active,
diff --git a/setup.py b/setup.py
@@ -105,7 +105,7 @@
       install_requires=['psycopg2', 'click', 'bcrypt', 'pandas<2.0',
                         'biom-format', 'tornado<6.0', 'toredis', 'redis',
                         'scp', 'pyparsing', 'h5py',  'natsort', 'nose', 'pep8',
-                        'networkx', 'humanize', 'wtforms<3.0.0', 'nltk',
+                        'networkx', 'humanize', 'wtforms<3.0.0', 'nltk<=3.8.1',
                         'openpyxl', 'sphinx-bootstrap-theme', 'Sphinx<3.0',
                         'gitpython', 'redbiom', 'pyzmq', 'sphinx_rtd_theme',
                         'paramiko', 'seaborn',  'matplotlib', 'scipy<=1.10.1',