Set up Gitlab testing for Kubernetes runners (#2903)

* Make the Gitlab config explain how to start and stop Docker, in case entrypoints aren't respected * Actually import boto.exception.S3ResponseError Looks like we never tested this code path because we never had an error here in the tests. * Consider cgroup v1 limits when determining available CPUs * Fail the Docker build if Quay login fails * Use new CPU counting function in Parasol test support * Use Gitlab secrets instead of the AWS secrets manager This is so we don't rely on CI runners having any built-in AWS accessi. * Instrument Quay login * Don't open the name of an environment variable as a file * Make sure to pre-pull base Docker images to avoid disconnects failing the actual build * Make sure Docker is actually up before each pull * Just do more pull attempts since it seems like pull is a bit flaky today * Set timouts on the deferred function tests and verbosify them. * Make all the pytest tests verbose so we can see the names of what runs if it gets stuck * Dump flaky test output * Report on core counts in case they weirdly all became 1 * Send enough format arguments * Adopt a non-kubernetes-dependent cgroup sizing method * Stop instrumenting the flaky test so carefully
DataBiosphere · Jan 15, 2020 · debfc51 · debfc51
1 parent ddb2beb
commit debfc51
Show file tree

Hide file tree

Showing 15 changed files with 139 additions and 49 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -1,4 +1,8 @@
+image: quay.io/vgteam/vg_ci_prebake:latest
+# Note that we must run in a priviliged container for our internal Docker daemon to come up.
+
 before_script:
+  - startdocker || true
   - docker info
   - cat /etc/hosts
   - export PYTHONIOENCODING=utf-8
@@ -9,6 +13,7 @@ after_script:
   # that next job.
   - pwd
   - sudo rm -rf tmp
+  - stopdocker || true
 
 
 stages:
@@ -24,11 +29,11 @@ py2_batch_systems:
     - pwd
     - apt update && DEBIAN_FRONTEND=noninteractive apt install -y tzdata && apt install -y jq
     - virtualenv -p python2.7 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor awscli==1.16.272
-    # Get Kubernetes credentials before switching over to another AWS account
+    # Get Kubernetes credentials
     - mkdir -p ~/.kube
-    - aws secretsmanager get-secret-value --secret-id allspark/runner/kubeconfig --region us-west-2 | jq -r .SecretString > ~/.kube/config
+    - cp "$GITLAB_SECRET_FILE_KUBE_CONFIG" ~/.kube/config
     - mkdir -p ~/.aws
-    - echo -e $(aws secretsmanager get-secret-value --secret-id allspark/runner/credentials --region us-west-2 | jq -r .SecretString) > ~/.aws/credentials
+    - cp "$GITLAB_SECRET_FILE_AWS_CREDENTIALS" ~/.aws/credentials
     - python -m pytest -r s src/toil/test/batchSystems/batchSystemTest.py
     - python -m pytest -r s src/toil/test/mesos/MesosDataStructuresTest.py
 
@@ -70,6 +75,7 @@ py2_appliance_build:
     - pwd
     - apt update && DEBIAN_FRONTEND=noninteractive apt install -y tzdata && apt install -y jq
     - virtualenv -p python2.7 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor awscli==1.16.272
+    # This reads GITLAB_SECRET_FILE_QUAY_CREDENTIALS
     - python setup_gitlab_docker.py
     - export TOIL_APPLIANCE_SELF=quay.io/ucsc_cgl/toil:$(python version_template.py dockerTag)
     - echo $TOIL_APPLIANCE_SELF
@@ -84,9 +90,10 @@ py2_integration_jobstore:
     - export TOIL_TEST_INTEGRATIVE=True
     - export TOIL_AWS_KEYNAME=id_rsa
     - export TOIL_AWS_ZONE=us-west-2a
+    # This reads GITLAB_SECRET_FILE_SSH_KEYS
     - python setup_gitlab_ssh.py
     - mkdir -p ~/.aws
-    - echo -e $(aws secretsmanager get-secret-value --secret-id allspark/runner/credentials --region us-west-2 | jq -r .SecretString) > ~/.aws/credentials
+    - cp "$GITLAB_SECRET_FILE_AWS_CREDENTIALS" ~/.aws/credentials
     - python -m pytest src/toil/test/jobStores/jobStoreTest.py
 
 py2_integration_sort:
@@ -98,9 +105,10 @@ py2_integration_sort:
     - export TOIL_TEST_INTEGRATIVE=True
     - export TOIL_AWS_KEYNAME=id_rsa
     - export TOIL_AWS_ZONE=us-west-2a
+    # This reads GITLAB_SECRET_FILE_SSH_KEYS
     - python setup_gitlab_ssh.py
     - mkdir -p ~/.aws
-    - echo -e $(aws secretsmanager get-secret-value --secret-id allspark/runner/credentials --region us-west-2 | jq -r .SecretString) > ~/.aws/credentials
+    - cp "$GITLAB_SECRET_FILE_AWS_CREDENTIALS" ~/.aws/credentials
     - python -m pytest src/toil/test/sort/sortTest.py
     - python -m pytest src/toil/test/provisioners/clusterScalerTest.py
 
@@ -113,9 +121,10 @@ py2_integration_sort:
 #    - export TOIL_TEST_INTEGRATIVE=True
 #    - export TOIL_AWS_KEYNAME=id_rsa
 #    - export TOIL_AWS_ZONE=us-west-2a
+#    # This reads GITLAB_SECRET_FILE_SSH_KEYS
 #    - python setup_gitlab_ssh.py
 #    - mkdir -p ~/.aws
-#    - echo -e $(aws secretsmanager get-secret-value --secret-id allspark/runner/credentials --region us-west-2 | jq -r .SecretString) > ~/.aws/credentials
+#    - cp "$GITLAB_SECRET_FILE_AWS_CREDENTIALS" ~/.aws/credentials
 #    - python -m pytest src/toil/test/provisioners/aws/awsProvisionerTest.py
 
 
@@ -171,7 +180,8 @@ py3_main:
 #    - export TOIL_TEST_INTEGRATIVE=True
 #    - export TOIL_AWS_KEYNAME=id_rsa
 #    - export TOIL_AWS_ZONE=us-west-2a
+#    # This reads GITLAB_SECRET_FILE_SSH_KEYS
 #    - python setup_gitlab_ssh.py
 #    - mkdir -p ~/.aws
-#    - echo -e $(aws secretsmanager get-secret-value --secret-id allspark/runner/credentials --region us-west-2 | jq -r .SecretString) > ~/.aws/credentials
+#    - cp "$GITLAB_SECRET_FILE_AWS_CREDENTIALS" ~/.aws/credentials
 #    - python -m pytest src/toil/test/jobStores/jobStoreTest.py
diff --git a/Makefile b/Makefile
@@ -181,18 +181,24 @@ define tag_docker
 endef
 
 docker: docker/Dockerfile
+	# Pre-pull everything
+	for i in $$(seq 1 11); do if [[ $$i == "11" ]] ; then exit 1 ; fi ; docker pull ubuntu:16.04 && break || sleep 60; done
+	for i in $$(seq 1 11); do if [[ $$i == "11" ]] ; then exit 1 ; fi ; docker pull prom/prometheus:v2.0.0 && break || sleep 60; done
+	for i in $$(seq 1 11); do if [[ $$i == "11" ]] ; then exit 1 ; fi ; docker pull grafana/grafana && break || sleep 60; done
+	for i in $$(seq 1 11); do if [[ $$i == "11" ]] ; then exit 1 ; fi ; docker pull sscaling/mtail && break || sleep 60; done
+
 	@set -ex \
 	; cd docker \
 	; docker build --tag=$(docker_image):$(docker_tag) -f Dockerfile .
-
+	
 	@set -ex \
 	; cd dashboard/prometheus \
 	; docker build --tag=$(prometheus_image):$(docker_tag) -f Dockerfile .
-
+	
 	@set -ex \
 	; cd dashboard/grafana \
 	; docker build --tag=$(grafana_image):$(docker_tag) -f Dockerfile .
-
+	
 	@set -ex \
 	; cd dashboard/mtail \
 	; docker build --tag=$(mtail_image):$(docker_tag) -f Dockerfile .
@@ -208,10 +214,11 @@ clean_docker:
 	-docker rmi $(docker_image):$(docker_tag)
 
 push_docker: docker
-	for i in $$(seq 1 5); do docker push $(docker_image):$(docker_tag) && break || sleep 60; done
-	for i in $$(seq 1 5); do docker push $(grafana_image):$(docker_tag) && break || sleep 60; done
-	for i in $$(seq 1 5); do docker push $(prometheus_image):$(docker_tag) && break || sleep 60; done
-	for i in $$(seq 1 5); do docker push $(mtail_image):$(docker_tag) && break || sleep 60; done
+	# Weird if logic is so we fail if all the pushes fail
+	for i in $$(seq 1 6); do if [[ $$i == "6" ]] ; then exit 1 ; fi ; docker push $(docker_image):$(docker_tag) && break || sleep 60; done
+	for i in $$(seq 1 6); do if [[ $$i == "6" ]] ; then exit 1 ; fi ; docker push $(grafana_image):$(docker_tag) && break || sleep 60; done
+	for i in $$(seq 1 6); do if [[ $$i == "6" ]] ; then exit 1 ; fi ; docker push $(prometheus_image):$(docker_tag) && break || sleep 60; done
+	for i in $$(seq 1 6); do if [[ $$i == "6" ]] ; then exit 1 ; fi ; docker push $(mtail_image):$(docker_tag) && break || sleep 60; done
 
 else
 

diff --git a/setup_gitlab_docker.py b/setup_gitlab_docker.py
@@ -1,19 +1,39 @@
-import subprocess
 import json
+import os
+import subprocess
+import sys
+
+stderr = 'Login was not attempted'
 
-p = subprocess.Popen('aws secretsmanager --region us-west-2 get-secret-value --secret-id /toil/gitlab/quay',
-                     stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
-stdout, stderr = p.communicate()
+env_var = 'GITLAB_SECRET_FILE_QUAY_CREDENTIALS'
 
 try:
-    keys = json.loads(json.loads(stdout)['SecretString'])
+    if env_var not in os.environ:
+        print('Error: could not find environment variable ' + env_var)
+        sys.exit(1)
+
+    filename = os.environ[env_var]
+
+    if not os.path.exists(filename):
+        print('Error: could not find file referenced by ' + env_var)
+        sys.exit(1)
+
+    print('Opening key file...')
+    with open(filename, 'r') as cred_json_file:
+        print('Reading keys...')
+        keys = json.loads(cred_json_file.read())
+        print('Read and decoded keys')
+
+    print('Starting login process...')
     process = subprocess.Popen('docker login quay.io -u "{user}" --password-stdin'.format(user=keys['user']),
                                stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE,
                                shell=True)
+    print('Logging in...')
     stdout, stderr = process.communicate(input=keys['password'])
     if 'Login Succeeded' in stdout:
         print('Login Succeeded')
     else:
         raise RuntimeError
 except:
-    print('While attempting to log into quay.io:\n' + str(stderr))
+    print('Error while attempting to log into quay.io:\n' + str(stderr))
+    sys.exit(1)
diff --git a/setup_gitlab_ssh.py b/setup_gitlab_ssh.py
@@ -1,19 +1,18 @@
-import subprocess
 import json
 import os
-
-p = subprocess.Popen('aws secretsmanager --region us-west-2 get-secret-value --secret-id /toil/gitlab/ssh_key',
-                     stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
-stdout, stderr = p.communicate()
+import subprocess
+import sys
 
 good_spot = os.path.expanduser('~/.ssh')
 os.mkdir(good_spot)
 
 try:
-    keys = json.loads(json.loads(stdout)['SecretString'])
+    with open(os.environ['GITLAB_SECRET_FILE_SSH_KEYS'], 'r') as keys_json_file:
+        keys = json.loads(keys_json_file.read())
     with open(os.path.join(good_spot, 'id_rsa.pub'), 'w') as f:
         f.write(keys['public'])
     with open(os.path.join(good_spot, 'id_rsa'), 'w') as f:
         f.write(keys['private'])
 except:
-    print('While attempting to set up the ssh key:\n' + str(stderr))
+    print('While attempting to set up the ssh keys.')
+    sys.exit(1)
diff --git a/src/toil/batchSystems/mesos/executor.py b/src/toil/batchSystems/mesos/executor.py
@@ -40,6 +40,7 @@
 
 from toil import subprocess, pickle
 from toil.lib.expando import Expando
+from toil.lib.threading import cpu_count
 from toil.batchSystems.abstractBatchSystem import BatchSystemSupport
 from toil.resource import Resource
 
@@ -132,7 +133,7 @@ def _sendFrameworkMessage(self, driver):
             else:
                 message.nodeInfo = dict(coresUsed=float(psutil.cpu_percent()) * .01,
                                         memoryUsed=float(psutil.virtual_memory().percent) * .01,
-                                        coresTotal=psutil.cpu_count(),
+                                        coresTotal=cpu_count(),
                                         memoryTotal=psutil.virtual_memory().total,
                                         workers=len(self.runningTasks))
             log.debug("Send framework message: %s", message)

diff --git a/src/toil/batchSystems/mesos/test/__init__.py b/src/toil/batchSystems/mesos/test/__init__.py
@@ -9,15 +9,14 @@
 import shutil
 import threading
 from toil import subprocess
-import multiprocessing
 from past.builtins import basestring
 from six.moves.urllib.request import urlopen
 from contextlib import closing
 import time
 
 from toil.lib.retry import retry
 from toil import which  # replace with shutil.which() directly; python3 only
-from toil.lib.threading import ExceptionalThread
+from toil.lib.threading import ExceptionalThread, cpu_count
 from future.utils import with_metaclass
 
 log = logging.getLogger(__name__)
@@ -30,7 +29,7 @@ class MesosTestSupport(object):
 
     def _startMesos(self, numCores=None):
         if numCores is None:
-            numCores = multiprocessing.cpu_count()
+            numCores = cpu_count()
         shutil.rmtree('/tmp/mesos', ignore_errors=True)
         self.master = self.MesosMasterThread(numCores)
         self.master.start()

diff --git a/src/toil/batchSystems/options.py b/src/toil/batchSystems/options.py
@@ -13,11 +13,12 @@
 # See the License for the specific language governing permissions and
 #
 
+from toil.lib.threading import cpu_count
+
 from .registry import batchSystemFactoryFor, defaultBatchSystem, uniqueNames
 
 import socket
 from contextlib import closing
-import multiprocessing
 
 def getPublicIP():
     """Get the IP that this machine uses to contact the internet.
@@ -114,7 +115,7 @@ def addOptions(addOptionFn, config):
                 help=("Should auto-deployment of the user script be deactivated? If True, the user "
                       "script/package should be present at the same location on all workers. "
                       "default=false"))
-    localCores = multiprocessing.cpu_count()
+    localCores = cpu_count()
     addOptionFn("--maxLocalJobs", default=localCores,
                 help="For batch systems that support a local queue for "
                 "housekeeping jobs (Mesos, GridEngine, htcondor, lsf, slurm, "
@@ -146,7 +147,7 @@ def setDefaultOptions(config):
     config.disableAutoDeployment = False
     config.environment = {}
     config.statePollingWait = None  # if not set, will default to seconds in getWaitDuration()
-    config.maxLocalJobs = multiprocessing.cpu_count()
+    config.maxLocalJobs = cpu_count()
     config.manualMemArgs = False
 
     # single machine

diff --git a/src/toil/batchSystems/parasolTestSupport.py b/src/toil/batchSystems/parasolTestSupport.py
@@ -19,11 +19,11 @@
 import threading
 import time
 from toil import subprocess
-import multiprocessing
 import signal
 import os
 import errno
 from toil.lib.objects import InnerClass
+from toil.lib.threading import cpu_count
 
 from toil import physicalMemory
 
@@ -46,7 +46,7 @@ class ParasolTestSupport(object):
 
     def _startParasol(self, numCores=None, memory=None):
         if numCores is None:
-            numCores = multiprocessing.cpu_count()
+            numCores = cpu_count()
         if memory is None:
             memory = physicalMemory()
         self.numCores = numCores

diff --git a/src/toil/batchSystems/singleMachine.py b/src/toil/batchSystems/singleMachine.py
@@ -22,7 +22,6 @@
 from past.utils import old_div
 from contextlib import contextmanager
 import logging
-import multiprocessing
 import os
 import time
 import math
@@ -33,6 +32,7 @@
 import toil
 from toil import subprocess
 from toil.batchSystems.abstractBatchSystem import BatchSystemSupport
+from toil.lib.threading import cpu_count
 from toil import worker as toil_worker
 from toil.common import Toil
 
@@ -53,7 +53,7 @@ def supportsAutoDeployment(cls):
     def supportsWorkerCleanup(cls):
         return True
 
-    numCores = multiprocessing.cpu_count()
+    numCores = cpu_count()
 
     minCores = 0.1
     """

diff --git a/src/toil/lib/threading.py b/src/toil/lib/threading.py
@@ -17,13 +17,15 @@
 from __future__ import absolute_import
 from future.utils import raise_
 from builtins import range
+import math
 import sys
 import threading
 if sys.version_info >= (3, 0):
     from threading import BoundedSemaphore
 else:
     from threading import _BoundedSemaphore as BoundedSemaphore
 
+import psutil
 
 class BoundedEmptySemaphore( BoundedSemaphore ):
     """
@@ -102,3 +104,46 @@ class defaultlocal(threading.local):
     def __init__( self, **kwargs ):
         super( defaultlocal, self ).__init__( )
         self.__dict__.update( kwargs )
+
+
+def cpu_count():
+    """
+    Get the rounded-up integer number of whole CPUs available.
+
+    Counts hyperthreads as CPUs.
+
+    Uses the system's actual CPU count, or the current v1 cgroup's quota per
+    period, if the quota is set.
+
+    Ignores the cgroup's cpu shares value, because it's extremely difficult to
+    interpret. See https://github.com/kubernetes/kubernetes/issues/81021.
+
+    :return: Integer count of available CPUs, minimum 1.
+    :rtype: int
+    """
+
+    # Get the fallback answer of all the CPUs on the machine
+    total_machine_size = psutil.cpu_count(logical=True)
+
+    try:
+        with open('/sys/fs/cgroup/cpu/cpu.cfs_quota_us', 'r') as stream:
+            # Read the quota
+            quota = int(stream.read)
+
+        if quota == -1:
+            # Assume we can use the whole machine
+            return total_machine_size
+
+        with open('/sys/fs/cgroup/cpu/cpu.cfs_period_us', 'r') as stream:
+            # Read the period in which we are allowed to burn the quota
+            period = int(stream.read)
+
+        # The thread count is how many multiples of a wall clcok period we can burn in that period.
+        cgroup_size = int(math.ceil(float(quota)/float(period)))
+    except:
+        # We can't actually read these cgroup fields. Maybe we are a mac or something.
+        cgroup_size = float('inf')
+
+    # Return the smaller of the actual thread count and the cgroup's limit, minimum 1.
+    return max(1, min(cgroup_size, total_machine_size))
+