Skip to content

Commit

Permalink
Set up Gitlab testing for Kubernetes runners (#2903)
Browse files Browse the repository at this point in the history
* Make the Gitlab config explain how to start and stop Docker, in case entrypoints aren't respected

* Actually import boto.exception.S3ResponseError

Looks like we never tested this code path because we never had an error here in the tests.

* Consider cgroup v1 limits when determining available CPUs

* Fail the Docker build if Quay login fails

* Use new CPU counting function in Parasol test support

* Use Gitlab secrets instead of the AWS secrets manager

This is so we don't rely on CI runners having any built-in AWS accessi.

* Instrument Quay login

* Don't open the name of an environment variable as a file

* Make sure to pre-pull base Docker images to avoid disconnects failing the actual build

* Make sure Docker is actually up before each pull

* Just do more pull attempts since it seems like pull is a bit flaky today

* Set timouts on the deferred function tests and verbosify them.

* Make all the pytest tests verbose so we can see the names of what runs if it gets stuck

* Dump flaky test output

* Report on core counts in case they weirdly all became 1

* Send enough format arguments

* Adopt a non-kubernetes-dependent cgroup sizing method

* Stop instrumenting the flaky test so carefully
  • Loading branch information
adamnovak authored Jan 15, 2020
1 parent ddb2beb commit debfc51
Show file tree
Hide file tree
Showing 15 changed files with 139 additions and 49 deletions.
24 changes: 17 additions & 7 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
image: quay.io/vgteam/vg_ci_prebake:latest
# Note that we must run in a priviliged container for our internal Docker daemon to come up.

before_script:
- startdocker || true
- docker info
- cat /etc/hosts
- export PYTHONIOENCODING=utf-8
Expand All @@ -9,6 +13,7 @@ after_script:
# that next job.
- pwd
- sudo rm -rf tmp
- stopdocker || true


stages:
Expand All @@ -24,11 +29,11 @@ py2_batch_systems:
- pwd
- apt update && DEBIAN_FRONTEND=noninteractive apt install -y tzdata && apt install -y jq
- virtualenv -p python2.7 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor awscli==1.16.272
# Get Kubernetes credentials before switching over to another AWS account
# Get Kubernetes credentials
- mkdir -p ~/.kube
- aws secretsmanager get-secret-value --secret-id allspark/runner/kubeconfig --region us-west-2 | jq -r .SecretString > ~/.kube/config
- cp "$GITLAB_SECRET_FILE_KUBE_CONFIG" ~/.kube/config
- mkdir -p ~/.aws
- echo -e $(aws secretsmanager get-secret-value --secret-id allspark/runner/credentials --region us-west-2 | jq -r .SecretString) > ~/.aws/credentials
- cp "$GITLAB_SECRET_FILE_AWS_CREDENTIALS" ~/.aws/credentials
- python -m pytest -r s src/toil/test/batchSystems/batchSystemTest.py
- python -m pytest -r s src/toil/test/mesos/MesosDataStructuresTest.py

Expand Down Expand Up @@ -70,6 +75,7 @@ py2_appliance_build:
- pwd
- apt update && DEBIAN_FRONTEND=noninteractive apt install -y tzdata && apt install -y jq
- virtualenv -p python2.7 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor awscli==1.16.272
# This reads GITLAB_SECRET_FILE_QUAY_CREDENTIALS
- python setup_gitlab_docker.py
- export TOIL_APPLIANCE_SELF=quay.io/ucsc_cgl/toil:$(python version_template.py dockerTag)
- echo $TOIL_APPLIANCE_SELF
Expand All @@ -84,9 +90,10 @@ py2_integration_jobstore:
- export TOIL_TEST_INTEGRATIVE=True
- export TOIL_AWS_KEYNAME=id_rsa
- export TOIL_AWS_ZONE=us-west-2a
# This reads GITLAB_SECRET_FILE_SSH_KEYS
- python setup_gitlab_ssh.py
- mkdir -p ~/.aws
- echo -e $(aws secretsmanager get-secret-value --secret-id allspark/runner/credentials --region us-west-2 | jq -r .SecretString) > ~/.aws/credentials
- cp "$GITLAB_SECRET_FILE_AWS_CREDENTIALS" ~/.aws/credentials
- python -m pytest src/toil/test/jobStores/jobStoreTest.py

py2_integration_sort:
Expand All @@ -98,9 +105,10 @@ py2_integration_sort:
- export TOIL_TEST_INTEGRATIVE=True
- export TOIL_AWS_KEYNAME=id_rsa
- export TOIL_AWS_ZONE=us-west-2a
# This reads GITLAB_SECRET_FILE_SSH_KEYS
- python setup_gitlab_ssh.py
- mkdir -p ~/.aws
- echo -e $(aws secretsmanager get-secret-value --secret-id allspark/runner/credentials --region us-west-2 | jq -r .SecretString) > ~/.aws/credentials
- cp "$GITLAB_SECRET_FILE_AWS_CREDENTIALS" ~/.aws/credentials
- python -m pytest src/toil/test/sort/sortTest.py
- python -m pytest src/toil/test/provisioners/clusterScalerTest.py

Expand All @@ -113,9 +121,10 @@ py2_integration_sort:
# - export TOIL_TEST_INTEGRATIVE=True
# - export TOIL_AWS_KEYNAME=id_rsa
# - export TOIL_AWS_ZONE=us-west-2a
# # This reads GITLAB_SECRET_FILE_SSH_KEYS
# - python setup_gitlab_ssh.py
# - mkdir -p ~/.aws
# - echo -e $(aws secretsmanager get-secret-value --secret-id allspark/runner/credentials --region us-west-2 | jq -r .SecretString) > ~/.aws/credentials
# - cp "$GITLAB_SECRET_FILE_AWS_CREDENTIALS" ~/.aws/credentials
# - python -m pytest src/toil/test/provisioners/aws/awsProvisionerTest.py


Expand Down Expand Up @@ -171,7 +180,8 @@ py3_main:
# - export TOIL_TEST_INTEGRATIVE=True
# - export TOIL_AWS_KEYNAME=id_rsa
# - export TOIL_AWS_ZONE=us-west-2a
# # This reads GITLAB_SECRET_FILE_SSH_KEYS
# - python setup_gitlab_ssh.py
# - mkdir -p ~/.aws
# - echo -e $(aws secretsmanager get-secret-value --secret-id allspark/runner/credentials --region us-west-2 | jq -r .SecretString) > ~/.aws/credentials
# - cp "$GITLAB_SECRET_FILE_AWS_CREDENTIALS" ~/.aws/credentials
# - python -m pytest src/toil/test/jobStores/jobStoreTest.py
21 changes: 14 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -181,18 +181,24 @@ define tag_docker
endef

docker: docker/Dockerfile
# Pre-pull everything
for i in $$(seq 1 11); do if [[ $$i == "11" ]] ; then exit 1 ; fi ; docker pull ubuntu:16.04 && break || sleep 60; done
for i in $$(seq 1 11); do if [[ $$i == "11" ]] ; then exit 1 ; fi ; docker pull prom/prometheus:v2.0.0 && break || sleep 60; done
for i in $$(seq 1 11); do if [[ $$i == "11" ]] ; then exit 1 ; fi ; docker pull grafana/grafana && break || sleep 60; done
for i in $$(seq 1 11); do if [[ $$i == "11" ]] ; then exit 1 ; fi ; docker pull sscaling/mtail && break || sleep 60; done

@set -ex \
; cd docker \
; docker build --tag=$(docker_image):$(docker_tag) -f Dockerfile .

@set -ex \
; cd dashboard/prometheus \
; docker build --tag=$(prometheus_image):$(docker_tag) -f Dockerfile .

@set -ex \
; cd dashboard/grafana \
; docker build --tag=$(grafana_image):$(docker_tag) -f Dockerfile .

@set -ex \
; cd dashboard/mtail \
; docker build --tag=$(mtail_image):$(docker_tag) -f Dockerfile .
Expand All @@ -208,10 +214,11 @@ clean_docker:
-docker rmi $(docker_image):$(docker_tag)

push_docker: docker
for i in $$(seq 1 5); do docker push $(docker_image):$(docker_tag) && break || sleep 60; done
for i in $$(seq 1 5); do docker push $(grafana_image):$(docker_tag) && break || sleep 60; done
for i in $$(seq 1 5); do docker push $(prometheus_image):$(docker_tag) && break || sleep 60; done
for i in $$(seq 1 5); do docker push $(mtail_image):$(docker_tag) && break || sleep 60; done
# Weird if logic is so we fail if all the pushes fail
for i in $$(seq 1 6); do if [[ $$i == "6" ]] ; then exit 1 ; fi ; docker push $(docker_image):$(docker_tag) && break || sleep 60; done
for i in $$(seq 1 6); do if [[ $$i == "6" ]] ; then exit 1 ; fi ; docker push $(grafana_image):$(docker_tag) && break || sleep 60; done
for i in $$(seq 1 6); do if [[ $$i == "6" ]] ; then exit 1 ; fi ; docker push $(prometheus_image):$(docker_tag) && break || sleep 60; done
for i in $$(seq 1 6); do if [[ $$i == "6" ]] ; then exit 1 ; fi ; docker push $(mtail_image):$(docker_tag) && break || sleep 60; done

else

Expand Down
32 changes: 26 additions & 6 deletions setup_gitlab_docker.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,39 @@
import subprocess
import json
import os
import subprocess
import sys

stderr = 'Login was not attempted'

p = subprocess.Popen('aws secretsmanager --region us-west-2 get-secret-value --secret-id /toil/gitlab/quay',
stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
stdout, stderr = p.communicate()
env_var = 'GITLAB_SECRET_FILE_QUAY_CREDENTIALS'

try:
keys = json.loads(json.loads(stdout)['SecretString'])
if env_var not in os.environ:
print('Error: could not find environment variable ' + env_var)
sys.exit(1)

filename = os.environ[env_var]

if not os.path.exists(filename):
print('Error: could not find file referenced by ' + env_var)
sys.exit(1)

print('Opening key file...')
with open(filename, 'r') as cred_json_file:
print('Reading keys...')
keys = json.loads(cred_json_file.read())
print('Read and decoded keys')

print('Starting login process...')
process = subprocess.Popen('docker login quay.io -u "{user}" --password-stdin'.format(user=keys['user']),
stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE,
shell=True)
print('Logging in...')
stdout, stderr = process.communicate(input=keys['password'])
if 'Login Succeeded' in stdout:
print('Login Succeeded')
else:
raise RuntimeError
except:
print('While attempting to log into quay.io:\n' + str(stderr))
print('Error while attempting to log into quay.io:\n' + str(stderr))
sys.exit(1)
13 changes: 6 additions & 7 deletions setup_gitlab_ssh.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
import subprocess
import json
import os

p = subprocess.Popen('aws secretsmanager --region us-west-2 get-secret-value --secret-id /toil/gitlab/ssh_key',
stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
stdout, stderr = p.communicate()
import subprocess
import sys

good_spot = os.path.expanduser('~/.ssh')
os.mkdir(good_spot)

try:
keys = json.loads(json.loads(stdout)['SecretString'])
with open(os.environ['GITLAB_SECRET_FILE_SSH_KEYS'], 'r') as keys_json_file:
keys = json.loads(keys_json_file.read())
with open(os.path.join(good_spot, 'id_rsa.pub'), 'w') as f:
f.write(keys['public'])
with open(os.path.join(good_spot, 'id_rsa'), 'w') as f:
f.write(keys['private'])
except:
print('While attempting to set up the ssh key:\n' + str(stderr))
print('While attempting to set up the ssh keys.')
sys.exit(1)
3 changes: 2 additions & 1 deletion src/toil/batchSystems/mesos/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@

from toil import subprocess, pickle
from toil.lib.expando import Expando
from toil.lib.threading import cpu_count
from toil.batchSystems.abstractBatchSystem import BatchSystemSupport
from toil.resource import Resource

Expand Down Expand Up @@ -132,7 +133,7 @@ def _sendFrameworkMessage(self, driver):
else:
message.nodeInfo = dict(coresUsed=float(psutil.cpu_percent()) * .01,
memoryUsed=float(psutil.virtual_memory().percent) * .01,
coresTotal=psutil.cpu_count(),
coresTotal=cpu_count(),
memoryTotal=psutil.virtual_memory().total,
workers=len(self.runningTasks))
log.debug("Send framework message: %s", message)
Expand Down
5 changes: 2 additions & 3 deletions src/toil/batchSystems/mesos/test/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,14 @@
import shutil
import threading
from toil import subprocess
import multiprocessing
from past.builtins import basestring
from six.moves.urllib.request import urlopen
from contextlib import closing
import time

from toil.lib.retry import retry
from toil import which # replace with shutil.which() directly; python3 only
from toil.lib.threading import ExceptionalThread
from toil.lib.threading import ExceptionalThread, cpu_count
from future.utils import with_metaclass

log = logging.getLogger(__name__)
Expand All @@ -30,7 +29,7 @@ class MesosTestSupport(object):

def _startMesos(self, numCores=None):
if numCores is None:
numCores = multiprocessing.cpu_count()
numCores = cpu_count()
shutil.rmtree('/tmp/mesos', ignore_errors=True)
self.master = self.MesosMasterThread(numCores)
self.master.start()
Expand Down
7 changes: 4 additions & 3 deletions src/toil/batchSystems/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@
# See the License for the specific language governing permissions and
#

from toil.lib.threading import cpu_count

from .registry import batchSystemFactoryFor, defaultBatchSystem, uniqueNames

import socket
from contextlib import closing
import multiprocessing

def getPublicIP():
"""Get the IP that this machine uses to contact the internet.
Expand Down Expand Up @@ -114,7 +115,7 @@ def addOptions(addOptionFn, config):
help=("Should auto-deployment of the user script be deactivated? If True, the user "
"script/package should be present at the same location on all workers. "
"default=false"))
localCores = multiprocessing.cpu_count()
localCores = cpu_count()
addOptionFn("--maxLocalJobs", default=localCores,
help="For batch systems that support a local queue for "
"housekeeping jobs (Mesos, GridEngine, htcondor, lsf, slurm, "
Expand Down Expand Up @@ -146,7 +147,7 @@ def setDefaultOptions(config):
config.disableAutoDeployment = False
config.environment = {}
config.statePollingWait = None # if not set, will default to seconds in getWaitDuration()
config.maxLocalJobs = multiprocessing.cpu_count()
config.maxLocalJobs = cpu_count()
config.manualMemArgs = False

# single machine
Expand Down
4 changes: 2 additions & 2 deletions src/toil/batchSystems/parasolTestSupport.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
import threading
import time
from toil import subprocess
import multiprocessing
import signal
import os
import errno
from toil.lib.objects import InnerClass
from toil.lib.threading import cpu_count

from toil import physicalMemory

Expand All @@ -46,7 +46,7 @@ class ParasolTestSupport(object):

def _startParasol(self, numCores=None, memory=None):
if numCores is None:
numCores = multiprocessing.cpu_count()
numCores = cpu_count()
if memory is None:
memory = physicalMemory()
self.numCores = numCores
Expand Down
4 changes: 2 additions & 2 deletions src/toil/batchSystems/singleMachine.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
from past.utils import old_div
from contextlib import contextmanager
import logging
import multiprocessing
import os
import time
import math
Expand All @@ -33,6 +32,7 @@
import toil
from toil import subprocess
from toil.batchSystems.abstractBatchSystem import BatchSystemSupport
from toil.lib.threading import cpu_count
from toil import worker as toil_worker
from toil.common import Toil

Expand All @@ -53,7 +53,7 @@ def supportsAutoDeployment(cls):
def supportsWorkerCleanup(cls):
return True

numCores = multiprocessing.cpu_count()
numCores = cpu_count()

minCores = 0.1
"""
Expand Down
45 changes: 45 additions & 0 deletions src/toil/lib/threading.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@
from __future__ import absolute_import
from future.utils import raise_
from builtins import range
import math
import sys
import threading
if sys.version_info >= (3, 0):
from threading import BoundedSemaphore
else:
from threading import _BoundedSemaphore as BoundedSemaphore

import psutil

class BoundedEmptySemaphore( BoundedSemaphore ):
"""
Expand Down Expand Up @@ -102,3 +104,46 @@ class defaultlocal(threading.local):
def __init__( self, **kwargs ):
super( defaultlocal, self ).__init__( )
self.__dict__.update( kwargs )


def cpu_count():
"""
Get the rounded-up integer number of whole CPUs available.
Counts hyperthreads as CPUs.
Uses the system's actual CPU count, or the current v1 cgroup's quota per
period, if the quota is set.
Ignores the cgroup's cpu shares value, because it's extremely difficult to
interpret. See https://github.com/kubernetes/kubernetes/issues/81021.
:return: Integer count of available CPUs, minimum 1.
:rtype: int
"""

# Get the fallback answer of all the CPUs on the machine
total_machine_size = psutil.cpu_count(logical=True)

try:
with open('/sys/fs/cgroup/cpu/cpu.cfs_quota_us', 'r') as stream:
# Read the quota
quota = int(stream.read)

if quota == -1:
# Assume we can use the whole machine
return total_machine_size

with open('/sys/fs/cgroup/cpu/cpu.cfs_period_us', 'r') as stream:
# Read the period in which we are allowed to burn the quota
period = int(stream.read)

# The thread count is how many multiples of a wall clcok period we can burn in that period.
cgroup_size = int(math.ceil(float(quota)/float(period)))
except:
# We can't actually read these cgroup fields. Maybe we are a mac or something.
cgroup_size = float('inf')

# Return the smaller of the actual thread count and the cgroup's limit, minimum 1.
return max(1, min(cgroup_size, total_machine_size))

Loading

0 comments on commit debfc51

Please sign in to comment.