Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tolerate not finding Flatcar releases in CI #5201

Merged
merged 3 commits into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 38 additions & 46 deletions src/toil/lib/aws/ami.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
import json
import logging
import os
import time
import urllib.request
from collections.abc import Iterator
from typing import Optional, cast
from urllib.error import HTTPError, URLError

from botocore.client import BaseClient
from botocore.exceptions import ClientError
from botocore.exceptions import ClientError, EndpointConnectionError

from toil.lib.retry import retry

logger = logging.getLogger(__name__)

class ReleaseFeedUnavailableError(RuntimeError):
"""Raised when a Flatcar releases can't be located."""
pass

@retry(errors=[ReleaseFeedUnavailableError])
def get_flatcar_ami(ec2_client: BaseClient, architecture: str = "amd64") -> str:
"""
Retrieve the flatcar AMI image to use as the base for all Toil autoscaling instances.
Expand All @@ -25,15 +30,13 @@ def get_flatcar_ami(ec2_client: BaseClient, architecture: str = "amd64") -> str:
2. Official AMI from stable.release.flatcar-linux.net
3. Search the AWS Marketplace

If all of these sources fail, we raise an error to complain.
:raises ReleaseFeedUnavailableError: if all of these sources fail.

:param ec2_client: Boto3 EC2 Client
:param architecture: The architecture type for the new AWS machine. Can be either amd64 or arm64
"""

# Take a user override
ami = os.environ.get("TOIL_AWS_AMI")
try_number = 0
if not ami:
logger.debug(
"No AMI found in TOIL_AWS_AMI; checking stable Flatcar release feed"
Expand All @@ -48,11 +51,6 @@ def get_flatcar_ami(ec2_client: BaseClient, architecture: str = "amd64") -> str:
ami = aws_marketplace_flatcar_ami_search(
ec2_client=ec2_client, architecture=architecture
)
if not ami:
logger.debug("No AMI found in marketplace; checking Toil Flatcar release feed")
ami = feed_flatcar_ami_release(
ec2_client=ec2_client, architecture=architecture, source="toil"
)
if not ami:
logger.debug(
"No AMI found in Toil project feed; checking beta Flatcar release feed"
Expand All @@ -69,7 +67,7 @@ def get_flatcar_ami(ec2_client: BaseClient, architecture: str = "amd64") -> str:
)
if not ami:
logger.critical("No available Flatcar AMI in any source!")
raise RuntimeError(
raise ReleaseFeedUnavailableError(
f"Unable to fetch the latest flatcar image. Upload "
f"https://stable.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_image.bin.bz2 "
f"to AWS as am AMI and set TOIL_AWS_AMI in the environment to its AMI ID."
Expand All @@ -78,7 +76,6 @@ def get_flatcar_ami(ec2_client: BaseClient, architecture: str = "amd64") -> str:
return ami


@retry(errors=[HTTPError])
def _fetch_flatcar_feed(architecture: str = "amd64", source: str = "stable") -> bytes:
"""
Get the binary data of the Flatcar release feed for the given architecture.
Expand All @@ -94,9 +91,8 @@ def _fetch_flatcar_feed(architecture: str = "amd64", source: str = "stable") ->
JSON_FEED_URL = {
"stable": f"https://stable.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json",
"beta": f"https://beta.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json",
"alpha": f"https://alpha.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json",
"archive": f"https://web.archive.org/web/20220625112618if_/https://stable.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json",
"toil": f"https://raw.githubusercontent.com/DataBiosphere/toil/master/contrib/flatcar/{architecture}-usr/current/flatcar_production_ami_all.json",
# "alpha": f"https://alpha.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json",
"archive": f"https://web.archive.org/web/20220625112618if_/https://stable.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json"
}[source]
return cast(bytes, urllib.request.urlopen(JSON_FEED_URL).read())

Expand All @@ -121,13 +117,15 @@ def flatcar_release_feed_amis(

try_number = 0
while try_number < MAX_TRIES:
if try_number != 0:
time.sleep(1)
try:
feed = json.loads(_fetch_flatcar_feed(architecture, source))
break
except HTTPError:
# Flatcar servers did not return the feed
logger.exception(f"Could not retrieve {source} Flatcar release feed JSON")
# Don't retry
# This is probably a permanent error, or at least unlikely to go away immediately.
return
except json.JSONDecodeError:
# Feed is not JSON
Expand All @@ -149,19 +147,12 @@ def flatcar_release_feed_amis(

for ami_record in feed.get("amis", []):
# Scan the list of regions
if ami_record.get("name", None) == region:
# When we find ours, return the AMI ID
if "hvm" in ami_record:
yield ami_record["hvm"]
# And stop, there should be one per region.
return
if ami_record.get("name") == region:
return ami_record.get("hvm")
# We didn't find our region
logger.warning(
f"Flatcar {source} release feed does not have an image for region {region}"
)
logger.warning(f"Flatcar {source} release feed does not have an image for region {region}")


@retry() # TODO: What errors do we get for timeout, JSON parse failure, etc?
def feed_flatcar_ami_release(
ec2_client: BaseClient, architecture: str = "amd64", source: str = "stable"
) -> Optional[str]:
Expand All @@ -170,6 +161,8 @@ def feed_flatcar_ami_release(

Verify it's on AWS.

Does not raise exceptions.

:param ec2_client: Boto3 EC2 Client
:param architecture: The architecture type for the new AWS machine. Can be either amd64 or arm64
:param source: can be set to a Flatcar release channel ('stable', 'beta',
Expand All @@ -186,43 +179,42 @@ def feed_flatcar_ami_release(
# verify it exists on AWS
try:
response = ec2_client.describe_images(Filters=[{"Name": "image-id", "Values": [ami]}]) # type: ignore
if (
len(response["Images"]) == 1
and response["Images"][0]["State"] == "available"
):
if (len(response["Images"]) == 1 and response["Images"][0]["State"] == "available"):
return ami
else:
logger.warning(
f"Flatcar release feed suggests image {ami} which does not exist on AWS in {region}"
)
except ClientError:
logger.warning(f"Flatcar release feed suggests image {ami} which does not exist on AWS in {region}")
except (ClientError, EndpointConnectionError):
# Sometimes we get back nonsense like:
# botocore.exceptions.ClientError: An error occurred (AuthFailure) when calling the DescribeImages operation: AWS was not able to validate the provided access credentials
# Don't hold that against the AMI.
logger.exception(
f"Unable to check if AMI {ami} exists on AWS in {region}; assuming it does"
)
logger.exception(f"Unable to check if AMI {ami} exists on AWS in {region}; assuming it does")
return ami
# We didn't find it
logger.warning(
f"Flatcar release feed does not have an image for region {region} that exists on AWS"
)
return None
logger.warning(f"Flatcar release feed does not have an image for region {region} that exists on AWS")


@retry() # TODO: What errors do we get for timeout, JSON parse failure, etc?
def aws_marketplace_flatcar_ami_search(
ec2_client: BaseClient, architecture: str = "amd64"
) -> Optional[str]:
"""Query AWS for all AMI names matching ``Flatcar-stable-*`` and return the most recent one."""
"""
Query AWS for all AMI names matching ``Flatcar-stable-*`` and return the most recent one.

Does not raise exceptions.

:returns: An AMI name, or None if no matching AMI was found or we could not talk to AWS.
"""

# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Client.describe_images
# Possible arch choices on AWS: 'i386'|'x86_64'|'arm64'|'x86_64_mac'
architecture_mapping = {"amd64": "x86_64", "arm64": "arm64"}
response = ec2_client.describe_images( # type: ignore[attr-defined]
Owners=["aws-marketplace"],
Filters=[{"Name": "name", "Values": ["Flatcar-stable-*"]}],
)
try:
response = ec2_client.describe_images( # type: ignore[attr-defined]
Owners=["aws-marketplace"],
Filters=[{"Name": "name", "Values": ["Flatcar-stable-*"]}],
)
except (ClientError, EndpointConnectionError):
logger.exception("Unable to search AWS marketplace")
return None
latest: dict[str, str] = {"CreationDate": "0lder than atoms."}
for image in response["Images"]:
if (
Expand Down
36 changes: 19 additions & 17 deletions src/toil/test/lib/test_ec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@
import logging
import os

import pytest
from unittest import mock

from toil.lib.aws.ami import (
aws_marketplace_flatcar_ami_search,
feed_flatcar_ami_release,
flatcar_release_feed_amis,
get_flatcar_ami,
ReleaseFeedUnavailableError
)
from toil.test import ToilTest, needs_aws_ec2, needs_online

Expand Down Expand Up @@ -69,15 +70,18 @@ def setUpClass(cls):

def test_fetch_flatcar(self):
with self.subTest("Test flatcar AMI from user is prioritized."):
os.environ["TOIL_AWS_AMI"] = "overridden"
ami = get_flatcar_ami(self.ec2_client)
self.assertEqual(ami, "overridden")
del os.environ["TOIL_AWS_AMI"]
with mock.patch.dict(os.environ, {"TOIL_AWS_AMI": "overridden"}):
ami = get_flatcar_ami(self.ec2_client)
self.assertEqual(ami, "overridden")

with self.subTest("Test flatcar AMI returns an AMI-looking AMI."):
ami = get_flatcar_ami(self.ec2_client)
self.assertEqual(len(ami), len("ami-02b46c73fed689d1c"))
self.assertTrue(ami.startswith("ami-"))
try:
ami = get_flatcar_ami(self.ec2_client)
self.assertEqual(len(ami), len("ami-02b46c73fed689d1c"))
self.assertTrue(ami.startswith("ami-"))
except ReleaseFeedUnavailableError:
# Ignore any remote systems being down.
pass

with self.subTest(
"Test feed_flatcar_ami_release() returns an AMI-looking AMI."
Expand All @@ -90,16 +94,14 @@ def test_fetch_flatcar(self):
"Test aws_marketplace_flatcar_ami_search() returns an AMI-looking AMI."
):
ami = aws_marketplace_flatcar_ami_search(self.ec2_client)
self.assertEqual(len(ami), len("ami-02b46c73fed689d1c"))
self.assertTrue(ami.startswith("ami-"))
self.assertTrue(ami is None or len(ami), len("ami-02b46c73fed689d1c"))
self.assertTrue(ami is None or ami.startswith("ami-"))

# TODO: This will fail until https://github.com/flatcar/Flatcar/issues/962 is fixed
@pytest.mark.xfail
def test_fetch_arm_flatcar(self):
"""Test flatcar AMI finder architecture parameter."""
amis = set()
for arch in ["amd64", "arm64"]:
ami = get_flatcar_ami(self.ec2_client, architecture=arch)
try:
ami = get_flatcar_ami(self.ec2_client, architecture="arm64")
self.assertTrue(ami.startswith("ami-"))
amis.add(ami)
self.assertTrue(len(amis) == 2)
except ReleaseFeedUnavailableError:
# Ignore any remote systems being down.
pass
Loading