huggingface
diff --git a/‎api_inference_community/validation.py
Lines changed: 32 additions & 1 deletion b/‎api_inference_community/validation.py
Lines changed: 32 additions & 1 deletion
diff --git a/‎docker_images/latent-to-image/Dockerfile
Lines changed: 61 additions & 0 deletions b/‎docker_images/latent-to-image/Dockerfile
Lines changed: 61 additions & 0 deletions
diff --git a/‎docker_images/latent-to-image/app/__init__.py b/‎docker_images/latent-to-image/app/__init__.py
diff --git a/‎docker_images/latent-to-image/app/healthchecks.py
Lines changed: 239 additions & 0 deletions b/‎docker_images/latent-to-image/app/healthchecks.py
Lines changed: 239 additions & 0 deletions
@@ -196,8 +196,12 @@ def check_inputs(inputs, tag):
 IMAGE_OUTPUTS = {
     "image-to-image",
     "text-to-image",
+    "latent-to-image",
 }
 
+TENSOR_INPUTS = {
+    "latent-to-image",
+}
 
 TEXT_INPUTS = {
     "conversational",
@@ -218,7 +222,7 @@ def check_inputs(inputs, tag):
     "zero-shot-classification",
 }
 
-KNOWN_TASKS = AUDIO_INPUTS.union(IMAGE_INPUTS).union(TEXT_INPUTS)
+KNOWN_TASKS = AUDIO_INPUTS.union(IMAGE_INPUTS).union(TEXT_INPUTS).union(TENSOR_INPUTS)
 
 AUDIO = [
     "flac",
@@ -266,6 +270,8 @@ def normalize_payload(
         return normalize_payload_image(bpayload)
     elif task in TEXT_INPUTS:
         return normalize_payload_nlp(bpayload, task)
+    elif task in TENSOR_INPUTS:
+        return normalize_payload_tensor(bpayload)
     else:
         raise EnvironmentError(
             f"The task `{task}` is not recognized by api-inference-community"
@@ -407,3 +413,28 @@ def normalize_payload_nlp(bpayload: bytes, task: str) -> Tuple[Any, Dict]:
     check_params(parameters, task)
     check_inputs(inputs, task)
     return inputs, parameters
+
+
+def normalize_payload_tensor(bpayload: bytes) -> Tuple[Any, Dict]:
+    import torch
+
+    data = json.loads(bpayload)
+    tensor = data["inputs"]
+    tensor = b64decode(tensor.encode("utf-8"))
+    parameters = data.get("parameters", {})
+    if "shape" not in parameters:
+        raise ValueError("Expected `shape` in parameters.")
+    if "dtype" not in parameters:
+        raise ValueError("Expected `dtype` in parameters.")
+
+    DTYPE_MAP = {
+        "float16": torch.float16,
+        "float32": torch.float32,
+        "bfloat16": torch.bfloat16,
+    }
+
+    shape = parameters.pop("shape")
+    dtype = DTYPE_MAP.get(parameters.pop("dtype"))
+    tensor = torch.frombuffer(bytearray(tensor), dtype=dtype).reshape(shape)
+
+    return tensor, parameters
@@ -0,0 +1,61 @@
+FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
+LABEL maintainer="Nicolas Patry <[email protected]>"
+
+# Add any system dependency here
+# RUN apt-get update -y && apt-get install libXXX -y
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install prerequisites
+RUN apt-get update && \
+    apt-get install -y build-essential libssl-dev zlib1g-dev libbz2-dev \
+    libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+    xz-utils tk-dev libffi-dev liblzma-dev python3-openssl git
+
+# Install pyenv
+RUN curl https://pyenv.run | bash
+
+# Set environment variables for pyenv
+ENV PYENV_ROOT=/root/.pyenv
+ENV PATH=$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
+
+# Install your desired Python version
+ARG PYTHON_VERSION=3.9.1
+RUN pyenv install $PYTHON_VERSION && \
+    pyenv global $PYTHON_VERSION && \
+    pyenv rehash
+
+RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
+    pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+
+WORKDIR /app
+COPY ./requirements.txt /app
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Most DL models are quite large in terms of memory, using workers is a HUGE
+# slowdown because of the fork and GIL with python.
+# Using multiple pods seems like a better default strategy.
+# Feel free to override if it does not make sense for your library.
+ARG max_workers=1
+ENV MAX_WORKERS=$max_workers
+ENV HUGGINGFACE_HUB_CACHE=/data
+ENV DIFFUSERS_CACHE=/data
+
+# Necessary on GPU environment docker.
+# TIMEOUT env variable is used by nvcr.io/nvidia/pytorch:xx for another purpose
+# rendering TIMEOUT defined by uvicorn impossible to use correctly
+# We're overriding it to be renamed UVICORN_TIMEOUT
+# UVICORN_TIMEOUT is a useful variable for very large models that take more
+# than 30s (the default) to load in memory.
+# If UVICORN_TIMEOUT is too low, uvicorn will simply never loads as it will
+# kill workers all the time before they finish.
+COPY --from=tiangolo/uvicorn-gunicorn:python3.8 /app/ /app
+COPY --from=tiangolo/uvicorn-gunicorn:python3.8 /start.sh /
+COPY --from=tiangolo/uvicorn-gunicorn:python3.8 /gunicorn_conf.py /
+COPY app/ /app/app
+
+COPY ./prestart.sh /app/
+
+RUN sed -i 's/TIMEOUT/UVICORN_TIMEOUT/g' /gunicorn_conf.py
+
+CMD ["/start.sh"]
@@ -0,0 +1,239 @@
+"""
+This file allows users to spawn some side service helping with giving a better view on the main ASGI app status.
+The issue with the status route of the main application is that it gets unresponsive as soon as all workers get busy.
+Thus, you cannot really use the said route as a healthcheck to decide whether your app is healthy or not.
+Instead this module allows you to distinguish between a dead service (not able to even tcp connect to app port)
+and a busy one (able to connect but not to process a trivial http request in time) as both states should result in
+different actions (restarting the service vs scaling it). It also exposes some data to be
+consumed as custom metrics, for example to be used in autoscaling decisions.
+"""
+
+import asyncio
+import functools
+import logging
+import os
+from collections import namedtuple
+from typing import Optional
+
+import aiohttp
+import psutil
+from starlette.applications import Starlette
+from starlette.requests import Request
+from starlette.responses import Response
+from starlette.routing import Route
+
+
+logger = logging.getLogger(__name__)
+
+
+METRICS = ""
+STATUS_OK = 0
+STATUS_BUSY = 1
+STATUS_ERROR = 2
+
+
+def metrics():
+    logging.debug("Requesting metrics")
+    return METRICS
+
+
+async def metrics_route(_request: Request) -> Response:
+    return Response(content=metrics())
+
+
+routes = [
+    Route("/{whatever:path}", metrics_route),
+]
+
+app = Starlette(routes=routes)
+
+
+def reset_logging():
+    if os.environ.get("METRICS_DEBUG", "false").lower() in ["1", "true"]:
+        level = logging.DEBUG
+    else:
+        level = logging.INFO
+    logging.basicConfig(
+        level=level,
+        format="healthchecks - %(asctime)s - %(levelname)s - %(message)s",
+        force=True,
+    )
+
+
+@app.on_event("startup")
+async def startup_event():
+    reset_logging()
+    # Link between `api-inference-community` and framework code.
+    asyncio.create_task(compute_metrics_loop(), name="compute_metrics")
+
+
+@functools.lru_cache()
+def get_listening_port():
+    logger.debug("Get listening port")
+    main_app_port = os.environ.get("MAIN_APP_PORT", "80")
+    try:
+        main_app_port = int(main_app_port)
+    except ValueError:
+        logger.warning(
+            "Main app port cannot be converted to an int, skipping and defaulting to 80"
+        )
+        main_app_port = 80
+    return main_app_port
+
+
+async def find_app_process(
+    listening_port: int,
+) -> Optional[namedtuple("addr", ["ip", "port"])]:  # noqa
+    connections = psutil.net_connections()
+    app_laddr = None
+    for c in connections:
+        if c.laddr.port != listening_port:
+            logger.debug("Skipping listening connection bound to excluded port %s", c)
+            continue
+        if c.status == psutil.CONN_LISTEN:
+            logger.debug("Found LISTEN conn %s", c)
+            candidate = c.pid
+            try:
+                p = psutil.Process(candidate)
+            except psutil.NoSuchProcess:
+                continue
+            if p.name() == "gunicorn":
+                logger.debug("Found gunicorn process %s", p)
+                app_laddr = c.laddr
+                break
+
+    return app_laddr
+
+
+def count_current_conns(app_port: int) -> str:
+    estab = []
+    conns = psutil.net_connections()
+
+    # logger.debug("Connections %s", conns)
+
+    for c in conns:
+        if c.status != psutil.CONN_ESTABLISHED:
+            continue
+        if c.laddr.port == app_port:
+            estab.append(c)
+    current_conns = len(estab)
+    logger.info("Current count of established connections  to app: %d", current_conns)
+
+    curr_conns_str = """# HELP inference_app_established_conns Established connection count for a given app.
+# TYPE inference_app_established_conns gauge
+inference_app_established_conns{{port="{:d}"}} {:d}
+""".format(
+        app_port, current_conns
+    )
+    return curr_conns_str
+
+
+async def status_with_timeout(
+    listening_port: int, app_laddr: Optional[namedtuple("addr", ["ip", "port"])]  # noqa
+) -> str:
+    logger.debug("Checking application status")
+
+    status = STATUS_OK
+
+    if not app_laddr:
+        status = STATUS_ERROR
+    else:
+        try:
+            async with aiohttp.ClientSession(
+                timeout=aiohttp.ClientTimeout(total=0.5)
+            ) as session:
+                url = "http://{}:{:d}/".format(app_laddr.ip, app_laddr.port)
+                async with session.get(url) as resp:
+                    status_code = resp.status
+                    status_text = await resp.text()
+                logger.debug("Status code %s and text %s", status_code, status_text)
+                if status_code != 200 or status_text != '{"ok":"ok"}':
+                    status = STATUS_ERROR
+        except asyncio.TimeoutError:
+            logger.debug("Asgi app seems busy, unable to reach it before timeout")
+            status = STATUS_BUSY
+        except Exception as e:
+            logger.exception(e)
+            status = STATUS_ERROR
+
+    status_str = """# HELP inference_app_status Application health status (0: ok, 1: busy, 2: error).
+# TYPE inference_app_status gauge
+inference_app_status{{port="{:d}"}} {:d}
+""".format(
+        listening_port, status
+    )
+
+    return status_str
+
+
+async def single_metrics_compute():
+    global METRICS
+    listening_port = get_listening_port()
+    app_laddr = await find_app_process(listening_port)
+    current_conns = count_current_conns(listening_port)
+    status = await status_with_timeout(listening_port, app_laddr)
+
+    # Assignment is atomic, we should be safe without locking
+    METRICS = current_conns + status
+
+    # Persist metrics to the local ephemeral as well
+    metrics_file = os.environ.get("METRICS_FILE")
+    if metrics_file:
+        with open(metrics_file) as f:
+            f.write(METRICS)
+
+
+@functools.lru_cache()
+def get_polling_sleep():
+    logger.debug("Get polling sleep interval")
+    sleep_value = os.environ.get("METRICS_POLLING_INTERVAL", 10)
+    try:
+        sleep_value = float(sleep_value)
+    except ValueError:
+        logger.warning(
+            "Unable to cast METRICS_POLLING_INTERVAL env value %s to float. Defaulting to 10.",
+            sleep_value,
+        )
+        sleep_value = 10.0
+    return sleep_value
+
+
+@functools.lru_cache()
+def get_initial_delay():
+    logger.debug("Get polling initial delay")
+    sleep_value = os.environ.get("METRICS_INITIAL_DELAY", 30)
+    try:
+        sleep_value = float(sleep_value)
+    except ValueError:
+        logger.warning(
+            "Unable to cast METRICS_INITIAL_DELAY env value %s to float. "
+            "Defaulting to 30.",
+            sleep_value,
+        )
+        sleep_value = 30.0
+    return sleep_value
+
+
+async def compute_metrics_loop():
+    initial_delay = get_initial_delay()
+
+    await asyncio.sleep(initial_delay)
+
+    polling_sleep = get_polling_sleep()
+    while True:
+        await asyncio.sleep(polling_sleep)
+        try:
+            await single_metrics_compute()
+        except Exception as e:
+            logger.error("Something wrong occurred while computing metrics")
+            logger.exception(e)
+
+
+if __name__ == "__main__":
+    reset_logging()
+    try:
+        single_metrics_compute()
+        logger.info("Metrics %s", metrics())
+    except Exception as exc:
+        logging.exception(exc)
+        raise