Skip to content

Commit 3c8457a

Browse files
authored
feat(BA-922): Add internal address config (#3989)
1 parent 4f66a84 commit 3c8457a

File tree

20 files changed

+114
-27
lines changed

20 files changed

+114
-27
lines changed

changes/3989.feature.md

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Separate internal api port

configs/account-manager/halfstack.toml

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ pool-pre-ping = false
1818
[account-manager]
1919
num-proc = 1
2020
service-addr = { host = "0.0.0.0", port = 8088 }
21+
internal-addr = { host = "0.0.0.0", port = 8087 }
2122
#user = "nobody"
2223
#group = "nobody"
2324
ssl-enabled = false

configs/account-manager/sample.toml

+2
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ num-proc = 1
4646

4747
# Set the service hostname/port to accept API requests.
4848
service-addr = { host = "0.0.0.0", port = 8088 }
49+
# Set the internal hostname/port to accept internal API requests.
50+
internal-addr = { host = "0.0.0.0", port = 8087 }
4951

5052
# Specify the user/group used for the account-manager daemon,
5153
# to which the account-manager changes after reading the daemon configuration and SSL certifiactes.

configs/manager/halfstack.toml

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ pool-pre-ping = false
1818
[manager]
1919
num-proc = 4
2020
service-addr = { host = "0.0.0.0", port = 8081 }
21+
internal-addr = { host = "0.0.0.0", port = 8092 }
2122
#user = "nobody"
2223
#group = "nobody"
2324
ssl-enabled = false

configs/manager/sample.toml

+3
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ group = "nobody"
7373

7474
# Set the service hostname/port to accept API requests.
7575
service-addr = { host = "0.0.0.0", port = 8080 }
76+
77+
# Set the internal hostname/port to accept internal API requests.
78+
internal-addr = { host = "0.0.0.0", port = 8082 }
7679
# env: BACKEND_SERVICE_IP, BACKEND_SERVICE_PORT
7780

7881
# Set the SSL certificate chain and the private keys used for serving the API requests.

configs/prometheus/prometheus.yaml

+4-7
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,16 @@ scrape_configs:
2121
- targets: ['backendai-half-prometheus:9090']
2222
- job_name: backendai-half-manager
2323
static_configs:
24-
- targets: ['host.docker.internal:8091']
24+
- targets: ['host.docker.internal:8092']
2525
- job_name: backendai-half-account-manager
2626
static_configs:
27-
- targets: ['host.docker.internal:8088']
27+
- targets: ['host.docker.internal:8087']
2828
- job_name: backendai-half-storage-proxy
29-
scheme: https
30-
tls_config:
31-
insecure_skip_verify: true
3229
static_configs:
33-
- targets: ['host.docker.internal:6022']
30+
- targets: ['host.docker.internal:6023']
3431
- job_name: backendai-half-wsproxy
3532
static_configs:
36-
- targets: ['host.docker.internal:5050']
33+
- targets: ['host.docker.internal:5051']
3734
- job_name: backendai-half-agent
3835
static_configs:
3936
- targets: ['host.docker.internal:6003']

configs/storage-proxy/halfstack.toml

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ ssl-enabled = false
5656
[api.manager]
5757
# Manager-facing API
5858
service-addr = { host = "0.0.0.0", port = 6022 }
59+
internal-addr = { host = "0.0.0.0", port = 6023 }
5960
ssl-enabled = false
6061
# ssl-cert = "configs/storage-proxy/ssl/manager-api-selfsigned.cert.pem"
6162
# ssl-privkey = "configs/storage-proxy/ssl/manager-api-selfsigned.key.pem"

configs/storage-proxy/sample.toml

+2
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ ssl-enabled = false
6868
# Manager-facing API
6969
# Recommended to have SSL and bind on a private IP only accessible by managers
7070
service-addr = { host = "0.0.0.0", port = 6022 }
71+
# Set the internal hostname/port to accept internal API requests.
72+
internal-addr = { host = "0.0.0.0", port = 6023 }
7173
ssl-enabled = true
7274
ssl-cert = "configs/storage-proxy/ssl/manager-api-selfsigned.cert.pem"
7375
ssl-privkey = "configs/storage-proxy/ssl/manager-api-selfsigned.key.pem"

configs/wsproxy/halfstack.toml

+2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ bind_host = "0.0.0.0"
33
advertised_host = "127.0.0.1"
44

55
bind_api_port = 5050
6+
internal_api_port = 5051
7+
68
advertised_api_port = 5050
79

810
# replace these values with your passphrase

configs/wsproxy/sample.toml

+2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ group = 501
88
bind_host = "0.0.0.0"
99
advertised_host = "example.com"
1010
bind_api_port = 5050
11+
# Set the internal hostname/port to accept internal API requests.
12+
internal_api_port = 5051
1113
advertised_api_port = 15050
1214
bind_proxy_port_range = [
1315
10200,

src/ai/backend/account_manager/config.py

+7
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,13 @@ class AccountManagerConfig(BaseSchema):
294294
examples=[HostPortPair(host="127.0.0.1", port=8099)],
295295
),
296296
]
297+
internal_addr: Annotated[
298+
HostPortPair,
299+
Field(
300+
description="Address of account-manager internal service for internal infra communication.",
301+
examples=[HostPortPair(host="127.0.0.1", port=8098)],
302+
),
303+
]
297304
ipc_base_path: Annotated[
298305
Path,
299306
Field(

src/ai/backend/account_manager/server.py

+18-3
Original file line numberDiff line numberDiff line change
@@ -265,9 +265,13 @@ async def _call_cleanup_context_shutdown_handlers(app: web.Application) -> None:
265265
# should be done in create_app() in other modules.
266266
cors.add(app.router.add_route("GET", r"", hello))
267267
cors.add(app.router.add_route("GET", r"/", hello))
268-
cors.add(
269-
app.router.add_route("GET", r"/metrics", build_prometheus_metrics_handler(metric_registry))
270-
)
268+
return app
269+
270+
271+
def build_internal_app() -> web.Application:
272+
app = web.Application()
273+
metric_registry = CommonMetricRegistry.instance()
274+
app.router.add_route("GET", r"/metrics", build_prometheus_metrics_handler(metric_registry))
271275
return app
272276

273277

@@ -278,6 +282,7 @@ async def server_main(
278282
_args: list[Any],
279283
) -> AsyncIterator[None]:
280284
root_app = build_root_app(pidx, _args[0], subapp_pkgs=global_subapp_pkgs)
285+
internal_app = build_internal_app()
281286
root_ctx: RootContext = root_app["_root.context"]
282287

283288
local_cfg = cast(ServerConfig, root_ctx.local_config)
@@ -327,7 +332,9 @@ async def server_main(
327332
)
328333

329334
runner = web.AppRunner(root_app, keepalive_timeout=30.0)
335+
internal_runner = web.AppRunner(internal_app, keepalive_timeout=30.0)
330336
await runner.setup()
337+
await internal_runner.setup()
331338
service_addr = am_cfg.service_addr
332339
site = web.TCPSite(
333340
runner,
@@ -337,7 +344,15 @@ async def server_main(
337344
reuse_port=True,
338345
ssl_context=ssl_ctx,
339346
)
347+
internal_site = web.TCPSite(
348+
internal_runner,
349+
str(am_cfg.internal_addr.host),
350+
am_cfg.internal_addr.port,
351+
backlog=1024,
352+
reuse_port=True,
353+
)
340354
await site.start()
355+
await internal_site.start()
341356

342357
if os.geteuid() == 0:
343358
uid = am_cfg.user

src/ai/backend/manager/config.py

+1
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@
265265
t.Key("user", default=None): tx.UserID(default_uid=_file_perm.st_uid),
266266
t.Key("group", default=None): tx.GroupID(default_gid=_file_perm.st_gid),
267267
t.Key("service-addr", default=("0.0.0.0", 8080)): tx.HostPortPair,
268+
t.Key("internal-addr", default=("0.0.0.0", 8081)): tx.HostPortPair,
268269
t.Key(
269270
"rpc-auth-manager-keypair", default="fixtures/manager/manager.key_secret"
270271
): tx.Path(type="file"),

src/ai/backend/manager/server.py

+20-4
Original file line numberDiff line numberDiff line change
@@ -855,7 +855,7 @@ def build_root_app(
855855
sample_rate=local_config["pyroscope"]["sample-rate"],
856856
)
857857
)
858-
root_ctx = RootContext(metrics=CommonMetricRegistry())
858+
root_ctx = RootContext(metrics=CommonMetricRegistry.instance())
859859
app = web.Application(
860860
middlewares=[
861861
exception_middleware,
@@ -942,9 +942,6 @@ async def _call_cleanup_context_shutdown_handlers(app: web.Application) -> None:
942942
# should be done in create_app() in other modules.
943943
cors.add(app.router.add_route("GET", r"", hello))
944944
cors.add(app.router.add_route("GET", r"/", hello))
945-
cors.add(
946-
app.router.add_route("GET", r"/metrics", build_prometheus_metrics_handler(root_ctx.metrics))
947-
)
948945
if subapp_pkgs is None:
949946
subapp_pkgs = []
950947
for pkg_name in subapp_pkgs:
@@ -959,6 +956,13 @@ async def _call_cleanup_context_shutdown_handlers(app: web.Application) -> None:
959956
return app
960957

961958

959+
def build_internal_app() -> web.Application:
960+
app = web.Application()
961+
metric_registry = CommonMetricRegistry.instance()
962+
app.router.add_route("GET", r"/metrics", build_prometheus_metrics_handler(metric_registry))
963+
return app
964+
965+
962966
def build_public_app(
963967
root_ctx: RootContext,
964968
subapp_pkgs: Iterable[str] | None = None,
@@ -982,6 +986,7 @@ async def server_main(
982986
_args: List[Any],
983987
) -> AsyncIterator[None]:
984988
root_app = build_root_app(pidx, _args[0], subapp_pkgs=global_subapp_pkgs)
989+
internal_app = build_internal_app()
985990
root_ctx: RootContext = root_app["_root.context"]
986991

987992
# Start aiomonitor.
@@ -1021,8 +1026,11 @@ async def server_main(
10211026
)
10221027

10231028
runner = web.AppRunner(root_app, keepalive_timeout=30.0)
1029+
internal_runner = web.AppRunner(internal_app, keepalive_timeout=30.0)
10241030
await runner.setup()
1031+
await internal_runner.setup()
10251032
service_addr = cast(HostPortPair, root_ctx.local_config["manager"]["service-addr"])
1033+
internal_addr = cast(HostPortPair, root_ctx.local_config["manager"]["internal-addr"])
10261034
site = web.TCPSite(
10271035
runner,
10281036
str(service_addr.host),
@@ -1031,7 +1039,15 @@ async def server_main(
10311039
reuse_port=True,
10321040
ssl_context=ssl_ctx,
10331041
)
1042+
internal_site = web.TCPSite(
1043+
internal_runner,
1044+
str(internal_addr.host),
1045+
internal_addr.port,
1046+
backlog=1024,
1047+
reuse_port=True,
1048+
)
10341049
await site.start()
1050+
await internal_site.start()
10351051
public_metrics_port = cast(
10361052
Optional[int], root_ctx.local_config["manager"]["public-metrics-port"]
10371053
)

src/ai/backend/storage/api/manager.py

+12-9
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,11 @@
4242
VolumeMounted,
4343
VolumeUnmounted,
4444
)
45-
from ai.backend.common.metrics.http import build_api_metric_middleware
45+
from ai.backend.common.metrics.http import (
46+
build_api_metric_middleware,
47+
build_prometheus_metrics_handler,
48+
)
49+
from ai.backend.common.metrics.metric import CommonMetricRegistry
4650
from ai.backend.common.types import AgentId, BinarySize, ItemResult, QuotaScopeID, ResultSet
4751
from ai.backend.logging import BraceStyleAdapter
4852

@@ -105,13 +109,6 @@ async def check_status(request: web.Request) -> web.Response:
105109
)
106110

107111

108-
@skip_token_auth
109-
async def prometheus_metrics_handler(request: web.Request) -> web.Response:
110-
root_ctx: RootContext = request.app["ctx"]
111-
metrics = root_ctx.metric_registry.to_prometheus()
112-
return web.Response(text=metrics, content_type="text/plain")
113-
114-
115112
@ctxmgr
116113
def handle_fs_errors(
117114
volume: AbstractVolume,
@@ -1214,7 +1211,6 @@ async def init_manager_app(ctx: RootContext) -> web.Application:
12141211
app["app_ctx"] = app_ctx
12151212
app.on_shutdown.append(_shutdown)
12161213
app.router.add_route("GET", "/", check_status)
1217-
app.router.add_route("GET", "/metrics", prometheus_metrics_handler)
12181214
app.router.add_route("GET", "/status", check_status)
12191215
app.router.add_route("GET", "/volumes", get_volumes)
12201216
app.router.add_route("GET", "/volume/hwinfo", get_hwinfo)
@@ -1251,6 +1247,13 @@ async def init_manager_app(ctx: RootContext) -> web.Application:
12511247
return app
12521248

12531249

1250+
def init_internal_app() -> web.Application:
1251+
app = web.Application()
1252+
metric_registry = CommonMetricRegistry.instance()
1253+
app.router.add_route("GET", "/metrics", build_prometheus_metrics_handler(metric_registry))
1254+
return app
1255+
1256+
12541257
async def handle_volume_mount(
12551258
context: RootContext,
12561259
source: AgentId,

src/ai/backend/storage/config.py

+3
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@
100100
t.Key("service-addr"): tx.HostPortPair(
101101
allow_blank_host=True,
102102
),
103+
t.Key("internal-addr", default=("127.0.0.1", 6023)): tx.HostPortPair(
104+
allow_blank_host=True,
105+
),
103106
t.Key("ssl-enabled"): t.ToBool,
104107
t.Key("ssl-cert", default=None): t.Null | tx.Path(type="file"),
105108
t.Key("ssl-privkey", default=None): t.Null | tx.Path(type="file"),

src/ai/backend/storage/context.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from ai.backend.logging import BraceStyleAdapter
2727

2828
from .api.client import init_client_app
29-
from .api.manager import init_manager_app
29+
from .api.manager import init_internal_app, init_manager_app
3030
from .exception import InvalidVolumeError
3131
from .plugin import (
3232
BasePluginContext,
@@ -173,6 +173,7 @@ async def __aenter__(self) -> None:
173173
# TODO: Setup the apps outside of the context.
174174
self.client_api_app = await init_client_app(self)
175175
self.manager_api_app = await init_manager_app(self)
176+
self.internal_api_app = init_internal_app()
176177
self.backends = {
177178
**DEFAULT_BACKENDS,
178179
}

src/ai/backend/storage/server.py

+11
Original file line numberDiff line numberDiff line change
@@ -206,10 +206,13 @@ async def server_main(
206206
)
207207
client_api_runner = web.AppRunner(ctx.client_api_app)
208208
manager_api_runner = web.AppRunner(ctx.manager_api_app)
209+
internal_api_runner = web.AppRunner(ctx.internal_api_app)
209210
await client_api_runner.setup()
210211
await manager_api_runner.setup()
212+
await internal_api_runner.setup()
211213
client_service_addr = local_config["api"]["client"]["service-addr"]
212214
manager_service_addr = local_config["api"]["manager"]["service-addr"]
215+
internal_addr = local_config["api"]["manager"]["internal-addr"]
213216
client_api_site = web.TCPSite(
214217
client_api_runner,
215218
str(client_service_addr.host),
@@ -226,8 +229,16 @@ async def server_main(
226229
reuse_port=True,
227230
ssl_context=manager_ssl_ctx,
228231
)
232+
internal_api_site = web.TCPSite(
233+
internal_api_runner,
234+
str(internal_addr.host),
235+
internal_addr.port,
236+
backlog=1024,
237+
reuse_port=True,
238+
)
229239
await client_api_site.start()
230240
await manager_api_site.start()
241+
await internal_api_site.start()
231242
if _is_root():
232243
uid = local_config["storage-proxy"]["user"]
233244
gid = local_config["storage-proxy"]["group"]

src/ai/backend/wsproxy/config.py

+3
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,9 @@ class WSProxyConfig(BaseSchema):
390390
bind_api_port: Annotated[
391391
int, Field(default=5050, description="Port number to bind for API server")
392392
]
393+
internal_api_port: Annotated[
394+
int, Field(default=5051, description="Port number to bind for internal API server")
395+
]
393396
advertised_api_port: Annotated[
394397
int | None,
395398
Field(default=None, examples=[15050], description="API port number reachable from client"),

0 commit comments

Comments
 (0)