Skip to content

Commit

Permalink
Setup AspNetCore health checks and Prometheus metrics
Browse files Browse the repository at this point in the history
Closes #2001
  • Loading branch information
Cyberboss committed Feb 16, 2025
1 parent c25b9f1 commit d61cd57
Show file tree
Hide file tree
Showing 22 changed files with 258 additions and 6 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,9 @@ Create an `appsettings.Production.yml` file next to `appsettings.yml`. This will

- `General:ConfigVersion`: Suppresses warnings about out of date config versions. You should change this after updating TGS to one with a new config version. The current version can be found on the releases page for your server version.

- `General:MinimumPasswordLength`: Minimum password length requirement for database users
- `General:MinimumPasswordLength`: Minimum password length requirement for database users.

- `General:PrometheusPort`: Port Prometheus metrics are published on under /metrics. This can be set to the same value as the `ApiPort`, just note that accessing it does not require authentication.

- `General:ValidInstancePaths`: Array meant to limit the directories in which instances may be created.

Expand Down
5 changes: 4 additions & 1 deletion build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,12 @@ RUN export TGS_TELEMETRY_KEY_FILE="../../${TGS_TELEMETRY_KEY_FILE}" \

FROM mcr.microsoft.com/dotnet/aspnet:8.0-bookworm-slim

#needed for byond
#needed for byond, curl for healthchecks
RUN apt-get update \
&& apt-get install -y \
gcc-multilib \
gdb \
curl \
&& rm -rf /var/lib/apt/lists/*

EXPOSE 5000
Expand All @@ -78,4 +79,6 @@ COPY --from=build /repo/build/tgs.docker.sh tgs.sh

VOLUME ["/config_data", "/tgs_logs", "/app/lib"]

HEALTHCHECK CMD --curl --fail http://localhost:5000/health || exit

ENTRYPOINT ["./tgs.sh"]
2 changes: 1 addition & 1 deletion build/Version.props
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<Import Project="WebpanelVersion.props" />
<PropertyGroup>
<TgsCoreVersion>6.13.0</TgsCoreVersion>
<TgsConfigVersion>5.4.0</TgsConfigVersion>
<TgsConfigVersion>5.5.0</TgsConfigVersion>
<TgsRestVersion>10.12.1</TgsRestVersion>
<TgsGraphQLVersion>0.5.0</TgsGraphQLVersion>
<TgsCommonLibraryVersion>7.0.0</TgsCommonLibraryVersion>
Expand Down
34 changes: 33 additions & 1 deletion src/Tgstation.Server.Host/Components/Deployment/DreamMaker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Logging;

using Prometheus;

using Tgstation.Server.Api.Models;
using Tgstation.Server.Api.Models.Internal;
using Tgstation.Server.Common.Extensions;
Expand Down Expand Up @@ -105,6 +107,21 @@ sealed class DreamMaker : IDreamMaker
/// </summary>
readonly Api.Models.Instance metadata;

/// <summary>
/// The number of attempted deployments.
/// </summary>
readonly Counter attemptedDeployments;

/// <summary>
/// The number of successful deployments.
/// </summary>
readonly Counter successfulDeployments;

/// <summary>
/// The number of failed deployments.
/// </summary>
readonly Counter failedDeployments;

/// <summary>
/// <see langword="lock"/> <see cref="object"/> for <see cref="deploying"/>.
/// </summary>
Expand Down Expand Up @@ -149,6 +166,7 @@ static string FormatExceptionForUsers(Exception exception)
/// <param name="repositoryManager">The value of <see cref="repositoryManager"/>.</param>
/// <param name="remoteDeploymentManagerFactory">The value of <see cref="remoteDeploymentManagerFactory"/>.</param>
/// <param name="asyncDelayer">The value of <see cref="asyncDelayer"/>.</param>
/// <param name="metricFactory">The <see cref="IMetricFactory"/> to use.</param>
/// <param name="logger">The value of <see cref="logger"/>.</param>
/// <param name="sessionConfiguration">The value of <see cref="sessionConfiguration"/>.</param>
/// <param name="metadata">The value of <see cref="metadata"/>.</param>
Expand All @@ -164,6 +182,7 @@ public DreamMaker(
IRepositoryManager repositoryManager,
IRemoteDeploymentManagerFactory remoteDeploymentManagerFactory,
IAsyncDelayer asyncDelayer,
IMetricFactory metricFactory,
ILogger<DreamMaker> logger,
SessionConfiguration sessionConfiguration,
Api.Models.Instance metadata)
Expand All @@ -177,12 +196,17 @@ public DreamMaker(
this.processExecutor = processExecutor ?? throw new ArgumentNullException(nameof(processExecutor));
this.compileJobConsumer = compileJobConsumer ?? throw new ArgumentNullException(nameof(compileJobConsumer));
this.repositoryManager = repositoryManager ?? throw new ArgumentNullException(nameof(repositoryManager));
this.asyncDelayer = asyncDelayer ?? throw new ArgumentNullException(nameof(asyncDelayer));
this.remoteDeploymentManagerFactory = remoteDeploymentManagerFactory ?? throw new ArgumentNullException(nameof(remoteDeploymentManagerFactory));
this.asyncDelayer = asyncDelayer ?? throw new ArgumentNullException(nameof(asyncDelayer));
ArgumentNullException.ThrowIfNull(metricFactory);
this.logger = logger ?? throw new ArgumentNullException(nameof(logger));
this.sessionConfiguration = sessionConfiguration ?? throw new ArgumentNullException(nameof(sessionConfiguration));
this.metadata = metadata ?? throw new ArgumentNullException(nameof(metadata));

successfulDeployments = metricFactory.CreateCounter("tgs_successful_deployments", "The number of deployments that have completed successfully");
failedDeployments = metricFactory.CreateCounter("tgs_failed_deployments", "The number of deployments that have failed");
attemptedDeployments = metricFactory.CreateCounter("tgs_total_deployments", "The number of deployments that have been attempted");

deploymentLock = new object();
}

Expand All @@ -205,9 +229,12 @@ public async ValueTask DeploymentProcess(
deploying = true;
}

attemptedDeployments.Inc();

currentChatCallback = null;
currentDreamMakerOutput = null;
Models.CompileJob? compileJob = null;
bool success = false;
try
{
string? repoOwner = null;
Expand Down Expand Up @@ -351,6 +378,7 @@ await databaseContextFactory.UseContext(
{
var chatNotificationAction = currentChatCallback!(null, compileJob.Output!);
await compileJobConsumer.LoadCompileJob(compileJob, chatNotificationAction, cancellationToken);
success = true;
}
catch
{
Expand Down Expand Up @@ -406,6 +434,10 @@ await databaseContextFactory.UseContext(
finally
{
deploying = false;
if (success)
successfulDeployments.Inc();
else
failedDeployments.Inc();
}
}
#pragma warning restore CA1506
Expand Down
22 changes: 22 additions & 0 deletions src/Tgstation.Server.Host/Components/InstanceFactory.cs
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Threading;
using System.Threading.Tasks;

using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;

using Prometheus;

using Tgstation.Server.Host.Components.Chat;
using Tgstation.Server.Host.Components.Chat.Commands;
using Tgstation.Server.Host.Components.Deployment;
Expand Down Expand Up @@ -140,6 +144,11 @@ sealed class InstanceFactory : IInstanceFactory
/// </summary>
readonly IDotnetDumpService dotnetDumpService;

/// <summary>
/// The <see cref="IMetricFactory"/> for the <see cref="InstanceFactory"/>.
/// </summary>
readonly IMetricFactory metricFactory;

/// <summary>
/// The <see cref="GeneralConfiguration"/> for the <see cref="InstanceFactory"/>.
/// </summary>
Expand Down Expand Up @@ -183,6 +192,7 @@ sealed class InstanceFactory : IInstanceFactory
/// <param name="remoteDeploymentManagerFactory">The value of <see cref="remoteDeploymentManagerFactory"/>.</param>
/// <param name="asyncDelayer">The value of <see cref="asyncDelayer"/>.</param>
/// <param name="dotnetDumpService">The value of <see cref="dotnetDumpService"/>.</param>
/// <param name="metricFactory">The value of <see cref="metricFactory"/>.</param>
/// <param name="generalConfigurationOptions">The <see cref="IOptions{TOptions}"/> containing the value of <see cref="generalConfiguration"/>.</param>
/// <param name="sessionConfigurationOptions">The <see cref="IOptions{TOptions}"/> containing the value of <see cref="sessionConfiguration"/>.</param>
public InstanceFactory(
Expand All @@ -208,6 +218,7 @@ public InstanceFactory(
IRemoteDeploymentManagerFactory remoteDeploymentManagerFactory,
IAsyncDelayer asyncDelayer,
IDotnetDumpService dotnetDumpService,
IMetricFactory metricFactory,
IOptions<GeneralConfiguration> generalConfigurationOptions,
IOptions<SessionConfiguration> sessionConfigurationOptions)
{
Expand All @@ -233,6 +244,7 @@ public InstanceFactory(
this.remoteDeploymentManagerFactory = remoteDeploymentManagerFactory ?? throw new ArgumentNullException(nameof(remoteDeploymentManagerFactory));
this.asyncDelayer = asyncDelayer ?? throw new ArgumentNullException(nameof(asyncDelayer));
this.dotnetDumpService = dotnetDumpService ?? throw new ArgumentNullException(nameof(dotnetDumpService));
this.metricFactory = metricFactory ?? throw new ArgumentNullException(nameof(metricFactory));
generalConfiguration = generalConfigurationOptions?.Value ?? throw new ArgumentNullException(nameof(generalConfigurationOptions));
sessionConfiguration = sessionConfigurationOptions?.Value ?? throw new ArgumentNullException(nameof(sessionConfigurationOptions));
}
Expand Down Expand Up @@ -264,6 +276,13 @@ public async ValueTask<IInstance> CreateInstance(IBridgeRegistrar bridgeRegistra
var diagnosticsIOManager = new ResolvingIOManager(instanceIoManager, "Diagnostics");
var configurationIoManager = new ResolvingIOManager(instanceIoManager, "Configuration");

var metricFactory = this.metricFactory.WithLabels(
new Dictionary<string, string>
{
{ "instance_name", metadata.Name! },
{ "instance_id", metadata.Id!.Value.ToString(CultureInfo.InvariantCulture) },
});

var configuration = new StaticFiles.Configuration(
configurationIoManager,
synchronousIOManager,
Expand Down Expand Up @@ -323,6 +342,7 @@ public async ValueTask<IInstance> CreateInstance(IBridgeRegistrar bridgeRegistra
eventConsumer,
asyncDelayer,
dotnetDumpService,
metricFactory,
loggerFactory,
loggerFactory.CreateLogger<SessionControllerFactory>(),
sessionConfiguration,
Expand All @@ -337,6 +357,7 @@ public async ValueTask<IInstance> CreateInstance(IBridgeRegistrar bridgeRegistra
diagnosticsIOManager,
configuration, // watchdog doesn't need itself as an event consumer
remoteDeploymentManagerFactory,
metricFactory,
metadata,
metadata.DreamDaemonSettings!);
try
Expand All @@ -357,6 +378,7 @@ public async ValueTask<IInstance> CreateInstance(IBridgeRegistrar bridgeRegistra
repoManager,
remoteDeploymentManagerFactory,
asyncDelayer,
metricFactory,
loggerFactory.CreateLogger<DreamMaker>(),
sessionConfiguration,
metadata);
Expand Down
25 changes: 25 additions & 0 deletions src/Tgstation.Server.Host/Components/InstanceManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;

using Prometheus;

using Tgstation.Server.Api.Models;
using Tgstation.Server.Common;
using Tgstation.Server.Common.Extensions;
Expand Down Expand Up @@ -150,6 +152,11 @@ sealed class InstanceManager :
/// </summary>
readonly CancellationTokenSource shutdownCancellationTokenSource;

/// <summary>
/// The count of online instances.
/// </summary>
readonly Gauge onlineInstances;

/// <summary>
/// The original <see cref="IConsole.Title"/> of <see cref="console"/>.
/// </summary>
Expand Down Expand Up @@ -180,6 +187,8 @@ sealed class InstanceManager :
/// <param name="swarmServiceController">The value of <see cref="swarmServiceController"/>.</param>
/// <param name="console">The value of <see cref="console"/>.</param>
/// <param name="platformIdentifier">The value of <see cref="platformIdentifier"/>.</param>
/// <param name="metricFactory">The <see cref="IMetricFactory"/> used to create metrics.</param>
/// <param name="collectorRegistry">The <see cref="ICollectorRegistry"/> to use.</param>
/// <param name="generalConfigurationOptions">The <see cref="IOptions{TOptions}"/> containing the value of <see cref="generalConfiguration"/>.</param>
/// <param name="swarmConfigurationOptions">The <see cref="IOptions{TOptions}"/> containing the value of <see cref="swarmConfiguration"/>.</param>
/// <param name="internalConfigurationOptions">The <see cref="IOptions{TOptions}"/> containing the value of <see cref="internalConfiguration"/>.</param>
Expand All @@ -197,6 +206,8 @@ public InstanceManager(
ISwarmServiceController swarmServiceController,
IConsole console,
IPlatformIdentifier platformIdentifier,
IMetricFactory metricFactory,
ICollectorRegistry collectorRegistry,
IOptions<GeneralConfiguration> generalConfigurationOptions,
IOptions<SwarmConfiguration> swarmConfigurationOptions,
IOptions<InternalConfiguration> internalConfigurationOptions,
Expand All @@ -214,19 +225,30 @@ public InstanceManager(
this.swarmServiceController = swarmServiceController ?? throw new ArgumentNullException(nameof(swarmServiceController));
this.console = console ?? throw new ArgumentNullException(nameof(console));
this.platformIdentifier = platformIdentifier ?? throw new ArgumentNullException(nameof(platformIdentifier));
ArgumentNullException.ThrowIfNull(metricFactory);
ArgumentNullException.ThrowIfNull(collectorRegistry);
generalConfiguration = generalConfigurationOptions?.Value ?? throw new ArgumentNullException(nameof(generalConfigurationOptions));
swarmConfiguration = swarmConfigurationOptions?.Value ?? throw new ArgumentNullException(nameof(swarmConfigurationOptions));
internalConfiguration = internalConfigurationOptions?.Value ?? throw new ArgumentNullException(nameof(internalConfigurationOptions));
this.logger = logger ?? throw new ArgumentNullException(nameof(logger));

originalConsoleTitle = console.Title;

onlineInstances = metricFactory.CreateGauge("tgs_online_instances", "The total number of instances online");

instances = new Dictionary<long, ReferenceCountingContainer<IInstance, InstanceWrapper>>();
bridgeHandlers = new Dictionary<string, IBridgeHandler>();
readyTcs = new TaskCompletionSource();
instanceStateChangeSemaphore = new SemaphoreSlim(1);
startupCancellationTokenSource = new CancellationTokenSource();
shutdownCancellationTokenSource = new CancellationTokenSource();

collectorRegistry.AddBeforeCollectCallback(async cancellationToken =>
{
using (await SemaphoreSlimContext.Lock(instanceStateChangeSemaphore, cancellationToken))
foreach (var container in instances.Values)
container.Instance.Watchdog.RunMetricsScrape();
});
}

/// <inheritdoc />
Expand Down Expand Up @@ -396,6 +418,7 @@ await databaseContextFactory.UseContext(
finally
{
await container.Instance.DisposeAsync();
onlineInstances.Dec();
}
}
}
Expand Down Expand Up @@ -426,6 +449,8 @@ public async ValueTask OnlineInstance(Models.Instance metadata, CancellationToke
instances.Add(
instanceId,
new ReferenceCountingContainer<IInstance, InstanceWrapper>(instance));

onlineInstances.Inc();
}
catch (Exception ex)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

using Microsoft.Extensions.Logging;

using Prometheus;

using Tgstation.Server.Api.Models;
using Tgstation.Server.Api.Models.Internal;
using Tgstation.Server.Common.Extensions;
Expand Down Expand Up @@ -127,6 +129,16 @@ sealed class SessionControllerFactory : ISessionControllerFactory
/// </summary>
readonly SessionConfiguration sessionConfiguration;

/// <summary>
/// The number of sessions launched.
/// </summary>
readonly Counter sessionsLaunched;

/// <summary>
/// The time the current session was launched.
/// </summary>
readonly Gauge lastSessionLaunch;

/// <summary>
/// The <see cref="Api.Models.Instance"/> for the <see cref="SessionControllerFactory"/>.
/// </summary>
Expand Down Expand Up @@ -185,6 +197,7 @@ async ValueTask PortBindTest(ushort port, EngineType engineType, CancellationTok
/// <param name="eventConsumer">The value of <see cref="eventConsumer"/>.</param>
/// <param name="asyncDelayer">The value of <see cref="asyncDelayer"/>.</param>
/// <param name="dotnetDumpService">The value of <see cref="dotnetDumpService"/>.</param>
/// <param name="metricFactory">The <see cref="IMetricFactory"/> used to create metrics.</param>
/// <param name="loggerFactory">The value of <see cref="loggerFactory"/>.</param>
/// <param name="logger">The value of <see cref="logger"/>.</param>
/// <param name="sessionConfiguration">The value of <see cref="sessionConfiguration"/>.</param>
Expand All @@ -204,6 +217,7 @@ public SessionControllerFactory(
IEventConsumer eventConsumer,
IAsyncDelayer asyncDelayer,
IDotnetDumpService dotnetDumpService,
IMetricFactory metricFactory,
ILoggerFactory loggerFactory,
ILogger<SessionControllerFactory> logger,
SessionConfiguration sessionConfiguration,
Expand All @@ -224,10 +238,14 @@ public SessionControllerFactory(
this.eventConsumer = eventConsumer ?? throw new ArgumentNullException(nameof(eventConsumer));
this.asyncDelayer = asyncDelayer ?? throw new ArgumentNullException(nameof(asyncDelayer));
this.dotnetDumpService = dotnetDumpService ?? throw new ArgumentNullException(nameof(dotnetDumpService));
ArgumentNullException.ThrowIfNull(metricFactory);
this.loggerFactory = loggerFactory ?? throw new ArgumentNullException(nameof(loggerFactory));
this.logger = logger ?? throw new ArgumentNullException(nameof(logger));
this.sessionConfiguration = sessionConfiguration ?? throw new ArgumentNullException(nameof(sessionConfiguration));
this.instance = instance ?? throw new ArgumentNullException(nameof(instance));

sessionsLaunched = metricFactory.CreateCounter("tgs_sessions_launched", "The number of game server processes created");
lastSessionLaunch = metricFactory.CreateGauge("tgs_session_start_time", "The UTC unix timestamp the most recent session was started");
}

/// <inheritdoc />
Expand Down Expand Up @@ -368,6 +386,12 @@ public async ValueTask<ISessionController> LaunchNew(
false,
apiValidate);

if (!apiValidate)
{
sessionsLaunched.Inc();
lastSessionLaunch.SetToCurrentTimeUtc();
}

return sessionController;
}
catch
Expand Down
Loading

0 comments on commit d61cd57

Please sign in to comment.