Skip to content

Commit

Permalink
Refactor dynamic configuration, simplify
Browse files Browse the repository at this point in the history
- Fix deduped reporting
  • Loading branch information
mariuswilms committed Feb 25, 2025
1 parent 88b7250 commit a38d919
Show file tree
Hide file tree
Showing 15 changed files with 222 additions and 269 deletions.
7 changes: 5 additions & 2 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@
# Controls the user agent string, by default "Tobey/0".
# TOBEY_USER_AGENT="Tobey/0"

# Enable dynamic re-configuration during run time. By default disabled.
# TOBEY_DYNAMIC_CONFIG=yes

# DSN specifying where crawl results should be stored, by default no results are stored. Here we store results
# in the "results" directory, relative to the current working directory.
TOBEY_RESULTS_DSN=disk://results
# TOBEY_RESULTS_DSN=webhook://host/path
TOBEY_RESULT_REPORTER_DSN=disk://results
# TOBEY_RESULT_REPORTER_DSN=webhook://host/path

# DSN for progress reporting. By default, a console progress reporter is used. Uncomment to report progress to the
# Factorial service or disable progress reporting.
Expand Down
20 changes: 12 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -234,22 +234,26 @@ the results to a configured webhook endpoint. [Webhooks](https://mailchimp.com/e
TOBEY_RESULTS_DSN=webhook://example.org/webhook
```

For the webhook method, **dynamic re-configuration** is supported. This means that you
configure the webhook endpoint on a per-request basis. Dynamic re-configuration is disabled
by default, for security reasons. It can be enabled by adding `enable_dynamic_config` to the DSN, if
can you trust the users that submit the crawl requests, i.e. if tobey is deployed as an internal service.
### Dynamic Re-configuration

Tobey supports dynamic re-configuration of the result reporter at runtime. This
means that you can change the result reporter configuration while the crawler is
running. This is useful if you want to change the result reporter configuration
while the crawler is running.

**Dynamic re-configuration is disabled by default, for security reasons. Enable only if you can trust the users that submit the crawl requests, i.e. if tobey is deployed as an internal service.**


```sh
TOBEY_RESULTS_DSN=webhook://example.org/webhook?enable_dynamic_config # with default endpoint
TOBEY_RESULTS_DSN=webhook://?enable_dynamic_config # without default endpoint
TOBEY_DYNAMIC_CONFIG=yes
```

You can than specify the webhook endpoint in the crawl request:
You can than use the `results_dsn` field in the crawl request to specify varying webhook endpoints:

```jsonc
{
"url": "https://example.org",
"results_dsn": "webhook://example.org/webhook"
"result_reporter_dsn": "webhook://example.org/webhook"
}
```

Expand Down
10 changes: 3 additions & 7 deletions api.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,6 @@ type APIRequest struct {

UserAgent string `json:"user_agent"`

WebhookResultStoreConfig *WebhookResultReporterConfig `json:"webhook"`

// If true, we'll bypass the robots.txt check, however we'll still
// download the file to look for sitemaps.
SkipRobots bool `json:"skip_robots"`
Expand All @@ -84,6 +82,9 @@ type APIRequest struct {

// A list of authentication configurations, that are used in the run.
AuthConfigs []*AuthConfig `json:"auth"`

// The DSN of the result reporter to use as dynamic configuration.
ResultReporterDSN string `json:"result_reporter_dsn"`
}

func (req *APIRequest) GetRun() (string, error) {
Expand Down Expand Up @@ -217,11 +218,6 @@ func (req *APIRequest) Validate() bool {
}
}
}
if req.WebhookResultStoreConfig != nil {
if req.WebhookResultStoreConfig.Endpoint == "" {
return false
}
}
return true
}

Expand Down
77 changes: 77 additions & 0 deletions configure.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package main

import (
"flag"
"log/slog"
"os"
"strconv"
"strings"
)

func configure() {
// Add command line flag parsing
var flagHost string
var flagPort int

flag.StringVar(&flagHost, "host", "", "Host interface to bind the HTTP server to")
flag.IntVar(&flagPort, "port", 0, "Port to bind the HTTP server to")
flag.Parse()

if os.Getenv("TOBEY_DEBUG") == "true" {
Debug = true
}
if os.Getenv("TOBEY_SKIP_CACHE") == "true" {
SkipCache = true
slog.Info("Skipping cache.")
}

// First check command line args, then fall back to env vars
if flagHost != "" {
ListenHost = flagHost
} else if v := os.Getenv("TOBEY_HOST"); v != "" {
ListenHost = v
}

if flagPort != 0 {
ListenPort = flagPort
} else if v := os.Getenv("TOBEY_PORT"); v != "" {
p, err := strconv.Atoi(v)
if err != nil {
panic(err)
}
ListenPort = p
}

if v := os.Getenv("TOBEY_WORKERS"); v != "" {
p, err := strconv.Atoi(v)
if err != nil {
panic(err)
}
NumVisitWorkers = p
}
slog.Info("Number of visit workers configured.", "num", NumVisitWorkers)

if v := os.Getenv("TOBEY_USER_AGENT"); v != "" {
UserAgent = v
slog.Info("Using custom user agent.", "user_agent", UserAgent)
}

if v := os.Getenv("TOBEY_DYNAMIC_CONFIG"); v == "true" || v == "yes" || v == "y" || v == "on" {
DynamicConfig = true
slog.Info("Dynamic configuration enabled!")
}

v := os.Getenv("TOBEY_TELEMETRY")
if strings.Contains(v, "traces") || strings.Contains(v, "tracing") {
UseTracing = true
slog.Info("Tracing enabled.")
}
if strings.Contains(v, "metrics") {
UseMetrics = true
slog.Info("Metrics enabled.")
}
if strings.Contains(v, "pulse") {
UsePulse = true
slog.Info("High Frequency Metrics (Pulse) enabled.")
}
}
5 changes: 4 additions & 1 deletion examples/.env.example.factorial
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@
# specific user agent.
TOBEY_USER_AGENT="WebsiteStandardsBot/1.0"

# Enable dynamic re-configuration during run time. By default disabled.
TOBEY_DYNAMIC_CONFIG=yes

# DSN specifying where crawl results should be stored, by default no results are stored. Here we store results
# in the "results" directory, relative to the current working directory.
# TOBEY_RESULTS_DSN=disk://results
TOBEY_RESULTS_DSN=webhook://?enable_dynamic_config
TOBEY_RESULT_REPORTER_DSN=noop://

# DSN for progress reporting. By default no progress is reported, uncomment to report progress to the
# Factorial service.
Expand Down
100 changes: 20 additions & 80 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,12 @@ package main
import (
"context"
"errors"
"flag"
"fmt"
"log"
"log/slog"
"net/http"
"os"
"os/signal"
"strconv"
"strings"
"time"
"tobey/internal/ctrlq"

Expand All @@ -25,7 +22,8 @@ import (
"go.opentelemetry.io/contrib/instrumentation/runtime"
)

// These variables can be controlled via environment variables.
// These variables can be controlled via environment variables and are set
// via configure().
var (
// Debug enables or disables debug mode, this can be controlled
// via the environment variable TOBEY_DEBUG.
Expand All @@ -35,14 +33,6 @@ var (
// TOBEY_SKIP_CACHE environment variable.
SkipCache = false

// These can be controlled via the TOBEY_TELEMETRY environment variable.
UseTracing = false
UseMetrics = false
UsePulse = false // High Frequency Metrics can be enabled by adding "pulse".

// NumVisitWorkers hard codes the number of workers we start at startup.
NumVisitWorkers int = 5

// ListenHost is the host where the main HTTP server listens and the API is served,
// this can be controlled via the TOBEY_HOST environment variable. An empty
// string means "listen on all interfaces".
Expand All @@ -52,9 +42,22 @@ var (
// be controlled via the TOBEY_PORT environment variable.
ListenPort int = 8080

// NumVisitWorkers hard codes the number of workers we start at startup.
NumVisitWorkers int = 5

// UserAgent to be used with all HTTP requests unless overridden per run.
// Can be controlled via TOBEY_USER_AGENT env var.
UserAgent = "Tobey/0"

// DynamicConfig allows the user to reconfigure i.e. the result reporter
// at runtime via the API. This should only be enabled if the users of the API
// can be fully trusted!
DynamicConfig = false

// These can be controlled via the TOBEY_TELEMETRY environment variable.
UseTracing = false
UseMetrics = false
UsePulse = false // High Frequency Metrics can be enabled by adding "pulse".
)

const (
Expand Down Expand Up @@ -82,66 +85,6 @@ const (
PulseEndpoint = "http://localhost:8090"
)

func configure() {
// Add command line flag parsing
var flagHost string
var flagPort int

flag.StringVar(&flagHost, "host", "", "Host interface to bind the HTTP server to")
flag.IntVar(&flagPort, "port", 0, "Port to bind the HTTP server to")
flag.Parse()

if os.Getenv("TOBEY_DEBUG") == "true" {
Debug = true
}
if os.Getenv("TOBEY_SKIP_CACHE") == "true" {
SkipCache = true
slog.Info("Skipping cache.")
}
if v := os.Getenv("TOBEY_WORKERS"); v != "" {
p, err := strconv.Atoi(v)
if err != nil {
panic(err)
}
NumVisitWorkers = p
}
v := os.Getenv("TOBEY_TELEMETRY")
if strings.Contains(v, "traces") || strings.Contains(v, "tracing") {
UseTracing = true
slog.Info("Tracing enabled.")
}
if strings.Contains(v, "metrics") {
UseMetrics = true
slog.Info("Metrics enabled.")
}
if strings.Contains(v, "pulse") {
UsePulse = true
slog.Info("High Frequency Metrics (Pulse) enabled.")
}

if v := os.Getenv("TOBEY_USER_AGENT"); v != "" {
UserAgent = v
slog.Info("Using custom user agent.", "user_agent", UserAgent)
}

// First check command line args, then fall back to env vars
if flagHost != "" {
ListenHost = flagHost
} else if v := os.Getenv("TOBEY_HOST"); v != "" {
ListenHost = v
}

if flagPort != 0 {
ListenPort = flagPort
} else if v := os.Getenv("TOBEY_PORT"); v != "" {
p, err := strconv.Atoi(v)
if err != nil {
panic(err)
}
ListenPort = p
}
}

func main() {
slog.Info("Tobey starting...")
tear, down := tears.New()
Expand Down Expand Up @@ -197,12 +140,9 @@ func main() {
tear(queue.Close)
}

if _, ok := os.LookupEnv("TOBEY_RESULTS_DSN"); !ok {
if _, ok := os.LookupEnv("TOBEY_RESULT_DSN"); ok {
slog.Debug("You have a typo in your env var: TOBEY_RESULTS_DSN is not set, but TOBEY_RESULT_DSN is set. Please use TOBEY_RESULTS_DSN instead.")
}
}
rs, err := CreateResultReporter(os.Getenv("TOBEY_RESULTS_DSN"))
// Set up the default result reporter. If dynamic config is enabled, we
// will use the API to change the result reporter at runtime.
defaultrr, err := CreateResultReporter(ctx, os.Getenv("TOBEY_RESULT_REPORTER_DSN"), nil, nil)
if err != nil {
panic(err)
}
Expand All @@ -217,16 +157,16 @@ func main() {
NumVisitWorkers,
runs,
queue,
defaultrr,
progress,
rs,
)
tear(vpool.Close)

// Set up and start the main API server.
slog.Info("Starting HTTP API server...", "host", ListenHost, "port", ListenPort)
apiserver := &http.Server{
Addr: fmt.Sprintf("%s:%d", ListenHost, ListenPort),
Handler: setupRoutes(runs, queue, progress, rs),
Handler: setupRoutes(runs, queue, defaultrr, progress),
}
go func() {
if err := apiserver.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) {
Expand Down
8 changes: 4 additions & 4 deletions main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ func setupTestServer(ctx context.Context, t *testing.T) *httptest.Server {
t.Fatal(err)
}

rs, err := CreateResultReporter("noop://")
defaultrr, err := CreateResultReporter(ctx, "noop://", nil, nil)
if err != nil {
t.Fatal(err)
panic(err)
}

progress, err := CreateProgressReporter("noop://")
Expand All @@ -37,11 +37,11 @@ func setupTestServer(ctx context.Context, t *testing.T) *httptest.Server {
1,
runs,
queue,
defaultrr,
progress,
rs,
)

return httptest.NewServer(setupRoutes(runs, queue, progress, rs))
return httptest.NewServer(setupRoutes(runs, queue, defaultrr, progress))
}

// TestCrawlRequestSubmission tries to perform a full integration test. On one
Expand Down
Loading

0 comments on commit a38d919

Please sign in to comment.