From f8be93d38e1fec60ca79b02b1790acd58779eb76 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Fri, 30 Aug 2024 17:52:57 +0200 Subject: [PATCH 01/57] Tobey: Increase throughput - Bring in pulse tool to monitor high frequency metrics - Move all queuing into Redis, loose dependency on RabbitMQ - Add retrying HTTP client - Move sitemap and robots handling out of collector, remove flags. - Have per host adaptive rate limiting. - Add support for prometheus metrics. --- .env.example | 7 +- Makefile | 6 +- README.md | 41 +- api.go | 24 +- cmd/pulse/main.go | 115 ++++++ collector.go | 100 ----- compose.yml | 35 ++ connections.go | 26 -- go.mod | 36 +- go.sum | 212 ++++------- host.go | 101 +++++ host_test.go | 19 + httpclient.go | 132 +++++-- httpclient_test.go | 27 +- internal/collector/collector.go | 73 ++-- internal/collector/errors.go | 4 +- internal/collector/httpbackend.go | 4 + internal/collector/request.go | 4 +- internal/collector/response.go | 4 + limiter.go | 67 ---- main.go | 111 +++--- observe.go | 46 +++ otel.go | 5 +- progress.go | 46 ++- prometheus.yml | 6 + robots.go | 122 +++--- robots_test.go | 41 ++ run.go | 147 ++++++-- runmanager.go | 32 +- runstore.go | 140 ------- scripts/errorserver.js | 37 ++ sitemap.go | 80 +++- store.go | 49 +++ store_memory.go | 122 ++++++ store_redis.go | 52 +++ runstore_test.go => store_redis_test.go | 8 +- visitworker.go | 195 +++++----- workqueue.go | 368 ++++-------------- workqueue_memory.go | 472 ++++++++++++++++++++++++ 39 files changed, 2016 insertions(+), 1100 deletions(-) create mode 100644 cmd/pulse/main.go delete mode 100644 collector.go create mode 100644 compose.yml create mode 100644 host.go create mode 100644 host_test.go delete mode 100644 limiter.go create mode 100644 observe.go create mode 100644 prometheus.yml create mode 100644 robots_test.go delete mode 100644 runstore.go create mode 100755 scripts/errorserver.js create mode 100644 store.go create mode 100644 store_memory.go create mode 100644 store_redis.go rename runstore_test.go => store_redis_test.go (80%) create mode 100644 workqueue_memory.go diff --git a/.env.example b/.env.example index 95db1d8..7a8f019 100644 --- a/.env.example +++ b/.env.example @@ -2,13 +2,12 @@ # TOBEY_SKIP_CACHE=false TOBEY_PROGRESS_DSN=http://progress:8080 -TOBEY_RABBITMQ_DSN=amqp://guest:guest@rabbitmq:5672/ TOBEY_REDIS_DSN=redis:6379/0 -# TOBEY_REQS_PER_S=2 - # A space separated list of telemetry to send. Available telemetry: metrics, -# traces. To disable telemetry provide an empty value. When enabling telemetry +# traces, pulse. To disable telemetry provide an empty value. When enabling telemetry # appropriate OTLP endpoints must provided as well. TOBEY_TELEMETRY="traces" + OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://jaeger:4318/v1/traces +OTEL_EXPORTER_OTLP_METRICS_ENDPOINT= diff --git a/Makefile b/Makefile index becd50b..83cbde0 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,10 @@ .PHONY: dev dev: - TOBEY_DEBUG=true go run . + TOBEY_SKIP_CACHE=true TOBEY_DEBUG=true TOBEY_HOST=127.0.0.1 go run . + +.PHONY: pulse +pulse: + go run cmd/pulse/main.go .PHONY: test test: diff --git a/README.md b/README.md index 5cf978c..2e2a166 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,7 @@ crawled. The service vertical scaling can be controlled by the number of workers used for crawling. The service is horizontally scalable by adding more instances on nodes in a cluster. In horizontal scaling, any instances can receive crawl requests, -for easy load balancing. The instances will coordinate with each other via a -RabbitMQ. +for easy load balancing. The instances will coordinate with each other via Redis. ## Features @@ -24,9 +23,40 @@ RabbitMQ. - Per host rate limiting, even when multiple instances are used. - Full support for OpenTelemetry. -## Quickstart - -To quickly try out the service, ensure you have Go installed. And run the following commands: +## Constraints + +- Also Tobey can be configured - on a per run basis - to crawl websites behind + HTTP basic auth, **it does not support fetching personalized content**. It is + expected that the website is generally publicly available, and that the content + is the same for all users. When HTTP basic auth is used by the website it must + only be so in order to prevent early access. + +## Architecture + +- The service optimizes for throughput per host. The rate limit and the requests + a host can handle in timely fashion is what mainly limits the throughput. In + order to maximize throughput we have to use the rate limit to its fullest. We + will also have to find out the maximum rate limit per host, for that we must be + able to adjust the rate limit per host dynamically. +- Runs are transient and they get evicted both from local memory and the store after a certain time, or whenever we hit a fixed limit. +- The instance must provide enough local memory and store so information about hosts that we access during runs can be kept and stored. +- However it can be assumed the number of hosts is sufficiently large enough, that we wouldn't be able + to keep a go routine which will hold a consumer for each host persistently. Go + routines are cheap, but hard to keep alive, when they interact with external + resources. +- For the same reason a worker process per host isn't suitable, also one worker per host wouldn't be enough. With a pool of workers + that could also handle many requests to a host at the same time we're better set up. +- We cannot pre-caclulate the delay when processing each incoming request is ok. As the rate-limit per host is dynamic and can change at any time, i.e. + when the host returns headers that allow us to adjust the rate limit. We want to do this as one of the main goals is throughput per host. +- Although the semantic correct way would be to have everything be scoped to a Run, i.e. Robots, Sitemap, etc. we will not do this. This approach + would (a) lead to a deep object graph (Run -> Host -> Robots, Sitemap, etc.) in which Run becomes kind of an god object and (b) it make hard + to share safe information between runs and prevent us from using a global cache. +- Information about the host's rate limiting state is not directly stored in the HostStore and passed to the work queue, instead the work queue will use the HostStore. The work queue hides + the dynamic adaption to the rate limit. Nobody else needs to know about it. +- Retrieved sitemaps and robot control files are not stored in the HostStore but in a global cache of the HTTP client. + Independent of the of the expiry set for a robot control file, it will be cached in-memory for a certain time, as we have + to check it for every request to a host. This adds another layer of caching. When changing the + robot control file, the cache can be invalidated by sending the XXX signal to all instances of tobey. ```sh # In the first terminal start the service. @@ -47,7 +77,6 @@ variables are available: |----------------|----------------|------------------|----------------------------------| | `TOBEY_DEBUG` | `false` | `true`, `false` | Controls debug mode. | | `TOBEY_SKIP_CACHE` | `false` | `true`, `false` | Controls caching access. | -| `TOBEY_RABBITMQ_DSN` | empty | i.e. `amqp://guest:guest@rabbitmq:5672/` | DSN to reach a RabbitMQ instance. Only needed when operating multiple instances. | | `TOBEY_REDIS_DSN` | empty | i.e. `redis://localhost:6379` | DSN to reach a Redis instance. Only needed when operating multiple instances. | | `TOBEY_PROGRESS_DSN` | empty | i.e. `http://localhost:9020` | DSN where to reach a progress service. When configured tobey will send progress updates there. | | `TOBEY_REQS_PER_S` | 2 | i.e. `4` | Maximum number of allowed requests per second per host. | diff --git a/api.go b/api.go index e8a05d6..acd9f5c 100644 --- a/api.go +++ b/api.go @@ -3,6 +3,7 @@ package main import ( + "crypto/sha256" "encoding/base64" "fmt" "log/slog" @@ -17,7 +18,7 @@ const ( ) type AuthConfig struct { - Host string `json:"host"` + Host *Host `json:"host"` Method string `json:"method"` // If method is "basic" @@ -27,19 +28,30 @@ type AuthConfig struct { // GetHeader returns the value of the Authorization header for the given // authentication configuration. -func (auth *AuthConfig) GetHeader() (string, bool) { - switch auth.Method { +func (ac *AuthConfig) GetHeader() (string, bool) { + switch ac.Method { case AuthMethodBasic: - token := fmt.Sprintf("%s:%s", auth.Username, auth.Password) + token := fmt.Sprintf("%s:%s", ac.Username, ac.Password) token = base64.StdEncoding.EncodeToString([]byte(token)) return fmt.Sprintf("Basic %s", token), true default: - slog.Warn("Unknown auth method.", "method", auth.Method) + slog.Warn("Unknown auth method.", "method", ac.Method) return "", false } } +// Hash returns a hash of the authentication configuration. This is used to +// uniquely identify the configuration. +func (ac *AuthConfig) Hash() []byte { + return sha256.New().Sum([]byte(fmt.Sprintf("%#v", ac))) +} + +// Matches checks if this AuthConfig should be used for the given Host. +func (ac *AuthConfig) Matches(h *Host) bool { + return h.Name == ac.Host.Name && h.Port == ac.Host.Port +} + type APIRequest struct { // We accept either a valid UUID as a string, or as an integer. If left // empty, we'll generate one. @@ -158,7 +170,7 @@ func (req *APIRequest) GetAuthConfigs() []*AuthConfig { config := &AuthConfig{ Method: "basic", - Host: p.Hostname(), + Host: NewHostFromURL(p), Username: p.User.Username(), Password: pass, } diff --git a/cmd/pulse/main.go b/cmd/pulse/main.go new file mode 100644 index 0000000..2ac292f --- /dev/null +++ b/cmd/pulse/main.go @@ -0,0 +1,115 @@ +package main + +import ( + "context" + "errors" + "fmt" + "io" + "log/slog" + "net/http" + "os" + "os/signal" + "strconv" + + "github.com/charmbracelet/bubbles/spinner" + tea "github.com/charmbracelet/bubbletea" +) + +// monitor provides a small time that show requests per second and average +// response times, it exposes a small HTTP API to receive metrics. + +const ( + ListenHost string = "127.0.0.1" + ListenPort int = 8090 +) + +// A command that waits for the activity on a channel. +func waitForActivity(sub chan int) tea.Cmd { + return func() tea.Msg { + return <-sub + } +} + +type model struct { + sub chan int // where we'll receive activity notifications + rps int + spinner spinner.Model + quitting bool +} + +func (m model) Init() tea.Cmd { + return tea.Batch( + m.spinner.Tick, + waitForActivity(m.sub), // wait for activity + ) +} + +func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { + switch msg.(type) { + case tea.KeyMsg: + m.quitting = true + return m, tea.Quit + case int: + m.rps = msg.(int) + return m, waitForActivity(m.sub) // wait for next event + case spinner.TickMsg: + var cmd tea.Cmd + m.spinner, cmd = m.spinner.Update(msg) + return m, cmd + default: + return m, nil + } +} + +func (m model) View() string { + s := fmt.Sprintf("\n %s Current Visit RPS: %d\n\n Press any key to exit\n", m.spinner.View(), m.rps) + if m.quitting { + s += "\n" + } + return s +} + +func main() { + slog.Info("Tobey Pulse is starting...") + + ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt) + + apirouter := http.NewServeMux() + ch := make(chan int) + + apirouter.HandleFunc("POST /rps", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + // This receives just an integer as plain text. Parse it. + + body, _ := io.ReadAll(r.Body) + v, _ := strconv.Atoi(string(body)) + + ch <- v + + r.Body.Close() + w.WriteHeader(http.StatusOK) + }) + + apiserver := &http.Server{ + Addr: fmt.Sprintf("%s:%d", ListenHost, ListenPort), + Handler: apirouter, + } + go func() { + if err := apiserver.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) { + slog.Error("HTTP server error.", "error", err) + } + slog.Info("Stopped serving new API HTTP connections.") + }() + + p := tea.NewProgram(model{ + sub: ch, + spinner: spinner.New(), + }) + + if _, err := p.Run(); err != nil { + fmt.Println("could not start program:", err) + os.Exit(1) + } + <-ctx.Done() + stop() +} diff --git a/collector.go b/collector.go deleted file mode 100644 index dd4d329..0000000 --- a/collector.go +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright 2024 Factorial GmbH. All rights reserved. - -package main - -import ( - "context" - "log/slog" - "tobey/internal/collector" - - "go.opentelemetry.io/otel/attribute" -) - -// CollectorConfig is the configuration for a collector.Collector. -type CollectorConfig struct { -} - -// getEnqueueFn returns the enqueue function, that will enqueue a single URL to -// be crawled. The enqueue function is called whenever a new URL is discovered -// by that Collector, i.e. by looking at all links in a crawled page HTML. -func getEnqueueFn(run *Run, q WorkQueue, progress Progress) collector.EnqueueFn { - - // The returned function takes the run context. - return func(ctx context.Context, c *collector.Collector, url string, flags uint8) error { - logger := slog.With("run", run.ID, "url", url) - tctx, span := tracer.Start(ctx, "enqueue_element") - defer span.End() - - span.SetAttributes(attribute.String("URL", url)) - // Ensure we never publish a URL twice for a single run. Not only does - // this help us not put unnecessary load on the queue, it also helps with - // ensuring there will only (mostly) be one result for a page. There is a slight - // chance that two processes enter this function with the same run and url, - // before one of them is finished. - if ok, err := c.IsVisitAllowed(url, flags); !ok { - if err == collector.ErrCheckInternal { - logger.Warn("Enqueue: Error checking if visit is allowed, not allowing visit.", "error", err) - } - logger.Debug("Enqueue: Not enqueuing visit, visit not allowed.", "error", err) - return nil - } - if run.HasSeenURL(tctx, url) { - // Do not need to enqueue an URL that has already been crawled, and its response - // can be served from cache. - // slog.Debug("Not enqueuing visit, URL already seen.", "run", c.Run, "url", url) - return nil - } - - logger.Debug("Publishing URL...") - err := q.PublishURL( - context.WithoutCancel(tctx), // The captured crawl run context. - // Passing the run ID to identify the crawl run, so when - // consumed the URL the run can be reconstructed by the RunManager. - run.ID, - url, - flags, - ) - - if flags&collector.FlagInternal == 0 { - progress.Update(ProgressUpdateMessagePackage{ - context.WithoutCancel(tctx), - ProgressUpdateMessage{ - ProgressStage, - ProgressStateQueuedForCrawling, - run.ID, - url, - }, - }) - } - - if err == nil { - run.SawURL(tctx, url) - logger.Debug("Enqueue: URL marked as seen.") - } else { - logger.Error("Enqueue: Error enqueuing visit.", "error", err) - } - return err - } -} - -// getCollectFn returns the collect function that is called once we have a -// result. Uses the information provided in the original crawl request, i.e. the -// WebhookConfig, that we have received via the queued message. -func getCollectFn(run *Run, hooks *WebhookDispatcher) collector.CollectFn { - - // The returned function takes the run context. - return func(ctx context.Context, c *collector.Collector, res *collector.Response, flags uint8) { - slog.Debug( - "Collect suceeded.", - "run", run.ID, - "url", res.Request.URL, - "response.body.length", len(res.Body), - "response.status", res.StatusCode, - ) - if flags&collector.FlagInternal == 0 { - if run.WebhookConfig != nil && run.WebhookConfig.Endpoint != "" { - hooks.Send(ctx, run.WebhookConfig, run.ID, res) - } - } - } -} diff --git a/compose.yml b/compose.yml new file mode 100644 index 0000000..cedbc9f --- /dev/null +++ b/compose.yml @@ -0,0 +1,35 @@ +# A docker compose file that sets up a Tobey instance that is instrumented and +# exposes metrics to Prometheus via /metrics endpoint. The metrics are then +# scraped by prometheus. Grafana is used to visualize the metrics scraped by +# Prometheus. + +services: + tobey: + ports: + - "8080:8080" + build: + context: . + environment: + - TOBEY_TELEMETRY=metrics + volumes: + - tobey-cache:/cache + + prometheus: + image: prom/prometheus + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + + grafana: + image: grafana/grafana + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + volumes: + - grafana-data:/var/lib/grafana + +volumes: + tobey-cache: + grafana-data: diff --git a/connections.go b/connections.go index 32d341d..970efec 100644 --- a/connections.go +++ b/connections.go @@ -12,7 +12,6 @@ import ( "github.com/cenkalti/backoff/v4" "github.com/kos-v/dsnparser" - amqp "github.com/rabbitmq/amqp091-go" "github.com/redis/go-redis/extra/redisotel/v9" "github.com/redis/go-redis/v9" ) @@ -60,28 +59,3 @@ func maybeRedis(ctx context.Context) (*redis.Client, error) { } return client, nil } - -func maybeRabbitMQ(ctx context.Context) (*amqp.Connection, error) { - dsn, ok := os.LookupEnv("TOBEY_RABBITMQ_DSN") - if !ok || dsn == "" { - return nil, nil - } - slog.Debug("Connecting to RabbitMQ...", "dsn", dsn) - - client, err := backoff.RetryNotifyWithData( - func() (*amqp.Connection, error) { - return amqp.Dial(dsn) - }, - backoff.WithContext(backoff.NewExponentialBackOff(), ctx), - func(err error, t time.Duration) { - slog.Info("Retrying RabbitMQ connection...", "error", err) - }, - ) - if err != nil { - return nil, fmt.Errorf("ultimately failed retrying RabitMQ connection: %w", err) - } - - slog.Debug("Connection to RabbitMQ established :)") - return client, nil - -} diff --git a/go.mod b/go.mod index 42b2cba..a06323a 100644 --- a/go.mod +++ b/go.mod @@ -8,15 +8,18 @@ require ( github.com/antchfx/htmlquery v1.3.2 github.com/antchfx/xmlquery v1.4.1 github.com/cenkalti/backoff/v4 v4.3.0 - github.com/go-redis/redis_rate/v10 v10.0.1 + github.com/charmbracelet/bubbles v0.19.0 + github.com/charmbracelet/bubbletea v0.27.1 github.com/google/uuid v1.6.0 github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 + github.com/hashicorp/go-retryablehttp v0.7.7 github.com/hashicorp/golang-lru/v2 v2.0.7 github.com/kennygrant/sanitize v1.2.4 github.com/kos-v/dsnparser v1.1.0 github.com/nlnwa/whatwg-url v0.4.1 + github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4 github.com/peterbourgon/diskv v2.0.1+incompatible - github.com/rabbitmq/amqp091-go v1.10.0 + github.com/prometheus/client_golang v1.20.1 github.com/redis/go-redis/extra/redisotel/v9 v9.5.3 github.com/redis/go-redis/v9 v9.5.4 github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d @@ -38,20 +41,45 @@ require ( github.com/alicebob/gopher-json v0.0.0-20200520072559-a9ecdc9d1d3a // indirect github.com/andybalholm/cascadia v1.3.2 // indirect github.com/antchfx/xpath v1.3.1 // indirect + github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect + github.com/beorn7/perks v1.0.1 // indirect github.com/bits-and-blooms/bitset v1.13.0 // indirect - github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/charmbracelet/lipgloss v0.13.0 // indirect + github.com/charmbracelet/x/ansi v0.1.4 // indirect + github.com/charmbracelet/x/input v0.1.0 // indirect + github.com/charmbracelet/x/term v0.1.1 // indirect + github.com/charmbracelet/x/windows v0.1.0 // indirect github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect + github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/google/btree v1.1.2 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect + github.com/hashicorp/go-cleanhttp v0.5.2 // indirect + github.com/klauspost/compress v1.17.9 // indirect + github.com/lucasb-eyer/go-colorful v1.2.0 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/mattn/go-localereader v0.0.1 // indirect + github.com/mattn/go-runewidth v0.0.16 // indirect + github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect + github.com/muesli/cancelreader v0.2.2 // indirect + github.com/muesli/termenv v0.15.2 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/prometheus/client_model v0.6.1 // indirect + github.com/prometheus/common v0.55.0 // indirect + github.com/prometheus/procfs v0.15.1 // indirect github.com/redis/go-redis/extra/rediscmd/v9 v9.5.3 // indirect + github.com/rivo/uniseg v0.4.7 // indirect + github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect github.com/yuin/gopher-lua v1.1.1 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 // indirect go.opentelemetry.io/proto/otlp v1.3.1 // indirect - golang.org/x/sys v0.22.0 // indirect + golang.org/x/exp v0.0.0-20240823005443-9b4947da3948 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.24.0 // indirect golang.org/x/text v0.16.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240701130421-f6361c86f094 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 // indirect diff --git a/go.sum b/go.sum index 92f18d0..6684d4b 100644 --- a/go.sum +++ b/go.sum @@ -1,67 +1,61 @@ -github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VPW7UI= -github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY= github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE= github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk= github.com/alicebob/gopher-json v0.0.0-20200520072559-a9ecdc9d1d3a h1:HbKu58rmZpUGpz5+4FfNmIU+FmZg2P3Xaj2v2bfNWmk= github.com/alicebob/gopher-json v0.0.0-20200520072559-a9ecdc9d1d3a/go.mod h1:SGnFV6hVsYE877CKEZ6tDNTjaSXYUk6QqoIK6PrAtcc= -github.com/alicebob/miniredis/v2 v2.32.1 h1:Bz7CciDnYSaa0mX5xODh6GUITRSx+cVhjNoOR4JssBo= -github.com/alicebob/miniredis/v2 v2.32.1/go.mod h1:AqkLNAfUm0K07J28hnAyyQKf/x0YkCY/g5DCtuL01Mw= github.com/alicebob/miniredis/v2 v2.33.0 h1:uvTF0EDeu9RLnUEG27Db5I68ESoIxTiXbNUiji6lZrA= github.com/alicebob/miniredis/v2 v2.33.0/go.mod h1:MhP4a3EU7aENRi9aO+tHfTBZicLqQevyi/DJpoj6mi0= github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= -github.com/antchfx/htmlquery v1.3.0 h1:5I5yNFOVI+egyia5F2s/5Do2nFWxJz41Tr3DyfKD25E= -github.com/antchfx/htmlquery v1.3.0/go.mod h1:zKPDVTMhfOmcwxheXUsx4rKJy8KEY/PU6eXr/2SebQ8= -github.com/antchfx/htmlquery v1.3.1 h1:wm0LxjLMsZhRHfQKKZscDf2COyH4vDYA3wyH+qZ+Ylc= -github.com/antchfx/htmlquery v1.3.1/go.mod h1:PTj+f1V2zksPlwNt7uVvZPsxpKNa7mlVliCRxLX6Nx8= github.com/antchfx/htmlquery v1.3.2 h1:85YdttVkR1rAY+Oiv/nKI4FCimID+NXhDn82kz3mEvs= github.com/antchfx/htmlquery v1.3.2/go.mod h1:1mbkcEgEarAokJiWhTfr4hR06w/q2ZZjnYLrDt6CTUk= -github.com/antchfx/xmlquery v1.3.18 h1:FSQ3wMuphnPPGJOFhvc+cRQ2CT/rUj4cyQXkJcjOwz0= -github.com/antchfx/xmlquery v1.3.18/go.mod h1:Afkq4JIeXut75taLSuI31ISJ/zeq+3jG7TunF7noreA= -github.com/antchfx/xmlquery v1.4.0 h1:xg2HkfcRK2TeTbdb0m1jxCYnvsPaGY/oeZWTGqX/0hA= -github.com/antchfx/xmlquery v1.4.0/go.mod h1:Ax2aeaeDjfIw3CwXKDQ0GkwZ6QlxoChlIBP+mGnDFjI= github.com/antchfx/xmlquery v1.4.1 h1:YgpSwbeWvLp557YFTi8E3z6t6/hYjmFEtiEKbDfEbl0= github.com/antchfx/xmlquery v1.4.1/go.mod h1:lKezcT8ELGt8kW5L+ckFMTbgdR61/odpPgDv8Gvi1fI= -github.com/antchfx/xpath v1.2.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= -github.com/antchfx/xpath v1.2.4 h1:dW1HB/JxKvGtJ9WyVGJ0sIoEcqftV3SqIstujI+B9XY= -github.com/antchfx/xpath v1.2.4/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= -github.com/antchfx/xpath v1.3.0 h1:nTMlzGAK3IJ0bPpME2urTuFL76o4A96iYvoKFHRXJgc= -github.com/antchfx/xpath v1.3.0/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/antchfx/xpath v1.3.1 h1:PNbFuUqHwWl0xRjvUPjJ95Agbmdj2uzzIwmQKgu4oCk= github.com/antchfx/xpath v1.3.1/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= -github.com/bits-and-blooms/bitset v1.5.0 h1:NpE8frKRLGHIcEzkR+gZhiioW1+WbYV6fKwD6ZIpQT8= -github.com/bits-and-blooms/bitset v1.5.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA= +github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k= +github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE= github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= -github.com/bsm/ginkgo/v2 v2.7.0/go.mod h1:AiKlXPm7ItEHNc/2+OkrNG4E0ITzojb9/xWzvQ9XZ9w= github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= -github.com/bsm/gomega v1.26.0/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= -github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= -github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= -github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= -github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/charmbracelet/bubbles v0.19.0 h1:gKZkKXPP6GlDk6EcfujDK19PCQqRjaJZQ7QRERx1UF0= +github.com/charmbracelet/bubbles v0.19.0/go.mod h1:WILteEqZ+krG5c3ntGEMeG99nCupcuIk7V0/zOP0tOA= +github.com/charmbracelet/bubbletea v0.27.1 h1:/yhaJKX52pxG4jZVKCNWj/oq0QouPdXycriDRA6m6r8= +github.com/charmbracelet/bubbletea v0.27.1/go.mod h1:xc4gm5yv+7tbniEvQ0naiG9P3fzYhk16cTgDZQQW6YE= +github.com/charmbracelet/lipgloss v0.13.0 h1:4X3PPeoWEDCMvzDvGmTajSyYPcZM4+y8sCA/SsA3cjw= +github.com/charmbracelet/lipgloss v0.13.0/go.mod h1:nw4zy0SBX/F/eAO1cWdcvy6qnkDUxr8Lw7dvFrAIbbY= +github.com/charmbracelet/x/ansi v0.1.4 h1:IEU3D6+dWwPSgZ6HBH+v6oUuZ/nVawMiWj5831KfiLM= +github.com/charmbracelet/x/ansi v0.1.4/go.mod h1:dk73KoMTT5AX5BsX0KrqhsTqAnhZZoCBjs7dGWp4Ktw= +github.com/charmbracelet/x/input v0.1.0 h1:TEsGSfZYQyOtp+STIjyBq6tpRaorH0qpwZUj8DavAhQ= +github.com/charmbracelet/x/input v0.1.0/go.mod h1:ZZwaBxPF7IG8gWWzPUVqHEtWhc1+HXJPNuerJGRGZ28= +github.com/charmbracelet/x/term v0.1.1 h1:3cosVAiPOig+EV4X9U+3LDgtwwAoEzJjNdwbXDjF6yI= +github.com/charmbracelet/x/term v0.1.1/go.mod h1:wB1fHt5ECsu3mXYusyzcngVWWlu1KKUmmLhfgr/Flxw= +github.com/charmbracelet/x/windows v0.1.0 h1:gTaxdvzDM5oMa/I2ZNF7wN78X/atWemG9Wph7Ika2k4= +github.com/charmbracelet/x/windows v0.1.0/go.mod h1:GLEO/l+lizvFDBPLIOk+49gdX49L9YWMB5t+DZd0jkQ= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= +github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4= +github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM= +github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= +github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= -github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= -github.com/go-redis/redis_rate/v10 v10.0.1 h1:calPxi7tVlxojKunJwQ72kwfozdy25RjA0bCj1h0MUo= -github.com/go-redis/redis_rate/v10 v10.0.1/go.mod h1:EMiuO9+cjRkR7UvdvwMO7vbgqJkltQHtwbdIQvaBKIU= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/google/btree v1.1.2 h1:xf4v41cLI2Z6FxbKm+8Bu+m8ifhj15JuZ9sa0jZCMUU= @@ -72,180 +66,142 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.0 h1:Wqo399gCIufwto+VfwCSvsnfGpF/w5E9CNxSwbpD6No= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.0/go.mod h1:qmOFXW2epJhM0qSnUUYpldc7gVz2KMQwJ/QYCDIa7XU= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.1 h1:/c3QmbOGMGTOumP2iT/rCwB7b0QDGLKzqOmktBjT+Is= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.1/go.mod h1:5SN9VR2LTsRFsrEC6FHgRbTWrTHu6tqPeKxEQv15giM= github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0= github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= +github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= +github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= +github.com/hashicorp/go-hclog v1.6.3 h1:Qr2kF+eVWjTiYmU7Y31tYlP1h0q/X3Nl3tPGdaB11/k= +github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M= +github.com/hashicorp/go-retryablehttp v0.7.7 h1:C8hUCYzor8PIfXHa4UrZkU4VvK8o9ISHxT2Q8+VepXU= +github.com/hashicorp/go-retryablehttp v0.7.7/go.mod h1:pkQpWZeYWskR+D1tR2O5OcBFOxfA7DoAO6xtkuQnHTk= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= +github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= +github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= github.com/kos-v/dsnparser v1.1.0 h1:3vctQ55GqpFfnXL8ME02U+lEpqz2XZhu9fyV4Aubjk0= github.com/kos-v/dsnparser v1.1.0/go.mod h1:hKgcsdjz6eSeu7lP7zfTtuYly0wQaEXlYwXaSr+QOig= -github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= -github.com/nlnwa/whatwg-url v0.4.0 h1:B3kFb5EL7KILeBkhrlQvFi41Ex0p4ropVA9brt5ungI= -github.com/nlnwa/whatwg-url v0.4.0/go.mod h1:pLzpJjFPtA+n7RCLvp0GBxvDHa/2ckNCBK9mfEeNOMQ= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY= +github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= +github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4= +github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88= +github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc= +github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= +github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI= +github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo= +github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA= +github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo= +github.com/muesli/termenv v0.15.2 h1:GohcuySI0QmI3wN8Ok9PtKGkgkFIk7y6Vpb5PvrY+Wo= +github.com/muesli/termenv v0.15.2/go.mod h1:Epx+iuz8sNs7mNKhxzH4fWXGNpZwUaJKRS1noLXviQ8= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/nlnwa/whatwg-url v0.4.1 h1:m0+XWylS9IuCPd5GMW2lzmSI9ssSwynT3nug0p3bUIo= github.com/nlnwa/whatwg-url v0.4.1/go.mod h1:X/ejnFFVbaOWdSul+cnlsSHviCzGZJdvPkgc9zD8IY8= +github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4 h1:2vmb32OdDhjZf2ETGDlr9n8RYXx7c+jXPxMiPbwnA+8= +github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4/go.mod h1:2JQx4jDHmWrbABvpOayg/+OTU6ehN0IyK2EHzceXpJo= github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI= github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/rabbitmq/amqp091-go v1.9.0 h1:qrQtyzB4H8BQgEuJwhmVQqVHB9O4+MNDJCCAcpc3Aoo= -github.com/rabbitmq/amqp091-go v1.9.0/go.mod h1:+jPrT9iY2eLjRaMSRHUhc3z14E/l85kv/f+6luSD3pc= -github.com/rabbitmq/amqp091-go v1.10.0 h1:STpn5XsHlHGcecLmMFCtg7mqq0RnD+zFr4uzukfVhBw= -github.com/rabbitmq/amqp091-go v1.10.0/go.mod h1:Hy4jKW5kQART1u+JkDTF9YYOQUHXqMuhrgxOEeS7G4o= -github.com/redis/go-redis/extra/rediscmd/v9 v9.0.5 h1:EaDatTxkdHG+U3Bk4EUr+DZ7fOGwTfezUiUJMaIcaho= -github.com/redis/go-redis/extra/rediscmd/v9 v9.0.5/go.mod h1:fyalQWdtzDBECAQFBJuQe5bzQ02jGd5Qcbgb97Flm7U= +github.com/prometheus/client_golang v1.20.1 h1:IMJXHOD6eARkQpxo8KkhgEVFlBNm+nkrFUyGlIu7Na8= +github.com/prometheus/client_golang v1.20.1/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= +github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/redis/go-redis/extra/rediscmd/v9 v9.5.3 h1:1/BDligzCa40GTllkDnY3Y5DTHuKCONbB2JcRyIfl20= github.com/redis/go-redis/extra/rediscmd/v9 v9.5.3/go.mod h1:3dZmcLn3Qw6FLlWASn1g4y+YO9ycEFUOM+bhBmzLVKQ= -github.com/redis/go-redis/extra/redisotel/v9 v9.0.5 h1:EfpWLLCyXw8PSM2/XNJLjI3Pb27yVE+gIAfeqp8LUCc= -github.com/redis/go-redis/extra/redisotel/v9 v9.0.5/go.mod h1:WZjPDy7VNzn77AAfnAfVjZNvfJTYfPetfZk5yoSTLaQ= github.com/redis/go-redis/extra/redisotel/v9 v9.5.3 h1:kuvuJL/+MZIEdvtb/kTBRiRgYaOmx1l+lYJyVdrRUOs= github.com/redis/go-redis/extra/redisotel/v9 v9.5.3/go.mod h1:7f/FMrf5RRRVHXgfk7CzSVzXHiWeuOQUu2bsVqWoa+g= -github.com/redis/go-redis/v9 v9.0.5/go.mod h1:WqMKv5vnQbRuZstUwxQI195wHy+t4PuXDOjzMvcuQHk= -github.com/redis/go-redis/v9 v9.5.1 h1:H1X4D3yHPaYrkL5X06Wh6xNVM/pX0Ft4RV0vMGvLBh8= -github.com/redis/go-redis/v9 v9.5.1/go.mod h1:hdY0cQFCN4fnSYT6TkisLufl/4W5UIXyv0b/CLO2V2M= -github.com/redis/go-redis/v9 v9.5.3 h1:fOAp1/uJG+ZtcITgZOfYFmTKPE7n4Vclj1wZFgRciUU= -github.com/redis/go-redis/v9 v9.5.3/go.mod h1:hdY0cQFCN4fnSYT6TkisLufl/4W5UIXyv0b/CLO2V2M= github.com/redis/go-redis/v9 v9.5.4 h1:vOFYDKKVgrI5u++QvnMT7DksSMYg7Aw/Np4vLJLKLwY= github.com/redis/go-redis/v9 v9.5.4/go.mod h1:hdY0cQFCN4fnSYT6TkisLufl/4W5UIXyv0b/CLO2V2M= +github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= +github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA= github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no= +github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M= github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 h1:jq9TW8u3so/bN+JPT166wjOI6/vQPF6Xe7nMNIltagk= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0/go.mod h1:p8pYQP+m5XfbZm9fxtSKAbM6oIllS7s2AfxrChvc7iw= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.50.0 h1:cEPbyTSEHlQR89XVlyo78gqluF8Y3oMeBkXGWzQsfXY= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.50.0/go.mod h1:DKdbWcT4GH1D0Y3Sqt/PFXt2naRKDWtU+eE6oLdFNA8= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0 h1:Xs2Ncz0gNihqu9iosIZ5SkBbWo5T8JhhLJFMQL1qmLI= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0/go.mod h1:vy+2G/6NvVMpwGX/NyLqcC41fxepnuKHk16E6IZUcJc= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIXefpVJtvA/8srF4V4y0akAoPHkIslgAkjixJA= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0/go.mod h1:jjdQuTGVsXV4vSs+CJ2qYDeDPf9yIJV23qlIzBm73Vg= -go.opentelemetry.io/contrib/instrumentation/runtime v0.50.0 h1:6dck47miguAOny5MeqX1G8idd+HpzDFt86U33d7aW2I= -go.opentelemetry.io/contrib/instrumentation/runtime v0.50.0/go.mod h1:rdPhRwNd2sHiRmwJAGs8xcwitqmP/j8pvl9X5jloYjU= -go.opentelemetry.io/contrib/instrumentation/runtime v0.51.0 h1:1tBjncp/Rr5iuV0WfdKGGynrzIJ8bMm5z7Zl6jMjfIE= -go.opentelemetry.io/contrib/instrumentation/runtime v0.51.0/go.mod h1:6MqTuVXkhmzrIc7SFHYVTo7N6OFvVpDH5eq5xXKpAZQ= go.opentelemetry.io/contrib/instrumentation/runtime v0.53.0 h1:nOlJEAJyrcy8hexK65M+dsCHIx7CVVbybcFDNkcTcAc= go.opentelemetry.io/contrib/instrumentation/runtime v0.53.0/go.mod h1:u79lGGIlkg3Ryw425RbMjEkGYNxSnXRyR286O840+u4= -go.opentelemetry.io/otel v1.25.0 h1:gldB5FfhRl7OJQbUHt/8s0a7cE8fbsPAtdpRaApKy4k= -go.opentelemetry.io/otel v1.25.0/go.mod h1:Wa2ds5NOXEMkCmUou1WA7ZBfLTHWIsp034OVD7AO+Vg= -go.opentelemetry.io/otel v1.26.0 h1:LQwgL5s/1W7YiiRwxf03QGnWLb2HW4pLiAhaA5cZXBs= -go.opentelemetry.io/otel v1.26.0/go.mod h1:UmLkJHUAidDval2EICqBMbnAd0/m2vmpf/dAM+fvFs4= go.opentelemetry.io/otel v1.28.0 h1:/SqNcYk+idO0CxKEUOtKQClMK/MimZihKYMruSMViUo= go.opentelemetry.io/otel v1.28.0/go.mod h1:q68ijF8Fc8CnMHKyzqL6akLO46ePnjkgfIMIjUIX9z4= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.25.0 h1:Wc4hZuYXhVqq+TfRXLXlmNIL/awOanGx8ssq3ciDQxc= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.25.0/go.mod h1:BydOvapRqVEc0DVz27qWBX2jq45Ca5TI9mhZBDIdweY= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.26.0 h1:HGZWGmCVRCVyAs2GQaiHQPbDHo+ObFWeUEOd+zDnp64= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.26.0/go.mod h1:SaH+v38LSCHddyk7RGlU9uZyQoRrKao6IBnJw6Kbn+c= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.28.0 h1:aLmmtjRke7LPDQ3lvpFz+kNEH43faFhzW7v8BFIEydg= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.28.0/go.mod h1:TC1pyCt6G9Sjb4bQpShH+P5R53pO6ZuGnHuuln9xMeE= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.25.0 h1:dT33yIHtmsqpixFsSQPwNeY5drM9wTcoL8h0FWF4oGM= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.25.0/go.mod h1:h95q0LBGh7hlAC08X2DhSeyIG02YQ0UyioTCVAqRPmc= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.26.0 h1:1u/AyyOqAWzy+SkPxDpahCNZParHV8Vid1RnI2clyDE= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.26.0/go.mod h1:z46paqbJ9l7c9fIPCXTqTGwhQZ5XoTIsfeFYWboizjs= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 h1:3Q/xZUyC1BBkualc9ROb4G8qkH90LXEIICcs5zv1OYY= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0/go.mod h1:s75jGIWA9OfCMzF0xr+ZgfrB5FEbbV7UuYo32ahUiFI= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.25.0 h1:Mbi5PKN7u322woPa85d7ebZ+SOvEoPvoiBu+ryHWgfA= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.25.0/go.mod h1:e7ciERRhZaOZXVjx5MiL8TK5+Xv7G5Gv5PA2ZDEJdL8= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.26.0 h1:1wp/gyxsuYtuE/JFxsQRtcCDtMrO2qMvlfXALU5wkzI= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.26.0/go.mod h1:gbTHmghkGgqxMomVQQMur1Nba4M0MQ8AYThXDUjsJ38= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.28.0 h1:j9+03ymgYhPKmeXGk5Zu+cIZOlVzd9Zv7QIiyItjFBU= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.28.0/go.mod h1:Y5+XiUG4Emn1hTfciPzGPJaSI+RpDts6BnCIir0SLqk= -go.opentelemetry.io/otel/metric v1.25.0 h1:LUKbS7ArpFL/I2jJHdJcqMGxkRdxpPHE0VU/D4NuEwA= -go.opentelemetry.io/otel/metric v1.25.0/go.mod h1:rkDLUSd2lC5lq2dFNrX9LGAbINP5B7WBkC78RXCpH5s= -go.opentelemetry.io/otel/metric v1.26.0 h1:7S39CLuY5Jgg9CrnA9HHiEjGMF/X2VHvoXGgSllRz30= -go.opentelemetry.io/otel/metric v1.26.0/go.mod h1:SY+rHOI4cEawI9a7N1A4nIg/nTQXe1ccCNWYOJUrpX4= go.opentelemetry.io/otel/metric v1.28.0 h1:f0HGvSl1KRAU1DLgLGFjrwVyismPlnuU6JD6bOeuA5Q= go.opentelemetry.io/otel/metric v1.28.0/go.mod h1:Fb1eVBFZmLVTMb6PPohq3TO9IIhUisDsbJoL/+uQW4s= -go.opentelemetry.io/otel/sdk v1.25.0 h1:PDryEJPC8YJZQSyLY5eqLeafHtG+X7FWnf3aXMtxbqo= -go.opentelemetry.io/otel/sdk v1.25.0/go.mod h1:oFgzCM2zdsxKzz6zwpTZYLLQsFwc+K0daArPdIhuxkw= -go.opentelemetry.io/otel/sdk v1.26.0 h1:Y7bumHf5tAiDlRYFmGqetNcLaVUZmh4iYfmGxtmz7F8= -go.opentelemetry.io/otel/sdk v1.26.0/go.mod h1:0p8MXpqLeJ0pzcszQQN4F0S5FVjBLgypeGSngLsmirs= go.opentelemetry.io/otel/sdk v1.28.0 h1:b9d7hIry8yZsgtbmM0DKyPWMMUMlK9NEKuIG4aBqWyE= go.opentelemetry.io/otel/sdk v1.28.0/go.mod h1:oYj7ClPUA7Iw3m+r7GeEjz0qckQRJK2B8zjcZEfu7Pg= -go.opentelemetry.io/otel/sdk/metric v1.25.0 h1:7CiHOy08LbrxMAp4vWpbiPcklunUshVpAvGBrdDRlGw= -go.opentelemetry.io/otel/sdk/metric v1.25.0/go.mod h1:LzwoKptdbBBdYfvtGCzGwk6GWMA3aUzBOwtQpR6Nz7o= -go.opentelemetry.io/otel/sdk/metric v1.26.0 h1:cWSks5tfriHPdWFnl+qpX3P681aAYqlZHcAyHw5aU9Y= -go.opentelemetry.io/otel/sdk/metric v1.26.0/go.mod h1:ClMFFknnThJCksebJwz7KIyEDHO+nTB6gK8obLy8RyE= go.opentelemetry.io/otel/sdk/metric v1.28.0 h1:OkuaKgKrgAbYrrY0t92c+cC+2F6hsFNnCQArXCKlg08= go.opentelemetry.io/otel/sdk/metric v1.28.0/go.mod h1:cWPjykihLAPvXKi4iZc1dpER3Jdq2Z0YLse3moQUCpg= -go.opentelemetry.io/otel/trace v1.25.0 h1:tqukZGLwQYRIFtSQM2u2+yfMVTgGVeqRLPUYx1Dq6RM= -go.opentelemetry.io/otel/trace v1.25.0/go.mod h1:hCCs70XM/ljO+BeQkyFnbK28SBIJ/Emuha+ccrCRT7I= -go.opentelemetry.io/otel/trace v1.26.0 h1:1ieeAUb4y0TE26jUFrCIXKpTuVK7uJGN9/Z/2LP5sQA= -go.opentelemetry.io/otel/trace v1.26.0/go.mod h1:4iDxvGDQuUkHve82hJJ8UqrwswHYsZuWCBllGV2U2y0= go.opentelemetry.io/otel/trace v1.28.0 h1:GhQ9cUuQGmNDd5BTCP2dAvv75RdMxEfTmYejp+lkx9g= go.opentelemetry.io/otel/trace v1.28.0/go.mod h1:jPyXzNPg6da9+38HEwElrQiHlVMTnVfM3/yv2OlIHaI= -go.opentelemetry.io/proto/otlp v1.1.0 h1:2Di21piLrCqJ3U3eXGCTPHE9R8Nh+0uglSnOyxikMeI= -go.opentelemetry.io/proto/otlp v1.1.0/go.mod h1:GpBHCBWiqvVLDqmHZsoMM3C5ySeKTC7ej/RNTae6MdY= -go.opentelemetry.io/proto/otlp v1.2.0 h1:pVeZGk7nXDC9O2hncA6nHldxEjm6LByfA2aN8IOkz94= -go.opentelemetry.io/proto/otlp v1.2.0/go.mod h1:gGpR8txAl5M03pDhMC79G6SdqNV26naRm/KDsgaHD8A= go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0= go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8= -go.uber.org/goleak v1.2.1 h1:NBol2c7O1ZokfZ0LEU9K6Whx/KnwvepVetCUhtKja4A= -go.uber.org/goleak v1.2.1/go.mod h1:qlT2yGI9QafXHhZZLxlSuNsMw3FFLxBr+tBRlmO1xH4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/exp v0.0.0-20240823005443-9b4947da3948 h1:kx6Ds3MlpiUHKj7syVnbp57++8WpuKPcR5yjLBjvLEA= +golang.org/x/exp v0.0.0-20240823005443-9b4947da3948/go.mod h1:akd2r19cwCdwSwWeIdzYQGa/EZZyqcOdwWiwj5L5eKQ= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w= -golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= -golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= -golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= -golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys= golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20190204203706-41f3e6584952/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= -golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= -golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= -golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= @@ -254,12 +210,9 @@ golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= @@ -270,28 +223,13 @@ golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/genproto v0.0.0-20240227224415-6ceb2ff114de h1:F6qOa9AZTYJXOUEr4jDysRDLrm4PHePlge4v4TGAlxY= -google.golang.org/genproto v0.0.0-20240227224415-6ceb2ff114de/go.mod h1:VUhTRKeHn9wwcdrk73nvdC9gF178Tzhmt/qyaFcPLSo= -google.golang.org/genproto/googleapis/api v0.0.0-20240227224415-6ceb2ff114de h1:jFNzHPIeuzhdRwVhbZdiym9q0ory/xY3sA+v2wPg8I0= -google.golang.org/genproto/googleapis/api v0.0.0-20240227224415-6ceb2ff114de/go.mod h1:5iCWqnniDlqZHrd3neWVTOwvh/v6s3232omMecelax8= google.golang.org/genproto/googleapis/api v0.0.0-20240701130421-f6361c86f094 h1:0+ozOGcrp+Y8Aq8TLNN2Aliibms5LEzsq99ZZmAGYm0= google.golang.org/genproto/googleapis/api v0.0.0-20240701130421-f6361c86f094/go.mod h1:fJ/e3If/Q67Mj99hin0hMhiNyCRmt6BQ2aWIJshUSJw= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240401170217-c3f982113cda h1:LI5DOvAxUPMv/50agcLLoo+AdWc1irS9Rzz4vPuD1V4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240401170217-c3f982113cda/go.mod h1:WtryC6hu0hhx87FDGxWCDptyssuo68sk10vYjF+T9fY= google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 h1:BwIjyKYGsK9dMCBOorzRri8MQwmi7mT9rGHsCEinZkA= google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= -google.golang.org/grpc v1.63.0 h1:WjKe+dnvABXyPJMD7KDNLxtoGk5tgk+YFWN6cBWjZE8= -google.golang.org/grpc v1.63.0/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA= -google.golang.org/grpc v1.63.2 h1:MUeiw1B2maTVZthpU5xvASfTh3LDbxHd6IJ6QQVU+xM= -google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA= google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY= google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg= -google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= -google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/host.go b/host.go new file mode 100644 index 0000000..e0f3d46 --- /dev/null +++ b/host.go @@ -0,0 +1,101 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. + +package main + +import ( + "crypto/sha256" + "net/url" + "strings" +) + +// getHostnameFromURL returns the normalized host name from the URL. It will return the +// naked version of the domain, without a "www." prefix. This can be used for +// host-specific work queues. +// +// TODO: Reconsider usage of this function as it might impose a security risk when used +// +// in security sensitive contexts, i.e. when generating cache keys for caches that +// might allow access to private hosts or similar. +func getHostnameFromURL(u string) string { + p, err := url.Parse(u) + if err != nil { + return "" + } + return strings.TrimLeft(p.Hostname(), "www.") +} + +func isPrivateHost(h *Host, getAuth GetAuthFn) bool { + _, ok := getAuth(h) + return ok +} + +func NewHostFromURL(u *url.URL) *Host { + return &Host{ + SerializableHost: SerializableHost{ + Name: u.Hostname(), + Port: u.Port(), + + PreferredScheme: u.Scheme, + }, + } +} + +// Host holds information that is scoped to a single host and is shared among +// runs, like per host rate limits, robots.txt data, sitemaps, etc. +type Host struct { + SerializableHost +} + +// SerializableHost is a serializable version of the Host struct. It must +// not include information that must not be shared beetween runs, such as +// authentication information. +type SerializableHost struct { + Name string // The Hostname, a FQDN of the host. + Port string // The port number, if any. + + PreferredScheme string // Either http or https +} + +type LiveHost struct { +} + +func (h *Host) String() string { + if h.Port != "" { + return h.Name + ":" + h.Port + } + return h.Name +} + +// Hash is an alias to HashWithAuth. +func (h *Host) Hash(getAuth GetAuthFn) []byte { + return h.HashWithAuth(getAuth) +} + +// HashWithAuth returns a hash of the host including authentication information, if +// available. It is used to uniquely identify a host and its configuration. You +// should usually use this hash to identify a host over Hash() as it is more +// secure. +func (h *Host) HashWithAuth(getAuth GetAuthFn) []byte { + hash := sha256.New() + + hash.Write(h.HashWithoutAuth()) + + if ac, ok := getAuth(h); ok { + hash.Write(ac.Hash()) + } + return hash.Sum(nil) +} + +// Hash returns a hash of the host *without* taking into account possible +// existing authentication information. Do only use this hash, i.e. as a cache +// key or ID if it is okay to have the stored data shared between different +// runs. +func (h *Host) HashWithoutAuth() []byte { + hash := sha256.New() + + hash.Write([]byte(h.Name)) + // Scheme is not taken into account as it would still be the same host. + hash.Write([]byte(h.Port)) + + return hash.Sum(nil) +} diff --git a/host_test.go b/host_test.go new file mode 100644 index 0000000..88b2aa1 --- /dev/null +++ b/host_test.go @@ -0,0 +1,19 @@ +package main + +import ( + "net/url" + "testing" +) + +// Verify that passing an URL with a port to NewHostFromURL works as expected and +// that the port is correctly set. +func TestNewHostFromURL(t *testing.T) { + u, err := url.Parse("http://example.com:8080") + if err != nil { + t.Fatal(err) + } + h := NewHostFromURL(u) + if h.Port != "8080" { + t.Errorf("Expected port to be '8080', got '%s'", h.Port) + } +} diff --git a/httpclient.go b/httpclient.go index 69bacd8..2330d02 100644 --- a/httpclient.go +++ b/httpclient.go @@ -6,41 +6,127 @@ import ( "context" "log/slog" "net/http" + "net/url" + "os" "path/filepath" "time" "github.com/gregjones/httpcache" "github.com/gregjones/httpcache/diskcache" + "github.com/hashicorp/go-retryablehttp" "github.com/peterbourgon/diskv" "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" ) -type GetAuthHeaderFn func(ctx context.Context, host string) (string, bool) +// https://www.mattcutts.com/blog/crawl-caching-proxy/ -// CreateCrawlerHTTPClient creates a new HTTP client configured and optimized for use -// in crawling actions. -func CreateCrawlerHTTPClient(getAuthHeader GetAuthHeaderFn) *http.Client { - var t http.RoundTripper - - // - // -> OtelTransport -> AuthTransport -> UserAgentTransport -> CachingTransport -> DefaultTransport - // +var ( + // HTTPCachePath holds a diskcache.Cache - unless SkipCache is true - + // that is used by the HTTP client to cache responses. It is exposed as a + // variable to allow for invalidation of the cache. + HTTPCacheDisk *diskcache.Cache +) - if SkipCache { - t = http.DefaultTransport - } else { - cachedir, _ := filepath.Abs(CachePath) - tempdir, _ := filepath.Abs(CacheTempPath) +func init() { + if !SkipCache { + cachedir, _ := filepath.Abs(HTTPCachePath) + tempdir, _ := filepath.Abs(os.TempDir()) - cachedisk := diskv.New(diskv.Options{ + HTTPCacheDisk = diskcache.NewWithDiskv(diskv.New(diskv.Options{ BasePath: cachedir, TempDir: tempdir, CacheSizeMax: 1000 * 1024 * 1024, // 1GB - }) - t = httpcache.NewTransport(diskcache.NewWithDiskv(cachedisk)) + })) + } +} + +// GetAuthFn returns the AuthConfig - if any - for the given Host. The second +// return value indicates if the host was found in the configuration. If the +// host was not found the caller should not add the Authorization header to the +// request, as none is needed. +type GetAuthFn func(*Host) (*AuthConfig, bool) + +// NoAuthFn is a GetAuthFn that always returns nil, false. It can be used +// when no authentication is required, i.e. in testing. +func NoAuthFn(*Host) (*AuthConfig, bool) { + return nil, false +} + +// getAuthHeaderFn returns the Authorization header for the given URL. +type getAuthHeaderFn func(context.Context, *url.URL) (string, bool) + +// CreateCrawlerHTTPClient creates a new HTTP client configured and optimized for use +// in crawling actions. It adds caching, tracing, metrics, and authentication support. +func CreateCrawlerHTTPClient(getAuth GetAuthFn) *http.Client { + return &http.Client{ + Timeout: 10 * time.Second, + Transport: withMiddlewares(http.DefaultTransport, getAuth), + } +} + +func CreateRetryingHTTPClient(getAuth GetAuthFn) *http.Client { + rc := retryablehttp.NewClient() + + // Fail a little quicker, as the caller might block until + // the request is done. + rc.RetryMax = 2 + + // Pass the last response we got back to the caller, otherwise + // would get a nil response. This allows the surrounding code to + // react on the status code. + rc.ErrorHandler = retryablehttp.PassthroughErrorHandler + + return &http.Client{ + Timeout: 10 * time.Second, + Transport: withMiddlewares(&retryablehttp.RoundTripper{ + Client: rc, + }, getAuth), } +} - t = &AuthTransport{Transport: t, getHeaderFn: getAuthHeader} +// withMiddlewares adds additional transports to the provided transport, usually this is http.DefaultTransport. +// +// The order in which the transports are layered on top of each other is important: +// +// [request initiated by client] +// -> OtelTransport +// -> AuthTransport +// -> UserAgentTransport +// -> CachingTransport +// -> t (usually http.DefaultTransport) +// [endpoint] +func withMiddlewares(t http.RoundTripper, getAuth GetAuthFn) http.RoundTripper { + if !SkipCache { + // Adds caching support to the client. Please note that the cache is a + // private cache and will store responses that required authentication + // as well. + // + // TODO: This is currently treated as a public cache, although it is a private one. Runs that don't provide + // authentication my still access cached responses that required authentication. + // + // We should either never cache responses that required authentication or include the Authorization + // headers' contents in the cache key. This would require a custom cache implementation. + // + // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Cache-Control + t = &httpcache.Transport{ + Transport: t, + Cache: HTTPCacheDisk, + MarkCachedResponses: true, + } + } + + t = &AuthTransport{ + Transport: t, + getHeaderFn: func(ctx context.Context, u *url.URL) (string, bool) { + h := NewHostFromURL(u) + + ac, ok := getAuth(h) + if !ok { + return "", false + } + return ac.GetHeader() + }, + } // Add User-Agent to the transport, these headers should be added // before going through the caching transport. @@ -51,20 +137,16 @@ func CreateCrawlerHTTPClient(getAuthHeader GetAuthHeaderFn) *http.Client { if UseMetrics || UseTracing { t = otelhttp.NewTransport(t) } - - return &http.Client{ - Timeout: 10 * time.Second, - Transport: t, - } + return t } type AuthTransport struct { Transport http.RoundTripper - getHeaderFn GetAuthHeaderFn + getHeaderFn getAuthHeaderFn } func (t *AuthTransport) RoundTrip(req *http.Request) (*http.Response, error) { - header, ok := t.getHeaderFn(req.Context(), req.Host) + header, ok := t.getHeaderFn(req.Context(), req.URL) if ok { slog.Debug("Client: Adding Authorization header to request.", "host", req.Host) req.Header.Add("Authorization", header) diff --git a/httpclient_test.go b/httpclient_test.go index 6580b61..b8ff49f 100644 --- a/httpclient_test.go +++ b/httpclient_test.go @@ -5,13 +5,15 @@ package main import ( "context" "net/http" + "net/http/httptest" + "net/url" "testing" ) func TestAuthTransportAddsAuthorizationHeader(t *testing.T) { transport := &AuthTransport{ Transport: http.DefaultTransport, - getHeaderFn: func(ctx context.Context, host string) (string, bool) { + getHeaderFn: func(ctx context.Context, u *url.URL) (string, bool) { return "
", true }, } @@ -23,3 +25,26 @@ func TestAuthTransportAddsAuthorizationHeader(t *testing.T) { t.Errorf("got: %q", h) } } + +func TestRetryingHTTPClientReturnsResponseOn503(t *testing.T) { + // Start a test HTTP server that only returns 503. + errserver := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + rw.WriteHeader(503) + // Write something to avoid empty body. + rw.Write([]byte("Service Unavailable")) + })) + defer errserver.Close() + + client := CreateRetryingHTTPClient(NoAuthFn) + + resp, err := client.Get(errserver.URL) + if err != nil { + t.Log(err) + } + if resp == nil { + t.Fatal("expected response") + } + if resp.StatusCode != 503 { + t.Errorf("got: %d", resp.StatusCode) + } +} diff --git a/internal/collector/collector.go b/internal/collector/collector.go index 39f4e48..12ec425 100644 --- a/internal/collector/collector.go +++ b/internal/collector/collector.go @@ -19,14 +19,9 @@ import ( whatwgUrl "github.com/nlnwa/whatwg-url/url" ) -const ( - FlagNone uint8 = 1 << iota - FlagInternal -) - -type EnqueueFn func(ctx context.Context, c *Collector, u string, flags uint8) error // Enqueues a scrape. -type CollectFn func(ctx context.Context, c *Collector, res *Response, flags uint8) // Collects the result of a scrape. -type RobotCheckFn func(agent string, u string) (bool, error) // Checks if a URL is allowed by robots.txt. +type EnqueueFn func(ctx context.Context, c *Collector, u string) error // Enqueues a scrape. +type CollectFn func(ctx context.Context, c *Collector, res *Response) // Collects the result of a scrape. +type RobotCheckFn func(agent string, u string) (ok bool, err error) // Checks if a URL is allowed by robots.txt. ok is true if allowed. var WellKnownFiles = []string{"robots.txt", "sitemap.xml", "sitemap_index.xml"} @@ -76,22 +71,11 @@ func NewCollector( if !strings.HasPrefix(u.Scheme, "http") { return } - // We currently assume none internal / well known files are discovered via links. - enqueue(ctx, c, u.String(), FlagNone) + enqueue(ctx, c, u.String()) }) c.OnScraped(func(ctx context.Context, res *Response) { - collect(ctx, c, res, FlagNone) - }) - - // Resolve linked sitemaps. - c.OnXML("//sitemap/loc", func(ctx context.Context, e *XMLElement) { - slog.Info("Sitemaps: Found linked sitemap.", "url", e.Text) - enqueue(ctx, c, e.Text, FlagInternal) - }) - c.OnXML("//urlset/url/loc", func(ctx context.Context, e *XMLElement) { - slog.Info("Sitemaps: Found URL in sitemap.", "url", e.Text) - enqueue(ctx, c, e.Text, FlagNone) + collect(ctx, c, res) }) c.OnError(func(res *Response, err error) { @@ -150,25 +134,21 @@ type Collector struct { } func (c *Collector) Enqueue(rctx context.Context, u string) error { - return c.enqueueFn(rctx, c, u, 0) + return c.enqueueFn(rctx, c, u) } -func (c *Collector) EnqueueWithFlags(rctx context.Context, u string, flags uint8) error { - return c.enqueueFn(rctx, c, u, flags) -} - -func (c *Collector) Visit(rctx context.Context, URL string) error { +func (c *Collector) Visit(rctx context.Context, URL string) (*Response, error) { return c.scrape(rctx, URL, "GET", 1, nil, nil) } -func (c *Collector) scrape(rctx context.Context, u, method string, depth int, requestData io.Reader, hdr http.Header) error { +func (c *Collector) scrape(rctx context.Context, u, method string, depth int, requestData io.Reader, hdr http.Header) (*Response, error) { parsedWhatwgURL, err := urlParser.Parse(u) if err != nil { - return err + return nil, err } parsedURL, err := url.Parse(parsedWhatwgURL.Href(false)) if err != nil { - return err + return nil, err } if hdr == nil { hdr = http.Header{} @@ -176,9 +156,9 @@ func (c *Collector) scrape(rctx context.Context, u, method string, depth int, re if _, ok := hdr["User-Agent"]; !ok { hdr.Set("User-Agent", c.UserAgent) } - req, err := http.NewRequest(method, parsedURL.String(), requestData) + req, err := http.NewRequestWithContext(rctx, method, parsedURL.String(), requestData) if err != nil { - return err + return nil, err } req.Header = hdr // The Go HTTP API ignores "Host" in the headers, preferring the client @@ -186,17 +166,13 @@ func (c *Collector) scrape(rctx context.Context, u, method string, depth int, re if hostHeader := hdr.Get("Host"); hostHeader != "" { req.Host = hostHeader } - // note: once 1.13 is minimum supported Go version, - // replace this with http.NewRequestWithContext - req = req.WithContext(rctx) - if err := c.requestCheck(parsedURL, method, req.GetBody, depth); err != nil { - return err + if err := c.requestCheck(req.URL, req.Method, req.GetBody, depth); err != nil { + return nil, err } - u = parsedURL.String() - return c.fetch(rctx, u, method, depth, requestData, hdr, req) + return c.fetch(rctx, req.URL.String(), req.Method, depth, requestData, hdr, req) } -func (c *Collector) fetch(rctx context.Context, u, method string, depth int, requestData io.Reader, hdr http.Header, req *http.Request) error { +func (c *Collector) fetch(rctx context.Context, u, method string, depth int, requestData io.Reader, hdr http.Header, req *http.Request) (*Response, error) { request := &Request{ URL: req.URL, Headers: &req.Header, @@ -214,7 +190,7 @@ func (c *Collector) fetch(rctx context.Context, u, method string, depth int, req c.handleOnRequest(request) if request.abort { - return nil + return nil, nil } if method == "POST" && req.Header.Get("Content-Type") == "" { @@ -236,14 +212,14 @@ func (c *Collector) fetch(rctx context.Context, u, method string, depth int, req request.ProxyURL = proxyURL } if err := c.handleOnError(response, err, request); err != nil { - return err + return response, err } response.Request = request err = response.fixCharset(c.DetectCharset, request.ResponseCharacterEncoding) if err != nil { - return err + return response, err } c.handleOnResponse(response) @@ -264,7 +240,7 @@ func (c *Collector) fetch(rctx context.Context, u, method string, depth int, req c.handleOnScraped(rctx, response) - return err + return response, err } func (c *Collector) requestCheck(parsedURL *url.URL, method string, getBody func() (io.ReadCloser, error), depth int) error { @@ -283,25 +259,20 @@ func (c *Collector) CheckRedirectFunc() func(req *http.Request, via []*http.Requ // Enqueue the new redirect target URL, ensure when the HTTP request is // cancelled, the queuing is not also cancelled. - c.EnqueueWithFlags(context.WithoutCancel(req.Context()), req.URL.String(), FlagNone) + c.Enqueue(context.WithoutCancel(req.Context()), req.URL.String()) // This URL was not processed, so we do not want to follow the redirect. return http.ErrUseLastResponse } } -func (c *Collector) IsVisitAllowed(in string, flags uint8) (bool, error) { +func (c *Collector) IsVisitAllowed(in string) (bool, error) { p, err := url.Parse(in) if err != nil { slog.Error("url parse error", in, "") return false, ErrCheckInternal } - // Internal URLs are always allowed. - if flags&FlagInternal != 0 { - return true, nil - } - checkDomain := func(u *url.URL) bool { // Ensure there is at least one domain in the allowlist. Do not treat an // empty allowlist as a wildcard. diff --git a/internal/collector/errors.go b/internal/collector/errors.go index c12a4d9..f1c8247 100644 --- a/internal/collector/errors.go +++ b/internal/collector/errors.go @@ -6,7 +6,9 @@ package collector -import "errors" +import ( + "errors" +) var ( ErrCheckInternal = errors.New("Internal check error") diff --git a/internal/collector/httpbackend.go b/internal/collector/httpbackend.go index 90d3d3c..4ccce8f 100644 --- a/internal/collector/httpbackend.go +++ b/internal/collector/httpbackend.go @@ -10,6 +10,7 @@ import ( "io" "net/http" "strings" + "time" "compress/gzip" ) @@ -26,6 +27,8 @@ func (h *HTTPBackend) WithCheckRedirect(fn checkRedirectFunc) { } func (h *HTTPBackend) Do(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc) (*Response, error) { + start := time.Now() + res, err := h.Client.Do(request) if err != nil { return nil, err @@ -62,5 +65,6 @@ func (h *HTTPBackend) Do(request *http.Request, bodySize int, checkHeadersFunc c StatusCode: res.StatusCode, Body: body, Headers: &res.Header, + Took: time.Since(start), }, nil } diff --git a/internal/collector/request.go b/internal/collector/request.go index e0ec5e7..90a243a 100644 --- a/internal/collector/request.go +++ b/internal/collector/request.go @@ -102,12 +102,12 @@ func (r *Request) AbsoluteURL(u string) string { // Visit continues Collector's collecting job by creating a // request and preserves the Context of the previous request. // Visit also calls the previously provided callbacks -func (r *Request) Visit(rctx context.Context, URL string) error { +func (r *Request) Visit(rctx context.Context, URL string) (*Response, error) { return r.collector.scrape(rctx, r.AbsoluteURL(URL), "GET", r.Depth+1, nil, nil) } // Do submits the request -func (r *Request) Do() error { +func (r *Request) Do() (*Response, error) { return r.collector.scrape(context.TODO(), r.URL.String(), r.Method, r.Depth, r.Body, *r.Headers) } diff --git a/internal/collector/response.go b/internal/collector/response.go index 9695d23..a3cfe4c 100644 --- a/internal/collector/response.go +++ b/internal/collector/response.go @@ -14,6 +14,7 @@ import ( "net/http" "os" "strings" + "time" "github.com/saintfish/chardet" "golang.org/x/net/html/charset" @@ -29,6 +30,9 @@ type Response struct { Request *Request // Headers contains the Response's HTTP headers Headers *http.Header + + // How long it took to perform a request and get this response. + Took time.Duration } // Save writes response body to disk diff --git a/limiter.go b/limiter.go deleted file mode 100644 index bff930b..0000000 --- a/limiter.go +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2024 Factorial GmbH. All rights reserved. - -package main - -import ( - "context" - "log/slog" - "net/url" - "strings" - "sync" - "time" - - "github.com/go-redis/redis_rate/v10" - "github.com/redis/go-redis/v9" - xrate "golang.org/x/time/rate" -) - -var ( - // memoryLimiters is a map of hostnames to rate limiters. - memoryLimiters map[string]*xrate.Limiter - memoryLimitersLock sync.Mutex - - redisLimiter *redis_rate.Limiter -) - -type LimiterAllowFn func(url string) (hasReservation bool, retryAfter time.Duration, err error) - -func CreateLimiter(ctx context.Context, redis *redis.Client, ratePerS int) LimiterAllowFn { - host := func(u string) string { - p, err := url.Parse(u) - if err != nil { - slog.Warn("Limiter: Failed to parse URL for host.", "url", u, "error", err) - return "" - } - return strings.TrimPrefix(p.Hostname(), "www.") - } - - if redis != nil { - slog.Debug("Using distributed rate limiter...") - redisLimiter = redis_rate.NewLimiter(redis) - - return func(url string) (bool, time.Duration, error) { - res, err := redisLimiter.Allow(ctx, host(url), redis_rate.PerSecond(ratePerS)) - return res.Allowed > 0, res.RetryAfter, err - } - } - - slog.Debug("Using in-memory rate limiter...") - memoryLimiters = make(map[string]*xrate.Limiter) - - return func(url string) (bool, time.Duration, error) { - key := host(url) - - var memoryLimiter *xrate.Limiter - memoryLimitersLock.Lock() - if v, ok := memoryLimiters[key]; ok { - memoryLimiter = v - } else { - memoryLimiter = xrate.NewLimiter(xrate.Limit(ratePerS), 1) - memoryLimiters[key] = memoryLimiter - } - memoryLimitersLock.Unlock() - - r := memoryLimiter.Reserve() - return r.OK(), r.Delay(), nil - } -} diff --git a/main.go b/main.go index 8302bda..fb8611b 100644 --- a/main.go +++ b/main.go @@ -17,37 +17,40 @@ import ( "strings" "time" + "github.com/prometheus/client_golang/prometheus/promhttp" "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" _ "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "go.opentelemetry.io/contrib/instrumentation/runtime" ) +// These variables can be controlled via environment variables. var ( // Debug enables or disables debug mode, this can be controlled // via the environment variable TOBEY_DEBUG. Debug = false - // SkipCache disables caching when true. It can be controlled via the TOBEY_SKIP_CACHE environment variable. + // SkipCache disables caching when true. It can be controlled via the + // TOBEY_SKIP_CACHE environment variable. SkipCache = false // These can be controlled via the TOBEY_TELEMETRY environment variable. UseTracing = false UseMetrics = false + UsePulse = false // High Frequency Metrics can be enabled by adding "pulse". - // MaxRequestsPerSecond specifies the maximum number of requests per second - // that are exectuted against a single host. Can be controlled via the TOBEY_REQS_PER_S environment variable. - MaxRequestsPerSecond int = 2 -) + // ListenHost is the host where the main HTTP server listens and the API is served, + // this can be controlled via the TOBEY_HOST environment variable. An empty + // string means "listen on all interfaces". + ListenHost string = "" -const ( - // The port where the main HTTP server listens and the API is served. + // The port where the main HTTP server listens and the API is served, this can + // be controlled via the TOBEY_PORT environment variable. ListenPort int = 8080 +) - // The port where to ping for healthcheck. - HealthcheckListenPort int = 10241 - +const ( // NumVisitWorkers hard codes the number of workers we start at startup. - NumVisitWorkers int = 10 + NumVisitWorkers int = 5 // MaxParallelRuns specifies how many collectors we keep in memory, and thus // limits the maximum number of parrallel runs that we can perform. @@ -58,16 +61,23 @@ const ( // sensitve information, so we should not keep it around for too long. RunTTL = 24 * time.Hour + // HostTTL specifies the maximum time a host is kept in memory. After this + // time the host is evicted from memory and the cache. The TTL defaults to 365 days. + HostTTL = 365 * 24 * time.Hour + // UserAgent to be used with all HTTP request. The value is set to a // backwards compatible one. Some sites allowwlist this specific user agent. UserAgent = "WebsiteStandardsBot/1.0" -) -var ( - // CachePath is the absolute or relative path (to the working directory) where we store the cache. - CachePath = "./cache" + // HTTPCachePath is the absolute or relative path (to the working + // directory) where we store the cache for HTTP responses. + HTTPCachePath = "./cache" - CacheTempPath = os.TempDir() + // The port where to ping for healthcheck. + HealthcheckListenPort int = 10241 + + // PulseEndpoint is the endpoint where we send the high frequency metrics. + PulseEndpoint = "http://localhost:8090" ) func configure() { @@ -79,16 +89,6 @@ func configure() { slog.Info("Skipping cache.") } - if os.Getenv("TOBEY_REQS_PER_S") != "" { - n, err := strconv.Atoi(os.Getenv("TOBEY_REQS_PER_S")) - if err != nil { - panic(err) - } else { - MaxRequestsPerSecond = n - slog.Info("Setting MaxRequestsPerSecond.", "value", MaxRequestsPerSecond) - } - } - v := os.Getenv("TOBEY_TELEMETRY") if strings.Contains(v, "traces") || strings.Contains(v, "tracing") { UseTracing = true @@ -98,6 +98,21 @@ func configure() { UseMetrics = true slog.Info("Metrics enabled.") } + if strings.Contains(v, "pulse") { + UsePulse = true + slog.Info("High Frequency Metrics (Pulse) enabled.") + } + + if v := os.Getenv("TOBEY_HOST"); v != "" { + ListenHost = v + } + if v := os.Getenv("TOBEY_PORT"); v != "" { + p, err := strconv.Atoi(v) + if err != nil { + panic(err) + } + ListenPort = p + } } func main() { @@ -112,30 +127,34 @@ func main() { // This sets up the main process context. ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt) + var err error // Setup Opentelemetry - //todo add opentelemetry logging - shutdown, erro := setupOTelSDK(ctx) - if erro != nil { - panic("ahh") + // TODO: Add opentelemetry logging + shutdown, err := setupOTelSDK(ctx) + if err != nil { + panic(err) } - err := runtime.Start(runtime.WithMinimumReadMemStatsInterval(time.Second)) + err = runtime.Start(runtime.WithMinimumReadMemStatsInterval(time.Second)) if err != nil { log.Fatal(err) } - redisconn, err := maybeRedis(ctx) - if err != nil { - panic(err) + if UsePulse { + startPulse(ctx) } - rabbitmqconn, err := maybeRabbitMQ(ctx) + + redisconn, err := maybeRedis(ctx) if err != nil { panic(err) } - runs := NewRunManager(redisconn) + robots := NewRobots() + sitemaps := NewSitemaps(robots) // Sitemaps will use Robots to discover sitemaps. + + runs := NewRunManager(redisconn, robots, sitemaps) - queue := CreateWorkQueue(rabbitmqconn) - if err := queue.Open(); err != nil { + queue := CreateWorkQueue(redisconn) + if err := queue.Open(ctx); err != nil { panic(err) } @@ -148,13 +167,10 @@ func main() { progress := MustStartProgressFromEnv(ctx) - limiter := CreateLimiter(ctx, redisconn, MaxRequestsPerSecond) - workers := CreateVisitWorkersPool( ctx, NumVisitWorkers, runs, - limiter, queue, progress, hooks, @@ -241,7 +257,7 @@ func main() { // we start publishing to the work queue. runs.Add(ctx, run) - run.Start(reqctx, queue, progress, hooks, req.GetURLs(true)) + go run.Start(reqctx, queue, progress, hooks, req.GetURLs(true)) result := &APIResponse{ Run: run.ID, @@ -250,9 +266,13 @@ func main() { json.NewEncoder(w).Encode(result) }) - slog.Info("Starting HTTP API server...", "port", ListenPort) + if UseMetrics { + apirouter.Handle("GET /metrics", promhttp.Handler()) + } + + slog.Info("Starting HTTP API server...", "host", ListenHost, "port", ListenPort) apiserver := &http.Server{ - Addr: fmt.Sprintf(":%d", ListenPort), + Addr: fmt.Sprintf("%s:%d", ListenHost, ListenPort), Handler: otelhttp.NewHandler(apirouter, "get_new_request"), } go func() { @@ -307,9 +327,6 @@ func main() { if redisconn != nil { redisconn.Close() } - if rabbitmqconn != nil { - rabbitmqconn.Close() - } shutdown(ctx) } diff --git a/observe.go b/observe.go new file mode 100644 index 0000000..70b58b5 --- /dev/null +++ b/observe.go @@ -0,0 +1,46 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. + +package main + +import ( + "context" + "net/http" + "strconv" + "strings" + "sync/atomic" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +// Metrics exposed for collection by Prometheus. +var ( + PromVisitTxns = promauto.NewCounter(prometheus.CounterOpts{ + Name: "visits_txns_total", + Help: "The total number of visits.", + }) +) + +// High Frequency metrics, these should be mutated through atomic operations. +var ( + PulseVisitTxns int32 // A gauge that is reset every second. +) + +// startPulse starts a go routine that pushes updates to the pulse endpoint. +func startPulse(ctx context.Context) { + ticker := time.NewTicker(1 * time.Second) + + go func() { + for { + select { + case <-ticker.C: + v := atomic.LoadInt32(&PulseVisitTxns) + atomic.StoreInt32(&PulseVisitTxns, 0) + + rb := strings.NewReader(strconv.Itoa(int(v))) + http.Post(PulseEndpoint+"/rps", "text/plain", rb) + } + } + }() +} diff --git a/otel.go b/otel.go index 8afc5ca..fdd3f62 100644 --- a/otel.go +++ b/otel.go @@ -6,6 +6,7 @@ import ( "context" "errors" "fmt" + "os" "time" "go.opentelemetry.io/otel" @@ -68,7 +69,7 @@ func setupOTelSDK(ctx context.Context) (shutdown func(context.Context) error, er prop := newPropagator() otel.SetTextMapPropagator(prop) - if UseTracing { + if UseTracing && os.Getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT") != "" { // Set up trace provider. tracerProvider, erro := newTraceProvider(ctx) if erro != nil { @@ -80,7 +81,7 @@ func setupOTelSDK(ctx context.Context) (shutdown func(context.Context) error, er otel.SetTracerProvider(tracerProvider) } - if UseMetrics { + if UseMetrics && os.Getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT") != "" { // Set up meter provider. meterProvider, erra := newMeterProvider(ctx) if erra != nil { diff --git a/progress.go b/progress.go index 96cc2ab..fcebfeb 100644 --- a/progress.go +++ b/progress.go @@ -18,7 +18,7 @@ import ( ) const ( - ProgressStage = "crawler" + ProgressDefaultStage = "crawler" ProgressEndpointUpdate = "api/status/update" ProgressEndpointTransition = "api/status/transition-to" @@ -34,6 +34,25 @@ const ( ProgressStateCancelled = "cancelled" ) +type Progressor struct { + manager Progress + + Run string + URL string +} + +func (p *Progressor) Update(ctx context.Context, status string) error { + return p.manager.Update(ProgressUpdateMessagePackage{ + ctx: ctx, + payload: ProgressUpdateMessage{ + Stage: ProgressDefaultStage, + Status: status, + Run: p.Run, + URL: p.URL, + }, + }) +} + type ProgressUpdateMessagePackage struct { ctx context.Context payload ProgressUpdateMessage @@ -43,7 +62,7 @@ type ProgressUpdateMessage struct { Stage string `json:"stage"` Status string `json:"status"` // only constanz allowed Run string `json:"run_uuid"` // uuid of the run - Url string `json:"url"` + URL string `json:"url"` } type ProgressManager struct { @@ -79,6 +98,7 @@ func NewProgressManager() *ProgressManager { type Progress interface { Update(update_message ProgressUpdateMessagePackage) error Close() error + With(run string, url string) *Progressor } func (w *ProgressManager) startHandle(ctx context.Context, progressQueue chan ProgressUpdateMessagePackage, pnumber int) { @@ -113,7 +133,7 @@ func (w *ProgressManager) startHandle(ctx context.Context, progressQueue chan Pr if err != nil { wlogger.Error("Progress Dispatcher: Sending update ultimately failed.", "error", err) } else { - wlogger.Debug("Progress Dispatcher: Update succesfully sent.", "url", result.Url) + wlogger.Debug("Progress Dispatcher: Update succesfully sent.", "url", result.URL) } parentSpan.End() @@ -122,7 +142,7 @@ func (w *ProgressManager) startHandle(ctx context.Context, progressQueue chan Pr } func (w *ProgressManager) sendProgressUpdate(ctx context.Context, msg ProgressUpdateMessage) error { - logger := slog.With("url", msg.Url, "status", msg.Status, "run", msg.Run) + logger := slog.With("url", msg.URL, "status", msg.Status, "run", msg.Run) logger.Debug("Progress Dispatcher: Sending progress update...") ctx_send_webhook, span := tracer.Start(ctx, "handle.progress.queue.send") @@ -131,7 +151,7 @@ func (w *ProgressManager) sendProgressUpdate(ctx context.Context, msg ProgressUp url := fmt.Sprintf("%v/%v", w.apiURL, ProgressEndpointUpdate) span.SetAttributes(attribute.String("API_URL", url)) - span.SetAttributes(attribute.String("url", msg.Url)) + span.SetAttributes(attribute.String("url", msg.URL)) span.SetAttributes(attribute.String("status_update", msg.Status)) body, err := json.Marshal(msg) @@ -199,6 +219,14 @@ func (p *NoopProgress) Close() error { return nil } +func (p *NoopProgress) With(run string, url string) *Progressor { + return &Progressor{ + manager: p, + Run: run, + URL: url, + } +} + type BaseProgress struct { progressQueue chan ProgressUpdateMessagePackage } @@ -208,6 +236,14 @@ func (p *BaseProgress) Update(update_message ProgressUpdateMessagePackage) error return nil } +func (p *BaseProgress) With(run string, url string) *Progressor { + return &Progressor{ + manager: p, + Run: run, + URL: url, + } +} + func (p *BaseProgress) Close() error { close(p.progressQueue) return nil diff --git a/prometheus.yml b/prometheus.yml new file mode 100644 index 0000000..8ef7b5d --- /dev/null +++ b/prometheus.yml @@ -0,0 +1,6 @@ +scrape_configs: +- job_name: tobey + scrape_interval: 1s + static_configs: + - targets: + - tobey:8080 diff --git a/robots.go b/robots.go index c23851e..1f7587f 100644 --- a/robots.go +++ b/robots.go @@ -3,103 +3,137 @@ package main import ( + "errors" + "fmt" "log/slog" "net/http" "net/url" + "strings" "sync" "github.com/temoto/robotstxt" ) +func isProbablyRobots(url string) bool { + return strings.HasSuffix(url, "/robots.txt") +} + type RobotCheckFn func(agent string, u string) (bool, error) +var ( + // ErrRobotsUnavailable is returned when the robots.txt file is unavailable. Callee should + // decide themselves whether to allow or disallow the URL in this case. + ErrRobotsUnavailable = errors.New("robots.txt file is unavailable") +) + // Robots is a simple wrapper around robotstxt.RobotsData that caches the -// robots.txt file for each host. +// robots.txt file for each host in memory. It is meant to be setup once +// and the instance passed into each Run. +// +// Robots is a store that provides information on what URLs are allowed to be +// fetched by a given user agent on a per host basis. +// +// Results from a fetch are not shared across workers, so we might end up +// fetching the same robots.txt file multiple times. This compromise is made in +// order to keep the implementation simple and to avoid the need for dedicated +// worker pools and work queues. +// +// Robots will blindly issue fetch requests for the control file, and not check +// rate limit information prior. When a fetch requests is denied by rate limit +// Robots will retry until it succeeds. It is assumed that such fetch requests +// have a high priority. Other requests such as regular visit requests to the +// same host will be delayed until the rate limit is lifted. +// +// It expects the provided HTTP client to perform any necessary authentication +// and caching. In addition to caching at HTTP layer Robots caches the parsed +// robots.txt files in memory for a certain time. // -// TODO: Need to handle refreshing and expiry of robot.txt files. -// TODO: This is not shared across workers, so we might end up fetching the same robots.txt file multiple times. +// FIXME: Need to handle forced expiry of robots.txt files from memory. type Robots struct { sync.RWMutex - client *http.Client - + // data is a map of Host IDs to the parsed robots.txt data. data map[string]*robotstxt.RobotsData } -// We copy the http.client here so that it is not the same as in the collector, -// as the collector changes the state and prohibits redirects. -func NewRobots(client *http.Client) *Robots { +func NewRobots() *Robots { return &Robots{ - client: client, - data: make(map[string]*robotstxt.RobotsData), + data: make(map[string]*robotstxt.RobotsData), } } -func (r *Robots) Check(agent string, u string) (bool, error) { - robot, err := r.get(u) - if err != nil { - return false, err - } - +// Check checks whether the given URL is allowed to be fetched by the given user agent. +func (r *Robots) Check(u string, getAuth GetAuthFn, agent string) (bool, error) { p, err := url.Parse(u) if err != nil { return false, err } - group := robot.FindGroup(agent) - if group == nil { - return true, nil - } - - eu := p.EscapedPath() - if p.RawQuery != "" { - eu += "?" + p.Query().Encode() + robot, err := r.get(NewHostFromURL(p), getAuth) + if err != nil { + slog.Info("Robots: Failed to fetch robots.txt file.", "url", u, "error", err) } - return group.Test(eu), nil + return robot.TestAgent(agent, u), err } // Sitemaps returns available sitemap URLs for the given host. -func (r *Robots) Sitemaps(u string) ([]string, error) { - robot, err := r.get(u) +func (r *Robots) Sitemaps(u string, getAuth GetAuthFn) ([]string, error) { + p, err := url.Parse(u) + if err != nil { + return nil, err + } + + robot, err := r.get(NewHostFromURL(p), getAuth) if err != nil { return nil, err } return robot.Sitemaps, nil } -// get ensures that the robots.txt file for the given host is fetched. -func (r *Robots) get(u string) (*robotstxt.RobotsData, error) { +// get ensures that the robots.txt file for the given host is fetched. It will block until. +func (r *Robots) get(h *Host, getAuth GetAuthFn) (*robotstxt.RobotsData, error) { var robot *robotstxt.RobotsData + var err error + var res *http.Response - p, err := url.Parse(u) - if err != nil { - return robot, err - } + // We need to ensure that we don't retrieve a robots.txt that was earlier + // retrieved from a private host, to a request that doesn't provide + // authentication and treats it as a public host. + key := fmt.Sprintf("%x", h.Hash(getAuth)) r.RLock() - robot, ok := r.data[p.Host] + robot, ok := r.data[key] r.RUnlock() if ok { return robot, nil } - rurl := p.Scheme + "://" + p.Host + "/robots.txt" - slog.Debug("Fetching missing robots.txt file...", "url", rurl, "host", p.Host) - res, err := r.client.Get(rurl) + client := CreateRetryingHTTPClient(getAuth) + + rurl := fmt.Sprintf("%s://%s/robots.txt", h.PreferredScheme, h.String()) + hlogger := slog.With("url", rurl, "host.name", h.Name, "host.port", h.Port) + + hlogger.Debug("Robots: Fetching missing robots.txt file...") + + res, err = client.Get(rurl) if err != nil { - return robot, err + // An HTTP error is handled inside robotstxt.FromResponse, so it is save + // to not handle it here. + defer res.Body.Close() + hlogger.Debug("Robots: Fetched missing robots.txt file.") } - defer res.Body.Close() robot, err = robotstxt.FromResponse(res) - if err != nil { - return robot, err - } + // Always cache the result, even if it is an error. This is to avoid + // fetching the same robots.txt file multiple times. + // + // FIXME: Errored robots.txt and empty files should be cached for a shorter + // time, currently they are cached forever. r.Lock() - r.data[p.Host] = robot + r.data[key] = robot r.Unlock() - return robot, nil + return robot, err } diff --git a/robots_test.go b/robots_test.go new file mode 100644 index 0000000..f7c7fa4 --- /dev/null +++ b/robots_test.go @@ -0,0 +1,41 @@ +package main + +import ( + "net/http" + "net/http/httptest" + "testing" +) + +func TestCheckRobotsWith503(t *testing.T) { + errserver := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + rw.WriteHeader(503) + })) + defer errserver.Close() + + robots := NewRobots() + + ok, err := robots.Check(errserver.URL, NoAuthFn, "test-agent") + if err != nil { + t.Error(err) + } + if ok { + t.Errorf("expected not ok got %v", ok) + } +} + +func TestCheckRobotsWith429(t *testing.T) { + errserver := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + rw.WriteHeader(429) + })) + defer errserver.Close() + + robots := NewRobots() + + ok, err := robots.Check(errserver.URL, NoAuthFn, "test-agent") + if err != nil { + t.Error(err) + } + if !ok { + t.Errorf("expected ok got %v", ok) + } +} diff --git a/run.go b/run.go index ecdee1e..30d018b 100644 --- a/run.go +++ b/run.go @@ -7,6 +7,8 @@ import ( "log/slog" "net/http" "tobey/internal/collector" + + "go.opentelemetry.io/otel/attribute" ) // Run is a struct that represents a single run of the crawler. It contains @@ -18,8 +20,10 @@ import ( type Run struct { SerializableRun - store RunStore // Used to get a live list of seen URLs. - robots *Robots + store RunStore // Used to get a live list of seen URLs. + + robots *Robots + sitemaps *Sitemaps } // SerializableRun is a serializable version of the Run struct. It is used to @@ -48,33 +52,110 @@ type LiveRun struct { Seen []string } +func (r *Run) Configure(s RunStore, ro *Robots, si *Sitemaps) { + r.store = s + r.robots = ro + r.sitemaps = si +} + // GetClient configures and returns the http.Client for the Run. func (r *Run) GetClient() *http.Client { - return CreateCrawlerHTTPClient(func(ctx context.Context, host string) (string, bool) { - for _, auth := range r.AuthConfigs { - if auth.Host == host { - return auth.GetHeader() + return CreateCrawlerHTTPClient(r.getAuthFn()) +} + +// getAuthFn returns a GetAuthFn that can be used to get the auth configuration. +func (r *Run) getAuthFn() GetAuthFn { + return func(h *Host) (*AuthConfig, bool) { + for _, ac := range r.AuthConfigs { + if ac.Matches(h) { + return ac, true } } - return "", false - }) + return nil, false + } } -// ConfigureStore must be called before calling into one of the methods that -// access live data. -func (r *Run) ConfigureStore(s RunStore) { - r.store = s -} +func (r *Run) GetCollector(ctx context.Context, q VisitWorkQueue, p Progress, h *WebhookDispatcher) *collector.Collector { + // getEnqueueFn returns the enqueue function, that will enqueue a single URL to + // be crawled. The enqueue function is called whenever a new URL is discovered + // by that Collector, i.e. by looking at all links in a crawled page HTML. + getEnqueueFn := func(run *Run, q VisitWorkQueue, progress Progress) collector.EnqueueFn { + + // The returned function takes the run context. + return func(ctx context.Context, c *collector.Collector, url string) error { + logger := slog.With("run", run.ID, "url", url) + tctx, span := tracer.Start(ctx, "enqueue_element") + defer span.End() + + span.SetAttributes(attribute.String("URL", url)) + // Ensure we never publish a URL twice for a single run. Not only does + // this help us not put unnecessary load on the queue, it also helps with + // ensuring there will only (mostly) be one result for a page. There is a slight + // chance that two processes enter this function with the same run and url, + // before one of them is finished. + if ok, err := c.IsVisitAllowed(url); !ok { + if err == collector.ErrCheckInternal { + logger.Warn("Collector: Error checking if visit is allowed, not allowing visit.", "error", err) + } + logger.Debug("Collector: Not enqueuing visit, visit not allowed.", "error", err) + return nil + } + if run.HasSeenURL(tctx, url) { + // Do not need to enqueue an URL that has already been crawled, and its response + // can be served from cache. + // slog.Debug("Not enqueuing visit, URL already seen.", "run", c.Run, "url", url) + return nil + } -// ConfigureRobots configures a Robots instance for the Run, it ensures we retrieve -// the robots.txt file with the same http.Client as we use for crawling. The -// http.Client might be using custom headers for authentication. These are only -// available to the Run. -func (r *Run) ConfigureRobots() { - r.robots = NewRobots(r.GetClient()) -} + logger.Debug("Collector: Publishing URL...") + err := q.Publish( + context.WithoutCancel(tctx), // The captured crawl run context. + // Passing the run ID to identify the crawl run, so when + // consumed the URL the run can be reconstructed by the RunManager. + run.ID, + url, + ) + if err != nil { + logger.Error("Collector: Error enqueuing visit.", "error", err) + return err + } + + run.SawURL(tctx, url) + logger.Debug("Collector: URL marked as seen.") + + progress.Update(ProgressUpdateMessagePackage{ + context.WithoutCancel(tctx), + ProgressUpdateMessage{ + ProgressDefaultStage, + ProgressStateQueuedForCrawling, + run.ID, + url, + }, + }) + return nil + } + } + + // getCollectFn returns the collect function that is called once we have a + // result. Uses the information provided in the original crawl request, i.e. the + // WebhookConfig, that we have received via the queued message. + getCollectFn := func(run *Run, hooks *WebhookDispatcher) collector.CollectFn { + + // The returned function takes the run context. + return func(ctx context.Context, c *collector.Collector, res *collector.Response) { + slog.Debug( + "Collect suceeded.", + "run", run.ID, + "url", res.Request.URL, + "response.body.length", len(res.Body), + "response.status", res.StatusCode, + ) + if run.WebhookConfig != nil && run.WebhookConfig.Endpoint != "" { + hooks.Send(ctx, run.WebhookConfig, run.ID, res) + } + } + } -func (r *Run) GetCollector(ctx context.Context, q WorkQueue, p Progress, h *WebhookDispatcher) *collector.Collector { c := collector.NewCollector( ctx, // The collector.Collector will modify the http.Client passed to it, we @@ -84,14 +165,14 @@ func (r *Run) GetCollector(ctx context.Context, q WorkQueue, p Progress, h *Webh if r.SkipRobots { return true, nil } - return r.robots.Check(a, u) + return r.robots.Check(u, r.getAuthFn(), a) }, getEnqueueFn(r, q, p), getCollectFn(r, h), ) + // TODO: We should be able to pass these into the NewCollector constructor. c.UserAgent = UserAgent - c.AllowDomains = r.AllowDomains c.AllowPaths = r.AllowPaths c.DenyPaths = r.DenyPaths @@ -101,29 +182,29 @@ func (r *Run) GetCollector(ctx context.Context, q WorkQueue, p Progress, h *Webh // Start starts the crawl with the given URLs. It will discover sitemaps and // enqueue the URLs. From there on more URLs will be discovered and enqueued. -func (r *Run) Start(ctx context.Context, q WorkQueue, p Progress, h *WebhookDispatcher, urls []string) { +func (r *Run) Start(ctx context.Context, q VisitWorkQueue, p Progress, h *WebhookDispatcher, urls []string) { c := r.GetCollector(ctx, q, p, h) + // Decide where the initial URLs should go, users may provide sitemaps and + // just URLs to web pages. + // + // FIXME: This doesn't yet support providing an alternative robots.txt. for _, u := range urls { - if isProbablySitemap(u) { - c.EnqueueWithFlags(context.WithoutCancel(ctx), u, collector.FlagInternal) + if isProbablySitemap(u) || isProbablySiteindex(u) { + r.sitemaps.Drain(context.WithoutCancel(ctx), r.getAuthFn(), u, c.Enqueue) } else { c.Enqueue(context.WithoutCancel(ctx), u) } } + // This only skips *automatic* sitemap discovery, if the user provided sitemaps we still want to crawl them. if !r.SkipSitemapDiscovery { - for _, u := range r.DiscoverSitemaps(ctx, urls) { - slog.Debug("Sitemaps: Enqueueing sitemap for crawling.", "url", u) - c.EnqueueWithFlags(context.WithoutCancel(ctx), u, collector.FlagInternal) + for _, u := range r.sitemaps.Discover(ctx, r.getAuthFn(), urls) { + r.sitemaps.Drain(context.WithoutCancel(ctx), r.getAuthFn(), u, c.Enqueue) } } } -func (r *Run) DiscoverSitemaps(ctx context.Context, urls []string) []string { - return discoverSitemaps(ctx, urls, r.robots) -} - func (r *Run) SawURL(ctx context.Context, url string) { r.store.SawURL(ctx, r.ID, url) } diff --git a/runmanager.go b/runmanager.go index e703a91..5267517 100644 --- a/runmanager.go +++ b/runmanager.go @@ -11,23 +11,32 @@ import ( type RunManager struct { entries *lru.LRU[string, *Run] // Cannot grow unbound. - store RunStore + + // Shared for all runs, materialized by the RunManager. + store RunStore + robots *Robots + sitemaps *Sitemaps } -func NewRunManager(redis *redis.Client) *RunManager { +func NewRunManager(redis *redis.Client, ro *Robots, si *Sitemaps) *RunManager { m := &RunManager{} m.entries = lru.NewLRU(MaxParallelRuns, m.onEviction, RunTTL) - m.store = CreateRunStore(redis) + m.store = CreateStore(redis) + m.robots = ro + m.sitemaps = si return m } func (m *RunManager) Add(ctx context.Context, run *Run) bool { - m.store.Save(ctx, run) + m.store.SaveRun(ctx, run) - run.ConfigureStore(m.store) - run.ConfigureRobots() + run.Configure( + m.store, + m.robots, + m.sitemaps, + ) return m.entries.Add(run.ID, run) } @@ -36,13 +45,16 @@ func (m *RunManager) Get(ctx context.Context, id string) (*Run, bool) { entry, ok := m.entries.Get(id) if !ok { - run, ok := m.store.Load(ctx, id) + run, ok := m.store.LoadRun(ctx, id) if !ok { return nil, ok } - run.ConfigureStore(m.store) - run.ConfigureRobots() + run.Configure( + m.store, + m.robots, + m.sitemaps, + ) m.entries.Add(run.ID, run) @@ -52,5 +64,5 @@ func (m *RunManager) Get(ctx context.Context, id string) (*Run, bool) { } func (m *RunManager) onEviction(id string, v *Run) { - m.store.Clear(context.Background(), id) + m.store.DeleteRun(context.Background(), id) } diff --git a/runstore.go b/runstore.go deleted file mode 100644 index 7b8af83..0000000 --- a/runstore.go +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright 2024 Factorial GmbH. All rights reserved. - -package main - -import ( - "context" - "encoding/json" - "fmt" - "sync" - - "github.com/redis/go-redis/v9" -) - -// RunStore stores transient metadata about runs. -type RunStore interface { - Save(context.Context, *Run) error - Load(context.Context, string) (*Run, bool) - - SawURL(context.Context, string, string) - HasSeenURL(context.Context, string, string) bool - - Clear(context.Context, string) -} - -func CreateRunStore(redis *redis.Client) RunStore { - if redis != nil { - return &RedisRunStore{conn: redis} - } else { - return &MemoryRunStore{ - static: make(map[string]SerializableRun), - live: make(map[string]LiveRun), - } - } -} - -type MemoryRunStore struct { - sync.RWMutex - - static map[string]SerializableRun - live map[string]LiveRun -} - -func (s *MemoryRunStore) Save(ctx context.Context, run *Run) error { - s.Lock() - defer s.Unlock() - - s.static[run.ID] = run.SerializableRun - return nil -} - -func (s *MemoryRunStore) Load(ctx context.Context, id string) (*Run, bool) { - s.RLock() - defer s.RUnlock() - - if _, ok := s.static[id]; !ok { - return nil, false - } - return &Run{ - SerializableRun: s.static[id], - }, true -} - -func (s *MemoryRunStore) SawURL(ctx context.Context, run string, url string) { - s.Lock() - defer s.Unlock() - - if _, ok := s.live[run]; !ok { - s.live[run] = LiveRun{ - Seen: make([]string, 0), - } - } - entry := s.live[run] - entry.Seen = append(entry.Seen, url) - - s.live[run] = entry -} - -func (s *MemoryRunStore) HasSeenURL(ctx context.Context, run string, url string) bool { - s.RLock() - defer s.RUnlock() - - if _, ok := s.live[run]; !ok { - return false - } - for _, v := range s.live[run].Seen { - if v == url { - return true - } - } - return false -} - -func (s *MemoryRunStore) Clear(ctx context.Context, id string) { - s.Lock() - defer s.Unlock() - - delete(s.static, id) - delete(s.live, id) -} - -type RedisRunStore struct { - conn *redis.Client -} - -func (s *RedisRunStore) Save(ctx context.Context, run *Run) error { - b, _ := json.Marshal(run.SerializableRun) - s.conn.Set(ctx, fmt.Sprintf("%s:static", run.ID), string(b), RunTTL) - - return nil -} - -// Load loads a Run from Redis. -func (s *RedisRunStore) Load(ctx context.Context, id string) (*Run, bool) { - var run *Run - - reply := s.conn.Get(ctx, fmt.Sprintf("%s:static", id)) - if err := reply.Err(); err != nil { - return nil, false - } - - json.Unmarshal([]byte(reply.Val()), &run) - return run, true -} - -func (s *RedisRunStore) SawURL(ctx context.Context, run string, url string) { - s.conn.SAdd(ctx, fmt.Sprintf("%s:live:seen", run), url) -} - -func (s *RedisRunStore) HasSeenURL(ctx context.Context, run string, url string) bool { - reply := s.conn.SIsMember(ctx, fmt.Sprintf("%s:live:seen", run), url) - return reply.Val() -} - -func (s *RedisRunStore) Clear(ctx context.Context, run string) { - s.conn.Del( - ctx, - fmt.Sprintf("%s:static", run), - fmt.Sprintf("%s:live:seen", run), - ) -} diff --git a/scripts/errorserver.js b/scripts/errorserver.js new file mode 100755 index 0000000..381dab6 --- /dev/null +++ b/scripts/errorserver.js @@ -0,0 +1,37 @@ +#!/usr/bin/env node + +const http = require("http"); + +// Define a server that simulates several kind of error conditions. + +// On the /503 route it returns a 503 status code with a Retry-After header +// that tells the client to retry after 120 seconds. + +// On the /slow route the server hangs the request for 120 seconds before +// responding with a 200 status code. + +const requestHandler = (req, res) => { + if (req.url === "/503") { + res.writeHead(503, { + "Retry-After": 120, + }); + res.end("Service Unavailable"); + } else if (req.url === "/slow") { + setTimeout(() => { + res.writeHead(200); + res.end("OK"); + }, 120000); + } else { + res.writeHead(404); + res.end("Not Found"); + } +}; + +// Create the server +const server = http.createServer(requestHandler); + +// Start the server on port 9090 +const PORT = 9090; +server.listen(PORT, () => { + console.log(`Server running on port ${PORT}`); +}); diff --git a/sitemap.go b/sitemap.go index 94ac13a..5a8c5ea 100644 --- a/sitemap.go +++ b/sitemap.go @@ -6,19 +6,42 @@ import ( "context" "fmt" "log/slog" + "net/http" "net/url" "slices" "strings" + "sync" + + sitemap "github.com/oxffaa/gopher-parse-sitemap" ) func isProbablySitemap(url string) bool { - return strings.HasSuffix(url, "/sitemap.xml") || strings.HasSuffix(url, "/sitemap_index.xml") + return strings.HasSuffix(url, "/sitemap.xml") +} + +func isProbablySiteindex(url string) bool { + return strings.HasSuffix(url, "/sitemap_index.xml") +} + +// NewSitemaps creates a new Sitemaps instance. +func NewSitemaps(robots *Robots) *Sitemaps { + return &Sitemaps{ + robots: robots, + } +} + +type Sitemaps struct { + sync.RWMutex + + robots *Robots + + // Per host fetched sitemap data, each host may have multiple sitemaps. + data map[string][]byte } // Discover sitemaps for the hosts, if the robots.txt has no // information about it, fall back to a well known location. -func discoverSitemaps(ctx context.Context, urls []string, robots *Robots) []string { - +func (s *Sitemaps) Discover(ctx context.Context, getAuth GetAuthFn, urls []string) []string { bases := make([]string, 0, len(urls)) for _, u := range urls { p, err := url.Parse(u) @@ -26,7 +49,7 @@ func discoverSitemaps(ctx context.Context, urls []string, robots *Robots) []stri slog.Warn("Sitemaps: Failed to parse URL, skipping.", "url", u, "error", err) continue } - base := fmt.Sprintf("%s://%s", p.Scheme, p.Hostname()) + base := fmt.Sprintf("%s://%s", p.Scheme, p.Host) if slices.Index(bases, base) == -1 { // Ensure unique. bases = append(bases, base) @@ -35,7 +58,7 @@ func discoverSitemaps(ctx context.Context, urls []string, robots *Robots) []stri sitemaps := make([]string, 0) for _, base := range bases { - urls, err := robots.Sitemaps(base) + urls, err := s.robots.Sitemaps(base, getAuth) // This may block. if err != nil { slog.Error("Sitemaps: Failed to fetch sitemap URLs, taking a well known location.", "error", err) @@ -49,3 +72,50 @@ func discoverSitemaps(ctx context.Context, urls []string, robots *Robots) []stri } return sitemaps } + +// Drain fetches the sitemap, parses it and yield the URLs to the yield function. This also recusively +// resolves siteindexes to sitemaps. Async function, returns immediately. +// +// FIXME: Implement this, might use FFI to use Stephan's Rust sitemap fetcher. +// FIXME: Implement this as a work process and go through the work queue. +func (s *Sitemaps) Drain(ctx context.Context, getAuth GetAuthFn, url string, yieldu func(context.Context, string) error) { + client := CreateRetryingHTTPClient(getAuth) + + var resolve func(context.Context, string) error + resolve = func(ctx context.Context, url string) error { + select { + case <-ctx.Done(): + return nil + default: + } + slog.Debug("Sitemaps: Resolving...", "url", url) + + req, err := http.NewRequestWithContext(ctx, "GET", url, nil) + if err != nil { + return err + } + res, err := client.Do(req) + if err != nil { + return err + } + defer res.Body.Close() + + if isProbablySitemap(url) { + return sitemap.Parse(res.Body, func(e sitemap.Entry) error { + slog.Info("Sitemaps: Yield URL from sitemap.", "url", e.GetLocation()) + return yieldu(ctx, e.GetLocation()) + }) + } else if isProbablySiteindex(url) { + return sitemap.ParseIndex(res.Body, func(e sitemap.IndexEntry) error { + slog.Debug("Sitemaps: Resolving siteindex...", "url", e.GetLocation()) + return resolve(ctx, e.GetLocation()) + }) + } + return nil + } + go func(ctx context.Context) { + if err := resolve(ctx, url); err != nil { + slog.Error("Sitemaps: Failed to resolve sitemap/siteindex.", "url", url, "error", err) + } + }(ctx) +} diff --git a/store.go b/store.go new file mode 100644 index 0000000..2fe0e51 --- /dev/null +++ b/store.go @@ -0,0 +1,49 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. + +package main + +import ( + "context" + + "github.com/redis/go-redis/v9" +) + +type Store interface { + RunStore + HostStore +} + +// RunStore stores transient metadata about runs. +type RunStore interface { + SaveRun(context.Context, *Run) error + LoadRun(context.Context, string) (*Run, bool) + DeleteRun(context.Context, string) + + SawURL(context.Context, string, string) + HasSeenURL(context.Context, string, string) bool +} + +// HostStore holds transient information about a host that is okay to be shared +// between runs. +type HostStore interface { + SaveHost(ctx context.Context, host *Host) bool + LoadHost(ctx context.Context, name string) *Host + DeleteHost(ctx context.Context, name string) +} + +func CreateStore(redis *redis.Client) Store { + // if redis != nil { + // return &RedisStore{conn: redis} + // } else { + return &MemoryStore{ + MemoryRunStore: MemoryRunStore{ + rstatic: make(map[string]SerializableRun), + rlive: make(map[string]LiveRun), + }, + MemoryHostStore: MemoryHostStore{ + hstatic: make(map[string]SerializableHost), + hlive: make(map[string]LiveHost), + }, + } + // } +} diff --git a/store_memory.go b/store_memory.go new file mode 100644 index 0000000..7bac7cc --- /dev/null +++ b/store_memory.go @@ -0,0 +1,122 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. + +package main + +import ( + "context" + "fmt" + "sync" +) + +type MemoryStore struct { + sync.RWMutex + + MemoryRunStore + MemoryHostStore +} + +type MemoryRunStore struct { + rstatic map[string]SerializableRun + rlive map[string]LiveRun +} + +type MemoryHostStore struct { + hstatic map[string]SerializableHost + hlive map[string]LiveHost +} + +func (s *MemoryStore) SaveRun(ctx context.Context, run *Run) error { + s.Lock() + defer s.Unlock() + + s.rstatic[run.ID] = run.SerializableRun + return nil +} + +func (s *MemoryStore) LoadRun(ctx context.Context, id string) (*Run, bool) { + s.RLock() + defer s.RUnlock() + + if _, ok := s.rstatic[id]; !ok { + return nil, false + } + return &Run{ + SerializableRun: s.rstatic[id], + }, true +} + +func (s *MemoryStore) DeleteRun(ctx context.Context, id string) { + s.Lock() + defer s.Unlock() + + delete(s.rstatic, id) + delete(s.rlive, id) +} + +func (s *MemoryStore) SawURL(ctx context.Context, run string, url string) { + s.Lock() + defer s.Unlock() + + if _, ok := s.rlive[run]; !ok { + s.rlive[run] = LiveRun{ + Seen: make([]string, 0), + } + } + entry := s.rlive[run] + entry.Seen = append(entry.Seen, url) + + s.rlive[run] = entry +} + +func (s *MemoryStore) HasSeenURL(ctx context.Context, run string, url string) bool { + s.RLock() + defer s.RUnlock() + + if _, ok := s.rlive[run]; !ok { + return false + } + for _, v := range s.rlive[run].Seen { + if v == url { + return true + } + } + return false +} + +// SaveHost adds a host without authentcation to the store, if the host already +// exists it will be ignored. So this function is idempotent and can be called +// even not using HasHost to verify the preconditions. +func (s *MemoryStore) SaveHost(ctx context.Context, host *Host) bool { + s.Lock() + defer s.Unlock() + + key := fmt.Sprintf("%x", host.HashWithoutAuth()) + + if _, ok := s.hstatic[key]; ok { + return false // Already exists. + } + + s.hstatic[key] = host.SerializableHost + return true +} + +// LoadHost returns a host from the store, if the host does not exist it will +// return nil. +func (s *MemoryStore) LoadHost(ctx context.Context, id string) *Host { + s.RLock() + defer s.RUnlock() + + if _, ok := s.hstatic[id]; !ok { + return nil + } + return &Host{ + SerializableHost: s.hstatic[id], + } +} + +func (s *MemoryStore) DeleteHost(ctx context.Context, id string) { + s.Lock() + defer s.Unlock() + + delete(s.hstatic, id) +} diff --git a/store_redis.go b/store_redis.go new file mode 100644 index 0000000..63b90a5 --- /dev/null +++ b/store_redis.go @@ -0,0 +1,52 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. + +package main + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/redis/go-redis/v9" +) + +type RedisStore struct { + conn *redis.Client +} + +func (s *RedisStore) SaveRun(ctx context.Context, run *Run) error { + b, _ := json.Marshal(run.SerializableRun) + s.conn.Set(ctx, fmt.Sprintf("%s:static", run.ID), string(b), RunTTL) + + return nil +} + +// Load loads a Run from Redis. +func (s *RedisStore) LoadRun(ctx context.Context, id string) (*Run, bool) { + var run *Run + + reply := s.conn.Get(ctx, fmt.Sprintf("%s:static", id)) + if err := reply.Err(); err != nil { + return nil, false + } + + json.Unmarshal([]byte(reply.Val()), &run) + return run, true +} + +func (s *RedisStore) DeleteRun(ctx context.Context, run string) { + s.conn.Del( + ctx, + fmt.Sprintf("%s:static", run), + fmt.Sprintf("%s:live:seen", run), + ) +} + +func (s *RedisStore) SawURL(ctx context.Context, run string, url string) { + s.conn.SAdd(ctx, fmt.Sprintf("%s:live:seen", run), url) +} + +func (s *RedisStore) HasSeenURL(ctx context.Context, run string, url string) bool { + reply := s.conn.SIsMember(ctx, fmt.Sprintf("%s:live:seen", run), url) + return reply.Val() +} diff --git a/runstore_test.go b/store_redis_test.go similarity index 80% rename from runstore_test.go rename to store_redis_test.go index ef80b01..458dc15 100644 --- a/runstore_test.go +++ b/store_redis_test.go @@ -10,7 +10,7 @@ import ( "github.com/redis/go-redis/v9" ) -func TestRedisRunStoreLoad(t *testing.T) { +func TestRedisStoreLoadRun(t *testing.T) { ctx := context.Background() server := miniredis.RunT(t) @@ -21,14 +21,14 @@ func TestRedisRunStoreLoad(t *testing.T) { }) defer conn.Close() - s := &RedisRunStore{conn} + s := &RedisStore{conn} - s.Save(ctx, &Run{ + s.SaveRun(ctx, &Run{ SerializableRun: SerializableRun{ ID: "1", }, }) - run, ok := s.Load(ctx, "1") + run, ok := s.LoadRun(ctx, "1") if !ok { t.Fatal("run not found") } diff --git a/visitworker.go b/visitworker.go index 59526f5..96984c9 100644 --- a/visitworker.go +++ b/visitworker.go @@ -4,24 +4,30 @@ package main import ( "context" + "errors" "log/slog" "sync" + "sync/atomic" "time" - "tobey/internal/collector" - "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/trace" ) +const ( + // The maximum number of retries for a job. Jobs are retried if they fail + // with an error that indicates a temporary issue, i.e. a 503, when a host + // is down for maintenance. + MaxJobRetries = 3 +) + // CreateVisitWorkersPool initizalizes a worker pool and fills it with a number // of VisitWorker. func CreateVisitWorkersPool( ctx context.Context, num int, runs *RunManager, - limiter LimiterAllowFn, - q WorkQueue, + q VisitWorkQueue, progress Progress, hooks *WebhookDispatcher, ) *sync.WaitGroup { @@ -32,7 +38,7 @@ func CreateVisitWorkersPool( wg.Add(1) go func(id int) { - if err := VisitWorker(ctx, id, runs, limiter, q, progress, hooks); err != nil { + if err := VisitWorker(ctx, id, runs, q, progress, hooks); err != nil { slog.Error("Visitor: Worker exited with error.", "worker.id", id, "error", err) } else { slog.Debug("Visitor: Worker exited cleanly.", "worker.id", id) @@ -48,19 +54,18 @@ func VisitWorker( ctx context.Context, id int, runs *RunManager, - limiter LimiterAllowFn, - q WorkQueue, + q VisitWorkQueue, progress Progress, hooks *WebhookDispatcher, ) error { wlogger := slog.With("worker.id", id) + wlogger.Debug("Visitor: Starting...") + jobs, errs := q.Consume(ctx) for { var job *VisitJob wlogger.Debug("Visitor: Waiting for job...") - jobs, errs := q.ConsumeVisit(ctx) - select { // This allows to stop a worker gracefully. case <-ctx.Done(): @@ -68,19 +73,24 @@ func VisitWorker( return nil case err := <-errs: _, span := tracer.Start(ctx, "handle.visit.queue.worker.error") + wlogger.Error("Visitor: Failed to consume from queue.", "error", err) span.RecordError(err) - span.End() + span.End() return err case j := <-jobs: job = j } - jlogger := wlogger.With("run", job.Run, "url", job.URL, "job.id", job.ID, "job.flags", job.Flags) - jlogger.Debug("Visitor: Received job.") + jlogger := wlogger.With("run", job.Run, "url", job.URL, "job.id", job.ID) + + p := progress.With(job.Run, job.URL) jctx, span := tracer.Start(job.Context, "process_visit_job") span.SetAttributes(attribute.String("Url", job.URL)) + t := trace.WithAttributes(attribute.String("Url", job.URL)) + + jlogger.Debug("Visitor: Received job.") if _, err := job.Validate(); err != nil { jlogger.Error(err.Error()) @@ -96,97 +106,102 @@ func VisitWorker( r, _ := runs.Get(ctx, job.Run) c := r.GetCollector(ctx, q, progress, hooks) - if !job.HasReservation { - jlogger.Debug("Visitor: Job has no reservation.") + p.Update(jctx, ProgressStateCrawling) - nowReserved, retryAfter, err := limiter(job.URL) - if err != nil { - slog.Error("Visitor: Error while checking rate limiter.", "error", err) + res, err := c.Visit(jctx, job.URL) - span.End() - return err - } - // Some limiters will always perform a reservation, others will ask - // you to retry and reserve again later. - job.HasReservation = nowReserved // Skip limiter next time. + if UseMetrics { + PromVisitTxns.Inc() + } + if UsePulse { + atomic.AddInt32(&PulseVisitTxns, 1) + } + + if res != nil { + q.TakeRateLimitHeaders(jctx, job.URL, res.Headers) + q.TakeSample(jctx, job.URL, res.StatusCode, err, res.Took) + } else { + q.TakeSample(jctx, job.URL, 0, err, 0) + } - if retryAfter > 0 { - jlogger.Debug("Visitor: Delaying visit...", "delay", retryAfter) + if err == nil { + p.Update(jctx, ProgressStateCrawled) - if err := q.DelayVisit(jctx, retryAfter, job.VisitMessage); err != nil { - jlogger.Error("Visitor: Failed to schedule delayed message.") + jlogger.Info("Visitor: Visited URL.", "took.lifetime", time.Since(job.Created), "took.fetch", res.Took) + span.AddEvent("Visitor: Visited URL.", t) + + // TODO: Notify the webhook. + + span.End() + continue + } - span.AddEvent("Failed to schedule delayed message", trace.WithAttributes( - attribute.String("Url", job.URL), - )) + if res != nil { + // We have response information, use it to determine the correct error handling in detail. + switch res.StatusCode { + case 302: // Redirect + // When a redirect is encountered, the visit errors out. This is in fact + // no an actual error, but just a skip. + p.Update(jctx, ProgressStateCancelled) + + jlogger.Info("Visitor: Skipped URL, got redirected.") + span.AddEvent("Cancelled visiting URL", t) + + span.End() + continue + case 404: + // FIXME: Probably want to lower the log level to Info for 404s here. + case 429: // Too Many Requests + fallthrough + case 503: // Service Unavailable + // Additionally want to retrieve the Retry-After header here and wait for that amount of time, if + // we just have to wait than don't error out but reschedule the job and wait. In order to not do + // that infinitely, we should have a maximum number of retries. + + // Handling of Retry-After header is optional, so errors here + // are not critical. + if v := res.Headers.Get("Retry-After"); v != "" { + d, _ := time.ParseDuration(v + "s") + q.Pause(jctx, res.Request.URL.String(), d) + } + + if job.Retries < MaxJobRetries { + if err := q.Republish(jctx, job); err != nil { + jlogger.Warn("Visitor: Republish failed, stopping retrying.", "error", err) + } else { + // Leave job in "Crawling" state. + span.End() + continue + } + } else { + jlogger.Warn("Visitor: Maximum number of retries reached.") + } + default: + // Noop, fallthrough to generic error handling. + } + } else if errors.Is(err, context.DeadlineExceeded) { + // We react to timeouts as a temporary issue and retry the job, similary + // to 429 and 503 errors. + if job.Retries < MaxJobRetries { + if err := q.Republish(jctx, job); err != nil { + jlogger.Warn("Visitor: Republish failed, stopping retrying.", "error", err) + } else { + // Leave job in "Crawling" state. span.End() continue } - span.End() - continue + } else { + jlogger.Warn("Visitor: Maximum number of retries reached.") } } - if job.Flags&collector.FlagInternal == 0 { - progress.Update(ProgressUpdateMessagePackage{ - jctx, - ProgressUpdateMessage{ - ProgressStage, - ProgressStateCrawling, - job.Run, - job.URL, - }, - }) - } - if err := c.Visit(jctx, job.URL); err != nil { - // When a redirect is encountered, the visit errors out, but with an . - // errors.errorString{"Found"} This is in facto no an actual error, but. - // just a skip . - var state string - if err.Error() == "Found" { - state = ProgressStateCancelled - jlogger.Info("Visitor: Skipped URL visit.") - } else { - state = ProgressStateErrored + p.Update(jctx, ProgressStateErrored) - jlogger.Error("Visitor: Error visiting URL.", "error", err) - span.AddEvent("Error visiting URL", trace.WithAttributes( - attribute.String("Url", job.URL), - )) - } - if job.Flags&collector.FlagInternal == 0 { - progress.Update(ProgressUpdateMessagePackage{ - jctx, - ProgressUpdateMessage{ - ProgressStage, - state, - job.Run, - job.URL, - }, - }) - } - span.End() - continue - } - jlogger.Info("Visitor: Visited URL.", "took", time.Since(job.Created)) - - if job.Flags&collector.FlagInternal == 0 { - progress.Update(ProgressUpdateMessagePackage{ - jctx, - ProgressUpdateMessage{ - ProgressStage, - ProgressStateCrawled, - job.Run, - job.URL, - }, - }) - } - span.AddEvent("Visitor: Visited URL.", - trace.WithAttributes( - attribute.String("Url", job.URL), - )) - span.End() + jlogger.Error("Visitor: Error visiting URL.", "error", err) + span.RecordError(err) + span.End() + continue } } diff --git a/workqueue.go b/workqueue.go index 0e55a7c..ee1b363 100644 --- a/workqueue.go +++ b/workqueue.go @@ -4,25 +4,70 @@ package main import ( "context" - "encoding/json" "errors" + "hash/fnv" "log/slog" + "net/http" + "net/url" + "strings" "time" - "github.com/google/uuid" - amqp "github.com/rabbitmq/amqp091-go" - "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/propagation" + "github.com/redis/go-redis/v9" ) -type WorkQueue interface { - Open() error +const ( + // MinHostRPS specifies the minimum number of requests per + // second that are executed against a single host. + MinHostRPS float64 = 1 - // The following methods use the crawl run context. - PublishURL(ctx context.Context, run string, url string, flags uint8) error - ConsumeVisit(ctx context.Context) (<-chan *VisitJob, <-chan error) - DelayVisit(ctx context.Context, delay time.Duration, j *VisitMessage) error + // MaxHostRPS specifies the maximum number of requests per + // second that are exectuted against a single host. + MaxHostRPS float64 = 50 +) +// VisitWorkQueue appears to produceres and consumers as a single queue. Each +// message in the work queue represents a job for a request to visit a single +// URL and process the response. +// +// While producers publish a new VisitMessage immediately to the work queue, +// consumers can only consume jobs at a certain rate. This rate is determined by +// a per-host rate limiter. These rate limiters can be updated dynamically. +type VisitWorkQueue interface { + // Open opens the work queue for use. It must be called before any other method. + Open(context.Context) error + + // Publish creates a new VisitMessage for the given URL and enqueues the job to + // be retrieved later via Consume. The run ID must be specified in order to + // allow the consumer to find the right Collector to visit the URL. + Publish(ctx context.Context, run string, url string) error + + // Republish is used to reschedule a job for later processing. This is useful + // if the job could not be processed due to a temporary error. The function + // should keep a count on how often a job is rescheduled. + Republish(ctx context.Context, job *VisitJob) error + + // Consume is used by workers to retrieve a new VisitJob to process, reading from the + // returned channel will block until a job becomes available. Jobs are automatically acked + // when retrieved from the channel. + Consume(ctx context.Context) (<-chan *VisitJob, <-chan error) + + // Pause pauses the consumption of jobs for a given host. This is useful if + // we see the host stopping to be available, for example when it is down + // for maintenance. + Pause(ctx context.Context, url string, d time.Duration) error + + // TakeSample allows to inform the rate limiter how long it took to process a job and adjust + // accordingly. Seeing an increase in Latency might indicate we are overwhelming the + // target. + TakeSample(ctx context.Context, url string, statusCode int, err error, d time.Duration) + + // UseRateLimitHeaders allows the implementation to use the information provided + // through rate limit headers to inform the rate limiter. + // See https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28#checking-the-status-of-your-rate-limit + TakeRateLimitHeaders(ctx context.Context, url string, hdr *http.Header) + + // Close allows the implementation to release opened resources. After Close + // the work queue must not be used anymore. Close() error } @@ -32,19 +77,9 @@ type VisitMessage struct { URL string - // Whether this visit has a valid reservation by a rate limiter. - HasReservation bool - - Flags uint8 Created time.Time -} - -// visitPackage is used by the in-memory implementation of the WorkQueue. -// Implementations that have a built-in mechanism to transport headers do not -// need to use this. -type visitPackage struct { - Carrier propagation.MapCarrier - Message *VisitMessage + // The number of times this job has been retried to be enqueued. + Retries uint32 } // VisitJob is similar to a http.Request, it exists only for a certain time. It @@ -70,279 +105,34 @@ func (j *VisitJob) Validate() (bool, error) { return true, nil } -func CreateWorkQueue(rabbitmq *amqp.Connection) WorkQueue { - if rabbitmq != nil { +func CreateWorkQueue(redis *redis.Client) VisitWorkQueue { + if redis != nil { slog.Debug("Using distributed work queue...") - return &RabbitMQWorkQueue{conn: rabbitmq} + // TODO: Add support for redis work queue. + // return &RedisVisitWorkQueue{conn: redis} + return NewMemoryVisitWorkQueue() } else { slog.Debug("Using in-memory work queue...") - return &MemoryWorkQueue{} - } -} - -type MemoryWorkQueue struct { - // TODO: MaxSize - pkgs chan *visitPackage -} - -func (wq *MemoryWorkQueue) Open() error { - wq.pkgs = make(chan *visitPackage, 1_000_000) - return nil -} - -func (wq *MemoryWorkQueue) PublishURL(ctx context.Context, run string, url string, flags uint8) error { - // Extract tracing information from context, to transport it over the - // channel, without using a Context. - propagator := otel.GetTextMapPropagator() - carrier := propagation.MapCarrier{} - propagator.Inject(ctx, carrier) - - pkg := &visitPackage{ - Carrier: carrier, - Message: &VisitMessage{ - ID: uuid.New().ID(), - Created: time.Now(), - Run: run, - URL: url, - Flags: flags, - }, - } - select { - case wq.pkgs <- pkg: - slog.Debug("Work Queue: Message accepted.", "msg.id", pkg.Message.ID) - default: - slog.Warn("Work Queue: full, dropping message!", "msg", pkg.Message) - } - return nil -} - -// DelayVisit republishes a message with given delay. -func (wq *MemoryWorkQueue) DelayVisit(ctx context.Context, delay time.Duration, msg *VisitMessage) error { - go func() { - slog.Debug("Work Queue: Delaying message", "msg.id", msg.ID, "delay", delay.Seconds()) - - // Extract tracing information from context, to transport it over the - // channel, without using a Context. - propagator := otel.GetTextMapPropagator() - carrier := propagation.MapCarrier{} - propagator.Inject(ctx, carrier) - - time.Sleep(delay) - pkg := &visitPackage{ - Carrier: carrier, - Message: msg, - } - select { - case wq.pkgs <- pkg: - slog.Debug("Work Queue: Delayed message accepted.", "msg.id", pkg.Message.ID, "delay", delay.Seconds()) - default: - slog.Warn("Work Queue: full, dropping delayed message!", "msg", pkg.Message) - } - }() - return nil -} - -func (wq *MemoryWorkQueue) ConsumeVisit(ctx context.Context) (<-chan *VisitJob, <-chan error) { - reschan := make(chan *VisitJob) - errchan := make(chan error) - - go func() { - select { - case <-ctx.Done(): - slog.Debug("Work Queue: Consume context cancelled, closing channels.") - - close(reschan) - close(errchan) - return - case p := <-wq.pkgs: - slog.Debug("Work Queue: Received message, forwarding to results channel.", "msg.id", p.Message.ID) - - // Initializes the context for the job. Than extract the tracing - // information from the carrier into the job's context. - jctx := context.Background() - jctx = otel.GetTextMapPropagator().Extract(jctx, p.Carrier) - - reschan <- &VisitJob{ - VisitMessage: p.Message, - Context: jctx, - } - slog.Debug("Work Queue: Forwarded message to results channel.", "msg.id", p.Message.ID) - } - }() - - return reschan, errchan -} - -func (wq *MemoryWorkQueue) Close() error { - close(wq.pkgs) - return nil -} - -type RabbitMQWorkQueue struct { - conn *amqp.Connection - channel *amqp.Channel - queue amqp.Queue - receive <-chan amqp.Delivery -} - -// Open declares both sides (producer and consumer) of the work queue. -func (wq *RabbitMQWorkQueue) Open() error { - ch, err := wq.conn.Channel() - if err != nil { - return err - } - wq.channel = ch - - q, err := ch.QueueDeclare( - "tobey.urls", // name - true, // durable TODO: check meaning - false, // delete when unused - false, // exclusive TODO: check meaning - false, // no-wait TODO: check meaning - nil, // arguments - ) - if err != nil { - return err - } - wq.queue = q - - // This utilizes the delayed_message plugin. - ch.ExchangeDeclare("tobey.default", "x-delayed-message", true, false, false, false, amqp.Table{ - "x-delayed-type": "direct", - }) - - // Bind queue to delayed exchange. - err = ch.QueueBind(wq.queue.Name, wq.queue.Name, "tobey.default", false, nil) - if err != nil { - return err - } - - receive, err := wq.channel.Consume( - wq.queue.Name, // queue - "", // consumer - true, // auto-ack - false, // exclusive - false, // no-local - false, // no-wait - nil, // args - ) - if err != nil { - return err - } - wq.receive = receive - - return nil -} - -func (wq *RabbitMQWorkQueue) PublishURL(ctx context.Context, run string, url string, flags uint8) error { - jmlctx, span := tracer.Start(ctx, "publish_url") - defer span.End() - msg := &VisitMessage{ - ID: uuid.New().ID(), - Created: time.Now(), - Run: run, - URL: url, - Flags: flags, + return NewMemoryVisitWorkQueue() } - - b, err := json.Marshal(msg) - if err != nil { - return err - } - - table := make(amqp.Table) - - // Add tracing information into the RabbitMQ headers, so that - // the consumer of the message can continue the trace. - otel.GetTextMapPropagator().Inject(jmlctx, MapCarrierRabbitmq(table)) - - return wq.channel.Publish( - "tobey.default", // exchange - wq.queue.Name, // routing key - false, // mandatory TODO: check meaning - false, // immediate TODO: check meaning - amqp.Publishing{ - Headers: table, - DeliveryMode: amqp.Persistent, // TODO: check meaning - ContentType: "application/json", - Body: b, - }, - ) } -// DelayVisit republishes a message with given delay. -// Relies on: https://blog.rabbitmq.com/posts/2015/04/scheduling-messages-with-rabbitmq/ -func (wq *RabbitMQWorkQueue) DelayVisit(ctx context.Context, delay time.Duration, msg *VisitMessage) error { - slog.Debug("Delaying message", "msg.id", msg.ID, "delay", delay.Seconds()) - - b, err := json.Marshal(msg) +// guessHost heuristically identifies a host for the given URL. The function +// doesn't return the host name directly, as it might not exist, but an ID. +// +// It does by by ignoring a www. prefix, leading to www.example.org and +// example.org being considered the same host. It also ignores the port number, +// so example.org:8080 and example.org:9090 are considered the same host as +// well. +// +// Why FNV? https://softwareengineering.stackexchange.com/questions/49550 +func guessHost(u string) uint32 { + p, err := url.Parse(u) if err != nil { - return err + return 0 } + h := fnv.New32a() - table := make(amqp.Table) - table["x-delay"] = delay.Milliseconds() - - // Extract tracing information from context into headers. The tracing information - // should already be present in the context of the caller. - otel.GetTextMapPropagator().Inject(ctx, MapCarrierRabbitmq(table)) - - return wq.channel.Publish( - "tobey.default", // exchange - wq.queue.Name, // routing key - false, // mandatory TODO: check meaning - false, // immediate TODO: check meaning - amqp.Publishing{ - DeliveryMode: amqp.Persistent, // TODO: check meaning - ContentType: "application/json", - Body: b, - Headers: table, - }, - ) -} - -func (wq *RabbitMQWorkQueue) ConsumeVisit(ctx context.Context) (<-chan *VisitJob, <-chan error) { - reschan := make(chan *VisitJob) - errchan := make(chan error) - - go func() { - var msg *VisitMessage - var rawmsg amqp.Delivery - - select { - case v := <-wq.receive: // Blocks until we have at least one message. - rawmsg = v - case <-ctx.Done(): // The worker's context. - close(reschan) - close(errchan) - return // Exit if the context is cancelled. - } - - if err := json.Unmarshal(rawmsg.Body, &msg); err != nil { - errchan <- err - } else { - // Initializes the context for the job. Than extract the tracing - // information from the RabbitMQ headers into the job's context. - jctx := otel.GetTextMapPropagator().Extract(context.Background(), MapCarrierRabbitmq(rawmsg.Headers)) - - reschan <- &VisitJob{ - VisitMessage: msg, - Context: jctx, - } - } - }() - - return reschan, errchan -} - -func (wq *RabbitMQWorkQueue) Close() error { - var lasterr error - - if err := wq.channel.Close(); err != nil { - lasterr = err - } - if err := wq.conn.Close(); err != nil { - lasterr = err - } - return lasterr + h.Write([]byte(strings.TrimLeft(p.Hostname(), "www."))) + return h.Sum32() } diff --git a/workqueue_memory.go b/workqueue_memory.go new file mode 100644 index 0000000..64e9e09 --- /dev/null +++ b/workqueue_memory.go @@ -0,0 +1,472 @@ +package main + +import ( + "context" + "errors" + "log/slog" + "net/http" + "net/url" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/google/uuid" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/propagation" + xrate "golang.org/x/time/rate" +) + +// The maximum number of messages that can exists in the in-memory work queue. +const MemoryWorkQueueBufferSize = 1_000_000 + +// visitMemoryPackage is used by the in-memory implementation of the WorkQueue. +// Implementations that have a built-in mechanism to transport headers do not +// need to use this. +type visitMemoryPackage struct { + Carrier propagation.MapCarrier + Message *VisitMessage +} + +type hostMemoryVisitWorkQueue struct { + ID uint32 + Name string // For debugging purposes only. + + Queue chan *visitMemoryPackage + + Limiter *xrate.Limiter + HasReservation bool + IsAdaptive bool + PausedUntil time.Time +} + +func NewMemoryVisitWorkQueue() *MemoryVisitWorkQueue { + return &MemoryVisitWorkQueue{ + dqueue: make(chan *visitMemoryPackage, MemoryWorkQueueBufferSize), + hqueues: make(map[uint32]*hostMemoryVisitWorkQueue), + shoudlRecalc: make(chan bool), + } +} + +type MemoryVisitWorkQueue struct { + mu sync.RWMutex + + // This where consumers read from. + dqueue chan *visitMemoryPackage + + // This is where message get published too. Key ist the hostname including + // the port. It's okay to mix enqueue visits with and without authentication + // for the same host. + hqueues map[uint32]*hostMemoryVisitWorkQueue + + // shoudlRecalc is checked by the promoter to see if it should recalculate. + // It is an unbuffered channel. If sending is blocked, this means there is + // a pending notification. As one notification is enough, to trigger the + // recalculation, a failed send can be ignored. + shoudlRecalc chan bool +} + +func (wq *MemoryVisitWorkQueue) Open(ctx context.Context) error { + wq.startPromoter(ctx) + return nil +} + +func (wq *MemoryVisitWorkQueue) lazyHostQueue(u string) *hostMemoryVisitWorkQueue { + p, _ := url.Parse(u) + key := guessHost(u) + + if _, ok := wq.hqueues[key]; !ok { + wq.mu.Lock() + wq.hqueues[key] = &hostMemoryVisitWorkQueue{ + ID: key, + Name: strings.TrimLeft(p.Hostname(), "www."), + PausedUntil: time.Time{}, + HasReservation: false, + IsAdaptive: true, + Queue: make(chan *visitMemoryPackage, MemoryWorkQueueBufferSize), + Limiter: xrate.NewLimiter(xrate.Limit(MinHostRPS), 1), + } + wq.mu.Unlock() + } + + wq.mu.RLock() + defer wq.mu.RUnlock() + return wq.hqueues[key] +} + +func (wq *MemoryVisitWorkQueue) Publish(ctx context.Context, run string, url string) error { + defer wq.shouldRecalc() // Notify promoter that a new message is available. + + hq := wq.lazyHostQueue(url) + + // Extract tracing information from context, to transport it over the + // channel, without using a Context. + propagator := otel.GetTextMapPropagator() + carrier := propagation.MapCarrier{} + propagator.Inject(ctx, carrier) + + pkg := &visitMemoryPackage{ + Carrier: carrier, + Message: &VisitMessage{ + ID: uuid.New().ID(), + Run: run, + URL: url, + Created: time.Now(), + }, + } + + select { + case hq.Queue <- pkg: + slog.Debug("Work Queue: Message successfully published.", "msg.id", pkg.Message.ID) + default: + slog.Warn("Work Queue: full, not publishing message!", "msg", pkg.Message) + } + return nil +} + +func (wq *MemoryVisitWorkQueue) Republish(ctx context.Context, job *VisitJob) error { + defer wq.shouldRecalc() + + hq := wq.lazyHostQueue(job.URL) + + // Extract tracing information from context, to transport it over the + // channel, without using a Context. + propagator := otel.GetTextMapPropagator() + carrier := propagation.MapCarrier{} + propagator.Inject(job.Context, carrier) + + pkg := &visitMemoryPackage{ + Carrier: carrier, + Message: &VisitMessage{ + ID: job.ID, + Run: job.Run, + URL: job.URL, + Created: job.Created, + Retries: job.Retries + 1, + }, + } + + select { + case hq.Queue <- pkg: + slog.Debug("Work Queue: Message successfully rescheduled.", "msg.id", pkg.Message.ID) + default: + slog.Warn("Work Queue: full, not rescheduling message!", "msg", pkg.Message) + } + return nil +} + +// Consume returns the next available VisitJob from the default queue. +func (wq *MemoryVisitWorkQueue) Consume(ctx context.Context) (<-chan *VisitJob, <-chan error) { + // We are unwrapping the VisitJob from the visitMemoryPackage. + reschan := make(chan *VisitJob) + errchan := make(chan error) + + go func() { + for { + select { + case <-ctx.Done(): + slog.Debug("Work Queue: Consume context cancelled, closing channels.") + + close(reschan) + close(errchan) + return + case p := <-wq.dqueue: + // slog.Debug("Work Queue: Received message, forwarding to results channel.", "msg.id", p.Message.ID) + + // Initializes the context for the job. Than extract the tracing + // information from the carrier into the job's context. + jctx := context.Background() + jctx = otel.GetTextMapPropagator().Extract(jctx, p.Carrier) + + reschan <- &VisitJob{ + VisitMessage: p.Message, + Context: jctx, + } + // slog.Debug("Work Queue: Forwarded message to results channel.", "msg.id", p.Message.ID) + } + } + }() + + return reschan, errchan +} + +func (wq *MemoryVisitWorkQueue) Pause(ctx context.Context, url string, d time.Duration) error { + t := time.Now().Add(d) + hq := wq.lazyHostQueue(url) + + if hq.PausedUntil.IsZero() || !hq.PausedUntil.After(t) { // Pause can only increase. + hq.PausedUntil = t + } + return nil +} + +func (wq *MemoryVisitWorkQueue) TakeRateLimitHeaders(ctx context.Context, url string, hdr *http.Header) { + hq := wq.lazyHostQueue(url) + + if v := hdr.Get("X-RateLimit-Limit"); v != "" { + cur := float64(hq.Limiter.Limit()) + + parsed, _ := strconv.Atoi(v) + desired := max(float64(parsed/60/60), MinHostRPS) // Convert to per second, is per hour. + + if int(desired) != int(cur) { + slog.Debug("Work Queue: Rate limit updated.", "host", hq.Name, "now", desired, "change", desired-cur) + hq.Limiter.SetLimit(xrate.Limit(desired)) + } + hq.IsAdaptive = false + } +} + +// TakeSample implements an algorithm to adjust the rate limiter, to ensure maximum througput within the bounds +// set by the rate limiter, while not overwhelming the target. The algorithm will adjust the rate limiter +// but not go below MinRequestsPerSecondPerHost and not above MaxRequestsPerSecondPerHost. +func (wq *MemoryVisitWorkQueue) TakeSample(ctx context.Context, url string, statusCode int, err error, d time.Duration) { + hq := wq.lazyHostQueue(url) + + cur := float64(hq.Limiter.Limit()) + var desired float64 + + // If we have a status code that the target is already overwhelmed, we don't + // increase the rate limit. We also don't adjust the rate limit as we + // assume on a 429 this has already been done using TakeRateLimitHeaders + + switch { + case statusCode == 429: + fallthrough + case statusCode == 503: + return + case statusCode == 500 || errors.Is(err, context.DeadlineExceeded): + // The server is starting to behave badly. We should slow down. + // Half the rate limit, capped at MinRequestsPerSecondPerHost. + if cur/2 < MinHostRPS { + desired = MinHostRPS + } else { + desired = cur / 2 + } + + if int(desired) != int(cur) && hq.IsAdaptive { + slog.Debug("Work Queue: Lowering pressure on host.", "host", hq.Name, "now", desired, "change", desired-cur) + hq.Limiter.SetLimit(xrate.Limit(desired)) + } + return + } + + if d == 0 { + return + } + + // The higher the latency comes close to 1s the more we want to decrease the + // rate limit, capped at MinHostRPS. The lower the latency comes clost to 0s + // the more we want to increase the rate limit, capped at MaxHostRPS. + + latency := min(1, d.Seconds()) // Everything above 1s is considered slow. + desired = MaxHostRPS*(1-latency) + MinHostRPS*latency + + if int(desired) != int(cur) && hq.IsAdaptive { + slog.Debug("Work Queue: Rate limit fine tuned.", "host", hq.Name, "now", desired, "change", desired-cur) + hq.Limiter.SetLimit(xrate.Limit(desired)) + } +} + +// startPromoter starts a the promoter goroutine that shovels message from host +// queue into the default queue. The promoter is responsible for load balancing +// the host queues. +func (wq *MemoryVisitWorkQueue) startPromoter(ctx context.Context) { + go func() { + slog.Debug("Work Queue: Starting promoter...") + + // Load balancing queue querying is local to each tobey instance. This should wrap + // around at 2^32. + var next uint32 + + for { + immediatecalc: + // slog.Debug("Work Queue: Calculating immediate queue candidates.") + immediate, shortestPauseUntil := wq.calcImmediateHostQueueCandidates() + + // Check how long we have to wait until we can recalc immidiate candidates. + if len(immediate) == 0 { + // Nothin to do yet, try again whenever + // 1. a new message is published, + // 2. the rate limited is adjusted + // 3. a queue is now unpaused. + var delay time.Duration + + if !shortestPauseUntil.IsZero() { // None may be paused. + delay = shortestPauseUntil.Sub(time.Now()) + } else { + delay = 60 * time.Second + } + + slog.Debug("Work Queue: No immediate queue candidates, waiting for a while...", "delay", delay) + select { + case <-time.After(delay): + // slog.Debug("Work Queue: Pause time passed.") + goto immediatecalc + case <-wq.shoudlRecalc: + slog.Debug("Work Queue: Got notification to re-calc immediate queue candidates.") + goto immediatecalc + case <-ctx.Done(): + slog.Debug("Work Queue: Context cancelled, stopping promoter.") + return + } + } + + // When we get here we have a at least one host queue that can be + // queried, if we have multiple candidates we load balance over + // them. + slog.Debug("Work Queue: Final immediate queue candidates calculated.", "count", len(immediate)) + + n := atomic.AddUint32(&next, 1) + + key := immediate[(int(n)-1)%len(immediate)] + hq, _ := wq.hqueues[key] + + // FIXME: The host queue might haven gone poof in the meantime, we should + // check if the host queue is still there. + + slog.Debug("Work Queue: Querying host queue for messages.", "queue.name", hq.Name) + select { + case pkg := <-hq.Queue: + // Now promote the pkg to the default queue. + select { + case wq.dqueue <- pkg: + slog.Debug("Work Queue: Message promoted.", "msg.id", pkg.Message.ID) + + wq.mu.Lock() + wq.hqueues[hq.ID].HasReservation = false + wq.mu.Unlock() + default: + slog.Warn("Work Queue: full, dropping to-be-promoted message!", "msg", pkg.Message) + } + default: + // The channel may became empty in the meantime, we've checked at top of the loop. + // slog.Debug("Work Queue: Host queue empty, nothing to promote.", "queue.name", hq.Hostname) + case <-ctx.Done(): + slog.Debug("Work Queue: Context cancelled, stopping promoter.") + return + } + } + }() +} + +// calcImmediateHostQueueCandidates calculates which host queues are candidates +// to be queried for message to be promoted. The function modifies the host queue +// properties, so it requires a lock to be held. +// +// The second return value if non zero indicates the shortest time until a host +// queue is paused (non only the candidates). When no candidate is found, the +// callee should wait at least for that time and than try and call this function +// again. +func (wq *MemoryVisitWorkQueue) calcImmediateHostQueueCandidates() ([]uint32, time.Time) { + // Host queue candidates that can be queried immediately. + immediate := make([]uint32, 0, len(wq.hqueues)) + var shortestPauseUntil time.Time + + wq.mu.Lock() + defer wq.mu.Unlock() + + // First calculate which host queues are candidates for immediate + // querying. This is to unnecesarily hitting the rate limiter for + // that host, as this can be expensive. More than checking the + // PausedUntil time, or the length of the queue. + // + // FIXME: It might be less expensive to first check for the PausedUntil + // time and then check the length of the queue, depending on the + // underlying implementation of the work queue. + for k, hq := range wq.hqueues { + hlogger := slog.With("queue.name", hq.Name) + hlogger.Debug("Work Queue: Checking if is candidate.", "len", len(hq.Queue), "now", time.Now(), "pausedUntil", hq.PausedUntil) + + if len(hq.Queue) == 0 { + // This host queue is empty, no message to process for that queue, + // so don't include it in the immediate list. + continue + } + // hlogger.Debug("Work Queue: Host queue has messages to process.") + + if hq.PausedUntil.IsZero() { + // This host queue was never paused before. + // hlogger.Debug("Work Queue: Host queue is not paused.") + } else { + if time.Now().Before(hq.PausedUntil) { + // This host queue is *still* paused. + // hlogger.Debug("Work Queue: Host queue is paused.") + + // Check if this host queue is paused shorter thant the shortest + // pause we've seen so far. + if shortestPauseUntil.IsZero() || hq.PausedUntil.Before(shortestPauseUntil) { + shortestPauseUntil = hq.PausedUntil + } + + // As this host queue is still paused we don't include + // it in the imediate list. Continue to check if the + // next host queue is paused or not. + continue + } else { + // hlogger.Debug("Work Queue: Host queue is not paused anymore.") + + // This host queue is not paused anymore, include it in + // the list. While not technically necessary, we reset + // the pause until time to zero, for the sake of tidiness. + wq.hqueues[k].PausedUntil = time.Time{} + } + } + + // If we get here, the current host queue was either never + // paused, or it is now unpaused. This means we can try to get a + // token from the rate limiter, if we haven't already. + if !hq.HasReservation { + // hlogger.Debug("Work Queue: Host queue needs a reservation, checking rate limiter.") + res := hq.Limiter.Reserve() + if !res.OK() { + hlogger.Warn("Work Queue: Rate limiter cannot provide reservation in max wait time.") + continue + } + + if res.Delay() > 0 { + hlogger.Debug("Work Queue: Host queue is rate limited, pausing the queue.", "delay", res.Delay()) + + // Pause the tube for a while, the limiter wants us to retry later. + wq.hqueues[k].PausedUntil = time.Now().Add(res.Delay()) + + if shortestPauseUntil.IsZero() || wq.hqueues[k].PausedUntil.Before(shortestPauseUntil) { + shortestPauseUntil = wq.hqueues[k].PausedUntil + } + + wq.hqueues[k].HasReservation = true + continue + } else { + // Got a token from the limiter, we may act immediately. + // hlogger.Debug("Work Queue: Host queue is not rate limited, recording as candidate :)") + wq.hqueues[k].HasReservation = true + } + } else { + // hlogger.Debug("Work Queue: Host already has a reservation.") + } + immediate = append(immediate, hq.ID) + } + return immediate, shortestPauseUntil +} + +// shoudlRecalc is checked by the promoter to see if it should recalculate. +// It is an unbuffered channel. If sending is blocked, this means there is +// a pending notification. As one notification is enough, to trigger the +// recalculation, a failed send can be ignored. +func (wq *MemoryVisitWorkQueue) shouldRecalc() { + select { + case wq.shoudlRecalc <- true: + default: + // A notification is already pending, no need to send another. + } +} + +func (wq *MemoryVisitWorkQueue) Close() error { + close(wq.dqueue) + + for _, hq := range wq.hqueues { + close(hq.Queue) + } + return nil +} From 39cdd553ca6bf51e59d4725c6811104c7ea2307d Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Wed, 11 Sep 2024 11:03:47 +0200 Subject: [PATCH 02/57] Move prometheus to main --- prometheus.yml | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 prometheus.yml diff --git a/prometheus.yml b/prometheus.yml deleted file mode 100644 index 8ef7b5d..0000000 --- a/prometheus.yml +++ /dev/null @@ -1,6 +0,0 @@ -scrape_configs: -- job_name: tobey - scrape_interval: 1s - static_configs: - - targets: - - tobey:8080 From 2db65605fe8bfb0bc8721f216af95c81e48d930e Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Fri, 13 Sep 2024 15:06:19 +0200 Subject: [PATCH 03/57] Make hosting a hidden directory --- {hosting => .hosting}/scaffold/index.yml | 0 {hosting => .hosting}/scaffold/template/app/app-deployment.yml | 0 {hosting => .hosting}/scaffold/template/app/app-service.yml | 0 {hosting => .hosting}/scaffold/template/secrets/app-secrets.yml | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename {hosting => .hosting}/scaffold/index.yml (100%) rename {hosting => .hosting}/scaffold/template/app/app-deployment.yml (100%) rename {hosting => .hosting}/scaffold/template/app/app-service.yml (100%) rename {hosting => .hosting}/scaffold/template/secrets/app-secrets.yml (100%) diff --git a/hosting/scaffold/index.yml b/.hosting/scaffold/index.yml similarity index 100% rename from hosting/scaffold/index.yml rename to .hosting/scaffold/index.yml diff --git a/hosting/scaffold/template/app/app-deployment.yml b/.hosting/scaffold/template/app/app-deployment.yml similarity index 100% rename from hosting/scaffold/template/app/app-deployment.yml rename to .hosting/scaffold/template/app/app-deployment.yml diff --git a/hosting/scaffold/template/app/app-service.yml b/.hosting/scaffold/template/app/app-service.yml similarity index 100% rename from hosting/scaffold/template/app/app-service.yml rename to .hosting/scaffold/template/app/app-service.yml diff --git a/hosting/scaffold/template/secrets/app-secrets.yml b/.hosting/scaffold/template/secrets/app-secrets.yml similarity index 100% rename from hosting/scaffold/template/secrets/app-secrets.yml rename to .hosting/scaffold/template/secrets/app-secrets.yml From 0fddebfecb399b7bdb415fe7f6be4800b31fda69 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Fri, 13 Sep 2024 15:12:22 +0200 Subject: [PATCH 04/57] Add licenses --- LICENSE.txt | 239 +++++---------------------------- internal/collector/LICENSE.txt | 202 ++++++++++++++++++++++++++++ 2 files changed, 235 insertions(+), 206 deletions(-) create mode 100644 internal/collector/LICENSE.txt diff --git a/LICENSE.txt b/LICENSE.txt index 9374297..9535c8f 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,206 +1,33 @@ -This code base contains source code from Colly, it has been forked and modified -for use in Tobey. Where not otherwise stated the unmodified portions of the -Colly source code as found in the directories are licensed under the following -license: - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. +BSD 3-Clause License + +Copyright (c) 2023, Factorial GmbH. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +This code base also contains source code from Colly, it has been forked and +modified for use in Tobey in internal/collector. The original license can be +found at internal/collector/LICENSE.txt. diff --git a/internal/collector/LICENSE.txt b/internal/collector/LICENSE.txt new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/internal/collector/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. From 370914a8f4c4d69ffef9c61515c1b208f5068b13 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Fri, 13 Sep 2024 15:15:02 +0200 Subject: [PATCH 05/57] Remove feature stubs --- README.md | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/README.md b/README.md index 2e2a166..e047cee 100644 --- a/README.md +++ b/README.md @@ -242,40 +242,6 @@ by providing the `auth` key: ] ``` -### Prioritites (not implemented) - -tbd - -### Sample Size (not implemented) - -When a sample size is given, and its threshold of crawled pages has been reached -the crawl request will stop fetching more pages. Please note that slightly more pages -than the sample size might be returned. - -```jsonc -{ - "url": "https://example.org", - "sample_size": 10 -} -``` - -### Oneshot Mode (not implemented) - -By default the URLs submitted are considered entrypoints, you can change this -behavior by providing the query parameter `oneshot`. This will only download the -resource as found under the URL and nothing more. Of course multiple URLs (see -below) are usable here as well. - -```sh -curl -X POST http://127.0.0.1:8080?oneshot # ... -``` - -```jsonc -{ - "url": "https://example.org/values" -} -``` - ### Using Webhook to state where results should go [Webhooks](https://mailchimp.com/en/marketing-glossary/webhook) are a technique to notify other services about a result, once its ready. From fa73bed5c53133fbd3aafe0b6d52354d6cb694ac Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Fri, 13 Sep 2024 15:20:49 +0200 Subject: [PATCH 06/57] Rewrite arch section --- README.md | 78 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 44 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index e047cee..2cdfc5f 100644 --- a/README.md +++ b/README.md @@ -23,40 +23,7 @@ for easy load balancing. The instances will coordinate with each other via Redis - Per host rate limiting, even when multiple instances are used. - Full support for OpenTelemetry. -## Constraints - -- Also Tobey can be configured - on a per run basis - to crawl websites behind - HTTP basic auth, **it does not support fetching personalized content**. It is - expected that the website is generally publicly available, and that the content - is the same for all users. When HTTP basic auth is used by the website it must - only be so in order to prevent early access. - -## Architecture - -- The service optimizes for throughput per host. The rate limit and the requests - a host can handle in timely fashion is what mainly limits the throughput. In - order to maximize throughput we have to use the rate limit to its fullest. We - will also have to find out the maximum rate limit per host, for that we must be - able to adjust the rate limit per host dynamically. -- Runs are transient and they get evicted both from local memory and the store after a certain time, or whenever we hit a fixed limit. -- The instance must provide enough local memory and store so information about hosts that we access during runs can be kept and stored. -- However it can be assumed the number of hosts is sufficiently large enough, that we wouldn't be able - to keep a go routine which will hold a consumer for each host persistently. Go - routines are cheap, but hard to keep alive, when they interact with external - resources. -- For the same reason a worker process per host isn't suitable, also one worker per host wouldn't be enough. With a pool of workers - that could also handle many requests to a host at the same time we're better set up. -- We cannot pre-caclulate the delay when processing each incoming request is ok. As the rate-limit per host is dynamic and can change at any time, i.e. - when the host returns headers that allow us to adjust the rate limit. We want to do this as one of the main goals is throughput per host. -- Although the semantic correct way would be to have everything be scoped to a Run, i.e. Robots, Sitemap, etc. we will not do this. This approach - would (a) lead to a deep object graph (Run -> Host -> Robots, Sitemap, etc.) in which Run becomes kind of an god object and (b) it make hard - to share safe information between runs and prevent us from using a global cache. -- Information about the host's rate limiting state is not directly stored in the HostStore and passed to the work queue, instead the work queue will use the HostStore. The work queue hides - the dynamic adaption to the rate limit. Nobody else needs to know about it. -- Retrieved sitemaps and robot control files are not stored in the HostStore but in a global cache of the HTTP client. - Independent of the of the expiry set for a robot control file, it will be cached in-memory for a certain time, as we have - to check it for every request to a host. This adds another layer of caching. When changing the - robot control file, the cache can be invalidated by sending the XXX signal to all instances of tobey. +## Running Tobey ```sh # In the first terminal start the service. @@ -68,6 +35,49 @@ curl -X POST http://127.0.0.1:8080 \ -d '{"url": "https://www.example.org/"}' ``` + +## Architecture + +The Tobey Crawler architecture optimizes throughput per host by dynamically +managing rate limits, ensuring that requests to each host are processed as +efficiently as possible. The crawler does not impose static rate limits; +instead, it adapts to each host's capabilities, adjusting the rate limit in real +time based on feedback from headers or other factors. + +This dynamic adjustment is essential because maximizing throughput requires +using the host’s capacity without overloading it. To manage these rate limits +effectively, Tobey employs a rate-limited work queue that abstracts away the +complexities of dynamic rate limiting from other parts of the system. The goal +is to focus on maintaining a steady flow of requests without overwhelming +individual hosts. + +The crawler is designed to handle a large potentially infinite number of hosts, +which presents challenges for managing resources like memory and concurrency. +Keeping a persistent worker process or goroutine for each host would be +inefficient and resource-intensive, particularly since external interactions +can make it difficult to keep them alive. Instead, Tobey uses a pool of workers +that can process multiple requests per host concurrently, balancing the workload +across different hosts. + +Caching is a critical part of the architecture. The crawler uses a global cache, +for HTTP responses. Access to sitemaps and robot control files are also cached. +While these files have expiration times, the crawler maintains an in-memory +cache to quickly validate requests without constantly retrieving them. The cache +is designed to be updated or invalidated as necessary, and a signal can be sent +across all Tobey instances to ensure the latest robot control files are used, +keeping the system responsive and compliant. This layered caching strategy, +along with the dynamic rate limit adjustment, ensures that Tobey maintains high +efficiency and adaptability during its crawling operations. + +## Trade-offs + +Also Tobey can be configured - on a per run basis - to crawl websites behind +HTTP basic auth, **it does not support fetching personalized content**. It is +expected that the website is generally publicly available, and that the content +is the same for all users. When HTTP basic auth is used by the website it must +only be so in order to prevent early access. + + ## Configuration The service is configured via environment variables. The following environment From 84ed09157fb2400cccccabcb6835b792b4cc5e9e Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Tue, 22 Oct 2024 16:05:08 +0200 Subject: [PATCH 07/57] Ignore .env --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index f53d1a0..9bdd304 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ cache tobey +.env From 115dcf0da97d05155381e33c421e68c21184e916 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Tue, 22 Oct 2024 16:18:31 +0200 Subject: [PATCH 08/57] Move promotion and rate limit optimization into their own files --- jobpromoter.go | 192 ++++++++++++++++++++++++++++++++ ratelimitoptimizer.go | 83 ++++++++++++++ workqueue_memory.go | 253 ------------------------------------------ 3 files changed, 275 insertions(+), 253 deletions(-) create mode 100644 jobpromoter.go create mode 100644 ratelimitoptimizer.go diff --git a/jobpromoter.go b/jobpromoter.go new file mode 100644 index 0000000..6dc2b5a --- /dev/null +++ b/jobpromoter.go @@ -0,0 +1,192 @@ +package main + +// The job promoter looks at host queues in a round-robin fashion and promotes +// messages from a host queue to the default queue. + +import ( + "context" + "log/slog" + "sync/atomic" + "time" +) + +// startPromoter starts a the promoter goroutine that shovels message from host +// queue into the default queue. The promoter is responsible for load balancing +// the host queues. +func (wq *MemoryVisitWorkQueue) startPromoter(ctx context.Context) { + go func() { + slog.Debug("Work Queue: Starting promoter...") + + // Load balancing queue querying is local to each tobey instance. This should wrap + // around at 2^32. + var next uint32 + + for { + immediatecalc: + // slog.Debug("Work Queue: Calculating immediate queue candidates.") + immediate, shortestPauseUntil := wq.calcImmediateHostQueueCandidates() + + // Check how long we have to wait until we can recalc immidiate candidates. + if len(immediate) == 0 { + // Nothin to do yet, try again whenever + // 1. a new message is published, + // 2. the rate limited is adjusted + // 3. a queue is now unpaused. + var delay time.Duration + + if !shortestPauseUntil.IsZero() { // None may be paused. + delay = shortestPauseUntil.Sub(time.Now()) + } else { + delay = 60 * time.Second + } + + slog.Debug("Work Queue: No immediate queue candidates, waiting for a while...", "delay", delay) + select { + case <-time.After(delay): + // slog.Debug("Work Queue: Pause time passed.") + goto immediatecalc + case <-wq.shoudlRecalc: + slog.Debug("Work Queue: Got notification to re-calc immediate queue candidates.") + goto immediatecalc + case <-ctx.Done(): + slog.Debug("Work Queue: Context cancelled, stopping promoter.") + return + } + } + + // When we get here we have a at least one host queue that can be + // queried, if we have multiple candidates we load balance over + // them. + slog.Debug("Work Queue: Final immediate queue candidates calculated.", "count", len(immediate)) + + n := atomic.AddUint32(&next, 1) + + key := immediate[(int(n)-1)%len(immediate)] + hq, _ := wq.hqueues[key] + + // FIXME: The host queue might haven gone poof in the meantime, we should + // check if the host queue is still there. + + slog.Debug("Work Queue: Querying host queue for messages.", "queue.name", hq.Name) + select { + case pkg := <-hq.Queue: + // Now promote the pkg to the default queue. + select { + case wq.dqueue <- pkg: + slog.Debug("Work Queue: Message promoted.", "msg.id", pkg.Message.ID) + + wq.mu.Lock() + wq.hqueues[hq.ID].HasReservation = false + wq.mu.Unlock() + default: + slog.Warn("Work Queue: full, dropping to-be-promoted message!", "msg", pkg.Message) + } + default: + // The channel may became empty in the meantime, we've checked at top of the loop. + // slog.Debug("Work Queue: Host queue empty, nothing to promote.", "queue.name", hq.Hostname) + case <-ctx.Done(): + slog.Debug("Work Queue: Context cancelled, stopping promoter.") + return + } + } + }() +} + +// calcImmediateHostQueueCandidates calculates which host queues are candidates +// to be queried for message to be promoted. The function modifies the host queue +// properties, so it requires a lock to be held. +// +// The second return value if non zero indicates the shortest time until a host +// queue is paused (non only the candidates). When no candidate is found, the +// callee should wait at least for that time and than try and call this function +// again. +func (wq *MemoryVisitWorkQueue) calcImmediateHostQueueCandidates() ([]uint32, time.Time) { + // Host queue candidates that can be queried immediately. + immediate := make([]uint32, 0, len(wq.hqueues)) + var shortestPauseUntil time.Time + + wq.mu.Lock() + defer wq.mu.Unlock() + + // First calculate which host queues are candidates for immediate + // querying. This is to unnecesarily hitting the rate limiter for + // that host, as this can be expensive. More than checking the + // PausedUntil time, or the length of the queue. + // + // FIXME: It might be less expensive to first check for the PausedUntil + // time and then check the length of the queue, depending on the + // underlying implementation of the work queue. + for k, hq := range wq.hqueues { + hlogger := slog.With("queue.name", hq.Name) + hlogger.Debug("Work Queue: Checking if is candidate.", "len", len(hq.Queue), "now", time.Now(), "pausedUntil", hq.PausedUntil) + + if len(hq.Queue) == 0 { + // This host queue is empty, no message to process for that queue, + // so don't include it in the immediate list. + continue + } + // hlogger.Debug("Work Queue: Host queue has messages to process.") + + if hq.PausedUntil.IsZero() { + // This host queue was never paused before. + // hlogger.Debug("Work Queue: Host queue is not paused.") + } else { + if time.Now().Before(hq.PausedUntil) { + // This host queue is *still* paused. + // hlogger.Debug("Work Queue: Host queue is paused.") + + // Check if this host queue is paused shorter thant the shortest + // pause we've seen so far. + if shortestPauseUntil.IsZero() || hq.PausedUntil.Before(shortestPauseUntil) { + shortestPauseUntil = hq.PausedUntil + } + + // As this host queue is still paused we don't include + // it in the imediate list. Continue to check if the + // next host queue is paused or not. + continue + } else { + // hlogger.Debug("Work Queue: Host queue is not paused anymore.") + + // This host queue is not paused anymore, include it in + // the list. While not technically necessary, we reset + // the pause until time to zero, for the sake of tidiness. + wq.hqueues[k].PausedUntil = time.Time{} + } + } + + // If we get here, the current host queue was either never + // paused, or it is now unpaused. This means we can try to get a + // token from the rate limiter, if we haven't already. + if !hq.HasReservation { + // hlogger.Debug("Work Queue: Host queue needs a reservation, checking rate limiter.") + res := hq.Limiter.Reserve() + if !res.OK() { + hlogger.Warn("Work Queue: Rate limiter cannot provide reservation in max wait time.") + continue + } + + if res.Delay() > 0 { + hlogger.Debug("Work Queue: Host queue is rate limited, pausing the queue.", "delay", res.Delay()) + + // Pause the tube for a while, the limiter wants us to retry later. + wq.hqueues[k].PausedUntil = time.Now().Add(res.Delay()) + + if shortestPauseUntil.IsZero() || wq.hqueues[k].PausedUntil.Before(shortestPauseUntil) { + shortestPauseUntil = wq.hqueues[k].PausedUntil + } + + wq.hqueues[k].HasReservation = true + continue + } else { + // Got a token from the limiter, we may act immediately. + // hlogger.Debug("Work Queue: Host queue is not rate limited, recording as candidate :)") + wq.hqueues[k].HasReservation = true + } + } else { + // hlogger.Debug("Work Queue: Host already has a reservation.") + } + immediate = append(immediate, hq.ID) + } + return immediate, shortestPauseUntil +} diff --git a/ratelimitoptimizer.go b/ratelimitoptimizer.go new file mode 100644 index 0000000..6a65e73 --- /dev/null +++ b/ratelimitoptimizer.go @@ -0,0 +1,83 @@ +package main + +// The rate limit optimizer takes samples and optimizer the rate limit, keeping +// it as low as possible to maximize throughput without overhelming a host. + +import ( + "context" + "errors" + "log/slog" + "net/http" + "strconv" + "time" + + xrate "golang.org/x/time/rate" +) + +func (wq *MemoryVisitWorkQueue) TakeRateLimitHeaders(ctx context.Context, url string, hdr *http.Header) { + hq := wq.lazyHostQueue(url) + + if v := hdr.Get("X-RateLimit-Limit"); v != "" { + cur := float64(hq.Limiter.Limit()) + + parsed, _ := strconv.Atoi(v) + desired := max(float64(parsed/60/60), MinHostRPS) // Convert to per second, is per hour. + + if int(desired) != int(cur) { + slog.Debug("Work Queue: Rate limit updated.", "host", hq.Name, "now", desired, "change", desired-cur) + hq.Limiter.SetLimit(xrate.Limit(desired)) + } + hq.IsAdaptive = false + } +} + +// TakeSample implements an algorithm to adjust the rate limiter, to ensure maximum througput within the bounds +// set by the rate limiter, while not overwhelming the target. The algorithm will adjust the rate limiter +// but not go below MinRequestsPerSecondPerHost and not above MaxRequestsPerSecondPerHost. +func (wq *MemoryVisitWorkQueue) TakeSample(ctx context.Context, url string, statusCode int, err error, d time.Duration) { + hq := wq.lazyHostQueue(url) + + cur := float64(hq.Limiter.Limit()) + var desired float64 + + // If we have a status code that the target is already overwhelmed, we don't + // increase the rate limit. We also don't adjust the rate limit as we + // assume on a 429 this has already been done using TakeRateLimitHeaders + + switch { + case statusCode == 429: + fallthrough + case statusCode == 503: + return + case statusCode == 500 || errors.Is(err, context.DeadlineExceeded): + // The server is starting to behave badly. We should slow down. + // Half the rate limit, capped at MinRequestsPerSecondPerHost. + if cur/2 < MinHostRPS { + desired = MinHostRPS + } else { + desired = cur / 2 + } + + if int(desired) != int(cur) && hq.IsAdaptive { + slog.Debug("Work Queue: Lowering pressure on host.", "host", hq.Name, "now", desired, "change", desired-cur) + hq.Limiter.SetLimit(xrate.Limit(desired)) + } + return + } + + if d == 0 { + return + } + + // The higher the latency comes close to 1s the more we want to decrease the + // rate limit, capped at MinHostRPS. The lower the latency comes clost to 0s + // the more we want to increase the rate limit, capped at MaxHostRPS. + + latency := min(1, d.Seconds()) // Everything above 1s is considered slow. + desired = MaxHostRPS*(1-latency) + MinHostRPS*latency + + if int(desired) != int(cur) && hq.IsAdaptive { + slog.Debug("Work Queue: Rate limit fine tuned.", "host", hq.Name, "now", desired, "change", desired-cur) + hq.Limiter.SetLimit(xrate.Limit(desired)) + } +} diff --git a/workqueue_memory.go b/workqueue_memory.go index 64e9e09..46ed3cd 100644 --- a/workqueue_memory.go +++ b/workqueue_memory.go @@ -2,14 +2,10 @@ package main import ( "context" - "errors" "log/slog" - "net/http" "net/url" - "strconv" "strings" "sync" - "sync/atomic" "time" "github.com/google/uuid" @@ -201,255 +197,6 @@ func (wq *MemoryVisitWorkQueue) Pause(ctx context.Context, url string, d time.Du return nil } -func (wq *MemoryVisitWorkQueue) TakeRateLimitHeaders(ctx context.Context, url string, hdr *http.Header) { - hq := wq.lazyHostQueue(url) - - if v := hdr.Get("X-RateLimit-Limit"); v != "" { - cur := float64(hq.Limiter.Limit()) - - parsed, _ := strconv.Atoi(v) - desired := max(float64(parsed/60/60), MinHostRPS) // Convert to per second, is per hour. - - if int(desired) != int(cur) { - slog.Debug("Work Queue: Rate limit updated.", "host", hq.Name, "now", desired, "change", desired-cur) - hq.Limiter.SetLimit(xrate.Limit(desired)) - } - hq.IsAdaptive = false - } -} - -// TakeSample implements an algorithm to adjust the rate limiter, to ensure maximum througput within the bounds -// set by the rate limiter, while not overwhelming the target. The algorithm will adjust the rate limiter -// but not go below MinRequestsPerSecondPerHost and not above MaxRequestsPerSecondPerHost. -func (wq *MemoryVisitWorkQueue) TakeSample(ctx context.Context, url string, statusCode int, err error, d time.Duration) { - hq := wq.lazyHostQueue(url) - - cur := float64(hq.Limiter.Limit()) - var desired float64 - - // If we have a status code that the target is already overwhelmed, we don't - // increase the rate limit. We also don't adjust the rate limit as we - // assume on a 429 this has already been done using TakeRateLimitHeaders - - switch { - case statusCode == 429: - fallthrough - case statusCode == 503: - return - case statusCode == 500 || errors.Is(err, context.DeadlineExceeded): - // The server is starting to behave badly. We should slow down. - // Half the rate limit, capped at MinRequestsPerSecondPerHost. - if cur/2 < MinHostRPS { - desired = MinHostRPS - } else { - desired = cur / 2 - } - - if int(desired) != int(cur) && hq.IsAdaptive { - slog.Debug("Work Queue: Lowering pressure on host.", "host", hq.Name, "now", desired, "change", desired-cur) - hq.Limiter.SetLimit(xrate.Limit(desired)) - } - return - } - - if d == 0 { - return - } - - // The higher the latency comes close to 1s the more we want to decrease the - // rate limit, capped at MinHostRPS. The lower the latency comes clost to 0s - // the more we want to increase the rate limit, capped at MaxHostRPS. - - latency := min(1, d.Seconds()) // Everything above 1s is considered slow. - desired = MaxHostRPS*(1-latency) + MinHostRPS*latency - - if int(desired) != int(cur) && hq.IsAdaptive { - slog.Debug("Work Queue: Rate limit fine tuned.", "host", hq.Name, "now", desired, "change", desired-cur) - hq.Limiter.SetLimit(xrate.Limit(desired)) - } -} - -// startPromoter starts a the promoter goroutine that shovels message from host -// queue into the default queue. The promoter is responsible for load balancing -// the host queues. -func (wq *MemoryVisitWorkQueue) startPromoter(ctx context.Context) { - go func() { - slog.Debug("Work Queue: Starting promoter...") - - // Load balancing queue querying is local to each tobey instance. This should wrap - // around at 2^32. - var next uint32 - - for { - immediatecalc: - // slog.Debug("Work Queue: Calculating immediate queue candidates.") - immediate, shortestPauseUntil := wq.calcImmediateHostQueueCandidates() - - // Check how long we have to wait until we can recalc immidiate candidates. - if len(immediate) == 0 { - // Nothin to do yet, try again whenever - // 1. a new message is published, - // 2. the rate limited is adjusted - // 3. a queue is now unpaused. - var delay time.Duration - - if !shortestPauseUntil.IsZero() { // None may be paused. - delay = shortestPauseUntil.Sub(time.Now()) - } else { - delay = 60 * time.Second - } - - slog.Debug("Work Queue: No immediate queue candidates, waiting for a while...", "delay", delay) - select { - case <-time.After(delay): - // slog.Debug("Work Queue: Pause time passed.") - goto immediatecalc - case <-wq.shoudlRecalc: - slog.Debug("Work Queue: Got notification to re-calc immediate queue candidates.") - goto immediatecalc - case <-ctx.Done(): - slog.Debug("Work Queue: Context cancelled, stopping promoter.") - return - } - } - - // When we get here we have a at least one host queue that can be - // queried, if we have multiple candidates we load balance over - // them. - slog.Debug("Work Queue: Final immediate queue candidates calculated.", "count", len(immediate)) - - n := atomic.AddUint32(&next, 1) - - key := immediate[(int(n)-1)%len(immediate)] - hq, _ := wq.hqueues[key] - - // FIXME: The host queue might haven gone poof in the meantime, we should - // check if the host queue is still there. - - slog.Debug("Work Queue: Querying host queue for messages.", "queue.name", hq.Name) - select { - case pkg := <-hq.Queue: - // Now promote the pkg to the default queue. - select { - case wq.dqueue <- pkg: - slog.Debug("Work Queue: Message promoted.", "msg.id", pkg.Message.ID) - - wq.mu.Lock() - wq.hqueues[hq.ID].HasReservation = false - wq.mu.Unlock() - default: - slog.Warn("Work Queue: full, dropping to-be-promoted message!", "msg", pkg.Message) - } - default: - // The channel may became empty in the meantime, we've checked at top of the loop. - // slog.Debug("Work Queue: Host queue empty, nothing to promote.", "queue.name", hq.Hostname) - case <-ctx.Done(): - slog.Debug("Work Queue: Context cancelled, stopping promoter.") - return - } - } - }() -} - -// calcImmediateHostQueueCandidates calculates which host queues are candidates -// to be queried for message to be promoted. The function modifies the host queue -// properties, so it requires a lock to be held. -// -// The second return value if non zero indicates the shortest time until a host -// queue is paused (non only the candidates). When no candidate is found, the -// callee should wait at least for that time and than try and call this function -// again. -func (wq *MemoryVisitWorkQueue) calcImmediateHostQueueCandidates() ([]uint32, time.Time) { - // Host queue candidates that can be queried immediately. - immediate := make([]uint32, 0, len(wq.hqueues)) - var shortestPauseUntil time.Time - - wq.mu.Lock() - defer wq.mu.Unlock() - - // First calculate which host queues are candidates for immediate - // querying. This is to unnecesarily hitting the rate limiter for - // that host, as this can be expensive. More than checking the - // PausedUntil time, or the length of the queue. - // - // FIXME: It might be less expensive to first check for the PausedUntil - // time and then check the length of the queue, depending on the - // underlying implementation of the work queue. - for k, hq := range wq.hqueues { - hlogger := slog.With("queue.name", hq.Name) - hlogger.Debug("Work Queue: Checking if is candidate.", "len", len(hq.Queue), "now", time.Now(), "pausedUntil", hq.PausedUntil) - - if len(hq.Queue) == 0 { - // This host queue is empty, no message to process for that queue, - // so don't include it in the immediate list. - continue - } - // hlogger.Debug("Work Queue: Host queue has messages to process.") - - if hq.PausedUntil.IsZero() { - // This host queue was never paused before. - // hlogger.Debug("Work Queue: Host queue is not paused.") - } else { - if time.Now().Before(hq.PausedUntil) { - // This host queue is *still* paused. - // hlogger.Debug("Work Queue: Host queue is paused.") - - // Check if this host queue is paused shorter thant the shortest - // pause we've seen so far. - if shortestPauseUntil.IsZero() || hq.PausedUntil.Before(shortestPauseUntil) { - shortestPauseUntil = hq.PausedUntil - } - - // As this host queue is still paused we don't include - // it in the imediate list. Continue to check if the - // next host queue is paused or not. - continue - } else { - // hlogger.Debug("Work Queue: Host queue is not paused anymore.") - - // This host queue is not paused anymore, include it in - // the list. While not technically necessary, we reset - // the pause until time to zero, for the sake of tidiness. - wq.hqueues[k].PausedUntil = time.Time{} - } - } - - // If we get here, the current host queue was either never - // paused, or it is now unpaused. This means we can try to get a - // token from the rate limiter, if we haven't already. - if !hq.HasReservation { - // hlogger.Debug("Work Queue: Host queue needs a reservation, checking rate limiter.") - res := hq.Limiter.Reserve() - if !res.OK() { - hlogger.Warn("Work Queue: Rate limiter cannot provide reservation in max wait time.") - continue - } - - if res.Delay() > 0 { - hlogger.Debug("Work Queue: Host queue is rate limited, pausing the queue.", "delay", res.Delay()) - - // Pause the tube for a while, the limiter wants us to retry later. - wq.hqueues[k].PausedUntil = time.Now().Add(res.Delay()) - - if shortestPauseUntil.IsZero() || wq.hqueues[k].PausedUntil.Before(shortestPauseUntil) { - shortestPauseUntil = wq.hqueues[k].PausedUntil - } - - wq.hqueues[k].HasReservation = true - continue - } else { - // Got a token from the limiter, we may act immediately. - // hlogger.Debug("Work Queue: Host queue is not rate limited, recording as candidate :)") - wq.hqueues[k].HasReservation = true - } - } else { - // hlogger.Debug("Work Queue: Host already has a reservation.") - } - immediate = append(immediate, hq.ID) - } - return immediate, shortestPauseUntil -} - // shoudlRecalc is checked by the promoter to see if it should recalculate. // It is an unbuffered channel. If sending is blocked, this means there is // a pending notification. As one notification is enough, to trigger the From a2e1f2cbf0827c060a938d0b0a2ae2c7da1825f6 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Tue, 22 Oct 2024 16:21:03 +0200 Subject: [PATCH 09/57] Add license headers --- api.go | 3 +++ api_test.go | 3 +++ connections.go | 3 +++ getenv.go | 3 +++ host.go | 3 +++ host_test.go | 5 +++++ httpclient.go | 3 +++ httpclient_test.go | 3 +++ jobpromoter.go | 5 +++++ main.go | 3 +++ observe.go | 3 +++ otel.go | 3 +++ progress.go | 3 +++ ratelimitoptimizer.go | 5 +++++ robots.go | 3 +++ robots_test.go | 5 +++++ run.go | 3 +++ runmanager.go | 3 +++ sitemap.go | 3 +++ store.go | 3 +++ store_memory.go | 3 +++ store_redis.go | 3 +++ store_redis_test.go | 3 +++ visitworker.go | 3 +++ webhook.go | 3 +++ workqueue.go | 3 +++ workqueue_memory.go | 5 +++++ 27 files changed, 91 insertions(+) diff --git a/api.go b/api.go index acd9f5c..d338b74 100644 --- a/api.go +++ b/api.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/api_test.go b/api_test.go index 40315ea..3ce8744 100644 --- a/api_test.go +++ b/api_test.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/connections.go b/connections.go index 970efec..19025ff 100644 --- a/connections.go +++ b/connections.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/getenv.go b/getenv.go index b203597..197702e 100644 --- a/getenv.go +++ b/getenv.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/host.go b/host.go index e0f3d46..4831ce5 100644 --- a/host.go +++ b/host.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/host_test.go b/host_test.go index 88b2aa1..3d910e7 100644 --- a/host_test.go +++ b/host_test.go @@ -1,3 +1,8 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + package main import ( diff --git a/httpclient.go b/httpclient.go index 2330d02..addfbb5 100644 --- a/httpclient.go +++ b/httpclient.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/httpclient_test.go b/httpclient_test.go index b8ff49f..78c11e8 100644 --- a/httpclient_test.go +++ b/httpclient_test.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/jobpromoter.go b/jobpromoter.go index 6dc2b5a..ac2044a 100644 --- a/jobpromoter.go +++ b/jobpromoter.go @@ -1,3 +1,8 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + package main // The job promoter looks at host queues in a round-robin fashion and promotes diff --git a/main.go b/main.go index fb8611b..8f5d3a3 100644 --- a/main.go +++ b/main.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/observe.go b/observe.go index 70b58b5..49b78ce 100644 --- a/observe.go +++ b/observe.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/otel.go b/otel.go index fdd3f62..f39cff7 100644 --- a/otel.go +++ b/otel.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/progress.go b/progress.go index fcebfeb..a1e01fd 100644 --- a/progress.go +++ b/progress.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/ratelimitoptimizer.go b/ratelimitoptimizer.go index 6a65e73..6a8138a 100644 --- a/ratelimitoptimizer.go +++ b/ratelimitoptimizer.go @@ -1,3 +1,8 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + package main // The rate limit optimizer takes samples and optimizer the rate limit, keeping diff --git a/robots.go b/robots.go index 1f7587f..725dbb4 100644 --- a/robots.go +++ b/robots.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/robots_test.go b/robots_test.go index f7c7fa4..217110a 100644 --- a/robots_test.go +++ b/robots_test.go @@ -1,3 +1,8 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + package main import ( diff --git a/run.go b/run.go index 30d018b..cc45448 100644 --- a/run.go +++ b/run.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/runmanager.go b/runmanager.go index 5267517..df66e4e 100644 --- a/runmanager.go +++ b/runmanager.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/sitemap.go b/sitemap.go index 5a8c5ea..ec432c8 100644 --- a/sitemap.go +++ b/sitemap.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/store.go b/store.go index 2fe0e51..e13d455 100644 --- a/store.go +++ b/store.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/store_memory.go b/store_memory.go index 7bac7cc..ffa75f3 100644 --- a/store_memory.go +++ b/store_memory.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/store_redis.go b/store_redis.go index 63b90a5..fbf0ab4 100644 --- a/store_redis.go +++ b/store_redis.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/store_redis_test.go b/store_redis_test.go index 458dc15..5eae2e3 100644 --- a/store_redis_test.go +++ b/store_redis_test.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/visitworker.go b/visitworker.go index 96984c9..8b175af 100644 --- a/visitworker.go +++ b/visitworker.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/webhook.go b/webhook.go index 6f77737..c426c84 100644 --- a/webhook.go +++ b/webhook.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/workqueue.go b/workqueue.go index ee1b363..46803b2 100644 --- a/workqueue.go +++ b/workqueue.go @@ -1,4 +1,7 @@ // Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package main diff --git a/workqueue_memory.go b/workqueue_memory.go index 46ed3cd..5e344d6 100644 --- a/workqueue_memory.go +++ b/workqueue_memory.go @@ -1,3 +1,8 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + package main import ( From 3a09b2487aeacfb6b7164e997430f9ac3da74395 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 4 Nov 2024 12:00:04 +0100 Subject: [PATCH 10/57] Improve testability of queue components --- workqueue.go | 55 ++++++ jobpromoter.go => workqueue_jobpromoter.go | 124 +++++-------- workqueue_jobpromoter_test.go | 6 + workqueue_memory.go | 175 +++++++++--------- workqueue_ratelimit.go | 68 +++++++ ...izer.go => workqueue_ratelimitoptimizer.go | 47 ++--- 6 files changed, 284 insertions(+), 191 deletions(-) rename jobpromoter.go => workqueue_jobpromoter.go (56%) create mode 100644 workqueue_jobpromoter_test.go create mode 100644 workqueue_ratelimit.go rename ratelimitoptimizer.go => workqueue_ratelimitoptimizer.go (53%) diff --git a/workqueue.go b/workqueue.go index 46803b2..7db9616 100644 --- a/workqueue.go +++ b/workqueue.go @@ -8,14 +8,17 @@ package main import ( "context" "errors" + "fmt" "hash/fnv" "log/slog" "net/http" "net/url" "strings" + "sync/atomic" "time" "github.com/redis/go-redis/v9" + "go.opentelemetry.io/otel/propagation" ) const ( @@ -81,8 +84,13 @@ type VisitMessage struct { URL string Created time.Time + // The number of times this job has been retried to be enqueued. Retries uint32 + + // The carrier is used to pass tracing information from the job publisher to + // the job consumer. It is used to pass the TraceID and SpanID. + Carrier propagation.MapCarrier } // VisitJob is similar to a http.Request, it exists only for a certain time. It @@ -120,6 +128,53 @@ func CreateWorkQueue(redis *redis.Client) VisitWorkQueue { } } +// ControlledQueue is a wrapped queue that is flow controllable, by +// pausing and resuming. It also has a rate limiter attached to it. +type ControlledQueue struct { + ID uint32 + Name string // For debugging purposes only. + + Queue chan *VisitMessage + + Limiter Limiter + + // Holds an Unix timestamp, instead of a time.Time so + // that we can operate on this without locks. + pausedUntil atomic.Int64 + + IsAdaptive bool +} + +func (cq *ControlledQueue) String() string { + _, until := cq.IsPaused() + return fmt.Sprintf("ControlledQueue(%d, %s, %d, %s, %s)", cq.ID, cq.Name, len(cq.Queue), until, cq.Limiter.HoldsReservation()) +} + +func (cq *ControlledQueue) IsPaused() (bool, time.Time) { + now := time.Now().Unix() + until := cq.pausedUntil.Load() + + if until == 0 { + return false, time.Time{} + } + return now < until, time.Unix(until, 0) +} + +func (cq *ControlledQueue) Pause(d time.Duration) time.Time { + v := time.Now().Add(d).Unix() + o := cq.pausedUntil.Load() + + if o == 0 || v > o { // Pause can only increase. + cq.pausedUntil.Store(v) + return time.Unix(v, 0) + } + return time.Unix(o, 0) +} + +func (cq *ControlledQueue) Unpause() { + cq.pausedUntil.Store(0) +} + // guessHost heuristically identifies a host for the given URL. The function // doesn't return the host name directly, as it might not exist, but an ID. // diff --git a/jobpromoter.go b/workqueue_jobpromoter.go similarity index 56% rename from jobpromoter.go rename to workqueue_jobpromoter.go index ac2044a..fb11966 100644 --- a/jobpromoter.go +++ b/workqueue_jobpromoter.go @@ -15,10 +15,10 @@ import ( "time" ) -// startPromoter starts a the promoter goroutine that shovels message from host +// startJobPromoter starts a the promoter goroutine that shovels message from host // queue into the default queue. The promoter is responsible for load balancing // the host queues. -func (wq *MemoryVisitWorkQueue) startPromoter(ctx context.Context) { +func startJobPromoter(ctx context.Context, dqueue chan *VisitMessage, hqueues map[uint32]*ControlledQueue, shouldRecalc chan bool) { go func() { slog.Debug("Work Queue: Starting promoter...") @@ -29,7 +29,7 @@ func (wq *MemoryVisitWorkQueue) startPromoter(ctx context.Context) { for { immediatecalc: // slog.Debug("Work Queue: Calculating immediate queue candidates.") - immediate, shortestPauseUntil := wq.calcImmediateHostQueueCandidates() + immediate, shortestPauseUntil := calcImmediateHostQueueCandidates(hqueues) // Check how long we have to wait until we can recalc immidiate candidates. if len(immediate) == 0 { @@ -50,7 +50,7 @@ func (wq *MemoryVisitWorkQueue) startPromoter(ctx context.Context) { case <-time.After(delay): // slog.Debug("Work Queue: Pause time passed.") goto immediatecalc - case <-wq.shoudlRecalc: + case <-shouldRecalc: slog.Debug("Work Queue: Got notification to re-calc immediate queue candidates.") goto immediatecalc case <-ctx.Done(): @@ -65,38 +65,40 @@ func (wq *MemoryVisitWorkQueue) startPromoter(ctx context.Context) { slog.Debug("Work Queue: Final immediate queue candidates calculated.", "count", len(immediate)) n := atomic.AddUint32(&next, 1) - key := immediate[(int(n)-1)%len(immediate)] - hq, _ := wq.hqueues[key] + hq, _ := hqueues[key] // FIXME: The host queue might haven gone poof in the meantime, we should // check if the host queue is still there. - slog.Debug("Work Queue: Querying host queue for messages.", "queue.name", hq.Name) - select { - case pkg := <-hq.Queue: - // Now promote the pkg to the default queue. - select { - case wq.dqueue <- pkg: - slog.Debug("Work Queue: Message promoted.", "msg.id", pkg.Message.ID) - - wq.mu.Lock() - wq.hqueues[hq.ID].HasReservation = false - wq.mu.Unlock() - default: - slog.Warn("Work Queue: full, dropping to-be-promoted message!", "msg", pkg.Message) - } - default: - // The channel may became empty in the meantime, we've checked at top of the loop. - // slog.Debug("Work Queue: Host queue empty, nothing to promote.", "queue.name", hq.Hostname) - case <-ctx.Done(): - slog.Debug("Work Queue: Context cancelled, stopping promoter.") - return + if promote(ctx, hq.Queue, dqueue) { + hq.Limiter.ReleaseReservation() } } }() } +// promote promotes a message from one channel to another. The function returns +// true when the message was successfully promoted, false otherwise. The +// function will not block on an empty source channel or on an overful target +// channel, even when these are not buffered. +func promote[V any](ctx context.Context, a <-chan V, b chan<- V) bool { + select { + case msg := <-a: + select { + case b <- msg: + return true + default: + return false + } + case <-ctx.Done(): + return false + default: + return false + } + return false +} + // calcImmediateHostQueueCandidates calculates which host queues are candidates // to be queried for message to be promoted. The function modifies the host queue // properties, so it requires a lock to be held. @@ -105,14 +107,11 @@ func (wq *MemoryVisitWorkQueue) startPromoter(ctx context.Context) { // queue is paused (non only the candidates). When no candidate is found, the // callee should wait at least for that time and than try and call this function // again. -func (wq *MemoryVisitWorkQueue) calcImmediateHostQueueCandidates() ([]uint32, time.Time) { +func calcImmediateHostQueueCandidates(hqueues map[uint32]*ControlledQueue) ([]uint32, time.Time) { // Host queue candidates that can be queried immediately. - immediate := make([]uint32, 0, len(wq.hqueues)) + immediate := make([]uint32, 0, len(hqueues)) var shortestPauseUntil time.Time - wq.mu.Lock() - defer wq.mu.Unlock() - // First calculate which host queues are candidates for immediate // querying. This is to unnecesarily hitting the rate limiter for // that host, as this can be expensive. More than checking the @@ -121,9 +120,9 @@ func (wq *MemoryVisitWorkQueue) calcImmediateHostQueueCandidates() ([]uint32, ti // FIXME: It might be less expensive to first check for the PausedUntil // time and then check the length of the queue, depending on the // underlying implementation of the work queue. - for k, hq := range wq.hqueues { + for _, hq := range hqueues { hlogger := slog.With("queue.name", hq.Name) - hlogger.Debug("Work Queue: Checking if is candidate.", "len", len(hq.Queue), "now", time.Now(), "pausedUntil", hq.PausedUntil) + hlogger.Debug("Work Queue: Checking if is candidate.", "Queue", hq.String()) if len(hq.Queue) == 0 { // This host queue is empty, no message to process for that queue, @@ -132,64 +131,41 @@ func (wq *MemoryVisitWorkQueue) calcImmediateHostQueueCandidates() ([]uint32, ti } // hlogger.Debug("Work Queue: Host queue has messages to process.") - if hq.PausedUntil.IsZero() { - // This host queue was never paused before. - // hlogger.Debug("Work Queue: Host queue is not paused.") - } else { - if time.Now().Before(hq.PausedUntil) { - // This host queue is *still* paused. - // hlogger.Debug("Work Queue: Host queue is paused.") - - // Check if this host queue is paused shorter thant the shortest - // pause we've seen so far. - if shortestPauseUntil.IsZero() || hq.PausedUntil.Before(shortestPauseUntil) { - shortestPauseUntil = hq.PausedUntil - } - - // As this host queue is still paused we don't include - // it in the imediate list. Continue to check if the - // next host queue is paused or not. - continue - } else { - // hlogger.Debug("Work Queue: Host queue is not paused anymore.") - - // This host queue is not paused anymore, include it in - // the list. While not technically necessary, we reset - // the pause until time to zero, for the sake of tidiness. - wq.hqueues[k].PausedUntil = time.Time{} + if paused, until := hq.IsPaused(); paused { + // Check if this host queue is paused shorter then the shortest + // pause we've seen so far. + if shortestPauseUntil.IsZero() || until.Before(shortestPauseUntil) { + shortestPauseUntil = until } + + // As this host queue is still paused we don't include + // it in the imediate list. Continue to check if the + // next host queue is paused or not. + continue } // If we get here, the current host queue was either never // paused, or it is now unpaused. This means we can try to get a // token from the rate limiter, if we haven't already. - if !hq.HasReservation { + if !hq.Limiter.HoldsReservation() { // hlogger.Debug("Work Queue: Host queue needs a reservation, checking rate limiter.") - res := hq.Limiter.Reserve() - if !res.OK() { + ok, delay := hq.Limiter.Reserve() + if !ok { hlogger.Warn("Work Queue: Rate limiter cannot provide reservation in max wait time.") continue } - if res.Delay() > 0 { - hlogger.Debug("Work Queue: Host queue is rate limited, pausing the queue.", "delay", res.Delay()) + if delay > 0 { + hlogger.Debug("Work Queue: Host queue is rate limited, pausing the queue.", "delay", delay) // Pause the tube for a while, the limiter wants us to retry later. - wq.hqueues[k].PausedUntil = time.Now().Add(res.Delay()) + until := hq.Pause(delay) - if shortestPauseUntil.IsZero() || wq.hqueues[k].PausedUntil.Before(shortestPauseUntil) { - shortestPauseUntil = wq.hqueues[k].PausedUntil + if shortestPauseUntil.IsZero() || until.Before(shortestPauseUntil) { + shortestPauseUntil = until } - - wq.hqueues[k].HasReservation = true continue - } else { - // Got a token from the limiter, we may act immediately. - // hlogger.Debug("Work Queue: Host queue is not rate limited, recording as candidate :)") - wq.hqueues[k].HasReservation = true } - } else { - // hlogger.Debug("Work Queue: Host already has a reservation.") } immediate = append(immediate, hq.ID) } diff --git a/workqueue_jobpromoter_test.go b/workqueue_jobpromoter_test.go new file mode 100644 index 0000000..b4c13e2 --- /dev/null +++ b/workqueue_jobpromoter_test.go @@ -0,0 +1,6 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main diff --git a/workqueue_memory.go b/workqueue_memory.go index 5e344d6..eda265d 100644 --- a/workqueue_memory.go +++ b/workqueue_memory.go @@ -8,6 +8,7 @@ package main import ( "context" "log/slog" + "net/http" "net/url" "strings" "sync" @@ -16,88 +17,66 @@ import ( "github.com/google/uuid" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/propagation" - xrate "golang.org/x/time/rate" ) // The maximum number of messages that can exists in the in-memory work queue. const MemoryWorkQueueBufferSize = 1_000_000 -// visitMemoryPackage is used by the in-memory implementation of the WorkQueue. -// Implementations that have a built-in mechanism to transport headers do not -// need to use this. -type visitMemoryPackage struct { - Carrier propagation.MapCarrier - Message *VisitMessage -} - -type hostMemoryVisitWorkQueue struct { - ID uint32 - Name string // For debugging purposes only. - - Queue chan *visitMemoryPackage - - Limiter *xrate.Limiter - HasReservation bool - IsAdaptive bool - PausedUntil time.Time -} - func NewMemoryVisitWorkQueue() *MemoryVisitWorkQueue { return &MemoryVisitWorkQueue{ - dqueue: make(chan *visitMemoryPackage, MemoryWorkQueueBufferSize), - hqueues: make(map[uint32]*hostMemoryVisitWorkQueue), - shoudlRecalc: make(chan bool), + dqueue: make(chan *VisitMessage, MemoryWorkQueueBufferSize), + shouldRecalc: make(chan bool), } } type MemoryVisitWorkQueue struct { - mu sync.RWMutex - // This where consumers read from. - dqueue chan *visitMemoryPackage + dqueue chan *VisitMessage // This is where message get published too. Key ist the hostname including // the port. It's okay to mix enqueue visits with and without authentication // for the same host. - hqueues map[uint32]*hostMemoryVisitWorkQueue + hqueues sync.Map // map[uint32]*ControlledQueue - // shoudlRecalc is checked by the promoter to see if it should recalculate. + // shouldRecalc is checked by the promoter to see if it should recalculate. // It is an unbuffered channel. If sending is blocked, this means there is // a pending notification. As one notification is enough, to trigger the // recalculation, a failed send can be ignored. - shoudlRecalc chan bool + shouldRecalc chan bool } func (wq *MemoryVisitWorkQueue) Open(ctx context.Context) error { - wq.startPromoter(ctx) + snap := make(map[uint32]*ControlledQueue) + + wq.hqueues.Range(func(key any, value any) bool { + snap[key.(uint32)] = value.(*ControlledQueue) + return true + }) + + startJobPromoter(ctx, wq.dqueue, snap, wq.shouldRecalc) return nil } -func (wq *MemoryVisitWorkQueue) lazyHostQueue(u string) *hostMemoryVisitWorkQueue { +func (wq *MemoryVisitWorkQueue) lazyHostQueue(u string) *ControlledQueue { p, _ := url.Parse(u) key := guessHost(u) - if _, ok := wq.hqueues[key]; !ok { - wq.mu.Lock() - wq.hqueues[key] = &hostMemoryVisitWorkQueue{ - ID: key, - Name: strings.TrimLeft(p.Hostname(), "www."), - PausedUntil: time.Time{}, - HasReservation: false, - IsAdaptive: true, - Queue: make(chan *visitMemoryPackage, MemoryWorkQueueBufferSize), - Limiter: xrate.NewLimiter(xrate.Limit(MinHostRPS), 1), - } - wq.mu.Unlock() + if _, ok := wq.hqueues.Load(key); !ok { + wq.hqueues.Store(key, &ControlledQueue{ + ID: key, + Name: strings.TrimLeft(p.Hostname(), "www."), + IsAdaptive: true, + Queue: make(chan *VisitMessage, MemoryWorkQueueBufferSize), + Limiter: NewMemoryLimiter(), + }) } - wq.mu.RLock() - defer wq.mu.RUnlock() - return wq.hqueues[key] + v, _ := wq.hqueues.Load(key) + return v.(*ControlledQueue) } func (wq *MemoryVisitWorkQueue) Publish(ctx context.Context, run string, url string) error { - defer wq.shouldRecalc() // Notify promoter that a new message is available. + defer wq.triggerRecalc() // Notify promoter that a new message is available. hq := wq.lazyHostQueue(url) @@ -107,28 +86,25 @@ func (wq *MemoryVisitWorkQueue) Publish(ctx context.Context, run string, url str carrier := propagation.MapCarrier{} propagator.Inject(ctx, carrier) - pkg := &visitMemoryPackage{ + msg := &VisitMessage{ + ID: uuid.New().ID(), + Run: run, + URL: url, + Created: time.Now(), Carrier: carrier, - Message: &VisitMessage{ - ID: uuid.New().ID(), - Run: run, - URL: url, - Created: time.Now(), - }, } select { - case hq.Queue <- pkg: - slog.Debug("Work Queue: Message successfully published.", "msg.id", pkg.Message.ID) + case hq.Queue <- msg: + slog.Debug("Work Queue: Message successfully published.", "msg.id", msg.ID) default: - slog.Warn("Work Queue: full, not publishing message!", "msg", pkg.Message) + slog.Warn("Work Queue: full, not publishing message!", "msg", msg) } return nil } func (wq *MemoryVisitWorkQueue) Republish(ctx context.Context, job *VisitJob) error { - defer wq.shouldRecalc() - + defer wq.triggerRecalc() hq := wq.lazyHostQueue(job.URL) // Extract tracing information from context, to transport it over the @@ -137,22 +113,20 @@ func (wq *MemoryVisitWorkQueue) Republish(ctx context.Context, job *VisitJob) er carrier := propagation.MapCarrier{} propagator.Inject(job.Context, carrier) - pkg := &visitMemoryPackage{ + msg := &VisitMessage{ + ID: job.ID, + Run: job.Run, + URL: job.URL, + Created: job.Created, + Retries: job.Retries + 1, Carrier: carrier, - Message: &VisitMessage{ - ID: job.ID, - Run: job.Run, - URL: job.URL, - Created: job.Created, - Retries: job.Retries + 1, - }, } select { - case hq.Queue <- pkg: - slog.Debug("Work Queue: Message successfully rescheduled.", "msg.id", pkg.Message.ID) + case hq.Queue <- msg: + slog.Debug("Work Queue: Message successfully rescheduled.", "msg.id", msg.ID) default: - slog.Warn("Work Queue: full, not rescheduling message!", "msg", pkg.Message) + slog.Warn("Work Queue: full, not rescheduling message!", "msg", msg) } return nil } @@ -172,16 +146,16 @@ func (wq *MemoryVisitWorkQueue) Consume(ctx context.Context) (<-chan *VisitJob, close(reschan) close(errchan) return - case p := <-wq.dqueue: + case msg := <-wq.dqueue: // slog.Debug("Work Queue: Received message, forwarding to results channel.", "msg.id", p.Message.ID) // Initializes the context for the job. Than extract the tracing // information from the carrier into the job's context. jctx := context.Background() - jctx = otel.GetTextMapPropagator().Extract(jctx, p.Carrier) + jctx = otel.GetTextMapPropagator().Extract(jctx, msg.Carrier) reschan <- &VisitJob{ - VisitMessage: p.Message, + VisitMessage: msg, Context: jctx, } // slog.Debug("Work Queue: Forwarded message to results channel.", "msg.id", p.Message.ID) @@ -193,22 +167,17 @@ func (wq *MemoryVisitWorkQueue) Consume(ctx context.Context) (<-chan *VisitJob, } func (wq *MemoryVisitWorkQueue) Pause(ctx context.Context, url string, d time.Duration) error { - t := time.Now().Add(d) - hq := wq.lazyHostQueue(url) - - if hq.PausedUntil.IsZero() || !hq.PausedUntil.After(t) { // Pause can only increase. - hq.PausedUntil = t - } + wq.lazyHostQueue(url).Pause(d) return nil } -// shoudlRecalc is checked by the promoter to see if it should recalculate. +// shouldRecalc is checked by the promoter to see if it should recalculate. // It is an unbuffered channel. If sending is blocked, this means there is // a pending notification. As one notification is enough, to trigger the // recalculation, a failed send can be ignored. -func (wq *MemoryVisitWorkQueue) shouldRecalc() { +func (wq *MemoryVisitWorkQueue) triggerRecalc() { select { - case wq.shoudlRecalc <- true: + case wq.shouldRecalc <- true: default: // A notification is already pending, no need to send another. } @@ -217,8 +186,42 @@ func (wq *MemoryVisitWorkQueue) shouldRecalc() { func (wq *MemoryVisitWorkQueue) Close() error { close(wq.dqueue) - for _, hq := range wq.hqueues { - close(hq.Queue) - } + wq.hqueues.Range(func(k any, v any) bool { + close(v.(*ControlledQueue).Queue) + return true + }) return nil } + +func (wq *MemoryVisitWorkQueue) TakeRateLimitHeaders(ctx context.Context, url string, hdr *http.Header) { + hq := wq.lazyHostQueue(url) + + if !hq.IsAdaptive { + return + } + changed, desired := newRateLimitByHeaders(hq.Limiter.GetLimit(), hdr) + if !changed { + return + } + + hq.Limiter.SetLimit(desired) + slog.Debug("Work Queue: Rate limit fine tuned.", "host", hq.Name, "desired", desired) +} + +// TakeSample implements an algorithm to adjust the rate limiter, to ensure maximum througput within the bounds +// set by the rate limiter, while not overwhelming the target. The algorithm will adjust the rate limiter +// but not go below MinRequestsPerSecondPerHost and not above MaxRequestsPerSecondPerHost. +func (wq *MemoryVisitWorkQueue) TakeSample(ctx context.Context, url string, statusCode int, err error, d time.Duration) { + hq := wq.lazyHostQueue(url) + + if !hq.IsAdaptive { + return + } + changed, desired := newRateLimitByLatency(hq.Limiter.GetLimit(), statusCode, err, d) + if !changed { + return + } + + hq.Limiter.SetLimit(desired) + slog.Debug("Work Queue: Rate limit fine tuned.", "host", hq.Name, "desired", desired) +} diff --git a/workqueue_ratelimit.go b/workqueue_ratelimit.go new file mode 100644 index 0000000..05b206e --- /dev/null +++ b/workqueue_ratelimit.go @@ -0,0 +1,68 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "sync/atomic" + "time" + + xrate "golang.org/x/time/rate" +) + +// Limiter is an interface that allows to make a reservation and check if it is +// being held. +type Limiter interface { + GetLimit() float64 + SetLimit(float64) + + Reserve() (ok bool, delay time.Duration) + HoldsReservation() bool + ReleaseReservation() +} + +type Pauser interface { + IsPaused() bool + Pause(d time.Duration) + Unpause() +} + +func NewMemoryLimiter() *MemoryLimiter { + return &MemoryLimiter{ + Limiter: xrate.NewLimiter(xrate.Limit(MinHostRPS), 1), + } +} + +// MemoryLimiter wraps xrate.Limiter to satisfy the Reserver interface and +// provide management around held reservations. It is safe to use concurrently. +type MemoryLimiter struct { + Limiter *xrate.Limiter + hasReservation atomic.Bool +} + +func (l *MemoryLimiter) GetLimit() float64 { + return float64(l.Limiter.Limit()) +} + +func (l *MemoryLimiter) SetLimit(limit float64) { + l.Limiter.SetLimit(xrate.Limit(limit)) +} + +func (l *MemoryLimiter) Reserve() (bool, time.Duration) { + r := l.Limiter.ReserveN(time.Now(), 1) + + if r.OK() { + l.hasReservation.Store(true) + } + return r.OK(), r.Delay() +} + +func (l *MemoryLimiter) HoldsReservation() bool { + return l.hasReservation.Load() +} + +func (l *MemoryLimiter) ReleaseReservation() { + l.hasReservation.Store(false) +} diff --git a/ratelimitoptimizer.go b/workqueue_ratelimitoptimizer.go similarity index 53% rename from ratelimitoptimizer.go rename to workqueue_ratelimitoptimizer.go index 6a8138a..652b044 100644 --- a/ratelimitoptimizer.go +++ b/workqueue_ratelimitoptimizer.go @@ -11,38 +11,30 @@ package main import ( "context" "errors" - "log/slog" "net/http" "strconv" "time" - - xrate "golang.org/x/time/rate" ) -func (wq *MemoryVisitWorkQueue) TakeRateLimitHeaders(ctx context.Context, url string, hdr *http.Header) { - hq := wq.lazyHostQueue(url) - +// newRateLimitByHeaders returns a new rate limit according to rate limiting headers. It ensures +// that the new rate limit is at least MinHostRPS. +func newRateLimitByHeaders(cur float64, hdr *http.Header) (bool, float64) { if v := hdr.Get("X-RateLimit-Limit"); v != "" { - cur := float64(hq.Limiter.Limit()) - parsed, _ := strconv.Atoi(v) desired := max(float64(parsed/60/60), MinHostRPS) // Convert to per second, is per hour. - if int(desired) != int(cur) { - slog.Debug("Work Queue: Rate limit updated.", "host", hq.Name, "now", desired, "change", desired-cur) - hq.Limiter.SetLimit(xrate.Limit(desired)) - } - hq.IsAdaptive = false + return int(desired) != int(cur), desired } + return false, 0 } -// TakeSample implements an algorithm to adjust the rate limiter, to ensure maximum througput within the bounds -// set by the rate limiter, while not overwhelming the target. The algorithm will adjust the rate limiter -// but not go below MinRequestsPerSecondPerHost and not above MaxRequestsPerSecondPerHost. -func (wq *MemoryVisitWorkQueue) TakeSample(ctx context.Context, url string, statusCode int, err error, d time.Duration) { - hq := wq.lazyHostQueue(url) - - cur := float64(hq.Limiter.Limit()) +// newRateLimitByLatency returns a new rate limit according to the latency of +// a request/response and status / error. It implements an algorithm to adjust +// the rate limiter, to ensure maximum througput within the bounds set by the +// rate limiter, while not overwhelming the target. The algorithm will adjust +// the rate limiter but not go below MinRequestsPerSecondPerHost and not above +// MaxRequestsPerSecondPerHost. +func newRateLimitByLatency(cur float64, statusCode int, err error, d time.Duration) (bool, float64) { var desired float64 // If we have a status code that the target is already overwhelmed, we don't @@ -53,7 +45,7 @@ func (wq *MemoryVisitWorkQueue) TakeSample(ctx context.Context, url string, stat case statusCode == 429: fallthrough case statusCode == 503: - return + return false, 0 case statusCode == 500 || errors.Is(err, context.DeadlineExceeded): // The server is starting to behave badly. We should slow down. // Half the rate limit, capped at MinRequestsPerSecondPerHost. @@ -63,15 +55,11 @@ func (wq *MemoryVisitWorkQueue) TakeSample(ctx context.Context, url string, stat desired = cur / 2 } - if int(desired) != int(cur) && hq.IsAdaptive { - slog.Debug("Work Queue: Lowering pressure on host.", "host", hq.Name, "now", desired, "change", desired-cur) - hq.Limiter.SetLimit(xrate.Limit(desired)) - } - return + return int(desired) != int(cur), desired } if d == 0 { - return + return false, 0 } // The higher the latency comes close to 1s the more we want to decrease the @@ -81,8 +69,5 @@ func (wq *MemoryVisitWorkQueue) TakeSample(ctx context.Context, url string, stat latency := min(1, d.Seconds()) // Everything above 1s is considered slow. desired = MaxHostRPS*(1-latency) + MinHostRPS*latency - if int(desired) != int(cur) && hq.IsAdaptive { - slog.Debug("Work Queue: Rate limit fine tuned.", "host", hq.Name, "now", desired, "change", desired-cur) - hq.Limiter.SetLimit(xrate.Limit(desired)) - } + return int(desired) != int(cur), desired } From f72fb4b6a057f4eaa459187e77ad7c8e7ebdcb88 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 4 Nov 2024 14:39:37 +0100 Subject: [PATCH 11/57] Namespace queue --- internal/ctrlq/controlled.go | 59 +++++++ internal/ctrlq/limiter.go | 27 +++ .../ctrlq/limiter_memory.go | 19 +-- workqueue.go => internal/ctrlq/main.go | 113 +++---------- .../ctrlq/memory.go | 6 +- .../ctrlq/optimizer.go | 12 +- .../ctrlq/promoter.go | 155 +++++++++--------- internal/ctrlq/util.go | 32 ++++ main.go | 3 +- run.go | 7 +- visitworker.go | 7 +- workqueue_jobpromoter_test.go | 6 - 12 files changed, 239 insertions(+), 207 deletions(-) create mode 100644 internal/ctrlq/controlled.go create mode 100644 internal/ctrlq/limiter.go rename workqueue_ratelimit.go => internal/ctrlq/limiter_memory.go (76%) rename workqueue.go => internal/ctrlq/main.go (65%) rename workqueue_memory.go => internal/ctrlq/memory.go (98%) rename workqueue_ratelimitoptimizer.go => internal/ctrlq/optimizer.go (89%) rename workqueue_jobpromoter.go => internal/ctrlq/promoter.go (56%) create mode 100644 internal/ctrlq/util.go delete mode 100644 workqueue_jobpromoter_test.go diff --git a/internal/ctrlq/controlled.go b/internal/ctrlq/controlled.go new file mode 100644 index 0000000..f9b330e --- /dev/null +++ b/internal/ctrlq/controlled.go @@ -0,0 +1,59 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ctrlq + +import ( + "fmt" + "sync/atomic" + "time" +) + +// ControlledQueue is a wrapped queue that is flow controllable, by +// pausing and resuming. It also has a rate limiter attached to it. +type ControlledQueue struct { + ID uint32 + Name string // For debugging purposes only. + + Queue chan *VisitMessage + + Limiter Limiter + + // Holds an Unix timestamp, instead of a time.Time so + // that we can operate on this without locks. + pausedUntil atomic.Int64 + + IsAdaptive bool +} + +func (cq *ControlledQueue) String() string { + _, until := cq.IsPaused() + return fmt.Sprintf("ControlledQueue(%d, %s, %d, %s, %v)", cq.ID, cq.Name, len(cq.Queue), until, cq.Limiter.HoldsReservation()) +} + +func (cq *ControlledQueue) IsPaused() (bool, time.Time) { + now := time.Now().Unix() + until := cq.pausedUntil.Load() + + if until == 0 { + return false, time.Time{} + } + return now < until, time.Unix(until, 0) +} + +func (cq *ControlledQueue) Pause(d time.Duration) time.Time { + v := time.Now().Add(d).Unix() + o := cq.pausedUntil.Load() + + if o == 0 || v > o { // Pause can only increase. + cq.pausedUntil.Store(v) + return time.Unix(v, 0) + } + return time.Unix(o, 0) +} + +func (cq *ControlledQueue) Unpause() { + cq.pausedUntil.Store(0) +} diff --git a/internal/ctrlq/limiter.go b/internal/ctrlq/limiter.go new file mode 100644 index 0000000..edf41b7 --- /dev/null +++ b/internal/ctrlq/limiter.go @@ -0,0 +1,27 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ctrlq + +import ( + "time" +) + +// Limiter is an interface that allows to make a reservation and check if it is +// being held. +type Limiter interface { + GetLimit() float64 + SetLimit(float64) + + Reserve() (ok bool, delay time.Duration) + HoldsReservation() bool + ReleaseReservation() +} + +type Pauser interface { + IsPaused() bool + Pause(d time.Duration) + Unpause() +} diff --git a/workqueue_ratelimit.go b/internal/ctrlq/limiter_memory.go similarity index 76% rename from workqueue_ratelimit.go rename to internal/ctrlq/limiter_memory.go index 05b206e..17ecd8a 100644 --- a/workqueue_ratelimit.go +++ b/internal/ctrlq/limiter_memory.go @@ -3,7 +3,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -package main +package ctrlq import ( "sync/atomic" @@ -12,23 +12,6 @@ import ( xrate "golang.org/x/time/rate" ) -// Limiter is an interface that allows to make a reservation and check if it is -// being held. -type Limiter interface { - GetLimit() float64 - SetLimit(float64) - - Reserve() (ok bool, delay time.Duration) - HoldsReservation() bool - ReleaseReservation() -} - -type Pauser interface { - IsPaused() bool - Pause(d time.Duration) - Unpause() -} - func NewMemoryLimiter() *MemoryLimiter { return &MemoryLimiter{ Limiter: xrate.NewLimiter(xrate.Limit(MinHostRPS), 1), diff --git a/workqueue.go b/internal/ctrlq/main.go similarity index 65% rename from workqueue.go rename to internal/ctrlq/main.go index 7db9616..f24b281 100644 --- a/workqueue.go +++ b/internal/ctrlq/main.go @@ -3,33 +3,37 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -package main +// Package ctrlq provides a work queue for accepting and distributing requests +// to visit a web resource to workers. It transparently handles rate limiting. +// By providing run time statistics to the queue it will adjust the rate +// limiters accordingly. +// +// In its most basic form the queue is backed by memory, but it can be backed by +// Redis if you wanted to have several instances coordinate over the queue. +package ctrlq import ( "context" "errors" - "fmt" - "hash/fnv" "log/slog" "net/http" - "net/url" - "strings" - "sync/atomic" "time" "github.com/redis/go-redis/v9" "go.opentelemetry.io/otel/propagation" ) -const ( - // MinHostRPS specifies the minimum number of requests per - // second that are executed against a single host. - MinHostRPS float64 = 1 - - // MaxHostRPS specifies the maximum number of requests per - // second that are exectuted against a single host. - MaxHostRPS float64 = 50 -) +func CreateWorkQueue(redis *redis.Client) VisitWorkQueue { + if redis != nil { + slog.Debug("Using distributed work queue...") + // TODO: Add support for redis work queue. + // return &RedisVisitWorkQueue{conn: redis} + return NewMemoryVisitWorkQueue() + } else { + slog.Debug("Using in-memory work queue...") + return NewMemoryVisitWorkQueue() + } +} // VisitWorkQueue appears to produceres and consumers as a single queue. Each // message in the work queue represents a job for a request to visit a single @@ -115,82 +119,3 @@ func (j *VisitJob) Validate() (bool, error) { } return true, nil } - -func CreateWorkQueue(redis *redis.Client) VisitWorkQueue { - if redis != nil { - slog.Debug("Using distributed work queue...") - // TODO: Add support for redis work queue. - // return &RedisVisitWorkQueue{conn: redis} - return NewMemoryVisitWorkQueue() - } else { - slog.Debug("Using in-memory work queue...") - return NewMemoryVisitWorkQueue() - } -} - -// ControlledQueue is a wrapped queue that is flow controllable, by -// pausing and resuming. It also has a rate limiter attached to it. -type ControlledQueue struct { - ID uint32 - Name string // For debugging purposes only. - - Queue chan *VisitMessage - - Limiter Limiter - - // Holds an Unix timestamp, instead of a time.Time so - // that we can operate on this without locks. - pausedUntil atomic.Int64 - - IsAdaptive bool -} - -func (cq *ControlledQueue) String() string { - _, until := cq.IsPaused() - return fmt.Sprintf("ControlledQueue(%d, %s, %d, %s, %s)", cq.ID, cq.Name, len(cq.Queue), until, cq.Limiter.HoldsReservation()) -} - -func (cq *ControlledQueue) IsPaused() (bool, time.Time) { - now := time.Now().Unix() - until := cq.pausedUntil.Load() - - if until == 0 { - return false, time.Time{} - } - return now < until, time.Unix(until, 0) -} - -func (cq *ControlledQueue) Pause(d time.Duration) time.Time { - v := time.Now().Add(d).Unix() - o := cq.pausedUntil.Load() - - if o == 0 || v > o { // Pause can only increase. - cq.pausedUntil.Store(v) - return time.Unix(v, 0) - } - return time.Unix(o, 0) -} - -func (cq *ControlledQueue) Unpause() { - cq.pausedUntil.Store(0) -} - -// guessHost heuristically identifies a host for the given URL. The function -// doesn't return the host name directly, as it might not exist, but an ID. -// -// It does by by ignoring a www. prefix, leading to www.example.org and -// example.org being considered the same host. It also ignores the port number, -// so example.org:8080 and example.org:9090 are considered the same host as -// well. -// -// Why FNV? https://softwareengineering.stackexchange.com/questions/49550 -func guessHost(u string) uint32 { - p, err := url.Parse(u) - if err != nil { - return 0 - } - h := fnv.New32a() - - h.Write([]byte(strings.TrimLeft(p.Hostname(), "www."))) - return h.Sum32() -} diff --git a/workqueue_memory.go b/internal/ctrlq/memory.go similarity index 98% rename from workqueue_memory.go rename to internal/ctrlq/memory.go index eda265d..1ca6ecf 100644 --- a/workqueue_memory.go +++ b/internal/ctrlq/memory.go @@ -3,7 +3,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -package main +package ctrlq import ( "context" @@ -53,7 +53,9 @@ func (wq *MemoryVisitWorkQueue) Open(ctx context.Context) error { return true }) - startJobPromoter(ctx, wq.dqueue, snap, wq.shouldRecalc) + go func() { + promoteLoop(ctx, wq.dqueue, snap, wq.shouldRecalc) + }() return nil } diff --git a/workqueue_ratelimitoptimizer.go b/internal/ctrlq/optimizer.go similarity index 89% rename from workqueue_ratelimitoptimizer.go rename to internal/ctrlq/optimizer.go index 652b044..d2a16cf 100644 --- a/workqueue_ratelimitoptimizer.go +++ b/internal/ctrlq/optimizer.go @@ -3,7 +3,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -package main +package ctrlq // The rate limit optimizer takes samples and optimizer the rate limit, keeping // it as low as possible to maximize throughput without overhelming a host. @@ -16,6 +16,16 @@ import ( "time" ) +const ( + // MinHostRPS specifies the minimum number of requests per + // second that are executed against a single host. + MinHostRPS float64 = 1 + + // MaxHostRPS specifies the maximum number of requests per + // second that are exectuted against a single host. + MaxHostRPS float64 = 50 +) + // newRateLimitByHeaders returns a new rate limit according to rate limiting headers. It ensures // that the new rate limit is at least MinHostRPS. func newRateLimitByHeaders(cur float64, hdr *http.Header) (bool, float64) { diff --git a/workqueue_jobpromoter.go b/internal/ctrlq/promoter.go similarity index 56% rename from workqueue_jobpromoter.go rename to internal/ctrlq/promoter.go index fb11966..ea90209 100644 --- a/workqueue_jobpromoter.go +++ b/internal/ctrlq/promoter.go @@ -3,7 +3,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -package main +package ctrlq // The job promoter looks at host queues in a round-robin fashion and promotes // messages from a host queue to the default queue. @@ -15,99 +15,76 @@ import ( "time" ) -// startJobPromoter starts a the promoter goroutine that shovels message from host -// queue into the default queue. The promoter is responsible for load balancing -// the host queues. -func startJobPromoter(ctx context.Context, dqueue chan *VisitMessage, hqueues map[uint32]*ControlledQueue, shouldRecalc chan bool) { - go func() { - slog.Debug("Work Queue: Starting promoter...") - - // Load balancing queue querying is local to each tobey instance. This should wrap - // around at 2^32. - var next uint32 - - for { - immediatecalc: - // slog.Debug("Work Queue: Calculating immediate queue candidates.") - immediate, shortestPauseUntil := calcImmediateHostQueueCandidates(hqueues) - - // Check how long we have to wait until we can recalc immidiate candidates. - if len(immediate) == 0 { - // Nothin to do yet, try again whenever - // 1. a new message is published, - // 2. the rate limited is adjusted - // 3. a queue is now unpaused. - var delay time.Duration - - if !shortestPauseUntil.IsZero() { // None may be paused. - delay = shortestPauseUntil.Sub(time.Now()) - } else { - delay = 60 * time.Second - } +// Load balancing queue querying is local to each tobey instance. This should wrap +// around at 2^32. +var next uint32 + +// promoteLoop starts a loop that shovels message from host queue into the +// default queue. The promoter is responsible for load balancing the host +// queues. +func promoteLoop(ctx context.Context, dqueue chan *VisitMessage, hqueues map[uint32]*ControlledQueue, shouldRecalc chan bool) { + slog.Debug("Work Queue: Starting promoter...") + + for { + immediatecalc: + // slog.Debug("Work Queue: Calculating immediate queue candidates.") + immediate, shortestPauseUntil := candidates(hqueues) + + // Check how long we have to wait until we can recalc immidiate candidates. + if len(immediate) == 0 { + // Nothin to do yet, try again whenever + // 1. a new message is published, + // 2. the rate limited is adjusted + // 3. a queue is now unpaused. + var delay time.Duration + + if !shortestPauseUntil.IsZero() { // None may be paused. + delay = shortestPauseUntil.Sub(time.Now()) + } else { + delay = 60 * time.Second + } - slog.Debug("Work Queue: No immediate queue candidates, waiting for a while...", "delay", delay) - select { - case <-time.After(delay): - // slog.Debug("Work Queue: Pause time passed.") - goto immediatecalc - case <-shouldRecalc: - slog.Debug("Work Queue: Got notification to re-calc immediate queue candidates.") - goto immediatecalc - case <-ctx.Done(): - slog.Debug("Work Queue: Context cancelled, stopping promoter.") - return - } + slog.Debug("Work Queue: No immediate queue candidates, waiting for a while...", "delay", delay) + select { + case <-time.After(delay): + // slog.Debug("Work Queue: Pause time passed.") + goto immediatecalc + case <-shouldRecalc: + slog.Debug("Work Queue: Got notification to re-calc immediate queue candidates.") + goto immediatecalc + case <-ctx.Done(): + slog.Debug("Work Queue: Context cancelled, stopping promoter.") + return } + } - // When we get here we have a at least one host queue that can be - // queried, if we have multiple candidates we load balance over - // them. - slog.Debug("Work Queue: Final immediate queue candidates calculated.", "count", len(immediate)) + // When we get here we have a at least one host queue that can be + // queried, if we have multiple candidates we load balance over + // them. + slog.Debug("Work Queue: Final immediate queue candidates calculated.", "count", len(immediate)) - n := atomic.AddUint32(&next, 1) - key := immediate[(int(n)-1)%len(immediate)] + n := atomic.AddUint32(&next, 1) + key := immediate[(int(n)-1)%len(immediate)] - hq, _ := hqueues[key] - // FIXME: The host queue might haven gone poof in the meantime, we should - // check if the host queue is still there. + hq, _ := hqueues[key] + // FIXME: The host queue might haven gone poof in the meantime, we should + // check if the host queue is still there. - if promote(ctx, hq.Queue, dqueue) { - hq.Limiter.ReleaseReservation() - } + if promote(ctx, hq.Queue, dqueue) { + hq.Limiter.ReleaseReservation() } - }() -} - -// promote promotes a message from one channel to another. The function returns -// true when the message was successfully promoted, false otherwise. The -// function will not block on an empty source channel or on an overful target -// channel, even when these are not buffered. -func promote[V any](ctx context.Context, a <-chan V, b chan<- V) bool { - select { - case msg := <-a: - select { - case b <- msg: - return true - default: - return false - } - case <-ctx.Done(): - return false - default: - return false } - return false } -// calcImmediateHostQueueCandidates calculates which host queues are candidates -// to be queried for message to be promoted. The function modifies the host queue -// properties, so it requires a lock to be held. +// candidates calculates which host queues are candidates to be queried for +// message to be promoted, it will check for a reservation and if none is held +// already will try to acquire one. // // The second return value if non zero indicates the shortest time until a host // queue is paused (non only the candidates). When no candidate is found, the // callee should wait at least for that time and than try and call this function // again. -func calcImmediateHostQueueCandidates(hqueues map[uint32]*ControlledQueue) ([]uint32, time.Time) { +func candidates(hqueues map[uint32]*ControlledQueue) ([]uint32, time.Time) { // Host queue candidates that can be queried immediately. immediate := make([]uint32, 0, len(hqueues)) var shortestPauseUntil time.Time @@ -171,3 +148,23 @@ func calcImmediateHostQueueCandidates(hqueues map[uint32]*ControlledQueue) ([]ui } return immediate, shortestPauseUntil } + +// promote promotes a message from one channel (a) to another (b). The function returns +// true when the message was successfully promoted, false otherwise. The +// function will not block on an empty source channel or on an overful target +// channel, even when these are not buffered. +func promote[V any](ctx context.Context, a <-chan V, b chan<- V) bool { + select { + case msg := <-a: + select { + case b <- msg: + return true + default: + return false + } + case <-ctx.Done(): + return false + default: + return false + } +} diff --git a/internal/ctrlq/util.go b/internal/ctrlq/util.go new file mode 100644 index 0000000..08c2daa --- /dev/null +++ b/internal/ctrlq/util.go @@ -0,0 +1,32 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ctrlq + +import ( + "hash/fnv" + "net/url" + "strings" +) + +// guessHost heuristically identifies a host for the given URL. The function +// doesn't return the host name directly, as it might not exist, but an ID. +// +// It does by by ignoring a www. prefix, leading to www.example.org and +// example.org being considered the same host. It also ignores the port number, +// so example.org:8080 and example.org:9090 are considered the same host as +// well. +// +// Why FNV? https://softwareengineering.stackexchange.com/questions/49550 +func guessHost(u string) uint32 { + p, err := url.Parse(u) + if err != nil { + return 0 + } + h := fnv.New32a() + + h.Write([]byte(strings.TrimLeft(p.Hostname(), "www."))) + return h.Sum32() +} diff --git a/main.go b/main.go index 8f5d3a3..92c973a 100644 --- a/main.go +++ b/main.go @@ -19,6 +19,7 @@ import ( "strconv" "strings" "time" + "tobey/internal/ctrlq" "github.com/prometheus/client_golang/prometheus/promhttp" "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" @@ -156,7 +157,7 @@ func main() { runs := NewRunManager(redisconn, robots, sitemaps) - queue := CreateWorkQueue(redisconn) + queue := ctrlq.CreateWorkQueue(redisconn) if err := queue.Open(ctx); err != nil { panic(err) } diff --git a/run.go b/run.go index cc45448..24d57cb 100644 --- a/run.go +++ b/run.go @@ -10,6 +10,7 @@ import ( "log/slog" "net/http" "tobey/internal/collector" + "tobey/internal/ctrlq" "go.opentelemetry.io/otel/attribute" ) @@ -78,11 +79,11 @@ func (r *Run) getAuthFn() GetAuthFn { } } -func (r *Run) GetCollector(ctx context.Context, q VisitWorkQueue, p Progress, h *WebhookDispatcher) *collector.Collector { +func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p Progress, h *WebhookDispatcher) *collector.Collector { // getEnqueueFn returns the enqueue function, that will enqueue a single URL to // be crawled. The enqueue function is called whenever a new URL is discovered // by that Collector, i.e. by looking at all links in a crawled page HTML. - getEnqueueFn := func(run *Run, q VisitWorkQueue, progress Progress) collector.EnqueueFn { + getEnqueueFn := func(run *Run, q ctrlq.VisitWorkQueue, progress Progress) collector.EnqueueFn { // The returned function takes the run context. return func(ctx context.Context, c *collector.Collector, url string) error { @@ -185,7 +186,7 @@ func (r *Run) GetCollector(ctx context.Context, q VisitWorkQueue, p Progress, h // Start starts the crawl with the given URLs. It will discover sitemaps and // enqueue the URLs. From there on more URLs will be discovered and enqueued. -func (r *Run) Start(ctx context.Context, q VisitWorkQueue, p Progress, h *WebhookDispatcher, urls []string) { +func (r *Run) Start(ctx context.Context, q ctrlq.VisitWorkQueue, p Progress, h *WebhookDispatcher, urls []string) { c := r.GetCollector(ctx, q, p, h) // Decide where the initial URLs should go, users may provide sitemaps and diff --git a/visitworker.go b/visitworker.go index 8b175af..d23c595 100644 --- a/visitworker.go +++ b/visitworker.go @@ -12,6 +12,7 @@ import ( "sync" "sync/atomic" "time" + "tobey/internal/ctrlq" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/trace" @@ -30,7 +31,7 @@ func CreateVisitWorkersPool( ctx context.Context, num int, runs *RunManager, - q VisitWorkQueue, + q ctrlq.VisitWorkQueue, progress Progress, hooks *WebhookDispatcher, ) *sync.WaitGroup { @@ -57,7 +58,7 @@ func VisitWorker( ctx context.Context, id int, runs *RunManager, - q VisitWorkQueue, + q ctrlq.VisitWorkQueue, progress Progress, hooks *WebhookDispatcher, ) error { @@ -66,7 +67,7 @@ func VisitWorker( jobs, errs := q.Consume(ctx) for { - var job *VisitJob + var job *ctrlq.VisitJob wlogger.Debug("Visitor: Waiting for job...") select { diff --git a/workqueue_jobpromoter_test.go b/workqueue_jobpromoter_test.go deleted file mode 100644 index b4c13e2..0000000 --- a/workqueue_jobpromoter_test.go +++ /dev/null @@ -1,6 +0,0 @@ -// Copyright 2024 Factorial GmbH. All rights reserved. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package main From 93352bfc30c00c5db4de9a9a04f579b4e2e9b7b4 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 4 Nov 2024 14:39:42 +0100 Subject: [PATCH 12/57] Add promoter tests --- internal/ctrlq/promoter_test.go | 165 ++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 internal/ctrlq/promoter_test.go diff --git a/internal/ctrlq/promoter_test.go b/internal/ctrlq/promoter_test.go new file mode 100644 index 0000000..580d975 --- /dev/null +++ b/internal/ctrlq/promoter_test.go @@ -0,0 +1,165 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ctrlq + +import ( + "context" + "testing" + "time" +) + +func TestPromoteSucceeds(t *testing.T) { + a := make(chan bool, 1) + b := make(chan bool, 1) + + a <- true + + if promote(context.Background(), a, b) != true { + t.Errorf("Expected true, got false") + } +} + +func TestPromoteNothingToRead(t *testing.T) { + a := make(chan bool) + b := make(chan bool, 1) + + if promote(context.Background(), a, b) != false { + t.Errorf("Expected false, got true") + } +} + +func TestCandidatesNothingToDo(t *testing.T) { + hqueues := make(map[uint32]*ControlledQueue) + + hqueues[1] = &ControlledQueue{ + ID: 1, + Name: "example.org", + IsAdaptive: false, + Queue: make(chan *VisitMessage, 10), + Limiter: NewMemoryLimiter(), + } + + candidates, retry := candidates(hqueues) + + if len(candidates) != 0 { + t.Errorf("Expected 0 candidates, got %d", len(candidates)) + } + if retry.IsZero() != true { + t.Errorf("Expected zero time, got %v", retry) + } +} + +func TestCandidatesSingle(t *testing.T) { + hqueues := make(map[uint32]*ControlledQueue) + + hqueues[1] = &ControlledQueue{ + ID: 1, + Name: "example.org", + IsAdaptive: false, + Queue: make(chan *VisitMessage, 10), + Limiter: NewMemoryLimiter(), + } + hqueues[1].Queue <- &VisitMessage{ID: 1} + + candidates, _ := candidates(hqueues) + + if len(candidates) != 1 { + t.Errorf("Expected 1 candidate, got %d", len(candidates)) + } + if candidates[0] != 1 { + t.Errorf("Expected ID 1, got %d", candidates[0]) + } +} + +func TestCandidatesAcquireReservation(t *testing.T) { + hqueues := make(map[uint32]*ControlledQueue) + + hqueues[1] = &ControlledQueue{ + ID: 1, + Name: "example1.org", + IsAdaptive: false, + Queue: make(chan *VisitMessage, 10), + Limiter: NewMemoryLimiter(), + } + hqueues[1].Queue <- &VisitMessage{ID: 1} + + hqueues[2] = &ControlledQueue{ + ID: 2, + Name: "example2.org", + IsAdaptive: false, + Queue: make(chan *VisitMessage, 10), + Limiter: NewMemoryLimiter(), + } + hqueues[2].Queue <- &VisitMessage{ID: 2} + + if hqueues[1].Limiter.HoldsReservation() != false { + t.Errorf("Expected no reservation to be held.") + } + if hqueues[2].Limiter.HoldsReservation() != false { + t.Errorf("Expected no reservation to be held.") + } + + candidates, _ := candidates(hqueues) + + if len(candidates) != 2 { + t.Errorf("Expected 2 candidates, got %d", len(candidates)) + } + + if hqueues[1].Limiter.HoldsReservation() != true { + t.Errorf("Expected reservation to be held.") + } + if hqueues[2].Limiter.HoldsReservation() != true { + t.Errorf("Expected reservation to be held.") + } +} + +func TestCandidatesPausedQueueDoesNotHitLimiterCalcShortest(t *testing.T) { + hqueues := make(map[uint32]*ControlledQueue) + + d1, _ := time.ParseDuration("1s") + d2, _ := time.ParseDuration("2s") + + t1 := time.Now().Add(d1).Unix() + t2 := time.Now().Add(d2).Unix() + + hqueues[1] = &ControlledQueue{ + ID: 1, + Name: "example1.org", + IsAdaptive: false, + Queue: make(chan *VisitMessage, 10), + Limiter: NewMemoryLimiter(), + } + hqueues[1].Queue <- &VisitMessage{ID: 1} + hqueues[1].pausedUntil.Store(t1) + + hqueues[2] = &ControlledQueue{ + ID: 2, + Name: "example2.org", + IsAdaptive: false, + Queue: make(chan *VisitMessage, 10), + Limiter: NewMemoryLimiter(), + } + hqueues[2].Queue <- &VisitMessage{ID: 2} + hqueues[2].pausedUntil.Store(t2) + + candidates, retry := candidates(hqueues) + + if retry.Unix() != t1 { + t.Errorf("Expected %v, got %v", t1, retry.Unix()) + } + if len(candidates) != 0 { + t.Errorf("Expected 0 candidates, got %d", len(candidates)) + } + + // Should not have hit rate limiter as we paused the queue and + // the queue's pause has not yet passed. + if hqueues[1].Limiter.HoldsReservation() != false { + t.Errorf("Expected no reservation to be held.") + } + if hqueues[2].Limiter.HoldsReservation() != false { + t.Errorf("Expected no reservation to be held.") + } +} From 3d747aa95157f10906eb871679fd38e0cac41833 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 4 Nov 2024 16:18:10 +0100 Subject: [PATCH 13/57] Update README --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 2cdfc5f..2a9ec08 100644 --- a/README.md +++ b/README.md @@ -44,8 +44,7 @@ efficiently as possible. The crawler does not impose static rate limits; instead, it adapts to each host's capabilities, adjusting the rate limit in real time based on feedback from headers or other factors. -This dynamic adjustment is essential because maximizing throughput requires -using the host’s capacity without overloading it. To manage these rate limits +This dynamic adjustment is essential. To manage these rate limits effectively, Tobey employs a rate-limited work queue that abstracts away the complexities of dynamic rate limiting from other parts of the system. The goal is to focus on maintaining a steady flow of requests without overwhelming From 3a5cf3f5633b1df3492e6ebddffe71bb5f9a4464 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 4 Nov 2024 16:36:37 +0100 Subject: [PATCH 14/57] Drop obsolete config --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 2a9ec08..f9ea8f9 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,6 @@ variables are available: | `TOBEY_SKIP_CACHE` | `false` | `true`, `false` | Controls caching access. | | `TOBEY_REDIS_DSN` | empty | i.e. `redis://localhost:6379` | DSN to reach a Redis instance. Only needed when operating multiple instances. | | `TOBEY_PROGRESS_DSN` | empty | i.e. `http://localhost:9020` | DSN where to reach a progress service. When configured tobey will send progress updates there. | -| `TOBEY_REQS_PER_S` | 2 | i.e. `4` | Maximum number of allowed requests per second per host. | | `TOBEY_TELEMETRY` | empty | i.e. `metrics traces` | Space separated list of what kind of telemetry is emitted. | On top of these variables, the service's telemetry From ec2672d442a332e0503741e0a1ef17729819f03e Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 4 Nov 2024 16:36:54 +0100 Subject: [PATCH 15/57] Prepare multiple output methods --- README.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f9ea8f9..a764d02 100644 --- a/README.md +++ b/README.md @@ -250,9 +250,14 @@ by providing the `auth` key: ] ``` -### Using Webhook to state where results should go +### Output Methods -[Webhooks](https://mailchimp.com/en/marketing-glossary/webhook) are a technique to notify other services about a result, once its ready. +Tobey currently supports one output method. + +#### Using Webhook to state where results should go + +With this output method tobey doesn't store any results by itself. It instead forwards +the results to a configured webhook endpoint. [Webhooks](https://mailchimp.com/en/marketing-glossary/webhook) are a technique to notify other services about a result, once its ready. Once the crawlwer has results for a resource, it will deliver them to a webhook, if one is configured via the `webhook` key. Using the `data` key you can pass From 4cb9c07c1ea54461a90ae14630f71684afbdaef5 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 4 Nov 2024 16:43:17 +0100 Subject: [PATCH 16/57] Bring back results forwarding --- visitworker.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/visitworker.go b/visitworker.go index d23c595..3a40bb5 100644 --- a/visitworker.go +++ b/visitworker.go @@ -134,7 +134,7 @@ func VisitWorker( jlogger.Info("Visitor: Visited URL.", "took.lifetime", time.Since(job.Created), "took.fetch", res.Took) span.AddEvent("Visitor: Visited URL.", t) - // TODO: Notify the webhook. + hooks.Send(jctx, r.WebhookConfig, r.ID, res) span.End() continue From 49cf7dacbfe0fa506f3b66c9a8846b73b28c076b Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 4 Nov 2024 17:15:18 +0100 Subject: [PATCH 17/57] Refactor webhook dispatcher --- main.go | 8 +-- otel.go | 4 ++ webhook.go | 193 +++++++++++------------------------------------------ 3 files changed, 45 insertions(+), 160 deletions(-) diff --git a/main.go b/main.go index 92c973a..2355c8a 100644 --- a/main.go +++ b/main.go @@ -162,13 +162,7 @@ func main() { panic(err) } - // Create Webhook Handling, TODO: this should always be non-blocking, as otherwise - // our visit workers will stand still. - hooksqueue := make(chan WebhookPayloadPackage, 1000) - hooksmgr := NewProcessWebhooksManager() - hooksmgr.Start(ctx, hooksqueue) - hooks := NewWebhookDispatcher(hooksqueue) - + hooks := NewWebhookDispatcher(ctx) progress := MustStartProgressFromEnv(ctx) workers := CreateVisitWorkersPool( diff --git a/otel.go b/otel.go index f39cff7..bdf85d6 100644 --- a/otel.go +++ b/otel.go @@ -22,6 +22,10 @@ import ( semconv "go.opentelemetry.io/otel/semconv/v1.17.0" ) +var ( + tracer = otel.Tracer("tobey") +) + // Transformer for Opentelemetry // medium for propagated key-value pairs. diff --git a/webhook.go b/webhook.go index c426c84..b39881d 100644 --- a/webhook.go +++ b/webhook.go @@ -9,17 +9,9 @@ import ( "bytes" "context" "encoding/json" - "errors" "log/slog" "net/http" - "tobey/internal/collector" - - "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" - "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/codes" - "go.opentelemetry.io/otel/metric" ) type WebhookConfig struct { @@ -27,12 +19,6 @@ type WebhookConfig struct { Data interface{} `json:"data"` // Accept arbitrary data here. } -// Have a Package to handle metadata. -type WebhookPayloadPackage struct { - ctx context.Context - payload WebhookPayload -} - // The messages that should go over the wire. type WebhookPayload struct { Action string `json:"action"` @@ -41,170 +27,71 @@ type WebhookPayload struct { // we than want to nest all results data under Data as to prevent // collisions with Action and other fields. // TODO Talk about the interface variation - RequestURL string `json:"request_url"` - ResponseBody []byte `json:"response_body"` // Will be base64 encoded once marshalled. - ResponseStatusCode int `json:"response_status_code"` - // TODO: This should probably be just interface{} - Data *WebhookConfig `json:"data"` // Pass through arbitrary data here. + RequestURL string `json:"request_url"` + ResponseBody []byte `json:"response_body"` // Will be base64 encoded once marshalled. + ResponseStatusCode int `json:"response_status_code"` + Data interface{} `json:"data"` // Pass through arbitrary data here. } -var ( - tracer = otel.Tracer("call.webhook") - meter = otel.Meter("call.webhook") - numbe_of_exceptions metric.Int64Counter -) - -type ProcessWebhooksManager struct { - client *http.Client -} - -func NewProcessWebhooksManager() *ProcessWebhooksManager { - return &ProcessWebhooksManager{ - &http.Client{Transport: otelhttp.NewTransport(http.DefaultTransport)}, +func NewWebhookDispatcher(ctx context.Context) *WebhookDispatcher { + return &WebhookDispatcher{ + client: CreateRetryingHTTPClient(NoAuthFn), } } -func (w ProcessWebhooksManager) startHandle(ctx context.Context, webhookQueue chan WebhookPayloadPackage, pnumber int) { - wlogger := slog.With("worker.id", pnumber) - // todo handle empty buffered queue - for { - select { - case <-ctx.Done(): - wlogger.Debug("Webhook Dispatcher: Context cancelled, stopping worker.") - // The context is over, stop processing results - return - case result_package, ok1 := <-webhookQueue: - - // This context is dynamic because of the different items. - if result_package.ctx == nil { - result_package.ctx = context.Background() - } - - // Start the tracing - ctx_fresh, parentSpan := tracer.Start(result_package.ctx, "handle.webhook.queue.worker") - result := result_package.payload - - parentSpan.SetAttributes(attribute.Int("worker", pnumber)) - if !ok1 { - parentSpan.SetAttributes(attribute.Int("worker", pnumber)) - parentSpan.RecordError(errors.New("channel is closed")) - parentSpan.End() - return - } - - if result.RequestURL == "" { - wlogger.Error("Webhook Dispatcher: Missing URL") - - parentSpan.SetAttributes(attribute.Int("worker", pnumber)) - parentSpan.RecordError(errors.New("URL is empty on page")) - parentSpan.End() - continue - } - - err := w.sendWebhook(ctx_fresh, result, result.Data.Endpoint, "") - if err != nil { - wlogger.Info("Webhook Dispatcher: Sending webhook ultimately failed.", "error", err) - } else { - wlogger.Info("Webhook Dispatcher: Webhook succesfully sent.", "url", result.RequestURL) - } - - parentSpan.End() - } - } +type WebhookDispatcher struct { + client *http.Client } -func (w *ProcessWebhooksManager) sendWebhook(ctx context.Context, data WebhookPayload, url string, webhookId string) error { - logger := slog.With("url", url, "endpoint", data.Data.Endpoint, "run", data.Run) +// Send sends a webhook to the given endpoint. It returns immediately, and is not blocking. +func (wd *WebhookDispatcher) Send(ctx context.Context, webhook *WebhookConfig, run string, res *collector.Response) error { + logger := slog.With("endpoint", webhook.Endpoint, "run", run) logger.Debug("Webhook Dispatcher: Sending webhook...") - ctx_send_webhook, span := tracer.Start(ctx, "handle.webhook.queue.send") + ctx, span := tracer.Start(ctx, "output.webhook.send") defer span.End() - jsonBytes, err := json.Marshal(data) - if err != nil { - span.SetStatus(codes.Error, "json marshal failed") - span.SetAttributes(attribute.String("data", "TODO")) - span.RecordError(err) + payload := WebhookPayload{ + Action: "collector.response", + Run: run, - return err - } + RequestURL: res.Request.URL.String(), + ResponseBody: res.Body[:], + ResponseStatusCode: res.StatusCode, - span.SetAttributes(attribute.String("webhook_url", url)) - span.SetAttributes(attribute.String("request_url", data.RequestURL)) - req, err := http.NewRequestWithContext(ctx_send_webhook, "POST", url, bytes.NewBuffer(jsonBytes)) + // We pass through the data we received taking in the + // initial crawl request, verbatim. + Data: webhook.Data, + } + body, err := json.Marshal(payload) if err != nil { - span.SetStatus(codes.Error, "cant create new request") span.RecordError(err) - return err } + buf := bytes.NewBuffer(body) - req.Header.Set("Content-Type", "application/json") - - // Send the webhook request - resp, err := w.client.Do(req) + req, err := http.NewRequestWithContext(ctx, "POST", webhook.Endpoint, buf) if err != nil { - span.SetStatus(codes.Error, "Request failed") span.RecordError(err) return err } - defer resp.Body.Close() - - span.SetAttributes(attribute.String("status", resp.Status)) - - if resp.StatusCode != http.StatusOK { - err := errors.New("webhook was not accepted") - - var body_bytes []byte - resp.Body.Read(body_bytes) + req.Header.Set("Content-Type", "application/json") - span.SetAttributes(attribute.String("Body", string(body_bytes[:]))) - span.SetStatus(codes.Error, "operationThatCouldFail failed") - span.RecordError(err) + go func() { + res, err := wd.client.Do(req) + defer res.Body.Close() - return err - } - return nil -} - -func (w *ProcessWebhooksManager) Start(ctx context.Context, webhookQueue chan WebhookPayloadPackage) { - //todo add recovery - go func(ctx context.Context, webhookQueue chan WebhookPayloadPackage) { - count := GetEnvInt("TOBEY_WEBHOOK_WORKER", 4) - for i := 0; i < count; i++ { - go w.startHandle(ctx, webhookQueue, i) + if err != nil { + logger.Error("Failed to send webhook.", "error", err) + span.RecordError(err) + return } - }(ctx, webhookQueue) -} - -type WebhookDispatcher struct { - webhookQueue chan WebhookPayloadPackage -} - -func NewWebhookDispatcher(webhookQueue chan WebhookPayloadPackage) *WebhookDispatcher { - return &WebhookDispatcher{ - webhookQueue: webhookQueue, - } -} - -func (wd *WebhookDispatcher) Send(ctx context.Context, webhook *WebhookConfig, run string, res *collector.Response) error { - webhook_package := WebhookPayloadPackage{ - ctx: ctx, - payload: WebhookPayload{ - Action: "collector.response", - Run: run, - - RequestURL: res.Request.URL.String(), - ResponseBody: res.Body[:], - ResponseStatusCode: res.StatusCode, - - // We pass through the data we received taking in the - // initial crawl request, verbatim. - Data: webhook, - }, - } - - wd.webhookQueue <- webhook_package + if res.StatusCode != http.StatusOK { + logger.Error("Webhook was not accepted.", "status", res.Status) + span.RecordError(err) + return + } + }() return nil } From 42511da2ee3e38603a3aafe897357e27dad67faf Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 4 Nov 2024 17:25:17 +0100 Subject: [PATCH 18/57] Optimize startup --- main.go | 17 +++++++++-------- otel.go | 4 ++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/main.go b/main.go index 2355c8a..7c7968d 100644 --- a/main.go +++ b/main.go @@ -130,17 +130,18 @@ func main() { // This sets up the main process context. ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt) - var err error - // Setup Opentelemetry - // TODO: Add opentelemetry logging - shutdown, err := setupOTelSDK(ctx) + + shutdownOtel, err := StartOTel(ctx) if err != nil { panic(err) } - err = runtime.Start(runtime.WithMinimumReadMemStatsInterval(time.Second)) - if err != nil { - log.Fatal(err) + + if UseMetrics { + err = runtime.Start(runtime.WithMinimumReadMemStatsInterval(time.Second)) + if err != nil { + log.Fatal(err) + } } if UsePulse { @@ -326,5 +327,5 @@ func main() { redisconn.Close() } - shutdown(ctx) + shutdownOtel(ctx) } diff --git a/otel.go b/otel.go index bdf85d6..e2f3907 100644 --- a/otel.go +++ b/otel.go @@ -50,9 +50,9 @@ func (c MapCarrierRabbitmq) Keys() []string { return keys } -// setupOTelSDK bootstraps the OpenTelemetry pipeline. +// StartOTel bootstraps the OpenTelemetry pipeline. // If it does not return an error, make sure to call shutdown for proper cleanup. -func setupOTelSDK(ctx context.Context) (shutdown func(context.Context) error, err error) { +func StartOTel(ctx context.Context) (shutdown func(context.Context) error, err error) { var shutdownFuncs []func(context.Context) error // shutdown calls cleanup functions registered via shutdownFuncs. From 2d2e6340cc26124a6ac90ce7aa0d42d9fdd81004 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 4 Nov 2024 17:25:30 +0100 Subject: [PATCH 19/57] Don not send hook if no config --- visitworker.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/visitworker.go b/visitworker.go index 3a40bb5..ffbdb57 100644 --- a/visitworker.go +++ b/visitworker.go @@ -134,7 +134,9 @@ func VisitWorker( jlogger.Info("Visitor: Visited URL.", "took.lifetime", time.Since(job.Created), "took.fetch", res.Took) span.AddEvent("Visitor: Visited URL.", t) - hooks.Send(jctx, r.WebhookConfig, r.ID, res) + if r.WebhookConfig != nil { + hooks.Send(jctx, r.WebhookConfig, r.ID, res) + } span.End() continue From c65434118f16c419e95728e3163a390a7d132889 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Wed, 6 Nov 2024 11:06:26 +0100 Subject: [PATCH 20/57] Refactor progress --- main.go | 3 - progress.go | 258 ++++++++++++++----------------------------------- run.go | 18 ++-- visitworker.go | 4 +- 4 files changed, 81 insertions(+), 202 deletions(-) diff --git a/main.go b/main.go index 7c7968d..2319c67 100644 --- a/main.go +++ b/main.go @@ -320,9 +320,6 @@ func main() { if queue != nil { queue.Close() } - if progress != nil { - progress.Close() - } if redisconn != nil { redisconn.Close() } diff --git a/progress.go b/progress.go index a1e01fd..d3fa2f1 100644 --- a/progress.go +++ b/progress.go @@ -9,22 +9,9 @@ import ( "bytes" "context" "encoding/json" - "errors" - "fmt" "log/slog" "net/http" "os" - - "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/codes" -) - -const ( - ProgressDefaultStage = "crawler" - - ProgressEndpointUpdate = "api/status/update" - ProgressEndpointTransition = "api/status/transition-to" ) // Constants to be used for indicating what state the progress is in. @@ -37,217 +24,118 @@ const ( ProgressStateCancelled = "cancelled" ) -type Progressor struct { - manager Progress - - Run string - URL string +func MustStartProgressFromEnv(ctx context.Context) ProgressDispatcher { + if dsn := os.Getenv("TOBEY_PROGRESS_DSN"); dsn != "" { + slog.Info("Using progress service for updates.", "dsn", dsn) + return &FactorialProgressServiceDispatcher{ + client: CreateRetryingHTTPClient(NoAuthFn), + } + } else { + slog.Debug("Not sharing progress updates.") + return &NoopProgressDispatcher{} + } } -func (p *Progressor) Update(ctx context.Context, status string) error { - return p.manager.Update(ProgressUpdateMessagePackage{ - ctx: ctx, - payload: ProgressUpdateMessage{ - Stage: ProgressDefaultStage, - Status: status, - Run: p.Run, - URL: p.URL, - }, - }) +type ProgressDispatcher interface { + With(run string, url string) *Progressor + Call(ctx context.Context, msg ProgressUpdate) error // Usually only called by the Progressor. } -type ProgressUpdateMessagePackage struct { - ctx context.Context - payload ProgressUpdateMessage +type Progressor struct { + dispatcher ProgressDispatcher + + stage string + Run string + URL string } -type ProgressUpdateMessage struct { +type ProgressUpdate struct { Stage string `json:"stage"` - Status string `json:"status"` // only constanz allowed + Status string `json:"status"` // only constants allowed Run string `json:"run_uuid"` // uuid of the run URL string `json:"url"` } -type ProgressManager struct { - apiURL string - client *http.Client +func (p *Progressor) Update(ctx context.Context, status string) error { + return p.dispatcher.Call(ctx, ProgressUpdate{ + Stage: p.stage, + Run: p.Run, + URL: p.URL, + Status: status, + }) } -func MustStartProgressFromEnv(ctx context.Context) Progress { - if dsn := os.Getenv("TOBEY_PROGRESS_DSN"); dsn != "" { - slog.Info("Using progress service for updates.", "dsn", dsn) - - // TODO: Make this always non-blocking as otherwise it can block the whole application. - queue := make(chan ProgressUpdateMessagePackage, 1000) - progress_manager := NewProgressManager() - progress_manager.Start(ctx, queue) - - return &BaseProgress{ - queue, - } - } else { - slog.Debug("Not sharing progress updates.") - return &NoopProgress{} - } +type NoopProgressDispatcher struct { } -func NewProgressManager() *ProgressManager { - return &ProgressManager{ - GetEnvString("TOBEY_PROGRESS_DSN", "http://progress:9020"), - &http.Client{Transport: otelhttp.NewTransport(http.DefaultTransport)}, - } +func (p *NoopProgressDispatcher) With(run string, url string) *Progressor { + return &Progressor{dispatcher: p} } -type Progress interface { - Update(update_message ProgressUpdateMessagePackage) error - Close() error - With(run string, url string) *Progressor +func (p *NoopProgressDispatcher) Call(ctx context.Context, pu ProgressUpdate) error { + return nil } -func (w *ProgressManager) startHandle(ctx context.Context, progressQueue chan ProgressUpdateMessagePackage, pnumber int) { - wlogger := slog.With("worker.id", pnumber) +const ( + // The progress service has the concept of stages, which are used to group + // progress updates. The default stage is "crawler". + FactorialProgressServiceDefaultStage = "crawler" + FactorialProgressEndpointUpdate = "api/status/update" + // FactorialProgressEndpointTransition = "api/status/transition-to" // Not yet implemented. +) - // todo handle empty buffered queue - for { - select { - case <-ctx.Done(): - wlogger.Debug("Progress Dispatcher: Context cancelled, stopping worker.") - // The context is over, stop processing results - return - case result_package, ok1 := <-progressQueue: - - // This context is dynamic because of the different items. - if result_package.ctx == nil { - result_package.ctx = context.Background() - } - // Start the tracing - ctx_fresh, parentSpan := tracer.Start(result_package.ctx, "handle.progress.queue.worker") - result := result_package.payload - - parentSpan.SetAttributes(attribute.Int("worker", pnumber)) - if !ok1 { - parentSpan.SetAttributes(attribute.Int("worker", pnumber)) - parentSpan.RecordError(errors.New("channel is closed")) - parentSpan.End() - return - } - - err := w.sendProgressUpdate(ctx_fresh, result) - if err != nil { - wlogger.Error("Progress Dispatcher: Sending update ultimately failed.", "error", err) - } else { - wlogger.Debug("Progress Dispatcher: Update succesfully sent.", "url", result.URL) - } - - parentSpan.End() - } +// FactorialProgressServiceDispatcher is a dispatcher for the Factorial progress service. +type FactorialProgressServiceDispatcher struct { + client *http.Client +} + +func (p *FactorialProgressServiceDispatcher) With(run string, url string) *Progressor { + return &Progressor{ + dispatcher: p, + stage: FactorialProgressServiceDefaultStage, + Run: run, + URL: url, } } -func (w *ProgressManager) sendProgressUpdate(ctx context.Context, msg ProgressUpdateMessage) error { - logger := slog.With("url", msg.URL, "status", msg.Status, "run", msg.Run) - logger.Debug("Progress Dispatcher: Sending progress update...") +// Call sends the progress update over the wire, it implements a fire and forget approach. +func (p *FactorialProgressServiceDispatcher) Call(ctx context.Context, pu ProgressUpdate) error { + logger := slog.With("run", pu.Run, "url", pu.URL) + logger.Debug("Progress Dispatcher: Sending update...") - ctx_send_webhook, span := tracer.Start(ctx, "handle.progress.queue.send") + ctx, span := tracer.Start(ctx, "output.progress.send") defer span.End() - url := fmt.Sprintf("%v/%v", w.apiURL, ProgressEndpointUpdate) - - span.SetAttributes(attribute.String("API_URL", url)) - span.SetAttributes(attribute.String("url", msg.URL)) - span.SetAttributes(attribute.String("status_update", msg.Status)) - - body, err := json.Marshal(msg) + payload := pu + body, err := json.Marshal(payload) if err != nil { - span.SetStatus(codes.Error, "failed to marshal body") - span.SetAttributes(attribute.String("data", "TODO")) span.RecordError(err) - return err } + buf := bytes.NewBuffer(body) - req, err := http.NewRequestWithContext(ctx_send_webhook, "POST", url, bytes.NewBuffer(body)) + req, err := http.NewRequestWithContext(ctx, "POST", FactorialProgressEndpointUpdate, buf) if err != nil { - span.SetStatus(codes.Error, "failed to create request") span.RecordError(err) - return err } req.Header.Set("Content-Type", "application/json") - resp, err := w.client.Do(req) - if err != nil { - span.SetStatus(codes.Error, "performing request failed") - span.RecordError(err) + go func() { + res, err := p.client.Do(req) + defer res.Body.Close() - return err - } - defer resp.Body.Close() - - span.SetAttributes(attribute.Int("StatusCode", resp.StatusCode)) - - if resp.StatusCode != http.StatusAccepted { - err := errors.New("status update was not accepted") - - var body_bytes []byte - resp.Body.Read(body_bytes) - - span.SetAttributes(attribute.String("Body", string(body_bytes[:]))) - span.SetStatus(codes.Error, "update not accepted") - span.RecordError(err) - - return err - } - return nil -} - -func (w *ProgressManager) Start(ctx context.Context, progressQueue chan ProgressUpdateMessagePackage) { - //todo add recovery - go func(ctx context.Context, progressQueue chan ProgressUpdateMessagePackage) { - count := GetEnvInt("TOBEY_PROGRESS_WORKER", 4) - for i := 0; i < count; i++ { - go w.startHandle(ctx, progressQueue, i) + if err != nil { + logger.Error("Progress Dispatcher: Failed to send progress.", "error", err) + span.RecordError(err) + return } - }(ctx, progressQueue) -} - -type NoopProgress struct { -} - -func (p *NoopProgress) Update(update_message ProgressUpdateMessagePackage) error { - return nil -} - -func (p *NoopProgress) Close() error { - return nil -} - -func (p *NoopProgress) With(run string, url string) *Progressor { - return &Progressor{ - manager: p, - Run: run, - URL: url, - } -} - -type BaseProgress struct { - progressQueue chan ProgressUpdateMessagePackage -} - -func (p *BaseProgress) Update(update_message ProgressUpdateMessagePackage) error { - p.progressQueue <- update_message - return nil -} - -func (p *BaseProgress) With(run string, url string) *Progressor { - return &Progressor{ - manager: p, - Run: run, - URL: url, - } -} + if res.StatusCode != http.StatusOK { + logger.Error("Progress Dispatcher: Progress was not accepted.", "status", res.Status) + span.RecordError(err) + return + } + }() -func (p *BaseProgress) Close() error { - close(p.progressQueue) return nil } diff --git a/run.go b/run.go index 24d57cb..a473df1 100644 --- a/run.go +++ b/run.go @@ -79,11 +79,11 @@ func (r *Run) getAuthFn() GetAuthFn { } } -func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p Progress, h *WebhookDispatcher) *collector.Collector { +func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p ProgressDispatcher, h *WebhookDispatcher) *collector.Collector { // getEnqueueFn returns the enqueue function, that will enqueue a single URL to // be crawled. The enqueue function is called whenever a new URL is discovered // by that Collector, i.e. by looking at all links in a crawled page HTML. - getEnqueueFn := func(run *Run, q ctrlq.VisitWorkQueue, progress Progress) collector.EnqueueFn { + getEnqueueFn := func(run *Run, q ctrlq.VisitWorkQueue, progress ProgressDispatcher) collector.EnqueueFn { // The returned function takes the run context. return func(ctx context.Context, c *collector.Collector, url string) error { @@ -91,6 +91,8 @@ func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p Progre tctx, span := tracer.Start(ctx, "enqueue_element") defer span.End() + p := progress.With(run.ID, url) + span.SetAttributes(attribute.String("URL", url)) // Ensure we never publish a URL twice for a single run. Not only does // this help us not put unnecessary load on the queue, it also helps with @@ -127,15 +129,7 @@ func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p Progre run.SawURL(tctx, url) logger.Debug("Collector: URL marked as seen.") - progress.Update(ProgressUpdateMessagePackage{ - context.WithoutCancel(tctx), - ProgressUpdateMessage{ - ProgressDefaultStage, - ProgressStateQueuedForCrawling, - run.ID, - url, - }, - }) + p.Update(tctx, ProgressStateQueuedForCrawling) return nil } } @@ -186,7 +180,7 @@ func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p Progre // Start starts the crawl with the given URLs. It will discover sitemaps and // enqueue the URLs. From there on more URLs will be discovered and enqueued. -func (r *Run) Start(ctx context.Context, q ctrlq.VisitWorkQueue, p Progress, h *WebhookDispatcher, urls []string) { +func (r *Run) Start(ctx context.Context, q ctrlq.VisitWorkQueue, p ProgressDispatcher, h *WebhookDispatcher, urls []string) { c := r.GetCollector(ctx, q, p, h) // Decide where the initial URLs should go, users may provide sitemaps and diff --git a/visitworker.go b/visitworker.go index ffbdb57..3013d19 100644 --- a/visitworker.go +++ b/visitworker.go @@ -32,7 +32,7 @@ func CreateVisitWorkersPool( num int, runs *RunManager, q ctrlq.VisitWorkQueue, - progress Progress, + progress ProgressDispatcher, hooks *WebhookDispatcher, ) *sync.WaitGroup { var wg sync.WaitGroup @@ -59,7 +59,7 @@ func VisitWorker( id int, runs *RunManager, q ctrlq.VisitWorkQueue, - progress Progress, + progress ProgressDispatcher, hooks *WebhookDispatcher, ) error { wlogger := slog.With("worker.id", id) From 94751e86366e25185b85a5eeab8fbe09dae126d5 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Wed, 6 Nov 2024 11:06:47 +0100 Subject: [PATCH 21/57] Update log messages for webhook --- webhook.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/webhook.go b/webhook.go index b39881d..a5f8bc2 100644 --- a/webhook.go +++ b/webhook.go @@ -43,7 +43,7 @@ type WebhookDispatcher struct { client *http.Client } -// Send sends a webhook to the given endpoint. It returns immediately, and is not blocking. +// Send sends a webhook to the given endpoint. It returns immediately, and is not blocking. It implements a fire and forget approach. func (wd *WebhookDispatcher) Send(ctx context.Context, webhook *WebhookConfig, run string, res *collector.Response) error { logger := slog.With("endpoint", webhook.Endpoint, "run", run) logger.Debug("Webhook Dispatcher: Sending webhook...") @@ -82,12 +82,12 @@ func (wd *WebhookDispatcher) Send(ctx context.Context, webhook *WebhookConfig, r defer res.Body.Close() if err != nil { - logger.Error("Failed to send webhook.", "error", err) + logger.Error("Webhook Dispatcher: Failed to send webhook.", "error", err) span.RecordError(err) return } if res.StatusCode != http.StatusOK { - logger.Error("Webhook was not accepted.", "status", res.Status) + logger.Error("Webhook Dispatcher: Webhook was not accepted.", "status", res.Status) span.RecordError(err) return } From 06b9f4c34b338eb4d5ef2395dae25ea8853c0a34 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 18 Nov 2024 13:39:47 +0100 Subject: [PATCH 22/57] Use tears --- go.mod | 7 +++++-- go.sum | 2 ++ main.go | 34 ++++++++++++++++------------------ otel.go | 38 ++++++++------------------------------ 4 files changed, 31 insertions(+), 50 deletions(-) diff --git a/go.mod b/go.mod index a06323a..85b537f 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,8 @@ module tobey -go 1.22 +go 1.23 + +toolchain go1.23.3 require ( github.com/PuerkitoBio/goquery v1.9.2 @@ -16,6 +18,7 @@ require ( github.com/hashicorp/golang-lru/v2 v2.0.7 github.com/kennygrant/sanitize v1.2.4 github.com/kos-v/dsnparser v1.1.0 + github.com/mariuswilms/tears v0.0.0-20241122112056-b272bc2cfcb1 github.com/nlnwa/whatwg-url v0.4.1 github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4 github.com/peterbourgon/diskv v2.0.1+incompatible @@ -29,7 +32,6 @@ require ( go.opentelemetry.io/otel v1.28.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.28.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.28.0 - go.opentelemetry.io/otel/metric v1.28.0 go.opentelemetry.io/otel/sdk v1.28.0 go.opentelemetry.io/otel/sdk/metric v1.28.0 go.opentelemetry.io/otel/trace v1.28.0 @@ -76,6 +78,7 @@ require ( github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect github.com/yuin/gopher-lua v1.1.1 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 // indirect + go.opentelemetry.io/otel/metric v1.28.0 // indirect go.opentelemetry.io/proto/otlp v1.3.1 // indirect golang.org/x/exp v0.0.0-20240823005443-9b4947da3948 // indirect golang.org/x/sync v0.8.0 // indirect diff --git a/go.sum b/go.sum index 6684d4b..0c48ead 100644 --- a/go.sum +++ b/go.sum @@ -86,6 +86,8 @@ github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY= github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= +github.com/mariuswilms/tears v0.0.0-20241122112056-b272bc2cfcb1 h1:6FYLaNR6uylYjHur3UAYuY+FG0FV7DlGkqEz/DiG1k0= +github.com/mariuswilms/tears v0.0.0-20241122112056-b272bc2cfcb1/go.mod h1:NxZ+NiX3oZdh2cYXzHRFhSqvxJoK06heFSSGRxEPSAA= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= diff --git a/main.go b/main.go index 2319c67..46b72c0 100644 --- a/main.go +++ b/main.go @@ -21,6 +21,7 @@ import ( "time" "tobey/internal/ctrlq" + "github.com/mariuswilms/tears" "github.com/prometheus/client_golang/prometheus/promhttp" "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" _ "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" @@ -121,6 +122,8 @@ func configure() { func main() { slog.Info("Tobey starting...") + tear, down := tears.New() + configure() if Debug { @@ -131,11 +134,13 @@ func main() { // This sets up the main process context. ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt) var err error + tear(stop) shutdownOtel, err := StartOTel(ctx) if err != nil { panic(err) } + tear(shutdownOtel).End() if UseMetrics { err = runtime.Start(runtime.WithMinimumReadMemStatsInterval(time.Second)) @@ -152,6 +157,9 @@ func main() { if err != nil { panic(err) } + if redisconn != nil { + tear(redisconn.Close) + } robots := NewRobots() sitemaps := NewSitemaps(robots) // Sitemaps will use Robots to discover sitemaps. @@ -162,6 +170,9 @@ func main() { if err := queue.Open(ctx); err != nil { panic(err) } + if queue != nil { + tear(queue.Close) + } hooks := NewWebhookDispatcher(ctx) progress := MustStartProgressFromEnv(ctx) @@ -174,6 +185,7 @@ func main() { progress, hooks, ) + tear(workers.Wait) apirouter := http.NewServeMux() @@ -280,6 +292,7 @@ func main() { } slog.Info("Stopped serving new API HTTP connections.") }() + tear(apiserver.Shutdown) slog.Info("Starting HTTP Healthcheck server...", "port", HealthcheckListenPort) hcrouter := http.NewServeMux() @@ -304,25 +317,10 @@ func main() { } slog.Info("Stopped serving new Healthcheck HTTP connections.") }() + tear(hcserver.Shutdown) <-ctx.Done() - slog.Info("Exiting...") - stop() // Exit everything that took the context. - - slog.Debug("Cleaning up...") - - workers.Wait() - slog.Debug("All visit workers stopped.") - apiserver.Shutdown(context.Background()) - hcserver.Shutdown(context.Background()) - - if queue != nil { - queue.Close() - } - if redisconn != nil { - redisconn.Close() - } - - shutdownOtel(ctx) + slog.Info("Exiting...") + down(context.Background()) } diff --git a/otel.go b/otel.go index e2f3907..ab96393 100644 --- a/otel.go +++ b/otel.go @@ -7,11 +7,11 @@ package main import ( "context" - "errors" "fmt" "os" "time" + "github.com/mariuswilms/tears" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" @@ -52,25 +52,8 @@ func (c MapCarrierRabbitmq) Keys() []string { // StartOTel bootstraps the OpenTelemetry pipeline. // If it does not return an error, make sure to call shutdown for proper cleanup. -func StartOTel(ctx context.Context) (shutdown func(context.Context) error, err error) { - var shutdownFuncs []func(context.Context) error - - // shutdown calls cleanup functions registered via shutdownFuncs. - // The errors from the calls are joined. - // Each registered cleanup will be invoked once. - shutdown = func(ctx context.Context) error { - var err error - for _, fn := range shutdownFuncs { - err = errors.Join(err, fn(ctx)) - } - shutdownFuncs = nil - return err - } - - // handleErr calls shutdown for cleanup and makes sure that all errors are returned. - handleErr := func(inErr error) { - err = errors.Join(inErr, shutdown(ctx)) - } +func StartOTel(ctx context.Context) (tears.DownFn, error) { + tear, down := tears.New() // Set up propagator. prop := newPropagator() @@ -80,11 +63,10 @@ func StartOTel(ctx context.Context) (shutdown func(context.Context) error, err e // Set up trace provider. tracerProvider, erro := newTraceProvider(ctx) if erro != nil { - handleErr(err) - return + return down, erro } - shutdownFuncs = append(shutdownFuncs, tracerProvider.Shutdown) + tear(tracerProvider.Shutdown) otel.SetTracerProvider(tracerProvider) } @@ -92,17 +74,13 @@ func StartOTel(ctx context.Context) (shutdown func(context.Context) error, err e // Set up meter provider. meterProvider, erra := newMeterProvider(ctx) if erra != nil { - handleErr(erra) - return + return down, erra } - shutdownFuncs = append(shutdownFuncs, meterProvider.Shutdown) + tear(meterProvider.Shutdown) otel.SetMeterProvider(meterProvider) } - // TODO do some research - //otel.SetLogger(logger.GetBaseLogger().Logger) - - return + return down, nil } func newPropagator() propagation.TextMapPropagator { From 4375182c0284117e532c25d409a9099c901c5528 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Fri, 7 Feb 2025 16:54:00 +0100 Subject: [PATCH 23/57] Introduce ResultStore interface and factory - Add Result type to represent crawl results - Add ResultStore interface for storing crawl results - Implement factory function to create stores from DSN - Support disk, webhook, and noop store types --- results.go | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 results.go diff --git a/results.go b/results.go new file mode 100644 index 0000000..65e02cb --- /dev/null +++ b/results.go @@ -0,0 +1,82 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "context" + "fmt" + "net/url" + "runtime" + "tobey/internal/collector" +) + +// Result represents a crawl result that can be stored by any ResultsStore implementation +type Result struct { + Run string `json:"run_uuid"` + RequestURL string `json:"request_url"` + ResponseBody []byte `json:"response_body"` // Will be base64 encoded when marshalled + ResponseStatusCode int `json:"response_status_code"` + Data interface{} `json:"data,omitempty"` // Optional additional data +} + +// NewResult creates a Result from a collector.Response and optional data +func NewResult(run string, res *collector.Response, data interface{}) *Result { + return &Result{ + Run: run, + RequestURL: res.Request.URL.String(), + ResponseBody: res.Body[:], + ResponseStatusCode: res.StatusCode, + Data: data, + } +} + +// ResultStore defines how crawl results are stored +type ResultStore interface { + Save(ctx context.Context, config ResultStoreConfig, run string, res *collector.Response) error +} + +// ResultStoreConfig is the base configuration interface that all result store configs must implement +type ResultStoreConfig interface { + Validate() error +} + +// CreateResultStore creates a ResultsStore based on the provided DSN +func CreateResultStore(dsn string) (ResultStore, error) { + if dsn == "" { + return &NoopResultStore{}, nil + } + + u, err := url.Parse(dsn) + if err != nil { + return nil, fmt.Errorf("invalid results DSN: %w", err) + } + + switch u.Scheme { + case "disk": + path := u.Path + if runtime.GOOS == "windows" && len(path) > 0 && path[0] == '/' { + path = path[1:] // Remove leading slash on Windows + } + config := DiskStoreConfig{ + OutputDir: path, + } + store, err := NewDiskResultStore(config) + if err != nil { + return nil, fmt.Errorf("failed to create disk store: %w", err) + } + return store, nil + case "webhook": + if u.Host == "" { + return nil, fmt.Errorf("webhook results store requires a valid host (e.g., webhook://example.com/results)") + } + endpoint := fmt.Sprintf("%s://%s%s", "https", u.Host, u.Path) + return NewWebhookResultStore(context.Background(), endpoint), nil + case "noop": + return &NoopResultStore{}, nil + default: + return nil, fmt.Errorf("unsupported results store type: %s", u.Scheme) + } +} From d7d37ae35a6917aa75cd46bdd2f67aa5999f55ff Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Fri, 7 Feb 2025 16:54:06 +0100 Subject: [PATCH 24/57] Implement result store types - Add disk-based result store implementation - Add webhook result store implementation - Add no-op result store implementation --- results_disk.go | 123 +++++++++++++++++++++++++++++++++++++++++++++ results_noop.go | 19 +++++++ results_webhook.go | 112 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 254 insertions(+) create mode 100644 results_disk.go create mode 100644 results_noop.go create mode 100644 results_webhook.go diff --git a/results_disk.go b/results_disk.go new file mode 100644 index 0000000..7fdd14e --- /dev/null +++ b/results_disk.go @@ -0,0 +1,123 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "os" + "path/filepath" + "tobey/internal/collector" +) + +// DiskStoreConfig holds configuration for DiskResultsStore +type DiskStoreConfig struct { + OutputDir string `json:"output_dir"` +} + +func (c *DiskStoreConfig) Validate() error { + // No validation needed for now, but we could add checks for write permissions, etc. + return nil +} + +// DiskResultStore implements ResultsStore by saving results to files on disk +type DiskResultStore struct { + defaultConfig DiskStoreConfig +} + +// NewDiskResultStore creates a new DiskResultStore +func NewDiskResultStore(config DiskStoreConfig) (*DiskResultStore, error) { + // Create default output directory if it doesn't exist + if config.OutputDir != "" { + if err := os.MkdirAll(config.OutputDir, 0755); err != nil { + return nil, fmt.Errorf("failed to create output directory: %w", err) + } + } + + return &DiskResultStore{ + defaultConfig: config, + }, nil +} + +// Save implements ResultStore.Save by writing results to a file +func (drs *DiskResultStore) Save(ctx context.Context, config ResultStoreConfig, run string, res *collector.Response) error { + logger := slog.With("run", run, "url", res.Request.URL) + logger.Debug("DiskResultStore: Saving result to file...") + + // Use per-call config if provided, otherwise use default config + outputDir := drs.defaultConfig.OutputDir + var webhookData interface{} + + if config != nil { + if diskConfig, ok := config.(*DiskStoreConfig); ok { + if diskConfig.OutputDir != "" { + outputDir = diskConfig.OutputDir + // Create directory if it doesn't exist + if err := os.MkdirAll(outputDir, 0755); err != nil { + return fmt.Errorf("failed to create output directory: %w", err) + } + } + } + if whConfig, ok := config.(*WebhookResultStoreConfig); ok { + webhookData = whConfig.Data + } + } + + // Create result using common Result type + result := NewResult(run, res, webhookData) + + // Create a filename based on URL and run ID + urlPath := sanitizeFilename(res.Request.URL.Path) + if urlPath == "" { + urlPath = "root" + } + + filename := fmt.Sprintf("%s_%s.json", run, urlPath) + filepath := filepath.Join(outputDir, filename) + + jsonData, err := json.MarshalIndent(result, "", " ") + if err != nil { + logger.Error("DiskResultsStore: Failed to marshal result", "error", err) + return fmt.Errorf("failed to marshal result: %w", err) + } + + if err := os.WriteFile(filepath, jsonData, 0644); err != nil { + logger.Error("DiskResultsStore: Failed to write file", "error", err, "path", filepath) + return fmt.Errorf("failed to write file: %w", err) + } + + logger.Debug("DiskResultsStore: Successfully saved result", "path", filepath) + return nil +} + +// sanitizeFilename creates a safe filename from a URL path +func sanitizeFilename(path string) string { + // Remove leading slash + path = filepath.Clean(path) + if path == "/" || path == "." { + return "" + } + if path[0] == '/' { + path = path[1:] + } + + // Replace remaining slashes with underscores + path = filepath.ToSlash(path) + for i := 0; i < len(path); i++ { + if path[i] == '/' { + path = path[:i] + "_" + path[i+1:] + } + } + + // Limit filename length + if len(path) > 100 { + path = path[:100] + } + + return path +} diff --git a/results_noop.go b/results_noop.go new file mode 100644 index 0000000..6b03860 --- /dev/null +++ b/results_noop.go @@ -0,0 +1,19 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "context" + "tobey/internal/collector" +) + +// NoopResultStore implements ResultsStore but discards all results +type NoopResultStore struct{} + +// Save implements ResultsStore.Save by discarding the result +func (n *NoopResultStore) Save(ctx context.Context, config ResultStoreConfig, run string, res *collector.Response) error { + return nil +} diff --git a/results_webhook.go b/results_webhook.go new file mode 100644 index 0000000..a0f58ee --- /dev/null +++ b/results_webhook.go @@ -0,0 +1,112 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "log/slog" + "net/http" + "tobey/internal/collector" +) + +// WebhookResultStoreConfig defines the configuration for webhook endpoints +type WebhookResultStoreConfig struct { + Endpoint string `json:"endpoint"` + Data interface{} `json:"data"` // Accept arbitrary data here. +} + +func (c *WebhookResultStoreConfig) Validate() error { + return nil +} + +func (c *WebhookResultStoreConfig) GetWebhook() *WebhookResultStoreConfig { + return c +} + +// WebhookResultStore implements ResultsStore by sending results to a webhook endpoint. +// It sends results in a non-blocking way, following a fire-and-forget approach. +type WebhookResultStore struct { + client *http.Client + defaultEndpoint string +} + +func NewWebhookResultStore(ctx context.Context, endpoint string) *WebhookResultStore { + return &WebhookResultStore{ + client: CreateRetryingHTTPClient(NoAuthFn), + defaultEndpoint: endpoint, + } +} + +// Save implements ResultsStore.Save by sending results to a webhook endpoint +func (wrs *WebhookResultStore) Save(ctx context.Context, config ResultStoreConfig, run string, res *collector.Response) error { + endpoint := wrs.defaultEndpoint + var webhook *WebhookResultStoreConfig + + if config != nil { + if whConfig, ok := config.(*WebhookResultStoreConfig); ok { + webhook = whConfig + if whConfig.Endpoint != "" { + endpoint = whConfig.Endpoint + } + } + } + + if endpoint == "" { + return fmt.Errorf("no webhook endpoint configured") + } + + logger := slog.With("endpoint", endpoint, "run", run) + logger.Debug("WebhookResultStore: Sending webhook...") + + ctx, span := tracer.Start(ctx, "output.webhook.send") + defer span.End() + + // Create result using common Result type and wrap it in a webhook payload + result := NewResult(run, res, webhook.Data) + + payload := struct { + Action string `json:"action"` + *Result + }{ + Action: "collector.response", + Result: result, + } + + body, err := json.Marshal(payload) + if err != nil { + span.RecordError(err) + return err + } + buf := bytes.NewBuffer(body) + + req, err := http.NewRequestWithContext(ctx, "POST", endpoint, buf) + if err != nil { + span.RecordError(err) + return err + } + req.Header.Set("Content-Type", "application/json") + + go func() { + res, err := wrs.client.Do(req) + defer res.Body.Close() + + if err != nil { + logger.Error("WebhookResultStore: Failed to send webhook.", "error", err) + span.RecordError(err) + return + } + if res.StatusCode != http.StatusOK { + logger.Error("WebhookResultStore: Webhook was not accepted.", "status", res.Status) + span.RecordError(err) + return + } + }() + + return nil +} From ea14521188dfea1df7844344a74889706c379f60 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Fri, 7 Feb 2025 16:54:14 +0100 Subject: [PATCH 25/57] Improve progress tracking - Introduce ProgressStatus type for better type safety - Extract progress implementations into separate files - Simplify progress creation with factory function --- progress.go | 116 ++++++++---------------------------------- progress_factorial.go | 78 ++++++++++++++++++++++++++++ progress_noop.go | 19 +++++++ 3 files changed, 118 insertions(+), 95 deletions(-) create mode 100644 progress_factorial.go create mode 100644 progress_noop.go diff --git a/progress.go b/progress.go index d3fa2f1..31326bd 100644 --- a/progress.go +++ b/progress.go @@ -6,34 +6,34 @@ package main import ( - "bytes" "context" - "encoding/json" "log/slog" - "net/http" - "os" ) +// ProgressStatus represents the valid states for progress updates +type ProgressStatus string + // Constants to be used for indicating what state the progress is in. const ( - ProgressStateQueuedForCrawling = "queued_for_crawling" // Used when an URL has been enqueued, see collector.Collector.EnqueueFn. - ProgressStateCrawling = "crawling" // Used when actively crawling an URL, i.e. right before collector.Collector.Visit. - ProgressStateCrawled = "crawled" - ProgressStateSucceeded = "succeeded" // When crawling has been successful. - ProgressStateErrored = "errored" - ProgressStateCancelled = "cancelled" + ProgressStateQueuedForCrawling ProgressStatus = "queued_for_crawling" // Used when an URL has been enqueued, see collector.Collector.EnqueueFn. + ProgressStateCrawling ProgressStatus = "crawling" // Used when actively crawling an URL, i.e. right before collector.Collector.Visit. + ProgressStateCrawled ProgressStatus = "crawled" + ProgressStateSucceeded ProgressStatus = "succeeded" // When crawling has been successful. + ProgressStateErrored ProgressStatus = "errored" + ProgressStateCancelled ProgressStatus = "cancelled" ) -func MustStartProgressFromEnv(ctx context.Context) ProgressDispatcher { - if dsn := os.Getenv("TOBEY_PROGRESS_DSN"); dsn != "" { +// CreateProgress creates a new progress dispatcher based on the provided DSN. +// If dsn is empty, it returns a NoopProgressDispatcher. +func CreateProgress(dsn string) ProgressDispatcher { + if dsn != "" { slog.Info("Using progress service for updates.", "dsn", dsn) return &FactorialProgressServiceDispatcher{ client: CreateRetryingHTTPClient(NoAuthFn), } - } else { - slog.Debug("Not sharing progress updates.") - return &NoopProgressDispatcher{} } + slog.Debug("Not sharing progress updates.") + return &NoopProgressDispatcher{} } type ProgressDispatcher interface { @@ -50,13 +50,14 @@ type Progressor struct { } type ProgressUpdate struct { - Stage string `json:"stage"` - Status string `json:"status"` // only constants allowed - Run string `json:"run_uuid"` // uuid of the run - URL string `json:"url"` + Stage string `json:"stage"` + Status ProgressStatus `json:"status"` + Run string `json:"run_uuid"` // uuid of the run + URL string `json:"url"` } -func (p *Progressor) Update(ctx context.Context, status string) error { +// Update updates the progress with a new status +func (p *Progressor) Update(ctx context.Context, status ProgressStatus) error { return p.dispatcher.Call(ctx, ProgressUpdate{ Stage: p.stage, Run: p.Run, @@ -64,78 +65,3 @@ func (p *Progressor) Update(ctx context.Context, status string) error { Status: status, }) } - -type NoopProgressDispatcher struct { -} - -func (p *NoopProgressDispatcher) With(run string, url string) *Progressor { - return &Progressor{dispatcher: p} -} - -func (p *NoopProgressDispatcher) Call(ctx context.Context, pu ProgressUpdate) error { - return nil -} - -const ( - // The progress service has the concept of stages, which are used to group - // progress updates. The default stage is "crawler". - FactorialProgressServiceDefaultStage = "crawler" - FactorialProgressEndpointUpdate = "api/status/update" - // FactorialProgressEndpointTransition = "api/status/transition-to" // Not yet implemented. -) - -// FactorialProgressServiceDispatcher is a dispatcher for the Factorial progress service. -type FactorialProgressServiceDispatcher struct { - client *http.Client -} - -func (p *FactorialProgressServiceDispatcher) With(run string, url string) *Progressor { - return &Progressor{ - dispatcher: p, - stage: FactorialProgressServiceDefaultStage, - Run: run, - URL: url, - } -} - -// Call sends the progress update over the wire, it implements a fire and forget approach. -func (p *FactorialProgressServiceDispatcher) Call(ctx context.Context, pu ProgressUpdate) error { - logger := slog.With("run", pu.Run, "url", pu.URL) - logger.Debug("Progress Dispatcher: Sending update...") - - ctx, span := tracer.Start(ctx, "output.progress.send") - defer span.End() - - payload := pu - body, err := json.Marshal(payload) - if err != nil { - span.RecordError(err) - return err - } - buf := bytes.NewBuffer(body) - - req, err := http.NewRequestWithContext(ctx, "POST", FactorialProgressEndpointUpdate, buf) - if err != nil { - span.RecordError(err) - return err - } - req.Header.Set("Content-Type", "application/json") - - go func() { - res, err := p.client.Do(req) - defer res.Body.Close() - - if err != nil { - logger.Error("Progress Dispatcher: Failed to send progress.", "error", err) - span.RecordError(err) - return - } - if res.StatusCode != http.StatusOK { - logger.Error("Progress Dispatcher: Progress was not accepted.", "status", res.Status) - span.RecordError(err) - return - } - }() - - return nil -} diff --git a/progress_factorial.go b/progress_factorial.go new file mode 100644 index 0000000..26a667f --- /dev/null +++ b/progress_factorial.go @@ -0,0 +1,78 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "bytes" + "context" + "encoding/json" + "log/slog" + "net/http" +) + +const ( + // The progress service has the concept of stages, which are used to group + // progress updates. The default stage is "crawler". + FactorialProgressServiceDefaultStage = "crawler" + FactorialProgressEndpointUpdate = "api/status/update" + // FactorialProgressEndpointTransition = "api/status/transition-to" // Not yet implemented. +) + +// FactorialProgressServiceDispatcher is a dispatcher for the Factorial progress service. +type FactorialProgressServiceDispatcher struct { + client *http.Client +} + +func (p *FactorialProgressServiceDispatcher) With(run string, url string) *Progressor { + return &Progressor{ + dispatcher: p, + stage: FactorialProgressServiceDefaultStage, + Run: run, + URL: url, + } +} + +// Call sends the progress update over the wire, it implements a fire and forget approach. +func (p *FactorialProgressServiceDispatcher) Call(ctx context.Context, pu ProgressUpdate) error { + logger := slog.With("run", pu.Run, "url", pu.URL) + logger.Debug("Progress Dispatcher: Sending update...") + + ctx, span := tracer.Start(ctx, "output.progress.send") + defer span.End() + + payload := pu + body, err := json.Marshal(payload) + if err != nil { + span.RecordError(err) + return err + } + buf := bytes.NewBuffer(body) + + req, err := http.NewRequestWithContext(ctx, "POST", FactorialProgressEndpointUpdate, buf) + if err != nil { + span.RecordError(err) + return err + } + req.Header.Set("Content-Type", "application/json") + + go func() { + res, err := p.client.Do(req) + defer res.Body.Close() + + if err != nil { + logger.Error("Progress Dispatcher: Failed to send progress.", "error", err) + span.RecordError(err) + return + } + if res.StatusCode != http.StatusOK { + logger.Error("Progress Dispatcher: Progress was not accepted.", "status", res.Status) + span.RecordError(err) + return + } + }() + + return nil +} diff --git a/progress_noop.go b/progress_noop.go new file mode 100644 index 0000000..8f1e78e --- /dev/null +++ b/progress_noop.go @@ -0,0 +1,19 @@ +// Copyright 2024 Factorial GmbH. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import "context" + +type NoopProgressDispatcher struct { +} + +func (p *NoopProgressDispatcher) With(run string, url string) *Progressor { + return &Progressor{dispatcher: p} +} + +func (p *NoopProgressDispatcher) Call(ctx context.Context, pu ProgressUpdate) error { + return nil +} \ No newline at end of file From ea62433de9a625aa171bfcf321799c48892d9d65 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Fri, 7 Feb 2025 16:54:21 +0100 Subject: [PATCH 26/57] Integrate result store - Replace webhook dispatcher with result store - Update API to use WebhookResultStoreConfig - Integrate result store in main, run and visit worker - Remove deprecated webhook.go --- api.go | 6 ++-- main.go | 16 +++++---- run.go | 16 ++++----- visitworker.go | 10 +++--- webhook.go | 97 -------------------------------------------------- 5 files changed, 26 insertions(+), 119 deletions(-) delete mode 100644 webhook.go diff --git a/api.go b/api.go index d338b74..9ca2cde 100644 --- a/api.go +++ b/api.go @@ -67,7 +67,7 @@ type APIRequest struct { AllowPaths []string `json:"paths"` DenyPaths []string `json:"!paths"` - WebhookConfig *WebhookConfig `json:"webhook"` + WebhookResultStoreConfig *WebhookResultStoreConfig `json:"webhook"` // If true, we'll bypass the robots.txt check, however we'll still // download the file to look for sitemaps. @@ -205,8 +205,8 @@ func (req *APIRequest) Validate() bool { } } } - if req.WebhookConfig != nil { - if req.WebhookConfig.Endpoint == "" { + if req.WebhookResultStoreConfig != nil { + if req.WebhookResultStoreConfig.Endpoint == "" { return false } } diff --git a/main.go b/main.go index 46b72c0..846c1ec 100644 --- a/main.go +++ b/main.go @@ -79,7 +79,7 @@ const ( HTTPCachePath = "./cache" // The port where to ping for healthcheck. - HealthcheckListenPort int = 10241 + HealthcheckListenPort = 10241 // PulseEndpoint is the endpoint where we send the high frequency metrics. PulseEndpoint = "http://localhost:8090" @@ -174,8 +174,12 @@ func main() { tear(queue.Close) } - hooks := NewWebhookDispatcher(ctx) - progress := MustStartProgressFromEnv(ctx) + rs, err := CreateResultStore(os.Getenv("TOBEY_RESULTS_DSN")) + if err != nil { + panic(err) + } + + progress := CreateProgress(os.Getenv("TOBEY_PROGRESS_DSN")) workers := CreateVisitWorkersPool( ctx, @@ -183,7 +187,7 @@ func main() { runs, queue, progress, - hooks, + rs, ) tear(workers.Wait) @@ -260,7 +264,7 @@ func main() { SkipRobots: req.SkipRobots, SkipSitemapDiscovery: req.SkipSitemapDiscovery, - WebhookConfig: req.WebhookConfig, + WebhookConfig: req.WebhookResultStoreConfig, }, } @@ -268,7 +272,7 @@ func main() { // we start publishing to the work queue. runs.Add(ctx, run) - go run.Start(reqctx, queue, progress, hooks, req.GetURLs(true)) + go run.Start(reqctx, queue, progress, rs, req.GetURLs(true)) result := &APIResponse{ Run: run.ID, diff --git a/run.go b/run.go index a473df1..0bb200b 100644 --- a/run.go +++ b/run.go @@ -47,7 +47,7 @@ type SerializableRun struct { SkipRobots bool SkipSitemapDiscovery bool - WebhookConfig *WebhookConfig + WebhookConfig *WebhookResultStoreConfig } // LiveRun is a live version of the Run struct. It contains data that should not @@ -79,7 +79,7 @@ func (r *Run) getAuthFn() GetAuthFn { } } -func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p ProgressDispatcher, h *WebhookDispatcher) *collector.Collector { +func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p ProgressDispatcher, rs ResultStore) *collector.Collector { // getEnqueueFn returns the enqueue function, that will enqueue a single URL to // be crawled. The enqueue function is called whenever a new URL is discovered // by that Collector, i.e. by looking at all links in a crawled page HTML. @@ -137,19 +137,19 @@ func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p Progre // getCollectFn returns the collect function that is called once we have a // result. Uses the information provided in the original crawl request, i.e. the // WebhookConfig, that we have received via the queued message. - getCollectFn := func(run *Run, hooks *WebhookDispatcher) collector.CollectFn { + getCollectFn := func(run *Run, rs ResultStore) collector.CollectFn { // The returned function takes the run context. return func(ctx context.Context, c *collector.Collector, res *collector.Response) { slog.Debug( - "Collect suceeded.", + "Collect succeeded.", "run", run.ID, "url", res.Request.URL, "response.body.length", len(res.Body), "response.status", res.StatusCode, ) if run.WebhookConfig != nil && run.WebhookConfig.Endpoint != "" { - hooks.Send(ctx, run.WebhookConfig, run.ID, res) + rs.Save(ctx, run.WebhookConfig, run.ID, res) } } } @@ -166,7 +166,7 @@ func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p Progre return r.robots.Check(u, r.getAuthFn(), a) }, getEnqueueFn(r, q, p), - getCollectFn(r, h), + getCollectFn(r, rs), ) // TODO: We should be able to pass these into the NewCollector constructor. @@ -180,8 +180,8 @@ func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p Progre // Start starts the crawl with the given URLs. It will discover sitemaps and // enqueue the URLs. From there on more URLs will be discovered and enqueued. -func (r *Run) Start(ctx context.Context, q ctrlq.VisitWorkQueue, p ProgressDispatcher, h *WebhookDispatcher, urls []string) { - c := r.GetCollector(ctx, q, p, h) +func (r *Run) Start(ctx context.Context, q ctrlq.VisitWorkQueue, p ProgressDispatcher, rs ResultStore, urls []string) { + c := r.GetCollector(ctx, q, p, rs) // Decide where the initial URLs should go, users may provide sitemaps and // just URLs to web pages. diff --git a/visitworker.go b/visitworker.go index 3013d19..fb6c41f 100644 --- a/visitworker.go +++ b/visitworker.go @@ -33,7 +33,7 @@ func CreateVisitWorkersPool( runs *RunManager, q ctrlq.VisitWorkQueue, progress ProgressDispatcher, - hooks *WebhookDispatcher, + rs ResultStore, ) *sync.WaitGroup { var wg sync.WaitGroup @@ -42,7 +42,7 @@ func CreateVisitWorkersPool( wg.Add(1) go func(id int) { - if err := VisitWorker(ctx, id, runs, q, progress, hooks); err != nil { + if err := VisitWorker(ctx, id, runs, q, progress, rs); err != nil { slog.Error("Visitor: Worker exited with error.", "worker.id", id, "error", err) } else { slog.Debug("Visitor: Worker exited cleanly.", "worker.id", id) @@ -60,7 +60,7 @@ func VisitWorker( runs *RunManager, q ctrlq.VisitWorkQueue, progress ProgressDispatcher, - hooks *WebhookDispatcher, + rs ResultStore, ) error { wlogger := slog.With("worker.id", id) wlogger.Debug("Visitor: Starting...") @@ -108,7 +108,7 @@ func VisitWorker( // yet have a collector available via the Manager. Please note that Collectors // are not shared by the Manager across tobey instances. r, _ := runs.Get(ctx, job.Run) - c := r.GetCollector(ctx, q, progress, hooks) + c := r.GetCollector(ctx, q, progress, rs) p.Update(jctx, ProgressStateCrawling) @@ -135,7 +135,7 @@ func VisitWorker( span.AddEvent("Visitor: Visited URL.", t) if r.WebhookConfig != nil { - hooks.Send(jctx, r.WebhookConfig, r.ID, res) + rs.Save(jctx, r.WebhookConfig, r.ID, res) } span.End() diff --git a/webhook.go b/webhook.go deleted file mode 100644 index a5f8bc2..0000000 --- a/webhook.go +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright 2024 Factorial GmbH. All rights reserved. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package main - -import ( - "bytes" - "context" - "encoding/json" - "log/slog" - "net/http" - "tobey/internal/collector" -) - -type WebhookConfig struct { - Endpoint string `json:"endpoint"` - Data interface{} `json:"data"` // Accept arbitrary data here. -} - -// The messages that should go over the wire. -type WebhookPayload struct { - Action string `json:"action"` - Run string `json:"run_uuid"` - // TODO: Figure out if we want to use "Standard Webhook" and/or if - // we than want to nest all results data under Data as to prevent - // collisions with Action and other fields. - // TODO Talk about the interface variation - RequestURL string `json:"request_url"` - ResponseBody []byte `json:"response_body"` // Will be base64 encoded once marshalled. - ResponseStatusCode int `json:"response_status_code"` - Data interface{} `json:"data"` // Pass through arbitrary data here. -} - -func NewWebhookDispatcher(ctx context.Context) *WebhookDispatcher { - return &WebhookDispatcher{ - client: CreateRetryingHTTPClient(NoAuthFn), - } -} - -type WebhookDispatcher struct { - client *http.Client -} - -// Send sends a webhook to the given endpoint. It returns immediately, and is not blocking. It implements a fire and forget approach. -func (wd *WebhookDispatcher) Send(ctx context.Context, webhook *WebhookConfig, run string, res *collector.Response) error { - logger := slog.With("endpoint", webhook.Endpoint, "run", run) - logger.Debug("Webhook Dispatcher: Sending webhook...") - - ctx, span := tracer.Start(ctx, "output.webhook.send") - defer span.End() - - payload := WebhookPayload{ - Action: "collector.response", - Run: run, - - RequestURL: res.Request.URL.String(), - ResponseBody: res.Body[:], - ResponseStatusCode: res.StatusCode, - - // We pass through the data we received taking in the - // initial crawl request, verbatim. - Data: webhook.Data, - } - body, err := json.Marshal(payload) - if err != nil { - span.RecordError(err) - return err - } - buf := bytes.NewBuffer(body) - - req, err := http.NewRequestWithContext(ctx, "POST", webhook.Endpoint, buf) - if err != nil { - span.RecordError(err) - return err - } - req.Header.Set("Content-Type", "application/json") - - go func() { - res, err := wd.client.Do(req) - defer res.Body.Close() - - if err != nil { - logger.Error("Webhook Dispatcher: Failed to send webhook.", "error", err) - span.RecordError(err) - return - } - if res.StatusCode != http.StatusOK { - logger.Error("Webhook Dispatcher: Webhook was not accepted.", "status", res.Status) - span.RecordError(err) - return - } - }() - - return nil -} From e27486d0801d855000df51739767d0664b2ec295 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Fri, 7 Feb 2025 17:22:18 +0100 Subject: [PATCH 27/57] Update README --- README.md | 93 +++++++++++++++++++++++++++++++++-------------------- main.go | 5 ++- progress.go | 28 ++++++++++++---- 3 files changed, 84 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index a764d02..c91348b 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,71 @@ # Tobey, a robust and scalable Crawler -The service is entirely stateless and receives requests to crawl a website via a -simple HTTP API. Once a resources has been downloaded, forwards the results to a -webhook, if one is configured. - -In its simplest form the service just receives a root URL of the website to be -crawled. - -The service vertical scaling can be controlled by the number of workers used for -crawling. The service is horizontally scalable by adding more instances on nodes -in a cluster. In horizontal scaling, any instances can receive crawl requests, -for easy load balancing. The instances will coordinate with each other via Redis. - -## Features - -- No configuration required. -- Simple HTTP API to submit crawl requests. -- Scalable, horizontally and vertically. -- Stateless, no data store required, as nothing is persisted. -- No further service dependencies, when operating as a single instance. -- Detects and uses a sitemap and robots.txt automatically (can be disabled). -- Per host rate limiting, even when multiple instances are used. -- Full support for OpenTelemetry. +Tobey is a throughput optimizing web crawler, that is scalable from a single instance to a cluster. It features intelligent +rate limiting, distributed coordination, and flexible deployment options. ## Running Tobey +Start the service. ```sh -# In the first terminal start the service. go run . +``` + +In its simplest form the service just receives a root URL of the website to be +crawled. -# In another terminal, submit a crawl request. +```sh curl -X POST http://127.0.0.1:8080 \ -H 'Content-Type: application/json' \ -d '{"url": "https://www.example.org/"}' ``` +## Deployment Options + +### Dependency Free + +By default Tobey runs without any depdencies on any other service. It this mode +the service will not coordinate with other instances. It will store results locally +on disk, but not report any progress. If you are tryint out tobey this is the +easiest way to get started. + +```sh +TOBEY_RESULTS_DSN=disk:///path/to/results go run . +``` + +### Stateless Operation + +It is possible to configure and use Tobey in a stateless manner. In this operation mode +you'll specificy configuration on a per-run basis, and not statically via a configuration file. Choosing +the webhook results store will forward results to a webhook endpoint without storing them locally. + +```sh +TOBEY_RESULTS_DSN=webhook://example.org/webhook go run . +``` + +### Distributed Operation + +The service is horizontally scalable by adding more instances on nodes +in a cluster. In horizontal scaling, any instances can receive crawl requests, +for easy load balancing. The instances will coordinate with each other via Redis. + +```sh +TOBEY_REDIS_DSN=redis://localhost:6379 go run . +``` + +## Scaling -## Architecture +Tobey can be scaled vertically by increasing the number of workers, via the `WORKERS` environment variable, or horizontally +by adding more instances in a cluster, see the [Distributed Operation](#distributed-operation) section for more details. + +The crawler is designed to handle a large potentially infinite number of hosts, +which presents challenges for managing resources like memory and concurrency. +Keeping a persistent worker process or goroutine for each host would be +inefficient and resource-intensive, particularly since external interactions +can make it difficult to keep them alive. Instead, Tobey uses a pool of workers +that can process multiple requests per host concurrently, balancing the workload +across different hosts. + +## Smart Rate Limiting The Tobey Crawler architecture optimizes throughput per host by dynamically managing rate limits, ensuring that requests to each host are processed as @@ -50,13 +79,7 @@ complexities of dynamic rate limiting from other parts of the system. The goal is to focus on maintaining a steady flow of requests without overwhelming individual hosts. -The crawler is designed to handle a large potentially infinite number of hosts, -which presents challenges for managing resources like memory and concurrency. -Keeping a persistent worker process or goroutine for each host would be -inefficient and resource-intensive, particularly since external interactions -can make it difficult to keep them alive. Instead, Tobey uses a pool of workers -that can process multiple requests per host concurrently, balancing the workload -across different hosts. +## Caching Caching is a critical part of the architecture. The crawler uses a global cache, for HTTP responses. Access to sitemaps and robot control files are also cached. @@ -68,7 +91,7 @@ keeping the system responsive and compliant. This layered caching strategy, along with the dynamic rate limit adjustment, ensures that Tobey maintains high efficiency and adaptability during its crawling operations. -## Trade-offs +## Limitations Also Tobey can be configured - on a per run basis - to crawl websites behind HTTP basic auth, **it does not support fetching personalized content**. It is @@ -76,7 +99,6 @@ expected that the website is generally publicly available, and that the content is the same for all users. When HTTP basic auth is used by the website it must only be so in order to prevent early access. - ## Configuration The service is configured via environment variables. The following environment @@ -87,7 +109,8 @@ variables are available: | `TOBEY_DEBUG` | `false` | `true`, `false` | Controls debug mode. | | `TOBEY_SKIP_CACHE` | `false` | `true`, `false` | Controls caching access. | | `TOBEY_REDIS_DSN` | empty | i.e. `redis://localhost:6379` | DSN to reach a Redis instance. Only needed when operating multiple instances. | -| `TOBEY_PROGRESS_DSN` | empty | i.e. `http://localhost:9020` | DSN where to reach a progress service. When configured tobey will send progress updates there. | +| `TOBEY_PROGRESS_DSN` | empty | `factorial://host:port`, `noop://` | DSN for progress reporting service. When configured, Tobey will send progress updates there. The factorial scheme enables progress updates to a Factorial progress service. Use noop:// to explicitly disable progress updates. | +| `TOBEY_RESULTS_DSN` | empty | `disk:///path`, `webhook://host/path`, `noop://` | DSN specifying where crawl results should be stored. Use disk:// for local filesystem storage, webhook:// to forward results to an HTTP endpoint, or noop:// to discard results. | | `TOBEY_TELEMETRY` | empty | i.e. `metrics traces` | Space separated list of what kind of telemetry is emitted. | On top of these variables, the service's telemetry diff --git a/main.go b/main.go index 846c1ec..601d3fc 100644 --- a/main.go +++ b/main.go @@ -179,7 +179,10 @@ func main() { panic(err) } - progress := CreateProgress(os.Getenv("TOBEY_PROGRESS_DSN")) + progress, err := CreateProgress(os.Getenv("TOBEY_PROGRESS_DSN")) + if err != nil { + panic(err) + } workers := CreateVisitWorkersPool( ctx, diff --git a/progress.go b/progress.go index 31326bd..0148359 100644 --- a/progress.go +++ b/progress.go @@ -7,7 +7,9 @@ package main import ( "context" + "fmt" "log/slog" + "net/url" ) // ProgressStatus represents the valid states for progress updates @@ -25,15 +27,29 @@ const ( // CreateProgress creates a new progress dispatcher based on the provided DSN. // If dsn is empty, it returns a NoopProgressDispatcher. -func CreateProgress(dsn string) ProgressDispatcher { - if dsn != "" { - slog.Info("Using progress service for updates.", "dsn", dsn) +func CreateProgress(dsn string) (ProgressDispatcher, error) { + if dsn == "" { + slog.Debug("Not sharing progress updates.") + return &NoopProgressDispatcher{}, nil + } + + u, err := url.Parse(dsn) + if err != nil { + return nil, fmt.Errorf("invalid progress DSN: %w", err) + } + + switch u.Scheme { + case "factorial": + slog.Info("Using Factorial progress service for updates.", "dsn", dsn) return &FactorialProgressServiceDispatcher{ client: CreateRetryingHTTPClient(NoAuthFn), - } + }, nil + case "noop": + slog.Debug("Using noop progress dispatcher.") + return &NoopProgressDispatcher{}, nil + default: + return nil, fmt.Errorf("unsupported progress dispatcher type: %s", u.Scheme) } - slog.Debug("Not sharing progress updates.") - return &NoopProgressDispatcher{} } type ProgressDispatcher interface { From 1e3fee4aa711fcdd682913219c59f0800202d074 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 10 Feb 2025 11:53:19 +0100 Subject: [PATCH 28/57] Attach metadata to run --- README.md | 178 +++++++++++++++++++++++++++------------------ api.go | 3 + main.go | 3 +- results.go | 8 +- results_disk.go | 8 +- results_noop.go | 2 +- results_webhook.go | 6 +- run.go | 5 +- visitworker.go | 2 +- 9 files changed, 128 insertions(+), 87 deletions(-) diff --git a/README.md b/README.md index c91348b..f6f3843 100644 --- a/README.md +++ b/README.md @@ -23,9 +23,9 @@ curl -X POST http://127.0.0.1:8080 \ ### Dependency Free -By default Tobey runs without any depdencies on any other service. It this mode +By default Tobey runs without any dependencies on any other service. In this mode the service will not coordinate with other instances. It will store results locally -on disk, but not report any progress. If you are tryint out tobey this is the +on disk, but not report any progress. If you are trying out tobey this is the easiest way to get started. ```sh @@ -35,11 +35,11 @@ TOBEY_RESULTS_DSN=disk:///path/to/results go run . ### Stateless Operation It is possible to configure and use Tobey in a stateless manner. In this operation mode -you'll specificy configuration on a per-run basis, and not statically via a configuration file. Choosing +you'll specify configuration on a per-run basis, and not statically via a configuration file. Choosing the webhook results store will forward results to a webhook endpoint without storing them locally. ```sh -TOBEY_RESULTS_DSN=webhook://example.org/webhook go run . +TOBEY_RESULTS_DSN=webhook://example.org/webhook?enable_dynamic_config=true go run . ``` ### Distributed Operation @@ -117,7 +117,9 @@ On top of these variables, the service's telemetry feature can be configured via the commonly known [OpenTelemetry environment variables](https://opentelemetry.io/docs/languages/sdk-configuration/otlp-exporter/). -## Submitting a Basic Crawl Request +## Providing Crawl Targets + +### Submitting a Basic Crawl Request Tobey currently has a single API endpoint to receive crawl requests: `/`. @@ -132,9 +134,46 @@ extracting links for content of the webpages. } ``` -### Constraining Crawling +### Multiple URLs -#### Domains +Multiple URLs either as entrypoints or for oneshot downloading work a well, +using the `urls` key: + +```jsonc +{ + "urls": [ + "https://example.org/blog", + "https://example.org/values" + ] +} +``` + + +### Authentication + +When the target you want to crawl requires authentication, you can provide +the credentials for HTTP basic auth in the URL. The crawler will use these +credientials for all resources under the same domain for that run. + +```jsonc +{ + "url": "https://foo:secret@example.org" +} +``` + +When you want to provide the credentials in a more structured way, you can do so +by providing the `auth` key: + +```jsonc +{ + "url": "https://example.org" + "auth": [ + { host: "example.org", method: "basic", username: "foo", password: "secret" } + ] +} +``` + +### Domain Constraints By default and when crawling a whole website tobey will only download resources from the host as provided in the URL, this is so we don't end up downloading the @@ -156,7 +195,7 @@ allow the naked domain (and all its subdomains). } ``` -### Paths +### Path Constraints To skip resources with certain paths, you may provide a list of literal path segments to include or skip via the `paths` or `!paths` key. The path segments @@ -178,38 +217,6 @@ you may also use regular expressions. As you can see positive and negative path constraints can be combined. With the options given above, `/en/about-us/` would be crawled, but not `/en/search/` and not `/blog/article`. -### Run Identifiers - -Each time you submit a URL to be crawled, a "run" is internally created. Tobey -automatically creates a unique run UUID for you, when you don't submit one -yourself. You'll receive that created run UUID in the response when submitting a -URL to be crawled. - -When you already have a run UUID yourself, you may as well submit in the crawl -request, than your run UUID will be used internally and visible when results are -dispatched. - -```jsonc -{ - "url": "https://example.org", - "run_uuid": "0033085c-685b-432a-9aa4-0aca59cc3e12" -} -``` - -### Multiple URLs - -Multiple URLs either as entrypoints or for oneshot downloading work a well, -using the `urls` key: - -```jsonc -{ - "urls": [ - "https://example.org/blog", - "https://example.org/values" - ] -} -``` - ### Bypassing robots.txt When running certain tests you might want to bypass the robots.txt file. You can @@ -250,68 +257,95 @@ the URL under the `url` key algonside the entrypoint: } ``` -### Authentication +## Triggering Runs -When the resource you want to download requires authentication, you can provide -the credentials for HTTP basic auth in the URL. The crawler will use these -credientials for all resources under the same domain. +Each time you submit a URL to be crawled, a _Run_ is internally created. Tobey +automatically creates a unique run UUID as **a run identifier** for you. You may +specify your own run UUID as well. ```jsonc { - "url": "https://foo:secret@example.org" + "url": "https://example.org", + "run_uuid": "0033085c-685b-432a-9aa4-0aca59cc3e12" // optional + // ... } ``` -When you want to provide the credentials in a more structured way, you can do so -by providing the `auth` key: +You may also attach metadata to a run. This metadata will be attached to all results +from that run. ```jsonc { - "url": "https://example.org" - "auth": [ - { host: "example.org", method: "basic", username: "foo", password: "secret" } - ] + "url": "https://example.org", + // ... + "metadata": { + "internal_project_reference": 42, + "triggered_by": "user@example.org" + } +} ``` -### Output Methods +## Collecting Results + +Tobey currently supports multiple methods to handle results. You can either store +them locally on disk, or forward them to a webhook endpoint. Additionaly results +can also be discarded, this is useful for testing. -Tobey currently supports one output method. +When you configure the crawler to **store results on disk**, it will save the results +to the local filesystem. The results are saved in the same directory as the crawl +request. -#### Using Webhook to state where results should go +```sh +TOBEY_RESULTS_DSN=disk:///path/to/results +``` -With this output method tobey doesn't store any results by itself. It instead forwards +When you configure the crawler to **forward results to a webhook**, it will deliver the results to a configured webhook endpoint. [Webhooks](https://mailchimp.com/en/marketing-glossary/webhook) are a technique to notify other services about a result, once its ready. -Once the crawlwer has results for a resource, it will deliver them to a webhook, -if one is configured via the `webhook` key. Using the `data` key you can pass -through additional information to the target of the webhook. +```sh +TOBEY_RESULTS_DSN=webhook://example.org/webhook +``` + +For the webhook method, **dynamic re-configuration** is supported. This means that you can +configure the webhook endpoint on a per-request basis. Dynamic re-configuration is disabled +by default, and can be enabled by adding `enable_dynamic_config` to the DSN. -```jsonc +```sh +TOBEY_RESULTS_DSN=webhook://example.org/webhook?enable_dynamic_config +``` + +You can than specify the webhook endpoint in the crawl request: + +```jsonc { - // ... "url": "https://example.org", - // ... - "webhook": { - "endpoint": "https://metatags.example.org/accept-webhook", - "data": { // Any additional data that you want the hook to receive. - "magic_number": 12 - } - } + "results_dsn": "webhook://example.org/webhook" } ``` -This is how the payload will look like, and how it is received by the target: +When you configure the crawler to **discard results**, it will not store any results +by itself. This is useful for testing and **the default behavior**. + +```sh +TOBEY_RESULTS_DSN=noop:// +``` + +### Results Format + +A _Result object_ is a JSON object that contains the result of a crawl request alongside +the metadata of the run, see _Runs_ above for more details. ```jsonc { "action": "collector.response", "run_uuid": "0033085c-685b-432a-9aa4-0aca59cc3e12", - // ... + "run_metadata": { + "internal_project_reference": 42, + "triggered_by": "user@example.org" + }, "request_url": "http://...", "response_body": "...", // Base64 encoded raw response body received when downloading the resource. // ... - "data": { // Passed-through data. - "magic_number": 12 - }, } ``` + diff --git a/api.go b/api.go index 9ca2cde..745a07f 100644 --- a/api.go +++ b/api.go @@ -60,6 +60,9 @@ type APIRequest struct { // empty, we'll generate one. Run string `json:"run_uuid"` + // Metadata associated with this run that will be included in all results + RunMetadata interface{} `json:"run_metadata,omitempty"` + URL string `json:"url"` URLs []string `json:"urls"` diff --git a/main.go b/main.go index 601d3fc..bae98f0 100644 --- a/main.go +++ b/main.go @@ -254,7 +254,8 @@ func main() { run := &Run{ SerializableRun: SerializableRun{ - ID: id, + ID: id, + Metadata: req.RunMetadata, URLs: req.GetURLs(true), diff --git a/results.go b/results.go index 65e02cb..b72fda4 100644 --- a/results.go +++ b/results.go @@ -16,6 +16,7 @@ import ( // Result represents a crawl result that can be stored by any ResultsStore implementation type Result struct { Run string `json:"run_uuid"` + RunMetadata interface{} `json:"run_metadata,omitempty"` RequestURL string `json:"request_url"` ResponseBody []byte `json:"response_body"` // Will be base64 encoded when marshalled ResponseStatusCode int `json:"response_status_code"` @@ -23,9 +24,10 @@ type Result struct { } // NewResult creates a Result from a collector.Response and optional data -func NewResult(run string, res *collector.Response, data interface{}) *Result { +func NewResult(run *Run, res *collector.Response, data interface{}) *Result { return &Result{ - Run: run, + Run: run.ID, + RunMetadata: run.Metadata, RequestURL: res.Request.URL.String(), ResponseBody: res.Body[:], ResponseStatusCode: res.StatusCode, @@ -35,7 +37,7 @@ func NewResult(run string, res *collector.Response, data interface{}) *Result { // ResultStore defines how crawl results are stored type ResultStore interface { - Save(ctx context.Context, config ResultStoreConfig, run string, res *collector.Response) error + Save(ctx context.Context, config ResultStoreConfig, run *Run, res *collector.Response) error } // ResultStoreConfig is the base configuration interface that all result store configs must implement diff --git a/results_disk.go b/results_disk.go index 7fdd14e..9e15081 100644 --- a/results_disk.go +++ b/results_disk.go @@ -45,8 +45,8 @@ func NewDiskResultStore(config DiskStoreConfig) (*DiskResultStore, error) { } // Save implements ResultStore.Save by writing results to a file -func (drs *DiskResultStore) Save(ctx context.Context, config ResultStoreConfig, run string, res *collector.Response) error { - logger := slog.With("run", run, "url", res.Request.URL) +func (drs *DiskResultStore) Save(ctx context.Context, config ResultStoreConfig, run *Run, res *collector.Response) error { + logger := slog.With("run", run.ID, "url", res.Request.URL) logger.Debug("DiskResultStore: Saving result to file...") // Use per-call config if provided, otherwise use default config @@ -68,7 +68,7 @@ func (drs *DiskResultStore) Save(ctx context.Context, config ResultStoreConfig, } } - // Create result using common Result type + // Create result using run metadata result := NewResult(run, res, webhookData) // Create a filename based on URL and run ID @@ -77,7 +77,7 @@ func (drs *DiskResultStore) Save(ctx context.Context, config ResultStoreConfig, urlPath = "root" } - filename := fmt.Sprintf("%s_%s.json", run, urlPath) + filename := fmt.Sprintf("%s_%s.json", run.ID, urlPath) filepath := filepath.Join(outputDir, filename) jsonData, err := json.MarshalIndent(result, "", " ") diff --git a/results_noop.go b/results_noop.go index 6b03860..4524f58 100644 --- a/results_noop.go +++ b/results_noop.go @@ -14,6 +14,6 @@ import ( type NoopResultStore struct{} // Save implements ResultsStore.Save by discarding the result -func (n *NoopResultStore) Save(ctx context.Context, config ResultStoreConfig, run string, res *collector.Response) error { +func (n *NoopResultStore) Save(ctx context.Context, config ResultStoreConfig, run *Run, res *collector.Response) error { return nil } diff --git a/results_webhook.go b/results_webhook.go index a0f58ee..bc7a509 100644 --- a/results_webhook.go +++ b/results_webhook.go @@ -44,7 +44,7 @@ func NewWebhookResultStore(ctx context.Context, endpoint string) *WebhookResultS } // Save implements ResultsStore.Save by sending results to a webhook endpoint -func (wrs *WebhookResultStore) Save(ctx context.Context, config ResultStoreConfig, run string, res *collector.Response) error { +func (wrs *WebhookResultStore) Save(ctx context.Context, config ResultStoreConfig, run *Run, res *collector.Response) error { endpoint := wrs.defaultEndpoint var webhook *WebhookResultStoreConfig @@ -61,13 +61,13 @@ func (wrs *WebhookResultStore) Save(ctx context.Context, config ResultStoreConfi return fmt.Errorf("no webhook endpoint configured") } - logger := slog.With("endpoint", endpoint, "run", run) + logger := slog.With("endpoint", endpoint, "run", run.ID) logger.Debug("WebhookResultStore: Sending webhook...") ctx, span := tracer.Start(ctx, "output.webhook.send") defer span.End() - // Create result using common Result type and wrap it in a webhook payload + // Create result using run metadata result := NewResult(run, res, webhook.Data) payload := struct { diff --git a/run.go b/run.go index 0bb200b..4b9287c 100644 --- a/run.go +++ b/run.go @@ -34,7 +34,8 @@ type Run struct { // store the Run in the RunStore. It contains only static data. "Live" data, // like seen URLs are not kept in this struct. type SerializableRun struct { - ID string + ID string + Metadata interface{} URLs []string @@ -149,7 +150,7 @@ func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p Progre "response.status", res.StatusCode, ) if run.WebhookConfig != nil && run.WebhookConfig.Endpoint != "" { - rs.Save(ctx, run.WebhookConfig, run.ID, res) + rs.Save(ctx, run.WebhookConfig, run, res) } } } diff --git a/visitworker.go b/visitworker.go index fb6c41f..dbb8e35 100644 --- a/visitworker.go +++ b/visitworker.go @@ -135,7 +135,7 @@ func VisitWorker( span.AddEvent("Visitor: Visited URL.", t) if r.WebhookConfig != nil { - rs.Save(jctx, r.WebhookConfig, r.ID, res) + rs.Save(jctx, r.WebhookConfig, r, res) } span.End() From c8693254400a64b23f730f34329a87aee6028027 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 10 Feb 2025 12:00:20 +0100 Subject: [PATCH 29/57] Allow dynamic configuration of the webhook endpoint --- results.go | 2 +- results_webhook.go | 29 ++++++++++++++++++++++++----- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/results.go b/results.go index b72fda4..6de82f9 100644 --- a/results.go +++ b/results.go @@ -74,7 +74,7 @@ func CreateResultStore(dsn string) (ResultStore, error) { if u.Host == "" { return nil, fmt.Errorf("webhook results store requires a valid host (e.g., webhook://example.com/results)") } - endpoint := fmt.Sprintf("%s://%s%s", "https", u.Host, u.Path) + endpoint := fmt.Sprintf("%s://%s%s?%s", "https", u.Host, u.Path, u.RawQuery) return NewWebhookResultStore(context.Background(), endpoint), nil case "noop": return &NoopResultStore{}, nil diff --git a/results_webhook.go b/results_webhook.go index bc7a509..8711beb 100644 --- a/results_webhook.go +++ b/results_webhook.go @@ -12,6 +12,7 @@ import ( "fmt" "log/slog" "net/http" + "net/url" "tobey/internal/collector" ) @@ -32,14 +33,30 @@ func (c *WebhookResultStoreConfig) GetWebhook() *WebhookResultStoreConfig { // WebhookResultStore implements ResultsStore by sending results to a webhook endpoint. // It sends results in a non-blocking way, following a fire-and-forget approach. type WebhookResultStore struct { - client *http.Client - defaultEndpoint string + client *http.Client + defaultEndpoint string + allowDynamicConfig bool } func NewWebhookResultStore(ctx context.Context, endpoint string) *WebhookResultStore { + u, err := url.Parse(endpoint) + if err != nil { + return &WebhookResultStore{ + client: CreateRetryingHTTPClient(NoAuthFn), + defaultEndpoint: endpoint, + allowDynamicConfig: false, + } + } + + allowDynamic := u.Query().Get("enable_dynamic_config") != "" + + u.RawQuery = "" + cleanEndpoint := u.String() + return &WebhookResultStore{ - client: CreateRetryingHTTPClient(NoAuthFn), - defaultEndpoint: endpoint, + client: CreateRetryingHTTPClient(NoAuthFn), + defaultEndpoint: cleanEndpoint, + allowDynamicConfig: allowDynamic, } } @@ -51,8 +68,10 @@ func (wrs *WebhookResultStore) Save(ctx context.Context, config ResultStoreConfi if config != nil { if whConfig, ok := config.(*WebhookResultStoreConfig); ok { webhook = whConfig - if whConfig.Endpoint != "" { + if whConfig.Endpoint != "" && wrs.allowDynamicConfig { endpoint = whConfig.Endpoint + } else if whConfig.Endpoint != "" && !wrs.allowDynamicConfig { + slog.Warn("Dynamic webhook configuration is disabled. Ignoring custom endpoint.") } } } From b8003533ab8639b542041cc2df18ab0ccd9c3222 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 10 Feb 2025 12:02:38 +0100 Subject: [PATCH 30/57] Allow to run only with dynamic config --- .env.example | 1 + README.md | 5 ++++- results.go | 5 +++-- results_webhook.go | 19 ++++++++++++++----- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/.env.example b/.env.example index 7a8f019..b4e9f68 100644 --- a/.env.example +++ b/.env.example @@ -2,6 +2,7 @@ # TOBEY_SKIP_CACHE=false TOBEY_PROGRESS_DSN=http://progress:8080 +TOBEY_RESULTS_DSN=webhook://?enable_dynamic_config TOBEY_REDIS_DSN=redis:6379/0 # A space separated list of telemetry to send. Available telemetry: metrics, diff --git a/README.md b/README.md index f6f3843..4e3ce36 100644 --- a/README.md +++ b/README.md @@ -311,7 +311,10 @@ configure the webhook endpoint on a per-request basis. Dynamic re-configuration by default, and can be enabled by adding `enable_dynamic_config` to the DSN. ```sh -TOBEY_RESULTS_DSN=webhook://example.org/webhook?enable_dynamic_config +TOBEY_RESULTS_DSN=webhook://example.org/webhook?enable_dynamic_config # with default endpoint +TOBEY_RESULTS_DSN=webhook://?enable_dynamic_config # without default endpoint, requires dynamic + # rconfiguration in each crawl request + # request ``` You can than specify the webhook endpoint in the crawl request: diff --git a/results.go b/results.go index 6de82f9..a799300 100644 --- a/results.go +++ b/results.go @@ -71,8 +71,9 @@ func CreateResultStore(dsn string) (ResultStore, error) { } return store, nil case "webhook": - if u.Host == "" { - return nil, fmt.Errorf("webhook results store requires a valid host (e.g., webhook://example.com/results)") + // Only require host if dynamic config is not enabled + if u.Host == "" && u.Query().Get("enable_dynamic_config") == "" { + return nil, fmt.Errorf("webhook results store requires a valid host (e.g., webhook://example.com/results) unless dynamic configuration is enabled") } endpoint := fmt.Sprintf("%s://%s%s?%s", "https", u.Host, u.Path, u.RawQuery) return NewWebhookResultStore(context.Background(), endpoint), nil diff --git a/results_webhook.go b/results_webhook.go index 8711beb..6b636ae 100644 --- a/results_webhook.go +++ b/results_webhook.go @@ -34,7 +34,7 @@ func (c *WebhookResultStoreConfig) GetWebhook() *WebhookResultStoreConfig { // It sends results in a non-blocking way, following a fire-and-forget approach. type WebhookResultStore struct { client *http.Client - defaultEndpoint string + defaultEndpoint string // Can be empty when only using dynamic config allowDynamicConfig bool } @@ -50,8 +50,12 @@ func NewWebhookResultStore(ctx context.Context, endpoint string) *WebhookResultS allowDynamic := u.Query().Get("enable_dynamic_config") != "" - u.RawQuery = "" - cleanEndpoint := u.String() + // If dynamic config is enabled, we don't require a default endpoint + var cleanEndpoint string + if u.Host != "" { + u.RawQuery = "" + cleanEndpoint = u.String() + } return &WebhookResultStore{ client: CreateRetryingHTTPClient(NoAuthFn), @@ -62,7 +66,7 @@ func NewWebhookResultStore(ctx context.Context, endpoint string) *WebhookResultS // Save implements ResultsStore.Save by sending results to a webhook endpoint func (wrs *WebhookResultStore) Save(ctx context.Context, config ResultStoreConfig, run *Run, res *collector.Response) error { - endpoint := wrs.defaultEndpoint + var endpoint string var webhook *WebhookResultStoreConfig if config != nil { @@ -76,8 +80,13 @@ func (wrs *WebhookResultStore) Save(ctx context.Context, config ResultStoreConfi } } + // If no dynamic endpoint, fall back to default + if endpoint == "" { + endpoint = wrs.defaultEndpoint + } + if endpoint == "" { - return fmt.Errorf("no webhook endpoint configured") + return fmt.Errorf("no webhook endpoint configured - must provide either default endpoint or dynamic configuration") } logger := slog.With("endpoint", endpoint, "run", run.ID) From fe22d07787843574380fb9a3ae75b553c53a2477 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 10 Feb 2025 12:10:34 +0100 Subject: [PATCH 31/57] Work on .env --- .env.example | 12 ++++++------ .env.example.factorial | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 6 deletions(-) create mode 100644 .env.example.factorial diff --git a/.env.example b/.env.example index b4e9f68..f1f864e 100644 --- a/.env.example +++ b/.env.example @@ -1,14 +1,14 @@ # TOBEY_DEBUG=false # TOBEY_SKIP_CACHE=false -TOBEY_PROGRESS_DSN=http://progress:8080 -TOBEY_RESULTS_DSN=webhook://?enable_dynamic_config -TOBEY_REDIS_DSN=redis:6379/0 +TOBEY_RESULTS_DSN=disk://results +# TOBEY_PROGRESS_DSN= +# TOBEY_REDIS_DSN= # A space separated list of telemetry to send. Available telemetry: metrics, # traces, pulse. To disable telemetry provide an empty value. When enabling telemetry # appropriate OTLP endpoints must provided as well. -TOBEY_TELEMETRY="traces" +# TOBEY_TELEMETRY= -OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://jaeger:4318/v1/traces -OTEL_EXPORTER_OTLP_METRICS_ENDPOINT= +# OTEL_EXPORTER_OTLP_TRACES_ENDPOINT= +# OTEL_EXPORTER_OTLP_METRICS_ENDPOINT= diff --git a/.env.example.factorial b/.env.example.factorial new file mode 100644 index 0000000..aeb5c86 --- /dev/null +++ b/.env.example.factorial @@ -0,0 +1,14 @@ +# TOBEY_DEBUG=false +# TOBEY_SKIP_CACHE=false + +TOBEY_PROGRESS_DSN=factorial://progress:8080 +TOBEY_RESULTS_DSN=webhook://?enable_dynamic_config +TOBEY_REDIS_DSN=redis:6379/0 + +# A space separated list of telemetry to send. Available telemetry: metrics, +# traces, pulse. To disable telemetry provide an empty value. When enabling telemetry +# appropriate OTLP endpoints must provided as well. +TOBEY_TELEMETRY="traces" + +OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://jaeger:4318/v1/traces +OTEL_EXPORTER_OTLP_METRICS_ENDPOINT= From 27d7cb696cb7eab42ff32ec0a171cd501eefdbde Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 10 Feb 2025 12:28:05 +0100 Subject: [PATCH 32/57] Move service specific in progress, allow to pass run metadata. --- progress.go | 42 ++++++++++++++++++++++++------------------ progress_factorial.go | 39 +++++++++++++++++++++++++++++++++++++-- progress_noop.go | 10 +++++++--- run.go | 2 +- visitworker.go | 3 +-- 5 files changed, 70 insertions(+), 26 deletions(-) diff --git a/progress.go b/progress.go index 0148359..6fbaf23 100644 --- a/progress.go +++ b/progress.go @@ -12,17 +12,17 @@ import ( "net/url" ) -// ProgressStatus represents the valid states for progress updates -type ProgressStatus string +// ProgressStatus represents the valid states for progress updates. +type ProgressStatus int // Constants to be used for indicating what state the progress is in. const ( - ProgressStateQueuedForCrawling ProgressStatus = "queued_for_crawling" // Used when an URL has been enqueued, see collector.Collector.EnqueueFn. - ProgressStateCrawling ProgressStatus = "crawling" // Used when actively crawling an URL, i.e. right before collector.Collector.Visit. - ProgressStateCrawled ProgressStatus = "crawled" - ProgressStateSucceeded ProgressStatus = "succeeded" // When crawling has been successful. - ProgressStateErrored ProgressStatus = "errored" - ProgressStateCancelled ProgressStatus = "cancelled" + ProgressStateQueuedForCrawling ProgressStatus = iota // Used when an URL has been enqueued, see collector.Collector.EnqueueFn. + ProgressStateCrawling // Used when actively crawling an URL, i.e. right before collector.Collector.Visit. + ProgressStateCrawled // Used when a URL has been crawled. + ProgressStateSucceeded // When crawling has been successful. + ProgressStateErrored // When crawling has failed. + ProgressStateCancelled // When crawling has been cancelled. ) // CreateProgress creates a new progress dispatcher based on the provided DSN. @@ -53,31 +53,37 @@ func CreateProgress(dsn string) (ProgressDispatcher, error) { } type ProgressDispatcher interface { - With(run string, url string) *Progressor + With(run *Run, url string) *Progressor Call(ctx context.Context, msg ProgressUpdate) error // Usually only called by the Progressor. } +// Progressor is a helper struct that is used to update the progress of a run. +// It is returned by the With method of the ProgressDispatcher. It allows for +// cleaner code when updating the progress of a run, multiple times in the same +// function. type Progressor struct { dispatcher ProgressDispatcher stage string - Run string + Run *Run URL string } type ProgressUpdate struct { - Stage string `json:"stage"` - Status ProgressStatus `json:"status"` - Run string `json:"run_uuid"` // uuid of the run - URL string `json:"url"` + Stage string + Status ProgressStatus + Run string + URL string + Metadata interface{} } // Update updates the progress with a new status func (p *Progressor) Update(ctx context.Context, status ProgressStatus) error { return p.dispatcher.Call(ctx, ProgressUpdate{ - Stage: p.stage, - Run: p.Run, - URL: p.URL, - Status: status, + Stage: p.stage, + Run: p.Run.ID, + URL: p.URL, + Status: status, + Metadata: p.Run.Metadata, }) } diff --git a/progress_factorial.go b/progress_factorial.go index 26a667f..2fa231e 100644 --- a/progress_factorial.go +++ b/progress_factorial.go @@ -21,12 +21,40 @@ const ( // FactorialProgressEndpointTransition = "api/status/transition-to" // Not yet implemented. ) +// factorialProgressStatus maps internal ProgressStatus to Factorial API string representations. +func factorialProgressStatus(status ProgressStatus) string { + switch status { + case ProgressStateQueuedForCrawling: + return "queued_for_crawling" + case ProgressStateCrawling: + return "crawling" + case ProgressStateCrawled: + return "crawled" + case ProgressStateSucceeded: + return "succeeded" + case ProgressStateErrored: + return "errored" + case ProgressStateCancelled: + return "cancelled" + default: + return "unknown" + } +} + +type FactorialProgressUpdatePayload struct { + Stage string `json:"stage"` + Status string `json:"status"` // Changed to string since we're using string representations + Run string `json:"run_uuid"` + URL string `json:"url"` + // FIXME: If the service starts supporting accepting run metadata, we can add it here. +} + // FactorialProgressServiceDispatcher is a dispatcher for the Factorial progress service. type FactorialProgressServiceDispatcher struct { client *http.Client } -func (p *FactorialProgressServiceDispatcher) With(run string, url string) *Progressor { +func (p *FactorialProgressServiceDispatcher) With(run *Run, url string) *Progressor { return &Progressor{ dispatcher: p, stage: FactorialProgressServiceDefaultStage, @@ -43,7 +71,14 @@ func (p *FactorialProgressServiceDispatcher) Call(ctx context.Context, pu Progre ctx, span := tracer.Start(ctx, "output.progress.send") defer span.End() - payload := pu + // Convert generic ProgressUpdate to Factorial-specific payload + payload := FactorialProgressUpdatePayload{ + Stage: pu.Stage, + Status: factorialProgressStatus(pu.Status), + Run: pu.Run, + URL: pu.URL, + } + body, err := json.Marshal(payload) if err != nil { span.RecordError(err) diff --git a/progress_noop.go b/progress_noop.go index 8f1e78e..1674024 100644 --- a/progress_noop.go +++ b/progress_noop.go @@ -10,10 +10,14 @@ import "context" type NoopProgressDispatcher struct { } -func (p *NoopProgressDispatcher) With(run string, url string) *Progressor { - return &Progressor{dispatcher: p} +func (p *NoopProgressDispatcher) With(run *Run, url string) *Progressor { + return &Progressor{ + dispatcher: p, + Run: run, + URL: url, + } } func (p *NoopProgressDispatcher) Call(ctx context.Context, pu ProgressUpdate) error { return nil -} \ No newline at end of file +} diff --git a/run.go b/run.go index 4b9287c..0f86539 100644 --- a/run.go +++ b/run.go @@ -92,7 +92,7 @@ func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p Progre tctx, span := tracer.Start(ctx, "enqueue_element") defer span.End() - p := progress.With(run.ID, url) + p := progress.With(run, url) span.SetAttributes(attribute.String("URL", url)) // Ensure we never publish a URL twice for a single run. Not only does diff --git a/visitworker.go b/visitworker.go index dbb8e35..cf5b161 100644 --- a/visitworker.go +++ b/visitworker.go @@ -88,8 +88,6 @@ func VisitWorker( } jlogger := wlogger.With("run", job.Run, "url", job.URL, "job.id", job.ID) - p := progress.With(job.Run, job.URL) - jctx, span := tracer.Start(job.Context, "process_visit_job") span.SetAttributes(attribute.String("Url", job.URL)) t := trace.WithAttributes(attribute.String("Url", job.URL)) @@ -109,6 +107,7 @@ func VisitWorker( // are not shared by the Manager across tobey instances. r, _ := runs.Get(ctx, job.Run) c := r.GetCollector(ctx, q, progress, rs) + p := progress.With(r, job.URL) p.Update(jctx, ProgressStateCrawling) From 3478ca80a8f02c386d267517aed2cbfb58f5fe2c Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 10 Feb 2025 12:30:50 +0100 Subject: [PATCH 33/57] Describe Progress Reporting feature --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 4e3ce36..c7927ac 100644 --- a/README.md +++ b/README.md @@ -352,3 +352,13 @@ the metadata of the run, see _Runs_ above for more details. } ``` +## Progress Reporting + +Tobey can report progress while it's crawling. This is useful for monitoring the +progress of a crawl and for debugging and determine when a crawl has finished. By default no progress reporting is enabled. There is currently only one progress reporting service that is supported, the Factorial Progress Service. It is configured via the `TOBEY_PROGRESS_DSN` environment +variable. + +```sh +TOBEY_PROGRESS_DSN=factorial://host:port +``` + From 963a30eee2989bd7bdf5e06535a7fce5da957049 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 10 Feb 2025 12:50:35 +0100 Subject: [PATCH 34/57] Continue work on results stores and dyn config --- results.go | 12 +++--------- results_disk.go | 40 +++++++++------------------------------- results_noop.go | 2 +- results_webhook.go | 39 ++++++++++++++++----------------------- 4 files changed, 29 insertions(+), 64 deletions(-) diff --git a/results.go b/results.go index a799300..5023d35 100644 --- a/results.go +++ b/results.go @@ -20,29 +20,23 @@ type Result struct { RequestURL string `json:"request_url"` ResponseBody []byte `json:"response_body"` // Will be base64 encoded when marshalled ResponseStatusCode int `json:"response_status_code"` - Data interface{} `json:"data,omitempty"` // Optional additional data + Metadata interface{} `json:"metadata,omitempty"` // Optional additional data } // NewResult creates a Result from a collector.Response and optional data -func NewResult(run *Run, res *collector.Response, data interface{}) *Result { +func NewResult(run *Run, res *collector.Response) *Result { return &Result{ Run: run.ID, RunMetadata: run.Metadata, RequestURL: res.Request.URL.String(), ResponseBody: res.Body[:], ResponseStatusCode: res.StatusCode, - Data: data, } } // ResultStore defines how crawl results are stored type ResultStore interface { - Save(ctx context.Context, config ResultStoreConfig, run *Run, res *collector.Response) error -} - -// ResultStoreConfig is the base configuration interface that all result store configs must implement -type ResultStoreConfig interface { - Validate() error + Save(ctx context.Context, config any, run *Run, res *collector.Response) error } // CreateResultStore creates a ResultsStore based on the provided DSN diff --git a/results_disk.go b/results_disk.go index 9e15081..eed2677 100644 --- a/results_disk.go +++ b/results_disk.go @@ -20,14 +20,9 @@ type DiskStoreConfig struct { OutputDir string `json:"output_dir"` } -func (c *DiskStoreConfig) Validate() error { - // No validation needed for now, but we could add checks for write permissions, etc. - return nil -} - // DiskResultStore implements ResultsStore by saving results to files on disk type DiskResultStore struct { - defaultConfig DiskStoreConfig + outputDir string } // NewDiskResultStore creates a new DiskResultStore @@ -40,36 +35,19 @@ func NewDiskResultStore(config DiskStoreConfig) (*DiskResultStore, error) { } return &DiskResultStore{ - defaultConfig: config, + outputDir: config.OutputDir, }, nil } -// Save implements ResultStore.Save by writing results to a file -func (drs *DiskResultStore) Save(ctx context.Context, config ResultStoreConfig, run *Run, res *collector.Response) error { +// Save implements ResultStore.Save by writing results to a file. +// +// We accept per-call config in the signature, to satisfy the ResultsStore interface, +// but we don't use it here, as we don't allow dynamic config for this store. +func (drs *DiskResultStore) Save(ctx context.Context, config any, run *Run, res *collector.Response) error { logger := slog.With("run", run.ID, "url", res.Request.URL) logger.Debug("DiskResultStore: Saving result to file...") - // Use per-call config if provided, otherwise use default config - outputDir := drs.defaultConfig.OutputDir - var webhookData interface{} - - if config != nil { - if diskConfig, ok := config.(*DiskStoreConfig); ok { - if diskConfig.OutputDir != "" { - outputDir = diskConfig.OutputDir - // Create directory if it doesn't exist - if err := os.MkdirAll(outputDir, 0755); err != nil { - return fmt.Errorf("failed to create output directory: %w", err) - } - } - } - if whConfig, ok := config.(*WebhookResultStoreConfig); ok { - webhookData = whConfig.Data - } - } - - // Create result using run metadata - result := NewResult(run, res, webhookData) + result := NewResult(run, res) // Create a filename based on URL and run ID urlPath := sanitizeFilename(res.Request.URL.Path) @@ -78,7 +56,7 @@ func (drs *DiskResultStore) Save(ctx context.Context, config ResultStoreConfig, } filename := fmt.Sprintf("%s_%s.json", run.ID, urlPath) - filepath := filepath.Join(outputDir, filename) + filepath := filepath.Join(drs.outputDir, filename) jsonData, err := json.MarshalIndent(result, "", " ") if err != nil { diff --git a/results_noop.go b/results_noop.go index 4524f58..775a24b 100644 --- a/results_noop.go +++ b/results_noop.go @@ -14,6 +14,6 @@ import ( type NoopResultStore struct{} // Save implements ResultsStore.Save by discarding the result -func (n *NoopResultStore) Save(ctx context.Context, config ResultStoreConfig, run *Run, res *collector.Response) error { +func (n *NoopResultStore) Save(ctx context.Context, config any, run *Run, res *collector.Response) error { return nil } diff --git a/results_webhook.go b/results_webhook.go index 6b636ae..64e1a01 100644 --- a/results_webhook.go +++ b/results_webhook.go @@ -18,16 +18,7 @@ import ( // WebhookResultStoreConfig defines the configuration for webhook endpoints type WebhookResultStoreConfig struct { - Endpoint string `json:"endpoint"` - Data interface{} `json:"data"` // Accept arbitrary data here. -} - -func (c *WebhookResultStoreConfig) Validate() error { - return nil -} - -func (c *WebhookResultStoreConfig) GetWebhook() *WebhookResultStoreConfig { - return c + Endpoint string `json:"endpoint"` } // WebhookResultStore implements ResultsStore by sending results to a webhook endpoint. @@ -65,26 +56,28 @@ func NewWebhookResultStore(ctx context.Context, endpoint string) *WebhookResultS } // Save implements ResultsStore.Save by sending results to a webhook endpoint -func (wrs *WebhookResultStore) Save(ctx context.Context, config ResultStoreConfig, run *Run, res *collector.Response) error { +func (wrs *WebhookResultStore) Save(ctx context.Context, config any, run *Run, res *collector.Response) error { var endpoint string - var webhook *WebhookResultStoreConfig + var webhook *WebhookResultStoreConfig if config != nil { - if whConfig, ok := config.(*WebhookResultStoreConfig); ok { - webhook = whConfig - if whConfig.Endpoint != "" && wrs.allowDynamicConfig { - endpoint = whConfig.Endpoint - } else if whConfig.Endpoint != "" && !wrs.allowDynamicConfig { - slog.Warn("Dynamic webhook configuration is disabled. Ignoring custom endpoint.") - } + var ok bool + webhook, ok = config.(*WebhookResultStoreConfig) + if !ok { + return fmt.Errorf("invalid webhook configuration: %T", config) } } - - // If no dynamic endpoint, fall back to default + if webhook != nil { + if webhook.Endpoint != "" && wrs.allowDynamicConfig { + endpoint = webhook.Endpoint + } else if webhook.Endpoint != "" && !wrs.allowDynamicConfig { + slog.Warn("Dynamic webhook configuration is disabled. Ignoring custom endpoint.") + } + } + // If no dynamic endpoint, fall back to default. if endpoint == "" { endpoint = wrs.defaultEndpoint } - if endpoint == "" { return fmt.Errorf("no webhook endpoint configured - must provide either default endpoint or dynamic configuration") } @@ -96,7 +89,7 @@ func (wrs *WebhookResultStore) Save(ctx context.Context, config ResultStoreConfi defer span.End() // Create result using run metadata - result := NewResult(run, res, webhook.Data) + result := NewResult(run, res) payload := struct { Action string `json:"action"` From 8863d5b4b7bf3ea1a3f576ba6c7b62cc512848c6 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 10 Feb 2025 13:30:19 +0100 Subject: [PATCH 35/57] Optimize disk storage --- results_disk.go | 46 ++++++++++------------------------------------ 1 file changed, 10 insertions(+), 36 deletions(-) diff --git a/results_disk.go b/results_disk.go index eed2677..1b6c367 100644 --- a/results_disk.go +++ b/results_disk.go @@ -7,6 +7,8 @@ package main import ( "context" + "crypto/sha256" + "encoding/hex" "encoding/json" "fmt" "log/slog" @@ -49,53 +51,25 @@ func (drs *DiskResultStore) Save(ctx context.Context, config any, run *Run, res result := NewResult(run, res) - // Create a filename based on URL and run ID - urlPath := sanitizeFilename(res.Request.URL.Path) - if urlPath == "" { - urlPath = "root" + // MkdirAll ignores errors where the directory exists. + runDir := filepath.Join(drs.outputDir, run.ID) + if err := os.MkdirAll(runDir, 0755); err != nil { + return fmt.Errorf("failed to create run directory: %w", err) } - filename := fmt.Sprintf("%s_%s.json", run.ID, urlPath) - filepath := filepath.Join(drs.outputDir, filename) + hash := sha256.New() + hash.Write([]byte(res.Request.URL.String())) + filename := fmt.Sprintf("%s.json", hex.EncodeToString(hash.Sum(nil))) + filepath := filepath.Join(runDir, filename) jsonData, err := json.MarshalIndent(result, "", " ") if err != nil { - logger.Error("DiskResultsStore: Failed to marshal result", "error", err) return fmt.Errorf("failed to marshal result: %w", err) } if err := os.WriteFile(filepath, jsonData, 0644); err != nil { - logger.Error("DiskResultsStore: Failed to write file", "error", err, "path", filepath) return fmt.Errorf("failed to write file: %w", err) } - logger.Debug("DiskResultsStore: Successfully saved result", "path", filepath) return nil } - -// sanitizeFilename creates a safe filename from a URL path -func sanitizeFilename(path string) string { - // Remove leading slash - path = filepath.Clean(path) - if path == "/" || path == "." { - return "" - } - if path[0] == '/' { - path = path[1:] - } - - // Replace remaining slashes with underscores - path = filepath.ToSlash(path) - for i := 0; i < len(path); i++ { - if path[i] == '/' { - path = path[:i] + "_" + path[i+1:] - } - } - - // Limit filename length - if len(path) > 100 { - path = path[:100] - } - - return path -} From 582b97a4d57ba56a56de2dde1ec1dddab5b379c0 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 10 Feb 2025 15:49:44 +0100 Subject: [PATCH 36/57] Rename progress dispatcher to reporter --- main.go | 2 +- progress.go | 30 +++++++++++++++--------------- progress_factorial.go | 34 +++++++++++++++++----------------- progress_noop.go | 17 +++++++++-------- results_webhook.go | 10 +++++----- run.go | 6 +++--- visitworker.go | 4 ++-- 7 files changed, 52 insertions(+), 51 deletions(-) diff --git a/main.go b/main.go index bae98f0..895aef6 100644 --- a/main.go +++ b/main.go @@ -179,7 +179,7 @@ func main() { panic(err) } - progress, err := CreateProgress(os.Getenv("TOBEY_PROGRESS_DSN")) + progress, err := CreateProgressReporter(os.Getenv("TOBEY_PROGRESS_DSN")) if err != nil { panic(err) } diff --git a/progress.go b/progress.go index 6fbaf23..40b122c 100644 --- a/progress.go +++ b/progress.go @@ -25,12 +25,12 @@ const ( ProgressStateCancelled // When crawling has been cancelled. ) -// CreateProgress creates a new progress dispatcher based on the provided DSN. +// CreateProgressReporter creates a new progress dispatcher based on the provided DSN. // If dsn is empty, it returns a NoopProgressDispatcher. -func CreateProgress(dsn string) (ProgressDispatcher, error) { +func CreateProgressReporter(dsn string) (ProgressReporter, error) { if dsn == "" { - slog.Debug("Not sharing progress updates.") - return &NoopProgressDispatcher{}, nil + slog.Debug("Progress Reporting: Disabled, not sharing progress updates.") + return &NoopProgressReporter{}, nil } u, err := url.Parse(dsn) @@ -40,29 +40,29 @@ func CreateProgress(dsn string) (ProgressDispatcher, error) { switch u.Scheme { case "factorial": - slog.Info("Using Factorial progress service for updates.", "dsn", dsn) - return &FactorialProgressServiceDispatcher{ + slog.Info("Progress Reporting: Enabled, using Factorial progress service for updates.", "dsn", dsn) + return &FactorialProgressReporter{ client: CreateRetryingHTTPClient(NoAuthFn), }, nil case "noop": - slog.Debug("Using noop progress dispatcher.") - return &NoopProgressDispatcher{}, nil + slog.Debug("Progress Reporting: Disabled, not sharing progress updates.") + return &NoopProgressReporter{}, nil default: return nil, fmt.Errorf("unsupported progress dispatcher type: %s", u.Scheme) } } -type ProgressDispatcher interface { - With(run *Run, url string) *Progressor +type ProgressReporter interface { + With(run *Run, url string) *Progress Call(ctx context.Context, msg ProgressUpdate) error // Usually only called by the Progressor. } -// Progressor is a helper struct that is used to update the progress of a run. +// Progress is a helper struct that is used to update the progress of a run. // It is returned by the With method of the ProgressDispatcher. It allows for // cleaner code when updating the progress of a run, multiple times in the same // function. -type Progressor struct { - dispatcher ProgressDispatcher +type Progress struct { + reporter ProgressReporter stage string Run *Run @@ -78,8 +78,8 @@ type ProgressUpdate struct { } // Update updates the progress with a new status -func (p *Progressor) Update(ctx context.Context, status ProgressStatus) error { - return p.dispatcher.Call(ctx, ProgressUpdate{ +func (p *Progress) Update(ctx context.Context, status ProgressStatus) error { + return p.reporter.Call(ctx, ProgressUpdate{ Stage: p.stage, Run: p.Run.ID, URL: p.URL, diff --git a/progress_factorial.go b/progress_factorial.go index 2fa231e..9a3326a 100644 --- a/progress_factorial.go +++ b/progress_factorial.go @@ -21,6 +21,14 @@ const ( // FactorialProgressEndpointTransition = "api/status/transition-to" // Not yet implemented. ) +type FactorialProgressUpdatePayload struct { + Stage string `json:"stage"` + Status string `json:"status"` // Changed to string since we're using string representations + Run string `json:"run_uuid"` + URL string `json:"url"` + // FIXME: If the service starts supporting accepting run metadata, we can add it here. +} + // factorialProgressStatus maps internal ProgressStatus to Factorial API string representations. func factorialProgressStatus(status ProgressStatus) string { switch status { @@ -41,30 +49,22 @@ func factorialProgressStatus(status ProgressStatus) string { } } -type FactorialProgressUpdatePayload struct { - Stage string `json:"stage"` - Status string `json:"status"` // Changed to string since we're using string representations - Run string `json:"run_uuid"` - URL string `json:"url"` - // FIXME: If the service starts supporting accepting run metadata, we can add it here. -} - -// FactorialProgressServiceDispatcher is a dispatcher for the Factorial progress service. -type FactorialProgressServiceDispatcher struct { +// FactorialProgressReporter is a reporter for the Factorial progress service. +type FactorialProgressReporter struct { client *http.Client } -func (p *FactorialProgressServiceDispatcher) With(run *Run, url string) *Progressor { - return &Progressor{ - dispatcher: p, - stage: FactorialProgressServiceDefaultStage, - Run: run, - URL: url, +func (p *FactorialProgressReporter) With(run *Run, url string) *Progress { + return &Progress{ + reporter: p, + stage: FactorialProgressServiceDefaultStage, + Run: run, + URL: url, } } // Call sends the progress update over the wire, it implements a fire and forget approach. -func (p *FactorialProgressServiceDispatcher) Call(ctx context.Context, pu ProgressUpdate) error { +func (p *FactorialProgressReporter) Call(ctx context.Context, pu ProgressUpdate) error { logger := slog.With("run", pu.Run, "url", pu.URL) logger.Debug("Progress Dispatcher: Sending update...") diff --git a/progress_noop.go b/progress_noop.go index 1674024..e221815 100644 --- a/progress_noop.go +++ b/progress_noop.go @@ -7,17 +7,18 @@ package main import "context" -type NoopProgressDispatcher struct { -} +// NoopProgressReporter is a no-op implementation of the ProgressReporter interface, it +// is used as the default when no progress reporting is configured. +type NoopProgressReporter struct{} -func (p *NoopProgressDispatcher) With(run *Run, url string) *Progressor { - return &Progressor{ - dispatcher: p, - Run: run, - URL: url, +func (p *NoopProgressReporter) With(run *Run, url string) *Progress { + return &Progress{ + reporter: p, + Run: run, + URL: url, } } -func (p *NoopProgressDispatcher) Call(ctx context.Context, pu ProgressUpdate) error { +func (p *NoopProgressReporter) Call(ctx context.Context, pu ProgressUpdate) error { return nil } diff --git a/results_webhook.go b/results_webhook.go index 64e1a01..5470d12 100644 --- a/results_webhook.go +++ b/results_webhook.go @@ -39,8 +39,6 @@ func NewWebhookResultStore(ctx context.Context, endpoint string) *WebhookResultS } } - allowDynamic := u.Query().Get("enable_dynamic_config") != "" - // If dynamic config is enabled, we don't require a default endpoint var cleanEndpoint string if u.Host != "" { @@ -49,9 +47,11 @@ func NewWebhookResultStore(ctx context.Context, endpoint string) *WebhookResultS } return &WebhookResultStore{ - client: CreateRetryingHTTPClient(NoAuthFn), - defaultEndpoint: cleanEndpoint, - allowDynamicConfig: allowDynamic, + client: CreateRetryingHTTPClient(NoAuthFn), + defaultEndpoint: cleanEndpoint, + // Presence of the query parameter is sufficient to enable dynamic config. This is, + // so we don't need to check what counts as boolean true, i.e. "true", "1", "yes", etc. + allowDynamicConfig: u.Query().Get("enable_dynamic_config") != "", } } diff --git a/run.go b/run.go index 0f86539..616ea18 100644 --- a/run.go +++ b/run.go @@ -80,11 +80,11 @@ func (r *Run) getAuthFn() GetAuthFn { } } -func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p ProgressDispatcher, rs ResultStore) *collector.Collector { +func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p ProgressReporter, rs ResultStore) *collector.Collector { // getEnqueueFn returns the enqueue function, that will enqueue a single URL to // be crawled. The enqueue function is called whenever a new URL is discovered // by that Collector, i.e. by looking at all links in a crawled page HTML. - getEnqueueFn := func(run *Run, q ctrlq.VisitWorkQueue, progress ProgressDispatcher) collector.EnqueueFn { + getEnqueueFn := func(run *Run, q ctrlq.VisitWorkQueue, progress ProgressReporter) collector.EnqueueFn { // The returned function takes the run context. return func(ctx context.Context, c *collector.Collector, url string) error { @@ -181,7 +181,7 @@ func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p Progre // Start starts the crawl with the given URLs. It will discover sitemaps and // enqueue the URLs. From there on more URLs will be discovered and enqueued. -func (r *Run) Start(ctx context.Context, q ctrlq.VisitWorkQueue, p ProgressDispatcher, rs ResultStore, urls []string) { +func (r *Run) Start(ctx context.Context, q ctrlq.VisitWorkQueue, p ProgressReporter, rs ResultStore, urls []string) { c := r.GetCollector(ctx, q, p, rs) // Decide where the initial URLs should go, users may provide sitemaps and diff --git a/visitworker.go b/visitworker.go index cf5b161..701cfe9 100644 --- a/visitworker.go +++ b/visitworker.go @@ -32,7 +32,7 @@ func CreateVisitWorkersPool( num int, runs *RunManager, q ctrlq.VisitWorkQueue, - progress ProgressDispatcher, + progress ProgressReporter, rs ResultStore, ) *sync.WaitGroup { var wg sync.WaitGroup @@ -59,7 +59,7 @@ func VisitWorker( id int, runs *RunManager, q ctrlq.VisitWorkQueue, - progress ProgressDispatcher, + progress ProgressReporter, rs ResultStore, ) error { wlogger := slog.With("worker.id", id) From 9b4d1f2f75467a4b663f891524b96ee687893adb Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 10 Feb 2025 16:00:07 +0100 Subject: [PATCH 37/57] Refactor results store --- results.go | 31 ++++--------------------------- results_disk.go | 28 +++++++++++++++++++++++----- results_webhook.go | 21 +++++++++++++++++---- 3 files changed, 44 insertions(+), 36 deletions(-) diff --git a/results.go b/results.go index 5023d35..d22936b 100644 --- a/results.go +++ b/results.go @@ -13,33 +13,6 @@ import ( "tobey/internal/collector" ) -// Result represents a crawl result that can be stored by any ResultsStore implementation -type Result struct { - Run string `json:"run_uuid"` - RunMetadata interface{} `json:"run_metadata,omitempty"` - RequestURL string `json:"request_url"` - ResponseBody []byte `json:"response_body"` // Will be base64 encoded when marshalled - ResponseStatusCode int `json:"response_status_code"` - Metadata interface{} `json:"metadata,omitempty"` // Optional additional data -} - -// NewResult creates a Result from a collector.Response and optional data -func NewResult(run *Run, res *collector.Response) *Result { - return &Result{ - Run: run.ID, - RunMetadata: run.Metadata, - RequestURL: res.Request.URL.String(), - ResponseBody: res.Body[:], - ResponseStatusCode: res.StatusCode, - } -} - -// ResultStore defines how crawl results are stored -type ResultStore interface { - Save(ctx context.Context, config any, run *Run, res *collector.Response) error -} - -// CreateResultStore creates a ResultsStore based on the provided DSN func CreateResultStore(dsn string) (ResultStore, error) { if dsn == "" { return &NoopResultStore{}, nil @@ -77,3 +50,7 @@ func CreateResultStore(dsn string) (ResultStore, error) { return nil, fmt.Errorf("unsupported results store type: %s", u.Scheme) } } + +type ResultStore interface { + Save(ctx context.Context, config any, run *Run, res *collector.Response) error +} diff --git a/results_disk.go b/results_disk.go index 1b6c367..f0e197b 100644 --- a/results_disk.go +++ b/results_disk.go @@ -17,19 +17,32 @@ import ( "tobey/internal/collector" ) -// DiskStoreConfig holds configuration for DiskResultsStore type DiskStoreConfig struct { OutputDir string `json:"output_dir"` } -// DiskResultStore implements ResultsStore by saving results to files on disk +// DiskResultStore stores results on disk as JSON files. Results are grouped by run +// in a run specific directory. The directory structure is as follows: +// +// / +// / +// .json +// +// The is the SHA-256 hash of the request URL, encoded as a hex string. +// The JSON file contains the result as a JSON object. type DiskResultStore struct { outputDir string } -// NewDiskResultStore creates a new DiskResultStore +type DiskResult struct { + Run string `json:"run_uuid"` + RunMetadata interface{} `json:"run_metadata,omitempty"` + RequestURL string `json:"request_url"` + ResponseBody []byte `json:"response_body"` // Will be base64 encoded when JSON marshalled. + ResponseStatusCode int `json:"response_status_code"` +} + func NewDiskResultStore(config DiskStoreConfig) (*DiskResultStore, error) { - // Create default output directory if it doesn't exist if config.OutputDir != "" { if err := os.MkdirAll(config.OutputDir, 0755); err != nil { return nil, fmt.Errorf("failed to create output directory: %w", err) @@ -49,7 +62,12 @@ func (drs *DiskResultStore) Save(ctx context.Context, config any, run *Run, res logger := slog.With("run", run.ID, "url", res.Request.URL) logger.Debug("DiskResultStore: Saving result to file...") - result := NewResult(run, res) + result := &DiskResult{ + Run: run.ID, + RequestURL: res.Request.URL.String(), + ResponseBody: res.Body[:], + ResponseStatusCode: res.StatusCode, + } // MkdirAll ignores errors where the directory exists. runDir := filepath.Join(drs.outputDir, run.ID) diff --git a/results_webhook.go b/results_webhook.go index 5470d12..87d4039 100644 --- a/results_webhook.go +++ b/results_webhook.go @@ -29,6 +29,14 @@ type WebhookResultStore struct { allowDynamicConfig bool } +type WebhookResult struct { + Run string `json:"run_uuid"` + RunMetadata interface{} `json:"run_metadata,omitempty"` + RequestURL string `json:"request_url"` + ResponseBody []byte `json:"response_body"` // Will be base64 encoded when JSON marshalled. + ResponseStatusCode int `json:"response_status_code"` +} + func NewWebhookResultStore(ctx context.Context, endpoint string) *WebhookResultStore { u, err := url.Parse(endpoint) if err != nil { @@ -89,14 +97,19 @@ func (wrs *WebhookResultStore) Save(ctx context.Context, config any, run *Run, r defer span.End() // Create result using run metadata - result := NewResult(run, res) + result := &WebhookResult{ + Run: run.ID, + RequestURL: res.Request.URL.String(), + ResponseBody: res.Body[:], + ResponseStatusCode: res.StatusCode, + } payload := struct { Action string `json:"action"` - *Result + *WebhookResult }{ - Action: "collector.response", - Result: result, + Action: "collector.response", + WebhookResult: result, } body, err := json.Marshal(payload) From b65dadf77852e54f5203c2fbaa661b875100a792 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 10 Feb 2025 16:04:28 +0100 Subject: [PATCH 38/57] Rename result store to result reporter --- api.go | 2 +- main.go | 2 +- results.go | 24 ++++++++++++++++-------- results_disk.go | 20 ++++++++++---------- results_noop.go | 8 ++++---- results_webhook.go | 33 ++++++++++++++++++--------------- run.go | 10 +++++----- visitworker.go | 6 +++--- 8 files changed, 58 insertions(+), 47 deletions(-) diff --git a/api.go b/api.go index 745a07f..cad8c9b 100644 --- a/api.go +++ b/api.go @@ -70,7 +70,7 @@ type APIRequest struct { AllowPaths []string `json:"paths"` DenyPaths []string `json:"!paths"` - WebhookResultStoreConfig *WebhookResultStoreConfig `json:"webhook"` + WebhookResultStoreConfig *WebhookResultReporterConfig `json:"webhook"` // If true, we'll bypass the robots.txt check, however we'll still // download the file to look for sitemaps. diff --git a/main.go b/main.go index 895aef6..2734896 100644 --- a/main.go +++ b/main.go @@ -174,7 +174,7 @@ func main() { tear(queue.Close) } - rs, err := CreateResultStore(os.Getenv("TOBEY_RESULTS_DSN")) + rs, err := CreateResultReporter(os.Getenv("TOBEY_RESULTS_DSN")) if err != nil { panic(err) } diff --git a/results.go b/results.go index d22936b..aa2f39d 100644 --- a/results.go +++ b/results.go @@ -8,14 +8,16 @@ package main import ( "context" "fmt" + "log/slog" "net/url" "runtime" "tobey/internal/collector" ) -func CreateResultStore(dsn string) (ResultStore, error) { +func CreateResultReporter(dsn string) (ResultReporter, error) { if dsn == "" { - return &NoopResultStore{}, nil + slog.Debug("Result Reporter: Disabled, using noop reporter") + return &NoopResultReporter{}, nil } u, err := url.Parse(dsn) @@ -29,13 +31,15 @@ func CreateResultStore(dsn string) (ResultStore, error) { if runtime.GOOS == "windows" && len(path) > 0 && path[0] == '/' { path = path[1:] // Remove leading slash on Windows } - config := DiskStoreConfig{ + config := DiskResultReporterConfig{ OutputDir: path, } - store, err := NewDiskResultStore(config) + store, err := NewDiskResultReporter(config) if err != nil { return nil, fmt.Errorf("failed to create disk store: %w", err) } + + slog.Debug("Result Reporter: Enabled, using disk store", "dsn", dsn) return store, nil case "webhook": // Only require host if dynamic config is not enabled @@ -43,14 +47,18 @@ func CreateResultStore(dsn string) (ResultStore, error) { return nil, fmt.Errorf("webhook results store requires a valid host (e.g., webhook://example.com/results) unless dynamic configuration is enabled") } endpoint := fmt.Sprintf("%s://%s%s?%s", "https", u.Host, u.Path, u.RawQuery) - return NewWebhookResultStore(context.Background(), endpoint), nil + + slog.Debug("Result Reporter: Enabled, using webhook reporter", "dsn", dsn) + return NewWebhookResultReporter(context.Background(), endpoint), nil case "noop": - return &NoopResultStore{}, nil + + slog.Debug("Result Reporter: Disabled, using noop reporter") + return &NoopResultReporter{}, nil default: return nil, fmt.Errorf("unsupported results store type: %s", u.Scheme) } } -type ResultStore interface { - Save(ctx context.Context, config any, run *Run, res *collector.Response) error +type ResultReporter interface { + Accept(ctx context.Context, config any, run *Run, res *collector.Response) error } diff --git a/results_disk.go b/results_disk.go index f0e197b..02565b2 100644 --- a/results_disk.go +++ b/results_disk.go @@ -17,11 +17,11 @@ import ( "tobey/internal/collector" ) -type DiskStoreConfig struct { +type DiskResultReporterConfig struct { OutputDir string `json:"output_dir"` } -// DiskResultStore stores results on disk as JSON files. Results are grouped by run +// DiskResultReporter stores results on disk as JSON files. Results are grouped by run // in a run specific directory. The directory structure is as follows: // // / @@ -30,7 +30,7 @@ type DiskStoreConfig struct { // // The is the SHA-256 hash of the request URL, encoded as a hex string. // The JSON file contains the result as a JSON object. -type DiskResultStore struct { +type DiskResultReporter struct { outputDir string } @@ -42,25 +42,25 @@ type DiskResult struct { ResponseStatusCode int `json:"response_status_code"` } -func NewDiskResultStore(config DiskStoreConfig) (*DiskResultStore, error) { +func NewDiskResultReporter(config DiskResultReporterConfig) (*DiskResultReporter, error) { if config.OutputDir != "" { if err := os.MkdirAll(config.OutputDir, 0755); err != nil { return nil, fmt.Errorf("failed to create output directory: %w", err) } } - return &DiskResultStore{ + return &DiskResultReporter{ outputDir: config.OutputDir, }, nil } -// Save implements ResultStore.Save by writing results to a file. +// Accept implements ResultStore.Accept by writing results to a file. // -// We accept per-call config in the signature, to satisfy the ResultsStore interface, -// but we don't use it here, as we don't allow dynamic config for this store. -func (drs *DiskResultStore) Save(ctx context.Context, config any, run *Run, res *collector.Response) error { +// We accept per-call config in the signature, to satisfy the ResultReporter interface, +// but we don't use it here, as we don't allow dynamic config for this reporter. +func (drs *DiskResultReporter) Accept(ctx context.Context, config any, run *Run, res *collector.Response) error { logger := slog.With("run", run.ID, "url", res.Request.URL) - logger.Debug("DiskResultStore: Saving result to file...") + logger.Debug("DiskResultReporter: Saving result to file...") result := &DiskResult{ Run: run.ID, diff --git a/results_noop.go b/results_noop.go index 775a24b..881421b 100644 --- a/results_noop.go +++ b/results_noop.go @@ -10,10 +10,10 @@ import ( "tobey/internal/collector" ) -// NoopResultStore implements ResultsStore but discards all results -type NoopResultStore struct{} +// NoopResultReporter implements ResultsStore but discards all results +type NoopResultReporter struct{} -// Save implements ResultsStore.Save by discarding the result -func (n *NoopResultStore) Save(ctx context.Context, config any, run *Run, res *collector.Response) error { +// Accept implements ResultsStore.Accept by discarding the result +func (n *NoopResultReporter) Accept(ctx context.Context, config any, run *Run, res *collector.Response) error { return nil } diff --git a/results_webhook.go b/results_webhook.go index 87d4039..39086a7 100644 --- a/results_webhook.go +++ b/results_webhook.go @@ -16,16 +16,18 @@ import ( "tobey/internal/collector" ) -// WebhookResultStoreConfig defines the configuration for webhook endpoints -type WebhookResultStoreConfig struct { +// WebhookResultReporterConfig defines the configuration for webhook endpoints +type WebhookResultReporterConfig struct { Endpoint string `json:"endpoint"` } -// WebhookResultStore implements ResultsStore by sending results to a webhook endpoint. +// WebhookResultReporter implements ResultsStore by sending results to a webhook endpoint. // It sends results in a non-blocking way, following a fire-and-forget approach. -type WebhookResultStore struct { - client *http.Client - defaultEndpoint string // Can be empty when only using dynamic config +type WebhookResultReporter struct { + client *http.Client + // defaultEndppoint may be empty when always using dynamic config. It may be overriden + // on a per-call basis when allowDynamicConfig is true. + defaultEndpoint string allowDynamicConfig bool } @@ -37,25 +39,26 @@ type WebhookResult struct { ResponseStatusCode int `json:"response_status_code"` } -func NewWebhookResultStore(ctx context.Context, endpoint string) *WebhookResultStore { +func NewWebhookResultReporter(ctx context.Context, endpoint string) *WebhookResultReporter { u, err := url.Parse(endpoint) if err != nil { - return &WebhookResultStore{ + return &WebhookResultReporter{ client: CreateRetryingHTTPClient(NoAuthFn), defaultEndpoint: endpoint, allowDynamicConfig: false, } } - // If dynamic config is enabled, we don't require a default endpoint + // If dynamic config is enabled, we don't require a default endpoint. var cleanEndpoint string if u.Host != "" { u.RawQuery = "" cleanEndpoint = u.String() } - return &WebhookResultStore{ - client: CreateRetryingHTTPClient(NoAuthFn), + return &WebhookResultReporter{ + client: CreateRetryingHTTPClient(NoAuthFn), + defaultEndpoint: cleanEndpoint, // Presence of the query parameter is sufficient to enable dynamic config. This is, // so we don't need to check what counts as boolean true, i.e. "true", "1", "yes", etc. @@ -63,14 +66,14 @@ func NewWebhookResultStore(ctx context.Context, endpoint string) *WebhookResultS } } -// Save implements ResultsStore.Save by sending results to a webhook endpoint -func (wrs *WebhookResultStore) Save(ctx context.Context, config any, run *Run, res *collector.Response) error { +// Accept implements ResultsStore.Accept by sending results to a webhook endpoint +func (wrs *WebhookResultReporter) Accept(ctx context.Context, config any, run *Run, res *collector.Response) error { var endpoint string - var webhook *WebhookResultStoreConfig + var webhook *WebhookResultReporterConfig if config != nil { var ok bool - webhook, ok = config.(*WebhookResultStoreConfig) + webhook, ok = config.(*WebhookResultReporterConfig) if !ok { return fmt.Errorf("invalid webhook configuration: %T", config) } diff --git a/run.go b/run.go index 616ea18..65e3724 100644 --- a/run.go +++ b/run.go @@ -48,7 +48,7 @@ type SerializableRun struct { SkipRobots bool SkipSitemapDiscovery bool - WebhookConfig *WebhookResultStoreConfig + WebhookConfig *WebhookResultReporterConfig } // LiveRun is a live version of the Run struct. It contains data that should not @@ -80,7 +80,7 @@ func (r *Run) getAuthFn() GetAuthFn { } } -func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p ProgressReporter, rs ResultStore) *collector.Collector { +func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p ProgressReporter, rs ResultReporter) *collector.Collector { // getEnqueueFn returns the enqueue function, that will enqueue a single URL to // be crawled. The enqueue function is called whenever a new URL is discovered // by that Collector, i.e. by looking at all links in a crawled page HTML. @@ -138,7 +138,7 @@ func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p Progre // getCollectFn returns the collect function that is called once we have a // result. Uses the information provided in the original crawl request, i.e. the // WebhookConfig, that we have received via the queued message. - getCollectFn := func(run *Run, rs ResultStore) collector.CollectFn { + getCollectFn := func(run *Run, rs ResultReporter) collector.CollectFn { // The returned function takes the run context. return func(ctx context.Context, c *collector.Collector, res *collector.Response) { @@ -150,7 +150,7 @@ func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p Progre "response.status", res.StatusCode, ) if run.WebhookConfig != nil && run.WebhookConfig.Endpoint != "" { - rs.Save(ctx, run.WebhookConfig, run, res) + rs.Accept(ctx, run.WebhookConfig, run, res) } } } @@ -181,7 +181,7 @@ func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p Progre // Start starts the crawl with the given URLs. It will discover sitemaps and // enqueue the URLs. From there on more URLs will be discovered and enqueued. -func (r *Run) Start(ctx context.Context, q ctrlq.VisitWorkQueue, p ProgressReporter, rs ResultStore, urls []string) { +func (r *Run) Start(ctx context.Context, q ctrlq.VisitWorkQueue, p ProgressReporter, rs ResultReporter, urls []string) { c := r.GetCollector(ctx, q, p, rs) // Decide where the initial URLs should go, users may provide sitemaps and diff --git a/visitworker.go b/visitworker.go index 701cfe9..ae82c3d 100644 --- a/visitworker.go +++ b/visitworker.go @@ -33,7 +33,7 @@ func CreateVisitWorkersPool( runs *RunManager, q ctrlq.VisitWorkQueue, progress ProgressReporter, - rs ResultStore, + rs ResultReporter, ) *sync.WaitGroup { var wg sync.WaitGroup @@ -60,7 +60,7 @@ func VisitWorker( runs *RunManager, q ctrlq.VisitWorkQueue, progress ProgressReporter, - rs ResultStore, + rs ResultReporter, ) error { wlogger := slog.With("worker.id", id) wlogger.Debug("Visitor: Starting...") @@ -134,7 +134,7 @@ func VisitWorker( span.AddEvent("Visitor: Visited URL.", t) if r.WebhookConfig != nil { - rs.Save(jctx, r.WebhookConfig, r, res) + rs.Accept(jctx, r.WebhookConfig, r, res) } span.End() From c6f627a60ee2d2ec20a8c0c1bd77b78046cb0be3 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 10 Feb 2025 16:07:54 +0100 Subject: [PATCH 39/57] Prepare otel for refactor --- getenv.go | 67 ------------------------------------------------------- otel.go | 57 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 67 deletions(-) delete mode 100644 getenv.go diff --git a/getenv.go b/getenv.go deleted file mode 100644 index 197702e..0000000 --- a/getenv.go +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2024 Factorial GmbH. All rights reserved. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package main - -import ( - "log/slog" - "os" - "strconv" -) - -// GetEnvString gets the environment variable for a key and if that env-var hasn't been set it returns the default value -func GetEnvString(key string, defaultVal string) string { - value := os.Getenv(key) - if len(value) == 0 { - value = defaultVal - } - - slog.Debug("Set Environment ", "key", key, "value", value) - - return value -} - -// GetEnvBool gets the environment variable for a key and if that env-var hasn't been set it returns the default value -func GetEnvBool(key string, defaultVal bool) bool { - envvalue := os.Getenv(key) - value, err := strconv.ParseBool(envvalue) - if len(envvalue) == 0 || err != nil { - value := defaultVal - return value - } - - slog.Debug("Set Environment ", "key", key, "value", value) - - return value -} - -// GetEnvInt gets the environment variable for a key and if that env-var hasn't been set it returns the default value. This function is equivalent to ParseInt(s, 10, 0) to convert env-vars to type int -func GetEnvInt(key string, defaultVal int) int { - envvalue := os.Getenv(key) - value, err := strconv.Atoi(envvalue) - - if len(envvalue) == 0 || err != nil { - value := defaultVal - return value - } - - slog.Debug("Set Environment ", "key", key, "value", value) - - return value -} - -// GetEnvFloat gets the environment variable for a key and if that env-var hasn't been set it returns the default value. This function uses bitSize of 64 to convert string to float64. -func GetEnvFloat(key string, defaultVal float64) float64 { - envvalue := os.Getenv(key) - value, err := strconv.ParseFloat(envvalue, 64) - if len(envvalue) == 0 || err != nil { - value := defaultVal - return value - } - - slog.Debug("Set Environment ", "key", key, "value", value) - - return value -} diff --git a/otel.go b/otel.go index ab96393..12a7fc8 100644 --- a/otel.go +++ b/otel.go @@ -8,7 +8,9 @@ package main import ( "context" "fmt" + "log/slog" "os" + "strconv" "time" "github.com/mariuswilms/tears" @@ -131,3 +133,58 @@ func newMeterProvider(ctx context.Context) (*metric.MeterProvider, error) { ) return meterProvider, nil } + +// GetEnvString gets the environment variable for a key and if that env-var hasn't been set it returns the default value +func GetEnvString(key string, defaultVal string) string { + value := os.Getenv(key) + if len(value) == 0 { + value = defaultVal + } + + slog.Debug("Set Environment ", "key", key, "value", value) + + return value +} + +// GetEnvBool gets the environment variable for a key and if that env-var hasn't been set it returns the default value +func GetEnvBool(key string, defaultVal bool) bool { + envvalue := os.Getenv(key) + value, err := strconv.ParseBool(envvalue) + if len(envvalue) == 0 || err != nil { + value := defaultVal + return value + } + + slog.Debug("Set Environment ", "key", key, "value", value) + + return value +} + +// GetEnvInt gets the environment variable for a key and if that env-var hasn't been set it returns the default value. This function is equivalent to ParseInt(s, 10, 0) to convert env-vars to type int +func GetEnvInt(key string, defaultVal int) int { + envvalue := os.Getenv(key) + value, err := strconv.Atoi(envvalue) + + if len(envvalue) == 0 || err != nil { + value := defaultVal + return value + } + + slog.Debug("Set Environment ", "key", key, "value", value) + + return value +} + +// GetEnvFloat gets the environment variable for a key and if that env-var hasn't been set it returns the default value. This function uses bitSize of 64 to convert string to float64. +func GetEnvFloat(key string, defaultVal float64) float64 { + envvalue := os.Getenv(key) + value, err := strconv.ParseFloat(envvalue, 64) + if len(envvalue) == 0 || err != nil { + value := defaultVal + return value + } + + slog.Debug("Set Environment ", "key", key, "value", value) + + return value +} From 051ceb5f9a5f71b9741188c86f84e72ea0e26efa Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 10 Feb 2025 16:18:44 +0100 Subject: [PATCH 40/57] Work on .env.example --- .env.example | 32 ++++++++++++++++++++++---------- .env.example.factorial | 24 ++++++++++++++++++------ internal/ctrlq/main.go | 4 ++-- 3 files changed, 42 insertions(+), 18 deletions(-) diff --git a/.env.example b/.env.example index f1f864e..1bc4982 100644 --- a/.env.example +++ b/.env.example @@ -1,14 +1,26 @@ -# TOBEY_DEBUG=false -# TOBEY_SKIP_CACHE=false +# Controls debug mode, uncomment to enable debug mode, by default it is disabled. +# TOBEY_DEBUG=true -TOBEY_RESULTS_DSN=disk://results -# TOBEY_PROGRESS_DSN= -# TOBEY_REDIS_DSN= +# Controls caching access, uncomment to disabled caching. By default caching is enabled. +# TOBEY_SKIP_CACHE=true -# A space separated list of telemetry to send. Available telemetry: metrics, -# traces, pulse. To disable telemetry provide an empty value. When enabling telemetry -# appropriate OTLP endpoints must provided as well. -# TOBEY_TELEMETRY= +# DSN specifying where crawl results should be stored, by default no results are stored. Here we store results +# in the "results" directory, relative to the current working directory. +TOBEY_RESULTS_DSN=disk://results +# TOBEY_RESULTS_DSN=webhook://host/path +# DSN for progress reporting. By default no progress is reported, uncomment to report progress to the +# Factorial service. +# TOBEY_PROGRESS_DSN=factorial://localhost:8080 + +# If you have multiple instances of Tobey, you can use a Redis instance to coordinate the work queue. By default +# no coordination is done, uncomment to enable coordination. +# TOBEY_REDIS_DSN=redis://localhost:6379 + +# A space separated list of telemetry to send. Available telemetry: metrics, traces, pulse. By default no telemetry +# is send. Uncomment to enable metrics and traces. +# TOBEY_TELEMETRY=metrics traces + +# In order to send telemetry you need to provide the OTLP endpoints, as well. # OTEL_EXPORTER_OTLP_TRACES_ENDPOINT= -# OTEL_EXPORTER_OTLP_METRICS_ENDPOINT= +# OTEL_EXPORTER_OTLP_METRICS_ENDPOINT= \ No newline at end of file diff --git a/.env.example.factorial b/.env.example.factorial index aeb5c86..3ce7378 100644 --- a/.env.example.factorial +++ b/.env.example.factorial @@ -1,14 +1,26 @@ -# TOBEY_DEBUG=false -# TOBEY_SKIP_CACHE=false +# Controls debug mode, uncomment to enable debug mode, by default it is disabled. +# TOBEY_DEBUG=true -TOBEY_PROGRESS_DSN=factorial://progress:8080 +# Controls caching access, uncomment to disabled caching. By default caching is enabled. +# TOBEY_SKIP_CACHE=true + +# DSN specifying where crawl results should be stored, by default no results are stored. Here we store results +# in the "results" directory, relative to the current working directory. +# TOBEY_RESULTS_DSN=disk://results TOBEY_RESULTS_DSN=webhook://?enable_dynamic_config + +# DSN for progress reporting. By default no progress is reported, uncomment to report progress to the +# Factorial service. +TOBEY_PROGRESS_DSN=factorial://localhost:8080 + +# If you have multiple instances of Tobey, you can use a Redis instance to coordinate the work queue. By default +# no coordination is done, uncomment to enable coordination. TOBEY_REDIS_DSN=redis:6379/0 -# A space separated list of telemetry to send. Available telemetry: metrics, -# traces, pulse. To disable telemetry provide an empty value. When enabling telemetry -# appropriate OTLP endpoints must provided as well. +# A space separated list of telemetry to send. Available telemetry: metrics, traces, pulse. By default no telemetry +# is send. Uncomment to enable metrics and traces. TOBEY_TELEMETRY="traces" +# In order to send telemetry you need to provide the OTLP endpoints, as well. OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://jaeger:4318/v1/traces OTEL_EXPORTER_OTLP_METRICS_ENDPOINT= diff --git a/internal/ctrlq/main.go b/internal/ctrlq/main.go index f24b281..8023801 100644 --- a/internal/ctrlq/main.go +++ b/internal/ctrlq/main.go @@ -25,12 +25,12 @@ import ( func CreateWorkQueue(redis *redis.Client) VisitWorkQueue { if redis != nil { - slog.Debug("Using distributed work queue...") + slog.Debug("Work Queue: Using distributed work queue") // TODO: Add support for redis work queue. // return &RedisVisitWorkQueue{conn: redis} return NewMemoryVisitWorkQueue() } else { - slog.Debug("Using in-memory work queue...") + slog.Debug("Work Queue: Using in-memory work queue") return NewMemoryVisitWorkQueue() } } From fc6831517ca37b6c98aeff860bd678da66f2e5d7 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 10 Feb 2025 16:20:48 +0100 Subject: [PATCH 41/57] WIP --- README.md | 163 +++++++++++++++++++++++++++--------------------------- 1 file changed, 81 insertions(+), 82 deletions(-) diff --git a/README.md b/README.md index c7927ac..c3361c2 100644 --- a/README.md +++ b/README.md @@ -19,90 +19,11 @@ curl -X POST http://127.0.0.1:8080 \ -d '{"url": "https://www.example.org/"}' ``` -## Deployment Options - -### Dependency Free - -By default Tobey runs without any dependencies on any other service. In this mode -the service will not coordinate with other instances. It will store results locally -on disk, but not report any progress. If you are trying out tobey this is the -easiest way to get started. - -```sh -TOBEY_RESULTS_DSN=disk:///path/to/results go run . -``` - -### Stateless Operation - -It is possible to configure and use Tobey in a stateless manner. In this operation mode -you'll specify configuration on a per-run basis, and not statically via a configuration file. Choosing -the webhook results store will forward results to a webhook endpoint without storing them locally. - -```sh -TOBEY_RESULTS_DSN=webhook://example.org/webhook?enable_dynamic_config=true go run . -``` - -### Distributed Operation - -The service is horizontally scalable by adding more instances on nodes -in a cluster. In horizontal scaling, any instances can receive crawl requests, -for easy load balancing. The instances will coordinate with each other via Redis. - -```sh -TOBEY_REDIS_DSN=redis://localhost:6379 go run . -``` - -## Scaling - -Tobey can be scaled vertically by increasing the number of workers, via the `WORKERS` environment variable, or horizontally -by adding more instances in a cluster, see the [Distributed Operation](#distributed-operation) section for more details. - -The crawler is designed to handle a large potentially infinite number of hosts, -which presents challenges for managing resources like memory and concurrency. -Keeping a persistent worker process or goroutine for each host would be -inefficient and resource-intensive, particularly since external interactions -can make it difficult to keep them alive. Instead, Tobey uses a pool of workers -that can process multiple requests per host concurrently, balancing the workload -across different hosts. - -## Smart Rate Limiting - -The Tobey Crawler architecture optimizes throughput per host by dynamically -managing rate limits, ensuring that requests to each host are processed as -efficiently as possible. The crawler does not impose static rate limits; -instead, it adapts to each host's capabilities, adjusting the rate limit in real -time based on feedback from headers or other factors. - -This dynamic adjustment is essential. To manage these rate limits -effectively, Tobey employs a rate-limited work queue that abstracts away the -complexities of dynamic rate limiting from other parts of the system. The goal -is to focus on maintaining a steady flow of requests without overwhelming -individual hosts. - -## Caching - -Caching is a critical part of the architecture. The crawler uses a global cache, -for HTTP responses. Access to sitemaps and robot control files are also cached. -While these files have expiration times, the crawler maintains an in-memory -cache to quickly validate requests without constantly retrieving them. The cache -is designed to be updated or invalidated as necessary, and a signal can be sent -across all Tobey instances to ensure the latest robot control files are used, -keeping the system responsive and compliant. This layered caching strategy, -along with the dynamic rate limit adjustment, ensures that Tobey maintains high -efficiency and adaptability during its crawling operations. - -## Limitations - -Also Tobey can be configured - on a per run basis - to crawl websites behind -HTTP basic auth, **it does not support fetching personalized content**. It is -expected that the website is generally publicly available, and that the content -is the same for all users. When HTTP basic auth is used by the website it must -only be so in order to prevent early access. - ## Configuration The service is configured via environment variables. The following environment -variables are available: +variables are available. Please also see `.env.example` for a working +example configuration that should get you started. | Variable Name | Default Value | Supported Values | Description | |----------------|----------------|------------------|----------------------------------| @@ -115,7 +36,7 @@ variables are available: On top of these variables, the service's telemetry feature can be configured via the commonly known -[OpenTelemetry environment variables](https://opentelemetry.io/docs/languages/sdk-configuration/otlp-exporter/). +[OpenTelemetry environment variables](https://opentelemetry.io/docs/languages/sdk-configuration/otlp-exporter/). ## Providing Crawl Targets @@ -362,3 +283,81 @@ variable. TOBEY_PROGRESS_DSN=factorial://host:port ``` +## Deployment Options + +### Dependency Free + +By default Tobey runs without any dependencies on any other service. In this mode +the service will not coordinate with other instances. It will store results locally +on disk, but not report any progress. If you are trying out tobey this is the +easiest way to get started. + +```sh +TOBEY_RESULTS_DSN=disk:///path/to/results go run . +``` + +### Stateless Operation + +It is possible to configure and use Tobey in a stateless manner. In this operation mode +you'll specify configuration on a per-run basis, and not statically via a configuration file. Choosing +the webhook results store will forward results to a webhook endpoint without storing them locally. + +```sh +TOBEY_RESULTS_DSN=webhook://example.org/webhook?enable_dynamic_config=true go run . +``` + +### Distributed Operation + +The service is horizontally scalable by adding more instances on nodes +in a cluster. In horizontal scaling, any instances can receive crawl requests, +for easy load balancing. The instances will coordinate with each other via Redis. + +```sh +TOBEY_REDIS_DSN=redis://localhost:6379 go run . +``` +## Scaling + +Tobey can be scaled vertically by increasing the number of workers, via the `WORKERS` environment variable, or horizontally +by adding more instances in a cluster, see the [Distributed Operation](#distributed-operation) section for more details. + +The crawler is designed to handle a large potentially infinite number of hosts, +which presents challenges for managing resources like memory and concurrency. +Keeping a persistent worker process or goroutine for each host would be +inefficient and resource-intensive, particularly since external interactions +can make it difficult to keep them alive. Instead, Tobey uses a pool of workers +that can process multiple requests per host concurrently, balancing the workload +across different hosts. + +## Smart Rate Limiting + +The Tobey Crawler architecture optimizes throughput per host by dynamically +managing rate limits, ensuring that requests to each host are processed as +efficiently as possible. The crawler does not impose static rate limits; +instead, it adapts to each host's capabilities, adjusting the rate limit in real +time based on feedback from headers or other factors. + +This dynamic adjustment is essential. To manage these rate limits +effectively, Tobey employs a rate-limited work queue that abstracts away the +complexities of dynamic rate limiting from other parts of the system. The goal +is to focus on maintaining a steady flow of requests without overwhelming +individual hosts. + +## Caching + +Caching is a critical part of the architecture. The crawler uses a global cache, +for HTTP responses. Access to sitemaps and robot control files are also cached. +While these files have expiration times, the crawler maintains an in-memory +cache to quickly validate requests without constantly retrieving them. The cache +is designed to be updated or invalidated as necessary, and a signal can be sent +across all Tobey instances to ensure the latest robot control files are used, +keeping the system responsive and compliant. This layered caching strategy, +along with the dynamic rate limit adjustment, ensures that Tobey maintains high +efficiency and adaptability during its crawling operations. + +## Limitations + +Also Tobey can be configured - on a per run basis - to crawl websites behind +HTTP basic auth, **it does not support fetching personalized content**. It is +expected that the website is generally publicly available, and that the content +is the same for all users. When HTTP basic auth is used by the website it must +only be so in order to prevent early access. \ No newline at end of file From 61dd3903cf09ba74a5b004b38062d4ec98c301ec Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 10 Feb 2025 17:51:56 +0100 Subject: [PATCH 42/57] Minor improvements --- Makefile | 2 +- main.go | 5 +++++ results_disk.go | 2 +- results_webhook.go | 2 ++ sitemap.go | 2 +- visitworker.go | 22 +++++++++++++--------- 6 files changed, 23 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 83cbde0..7eaba8e 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ .PHONY: dev dev: - TOBEY_SKIP_CACHE=true TOBEY_DEBUG=true TOBEY_HOST=127.0.0.1 go run . + TOBEY_SKIP_CACHE=true TOBEY_DEBUG=true TOBEY_RESULTS_DSN=disk:///tmp/tobey TOBEY_HOST=127.0.0.1 go run . .PHONY: pulse pulse: diff --git a/main.go b/main.go index 2734896..22ca249 100644 --- a/main.go +++ b/main.go @@ -174,6 +174,11 @@ func main() { tear(queue.Close) } + if _, ok := os.LookupEnv("TOBEY_RESULTS_DSN"); !ok { + if _, ok := os.LookupEnv("TOBEY_RESULT_DSN"); ok { + slog.Debug("You have a typo in your env var: TOBEY_RESULTS_DSN is not set, but TOBEY_RESULT_DSN is set. Please use TOBEY_RESULTS_DSN instead.") + } + } rs, err := CreateResultReporter(os.Getenv("TOBEY_RESULTS_DSN")) if err != nil { panic(err) diff --git a/results_disk.go b/results_disk.go index 02565b2..110c5d9 100644 --- a/results_disk.go +++ b/results_disk.go @@ -60,7 +60,7 @@ func NewDiskResultReporter(config DiskResultReporterConfig) (*DiskResultReporter // but we don't use it here, as we don't allow dynamic config for this reporter. func (drs *DiskResultReporter) Accept(ctx context.Context, config any, run *Run, res *collector.Response) error { logger := slog.With("run", run.ID, "url", res.Request.URL) - logger.Debug("DiskResultReporter: Saving result to file...") + logger.Debug("Result reporter: Saving result to file...") result := &DiskResult{ Run: run.ID, diff --git a/results_webhook.go b/results_webhook.go index 39086a7..dc4faa4 100644 --- a/results_webhook.go +++ b/results_webhook.go @@ -70,6 +70,8 @@ func NewWebhookResultReporter(ctx context.Context, endpoint string) *WebhookResu func (wrs *WebhookResultReporter) Accept(ctx context.Context, config any, run *Run, res *collector.Response) error { var endpoint string + slog.Debug("Result reporter: Forwarding result...") + var webhook *WebhookResultReporterConfig if config != nil { var ok bool diff --git a/sitemap.go b/sitemap.go index ec432c8..61e329d 100644 --- a/sitemap.go +++ b/sitemap.go @@ -105,7 +105,7 @@ func (s *Sitemaps) Drain(ctx context.Context, getAuth GetAuthFn, url string, yie if isProbablySitemap(url) { return sitemap.Parse(res.Body, func(e sitemap.Entry) error { - slog.Info("Sitemaps: Yield URL from sitemap.", "url", e.GetLocation()) + slog.Info("Sitemaps: Yield URL.", "url", e.GetLocation()) return yieldu(ctx, e.GetLocation()) }) } else if isProbablySiteindex(url) { diff --git a/visitworker.go b/visitworker.go index ae82c3d..f23232e 100644 --- a/visitworker.go +++ b/visitworker.go @@ -32,8 +32,8 @@ func CreateVisitWorkersPool( num int, runs *RunManager, q ctrlq.VisitWorkQueue, - progress ProgressReporter, - rs ResultReporter, + pr ProgressReporter, + rr ResultReporter, ) *sync.WaitGroup { var wg sync.WaitGroup @@ -42,7 +42,7 @@ func CreateVisitWorkersPool( wg.Add(1) go func(id int) { - if err := VisitWorker(ctx, id, runs, q, progress, rs); err != nil { + if err := VisitWorker(ctx, id, runs, q, pr, rr); err != nil { slog.Error("Visitor: Worker exited with error.", "worker.id", id, "error", err) } else { slog.Debug("Visitor: Worker exited cleanly.", "worker.id", id) @@ -59,8 +59,8 @@ func VisitWorker( id int, runs *RunManager, q ctrlq.VisitWorkQueue, - progress ProgressReporter, - rs ResultReporter, + pr ProgressReporter, + rr ResultReporter, ) error { wlogger := slog.With("worker.id", id) wlogger.Debug("Visitor: Starting...") @@ -106,8 +106,8 @@ func VisitWorker( // yet have a collector available via the Manager. Please note that Collectors // are not shared by the Manager across tobey instances. r, _ := runs.Get(ctx, job.Run) - c := r.GetCollector(ctx, q, progress, rs) - p := progress.With(r, job.URL) + c := r.GetCollector(ctx, q, pr, rr) + p := pr.With(r, job.URL) p.Update(jctx, ProgressStateCrawling) @@ -133,8 +133,12 @@ func VisitWorker( jlogger.Info("Visitor: Visited URL.", "took.lifetime", time.Since(job.Created), "took.fetch", res.Took) span.AddEvent("Visitor: Visited URL.", t) - if r.WebhookConfig != nil { - rs.Accept(jctx, r.WebhookConfig, r, res) + // Pass the per-call config depending on the ResultReporter type. + switch rr := rr.(type) { + case *WebhookResultReporter: + rr.Accept(jctx, r.WebhookConfig, r, res) + case *DiskResultReporter: + rr.Accept(jctx, nil, r, res) } span.End() From 0465fee308be4118493618deadd4ea0eeccfb06c Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Mon, 10 Feb 2025 18:11:33 +0100 Subject: [PATCH 43/57] Add console progress reporter --- .env.example | 5 +++-- README.md | 7 ++++--- main.go | 14 +++++++++++--- progress.go | 27 ++++++++++++++------------- progress_console.go | 24 ++++++++++++++++++++++++ progress_factorial.go | 4 ++-- results.go | 8 ++++---- run.go | 9 +++++++++ 8 files changed, 71 insertions(+), 27 deletions(-) create mode 100644 progress_console.go diff --git a/.env.example b/.env.example index 1bc4982..0ad8041 100644 --- a/.env.example +++ b/.env.example @@ -9,9 +9,10 @@ TOBEY_RESULTS_DSN=disk://results # TOBEY_RESULTS_DSN=webhook://host/path -# DSN for progress reporting. By default no progress is reported, uncomment to report progress to the -# Factorial service. +# DSN for progress reporting. By default, a console progress reporter is used. Uncomment to report progress to the +# Factorial service or disable progress reporting. # TOBEY_PROGRESS_DSN=factorial://localhost:8080 +# TOBEY_PROGRESS_DSN= # If you have multiple instances of Tobey, you can use a Redis instance to coordinate the work queue. By default # no coordination is done, uncomment to enable coordination. diff --git a/README.md b/README.md index c3361c2..90618d2 100644 --- a/README.md +++ b/README.md @@ -276,11 +276,12 @@ the metadata of the run, see _Runs_ above for more details. ## Progress Reporting Tobey can report progress while it's crawling. This is useful for monitoring the -progress of a crawl and for debugging and determine when a crawl has finished. By default no progress reporting is enabled. There is currently only one progress reporting service that is supported, the Factorial Progress Service. It is configured via the `TOBEY_PROGRESS_DSN` environment -variable. +progress of a crawl and for debugging and determine when a crawl has finished. By default the console progress reporter is used. ```sh -TOBEY_PROGRESS_DSN=factorial://host:port +TOBEY_PROGRESS_DSN=factorial://host:port # To report progress to the Factorial progress service. +TOBEY_PROGRESS_DSN=console # To report progress to the console, this is the default. +TOBEY_PROGRESS_DSN= # To disable progress reporting. ``` ## Deployment Options diff --git a/main.go b/main.go index 22ca249..05e09c7 100644 --- a/main.go +++ b/main.go @@ -184,9 +184,17 @@ func main() { panic(err) } - progress, err := CreateProgressReporter(os.Getenv("TOBEY_PROGRESS_DSN")) - if err != nil { - panic(err) + var progress ProgressReporter + if v, ok := os.LookupEnv("TOBEY_PROGRESS_DSN"); !ok { + progress, err = CreateProgressReporter("console://") + if err != nil { + panic(err) + } + } else { + progress, err = CreateProgressReporter(v) + if err != nil { + panic(err) + } } workers := CreateVisitWorkersPool( diff --git a/progress.go b/progress.go index 40b122c..93da296 100644 --- a/progress.go +++ b/progress.go @@ -29,7 +29,7 @@ const ( // If dsn is empty, it returns a NoopProgressDispatcher. func CreateProgressReporter(dsn string) (ProgressReporter, error) { if dsn == "" { - slog.Debug("Progress Reporting: Disabled, not sharing progress updates.") + slog.Info("Progress Reporting: Disabled, not sharing progress updates.") return &NoopProgressReporter{}, nil } @@ -39,13 +39,16 @@ func CreateProgressReporter(dsn string) (ProgressReporter, error) { } switch u.Scheme { + case "console": + slog.Info("Progress Reporting: Using Console for progress updates.") + return &ConsoleProgressReporter{}, nil case "factorial": slog.Info("Progress Reporting: Enabled, using Factorial progress service for updates.", "dsn", dsn) return &FactorialProgressReporter{ client: CreateRetryingHTTPClient(NoAuthFn), }, nil case "noop": - slog.Debug("Progress Reporting: Disabled, not sharing progress updates.") + slog.Info("Progress Reporting: Disabled, not sharing progress updates.") return &NoopProgressReporter{}, nil default: return nil, fmt.Errorf("unsupported progress dispatcher type: %s", u.Scheme) @@ -64,26 +67,24 @@ type ProgressReporter interface { type Progress struct { reporter ProgressReporter - stage string Run *Run URL string + Stage string } type ProgressUpdate struct { - Stage string - Status ProgressStatus - Run string - URL string - Metadata interface{} + Run *Run + URL string + Stage string + Status ProgressStatus } // Update updates the progress with a new status func (p *Progress) Update(ctx context.Context, status ProgressStatus) error { return p.reporter.Call(ctx, ProgressUpdate{ - Stage: p.stage, - Run: p.Run.ID, - URL: p.URL, - Status: status, - Metadata: p.Run.Metadata, + Run: p.Run, + URL: p.URL, + Stage: p.Stage, + Status: status, }) } diff --git a/progress_console.go b/progress_console.go new file mode 100644 index 0000000..7198472 --- /dev/null +++ b/progress_console.go @@ -0,0 +1,24 @@ +package main + +import ( + "context" + "fmt" + "log/slog" +) + +type ConsoleProgressReporter struct{} + +func (c *ConsoleProgressReporter) With(run *Run, url string) *Progress { + return &Progress{ + reporter: c, + Stage: "initial", + Run: run, + URL: url, + } +} + +// Call outputs the progress update to the console. +func (c *ConsoleProgressReporter) Call(ctx context.Context, pu ProgressUpdate) error { + slog.Info(fmt.Sprintf("Progress Update: -> %d", pu.Status), "run", pu.Run.ShortID(), "url", pu.URL) + return nil +} diff --git a/progress_factorial.go b/progress_factorial.go index 9a3326a..f89cad7 100644 --- a/progress_factorial.go +++ b/progress_factorial.go @@ -57,7 +57,7 @@ type FactorialProgressReporter struct { func (p *FactorialProgressReporter) With(run *Run, url string) *Progress { return &Progress{ reporter: p, - stage: FactorialProgressServiceDefaultStage, + Stage: FactorialProgressServiceDefaultStage, Run: run, URL: url, } @@ -75,7 +75,7 @@ func (p *FactorialProgressReporter) Call(ctx context.Context, pu ProgressUpdate) payload := FactorialProgressUpdatePayload{ Stage: pu.Stage, Status: factorialProgressStatus(pu.Status), - Run: pu.Run, + Run: pu.Run.ID, URL: pu.URL, } diff --git a/results.go b/results.go index aa2f39d..dcab3c7 100644 --- a/results.go +++ b/results.go @@ -16,7 +16,7 @@ import ( func CreateResultReporter(dsn string) (ResultReporter, error) { if dsn == "" { - slog.Debug("Result Reporter: Disabled, using noop reporter") + slog.Info("Result Reporter: Disabled, using noop reporter") return &NoopResultReporter{}, nil } @@ -39,7 +39,7 @@ func CreateResultReporter(dsn string) (ResultReporter, error) { return nil, fmt.Errorf("failed to create disk store: %w", err) } - slog.Debug("Result Reporter: Enabled, using disk store", "dsn", dsn) + slog.Info("Result Reporter: Enabled, using disk store", "dsn", dsn) return store, nil case "webhook": // Only require host if dynamic config is not enabled @@ -48,11 +48,11 @@ func CreateResultReporter(dsn string) (ResultReporter, error) { } endpoint := fmt.Sprintf("%s://%s%s?%s", "https", u.Host, u.Path, u.RawQuery) - slog.Debug("Result Reporter: Enabled, using webhook reporter", "dsn", dsn) + slog.Info("Result Reporter: Enabled, using webhook reporter", "dsn", dsn) return NewWebhookResultReporter(context.Background(), endpoint), nil case "noop": - slog.Debug("Result Reporter: Disabled, using noop reporter") + slog.Info("Result Reporter: Disabled, using noop reporter") return &NoopResultReporter{}, nil default: return nil, fmt.Errorf("unsupported results store type: %s", u.Scheme) diff --git a/run.go b/run.go index 65e3724..1f9ae40 100644 --- a/run.go +++ b/run.go @@ -63,6 +63,15 @@ func (r *Run) Configure(s RunStore, ro *Robots, si *Sitemaps) { r.sitemaps = si } +// ShortID returns a human-readable version of the run's ID. +func (r *Run) ShortID() string { + // Here we can implement a simple transformation, for example, taking the first 8 characters. + if len(r.ID) > 8 { + return r.ID[:8] // Return the first 8 characters of the ID + } + return r.ID // Return the full ID if it's shorter than 8 characters +} + // GetClient configures and returns the http.Client for the Run. func (r *Run) GetClient() *http.Client { return CreateCrawlerHTTPClient(r.getAuthFn()) From 4484beb43b87b8c51ce9d1a20cf820c9901baba0 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Tue, 11 Feb 2025 09:19:26 +0100 Subject: [PATCH 44/57] Add back WORKERS, document, default to progress noop --- .env.example | 3 +++ README.md | 59 +++++++++++++++++----------------------------------- main.go | 56 +++++++++++++++++++++++++++---------------------- 3 files changed, 53 insertions(+), 65 deletions(-) diff --git a/.env.example b/.env.example index 0ad8041..376579c 100644 --- a/.env.example +++ b/.env.example @@ -4,6 +4,9 @@ # Controls caching access, uncomment to disabled caching. By default caching is enabled. # TOBEY_SKIP_CACHE=true +# Controls the number of workers per instance, by default 5. +# TOBEY_WORKERS=5 + # DSN specifying where crawl results should be stored, by default no results are stored. Here we store results # in the "results" directory, relative to the current working directory. TOBEY_RESULTS_DSN=disk://results diff --git a/README.md b/README.md index 90618d2..e5b39b1 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,29 @@ -# Tobey, a robust and scalable Crawler +# Tobey, a throughput optimizing web crawler Tobey is a throughput optimizing web crawler, that is scalable from a single instance to a cluster. It features intelligent rate limiting, distributed coordination, and flexible deployment options. ## Running Tobey -Start the service. ```sh -go run . -``` - -In its simplest form the service just receives a root URL of the website to be -crawled. - -```sh -curl -X POST http://127.0.0.1:8080 \ - -H 'Content-Type: application/json' \ - -d '{"url": "https://www.example.org/"}' +go run . # Start the crawler. +curl -X POST http://127.0.0.1:8080 https://www.example.org/ # Submit a crawl request. ``` ## Configuration -The service is configured via environment variables. The following environment -variables are available. Please also see `.env.example` for a working +Tobey is configured with sane defaults, which means it will work out of the box. If you need to configure it, you +can do so via environment variables. The following variables are available. Please also see `.env.example` for a working example configuration that should get you started. | Variable Name | Default Value | Supported Values | Description | |----------------|----------------|------------------|----------------------------------| | `TOBEY_DEBUG` | `false` | `true`, `false` | Controls debug mode. | | `TOBEY_SKIP_CACHE` | `false` | `true`, `false` | Controls caching access. | +| `TOBEY_WORKERS` | `5` | `1-128` | Controls the number of workers per instance. | | `TOBEY_REDIS_DSN` | empty | i.e. `redis://localhost:6379` | DSN to reach a Redis instance. Only needed when operating multiple instances. | -| `TOBEY_PROGRESS_DSN` | empty | `factorial://host:port`, `noop://` | DSN for progress reporting service. When configured, Tobey will send progress updates there. The factorial scheme enables progress updates to a Factorial progress service. Use noop:// to explicitly disable progress updates. | -| `TOBEY_RESULTS_DSN` | empty | `disk:///path`, `webhook://host/path`, `noop://` | DSN specifying where crawl results should be stored. Use disk:// for local filesystem storage, webhook:// to forward results to an HTTP endpoint, or noop:// to discard results. | +| `TOBEY_PROGRESS_DSN` | empty | `factorial://host:port` | DSN for progress reporting service. When configured, Tobey will send progress updates there. The factorial scheme enables progress updates to a Factorial progress service. Use noop:// to explicitly disable progress updates. | +| `TOBEY_RESULTS_DSN` | `disk://results` | `disk:///path`, `webhook://host/path` | DSN specifying where crawl results should be stored. Use disk:// for local filesystem storage, webhook:// to forward results to an HTTP endpoint, or noop:// to discard results. | | `TOBEY_TELEMETRY` | empty | i.e. `metrics traces` | Space separated list of what kind of telemetry is emitted. | On top of these variables, the service's telemetry @@ -174,7 +166,7 @@ the URL under the `url` key algonside the entrypoint: "https://example.org", "https://example.org/sitemap.xml" ], - "skip_auto_sitemaps": true + "skip_sitemap_discovery": true } ``` @@ -227,15 +219,14 @@ the results to a configured webhook endpoint. [Webhooks](https://mailchimp.com/e TOBEY_RESULTS_DSN=webhook://example.org/webhook ``` -For the webhook method, **dynamic re-configuration** is supported. This means that you can +For the webhook method, **dynamic re-configuration** is supported. This means that you configure the webhook endpoint on a per-request basis. Dynamic re-configuration is disabled -by default, and can be enabled by adding `enable_dynamic_config` to the DSN. +by default, for security reasons. It can be enabled by adding `enable_dynamic_config` to the DSN, if +can you trust the users that submit the crawl requests, i.e. if tobey is deployed as an internal service. ```sh TOBEY_RESULTS_DSN=webhook://example.org/webhook?enable_dynamic_config # with default endpoint -TOBEY_RESULTS_DSN=webhook://?enable_dynamic_config # without default endpoint, requires dynamic - # rconfiguration in each crawl request - # request +TOBEY_RESULTS_DSN=webhook://?enable_dynamic_config # without default endpoint ``` You can than specify the webhook endpoint in the crawl request: @@ -247,21 +238,13 @@ You can than specify the webhook endpoint in the crawl request: } ``` -When you configure the crawler to **discard results**, it will not store any results -by itself. This is useful for testing and **the default behavior**. - -```sh -TOBEY_RESULTS_DSN=noop:// -``` - ### Results Format -A _Result object_ is a JSON object that contains the result of a crawl request alongside +A _Result_ is an object that contains the result of a crawl request alongside the metadata of the run, see _Runs_ above for more details. ```jsonc { - "action": "collector.response", "run_uuid": "0033085c-685b-432a-9aa4-0aca59cc3e12", "run_metadata": { "internal_project_reference": 42, @@ -276,12 +259,12 @@ the metadata of the run, see _Runs_ above for more details. ## Progress Reporting Tobey can report progress while it's crawling. This is useful for monitoring the -progress of a crawl and for debugging and determine when a crawl has finished. By default the console progress reporter is used. +progress of a crawl and for debugging and determine when a crawl has finished. By +default this feature is disabled. ```sh TOBEY_PROGRESS_DSN=factorial://host:port # To report progress to the Factorial progress service. -TOBEY_PROGRESS_DSN=console # To report progress to the console, this is the default. -TOBEY_PROGRESS_DSN= # To disable progress reporting. +TOBEY_PROGRESS_DSN=console # To report progress to the console. ``` ## Deployment Options @@ -293,10 +276,6 @@ the service will not coordinate with other instances. It will store results loca on disk, but not report any progress. If you are trying out tobey this is the easiest way to get started. -```sh -TOBEY_RESULTS_DSN=disk:///path/to/results go run . -``` - ### Stateless Operation It is possible to configure and use Tobey in a stateless manner. In this operation mode @@ -304,7 +283,7 @@ you'll specify configuration on a per-run basis, and not statically via a config the webhook results store will forward results to a webhook endpoint without storing them locally. ```sh -TOBEY_RESULTS_DSN=webhook://example.org/webhook?enable_dynamic_config=true go run . +TOBEY_RESULTS_DSN=webhook://example.org/webhook?enable_dynamic_config ``` ### Distributed Operation @@ -314,7 +293,7 @@ in a cluster. In horizontal scaling, any instances can receive crawl requests, for easy load balancing. The instances will coordinate with each other via Redis. ```sh -TOBEY_REDIS_DSN=redis://localhost:6379 go run . +TOBEY_REDIS_DSN=redis://localhost:6379 ``` ## Scaling diff --git a/main.go b/main.go index 05e09c7..b90251d 100644 --- a/main.go +++ b/main.go @@ -6,6 +6,7 @@ package main import ( + "bytes" "context" "encoding/json" "errors" @@ -43,6 +44,9 @@ var ( UseMetrics = false UsePulse = false // High Frequency Metrics can be enabled by adding "pulse". + // NumVisitWorkers hard codes the number of workers we start at startup. + NumVisitWorkers int = 5 + // ListenHost is the host where the main HTTP server listens and the API is served, // this can be controlled via the TOBEY_HOST environment variable. An empty // string means "listen on all interfaces". @@ -54,9 +58,6 @@ var ( ) const ( - // NumVisitWorkers hard codes the number of workers we start at startup. - NumVisitWorkers int = 5 - // MaxParallelRuns specifies how many collectors we keep in memory, and thus // limits the maximum number of parrallel runs that we can perform. MaxParallelRuns int = 128 @@ -93,7 +94,13 @@ func configure() { SkipCache = true slog.Info("Skipping cache.") } - + if v := os.Getenv("TOBEY_WORKERS"); v != "" { + p, err := strconv.Atoi(v) + if err != nil { + panic(err) + } + NumVisitWorkers = p + } v := os.Getenv("TOBEY_TELEMETRY") if strings.Contains(v, "traces") || strings.Contains(v, "tracing") { UseTracing = true @@ -184,17 +191,9 @@ func main() { panic(err) } - var progress ProgressReporter - if v, ok := os.LookupEnv("TOBEY_PROGRESS_DSN"); !ok { - progress, err = CreateProgressReporter("console://") - if err != nil { - panic(err) - } - } else { - progress, err = CreateProgressReporter(v) - if err != nil { - panic(err) - } + progress, err := CreateProgressReporter(os.Getenv("TOBEY_PROGRESS_DSN")) + if err != nil { + panic(err) } workers := CreateVisitWorkersPool( @@ -220,7 +219,6 @@ func main() { body, _ := ioutil.ReadAll(r.Body) r.Body.Close() - w.Header().Set("Content-Type", "application/json") slog.Debug("Handling incoming request for crawl run...") // The context of the HTTP request might contain OpenTelemetry information, @@ -230,17 +228,25 @@ func main() { // This ends the very first span in handling the crawl run. It ends the HTTP handling span. defer span.End() - var req APIRequest - err := json.Unmarshal(body, &req) - if err != nil { - slog.Error("Failed to parse incoming JSON.", "error", err) + w.Header().Set("Content-Type", "application/json") - result := &APIError{ - Message: fmt.Sprintf("%s", err), + var req APIRequest + if bytes.HasPrefix(body, []byte("http://")) || bytes.HasPrefix(body, []byte("https://")) { + // As a special case, and to support minimalism, we allow directly + // posting a single URL. + req.URL = string(body) + } else { + err := json.Unmarshal(body, &req) + if err != nil { + slog.Error("Failed to parse incoming JSON.", "error", err) + + result := &APIError{ + Message: fmt.Sprintf("%s", err), + } + w.WriteHeader(http.StatusBadRequest) + json.NewEncoder(w).Encode(result) + return } - w.WriteHeader(http.StatusBadRequest) - json.NewEncoder(w).Encode(result) - return } if ok := req.Validate(); !ok { From a0075878f5ec7597b0c298a5341a8715cc6b8552 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Tue, 11 Feb 2025 09:23:03 +0100 Subject: [PATCH 45/57] Default to store results in './results' --- results.go | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/results.go b/results.go index dcab3c7..9b68312 100644 --- a/results.go +++ b/results.go @@ -16,8 +16,15 @@ import ( func CreateResultReporter(dsn string) (ResultReporter, error) { if dsn == "" { - slog.Info("Result Reporter: Disabled, using noop reporter") - return &NoopResultReporter{}, nil + config := DiskResultReporterConfig{ + OutputDir: "results", // Relative to the current working directory. + } + store, err := NewDiskResultReporter(config) + if err != nil { + return nil, fmt.Errorf("failed to setup disk result reporter: %w", err) + } + slog.Info("Result Reporter: Enabled, using disk store", "dsn", dsn) + return store, nil } u, err := url.Parse(dsn) @@ -36,7 +43,7 @@ func CreateResultReporter(dsn string) (ResultReporter, error) { } store, err := NewDiskResultReporter(config) if err != nil { - return nil, fmt.Errorf("failed to create disk store: %w", err) + return nil, fmt.Errorf("failed to setup disk result reporter: %w", err) } slog.Info("Result Reporter: Enabled, using disk store", "dsn", dsn) @@ -51,7 +58,6 @@ func CreateResultReporter(dsn string) (ResultReporter, error) { slog.Info("Result Reporter: Enabled, using webhook reporter", "dsn", dsn) return NewWebhookResultReporter(context.Background(), endpoint), nil case "noop": - slog.Info("Result Reporter: Disabled, using noop reporter") return &NoopResultReporter{}, nil default: From a05aef08c8588de176b65ea14ed65395c4f489f4 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Tue, 11 Feb 2025 15:36:35 +0100 Subject: [PATCH 46/57] Rename connections to coordinate --- connections.go => coordinate.go | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename connections.go => coordinate.go (100%) diff --git a/connections.go b/coordinate.go similarity index 100% rename from connections.go rename to coordinate.go From 4adfde0667377a70ffd3408b1c58f88325255fcb Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Thu, 13 Feb 2025 15:02:38 +0100 Subject: [PATCH 47/57] Surface and improve host/port config --- README.md | 2 ++ main.go | 19 +++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e5b39b1..968a8cf 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,8 @@ example configuration that should get you started. | `TOBEY_DEBUG` | `false` | `true`, `false` | Controls debug mode. | | `TOBEY_SKIP_CACHE` | `false` | `true`, `false` | Controls caching access. | | `TOBEY_WORKERS` | `5` | `1-128` | Controls the number of workers per instance. | +| `TOBEY_HOST` | empty | i.e. `localhost`, `127.0.0.1` | Host interface to bind the HTTP server to. Empty means listen on all interfaces. Alternatively you can use the `-host` command line flag. | +| `TOBEY_PORT` | `8080` | `1-65535` | Port to bind the HTTP server to. Alternatively you can use the `-port` command line flag. | | `TOBEY_REDIS_DSN` | empty | i.e. `redis://localhost:6379` | DSN to reach a Redis instance. Only needed when operating multiple instances. | | `TOBEY_PROGRESS_DSN` | empty | `factorial://host:port` | DSN for progress reporting service. When configured, Tobey will send progress updates there. The factorial scheme enables progress updates to a Factorial progress service. Use noop:// to explicitly disable progress updates. | | `TOBEY_RESULTS_DSN` | `disk://results` | `disk:///path`, `webhook://host/path` | DSN specifying where crawl results should be stored. Use disk:// for local filesystem storage, webhook:// to forward results to an HTTP endpoint, or noop:// to discard results. | diff --git a/main.go b/main.go index b90251d..1f668df 100644 --- a/main.go +++ b/main.go @@ -10,6 +10,7 @@ import ( "context" "encoding/json" "errors" + "flag" "fmt" "io/ioutil" "log" @@ -87,6 +88,14 @@ const ( ) func configure() { + // Add command line flag parsing + var flagHost string + var flagPort int + + flag.StringVar(&flagHost, "host", "", "Host interface to bind the HTTP server to") + flag.IntVar(&flagPort, "port", 0, "Port to bind the HTTP server to") + flag.Parse() + if os.Getenv("TOBEY_DEBUG") == "true" { Debug = true } @@ -115,10 +124,16 @@ func configure() { slog.Info("High Frequency Metrics (Pulse) enabled.") } - if v := os.Getenv("TOBEY_HOST"); v != "" { + // First check command line args, then fall back to env vars + if flagHost != "" { + ListenHost = flagHost + } else if v := os.Getenv("TOBEY_HOST"); v != "" { ListenHost = v } - if v := os.Getenv("TOBEY_PORT"); v != "" { + + if flagPort != 0 { + ListenPort = flagPort + } else if v := os.Getenv("TOBEY_PORT"); v != "" { p, err := strconv.Atoi(v) if err != nil { panic(err) From ea7bb8583b6c6a29742264e9dc56afae1ca71d0c Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Thu, 13 Feb 2025 15:08:19 +0100 Subject: [PATCH 48/57] Extract routes --- main.go | 126 ++------------------------------------------------ routes.go | 136 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+), 121 deletions(-) create mode 100644 routes.go diff --git a/main.go b/main.go index 1f668df..77b7530 100644 --- a/main.go +++ b/main.go @@ -6,13 +6,10 @@ package main import ( - "bytes" "context" - "encoding/json" "errors" "flag" "fmt" - "io/ioutil" "log" "log/slog" "net/http" @@ -24,8 +21,6 @@ import ( "tobey/internal/ctrlq" "github.com/mariuswilms/tears" - "github.com/prometheus/client_golang/prometheus/promhttp" - "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" _ "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "go.opentelemetry.io/contrib/instrumentation/runtime" ) @@ -189,6 +184,7 @@ func main() { runs := NewRunManager(redisconn, robots, sitemaps) queue := ctrlq.CreateWorkQueue(redisconn) + if err := queue.Open(ctx); err != nil { panic(err) } @@ -221,112 +217,11 @@ func main() { ) tear(workers.Wait) - apirouter := http.NewServeMux() - - apirouter.HandleFunc("GET /", func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "text/plain") - r.Body.Close() - - fmt.Fprint(w, "Hello from Tobey.") - }) - - apirouter.HandleFunc("POST /", func(w http.ResponseWriter, r *http.Request) { - body, _ := ioutil.ReadAll(r.Body) - r.Body.Close() - - slog.Debug("Handling incoming request for crawl run...") - - // The context of the HTTP request might contain OpenTelemetry information, - // i.e. SpanID or TraceID. If this is the case the line below creates - // a sub span. Otherwise we'll start a new root span here. - reqctx, span := tracer.Start(r.Context(), "receive_crawl_request") - // This ends the very first span in handling the crawl run. It ends the HTTP handling span. - defer span.End() - - w.Header().Set("Content-Type", "application/json") - - var req APIRequest - if bytes.HasPrefix(body, []byte("http://")) || bytes.HasPrefix(body, []byte("https://")) { - // As a special case, and to support minimalism, we allow directly - // posting a single URL. - req.URL = string(body) - } else { - err := json.Unmarshal(body, &req) - if err != nil { - slog.Error("Failed to parse incoming JSON.", "error", err) - - result := &APIError{ - Message: fmt.Sprintf("%s", err), - } - w.WriteHeader(http.StatusBadRequest) - json.NewEncoder(w).Encode(result) - return - } - } - - if ok := req.Validate(); !ok { - result := &APIError{ - Message: "Invalid request.", - } - w.WriteHeader(http.StatusBadRequest) - json.NewEncoder(w).Encode(result) - return - } - - id, err := req.GetRun() - if err != nil { - slog.Error("Failed to parse given run as UUID.", "run", req.Run) - - result := &APIError{ - Message: fmt.Sprintf("Failed to parse given run (%s) as UUID or number.", req.Run), - } - - w.WriteHeader(http.StatusBadRequest) - json.NewEncoder(w).Encode(result) - return - } - - run := &Run{ - SerializableRun: SerializableRun{ - ID: id, - Metadata: req.RunMetadata, - - URLs: req.GetURLs(true), - - AuthConfigs: req.GetAuthConfigs(), - - AllowDomains: req.GetAllowDomains(), - AllowPaths: req.GetAllowPaths(), - DenyPaths: req.GetDenyPaths(), - - SkipRobots: req.SkipRobots, - SkipSitemapDiscovery: req.SkipSitemapDiscovery, - - WebhookConfig: req.WebhookResultStoreConfig, - }, - } - - // Ensure we make the run configuration available in the store, before - // we start publishing to the work queue. - runs.Add(ctx, run) - - go run.Start(reqctx, queue, progress, rs, req.GetURLs(true)) - - result := &APIResponse{ - Run: run.ID, - } - w.WriteHeader(http.StatusOK) - json.NewEncoder(w).Encode(result) - }) - - if UseMetrics { - apirouter.Handle("GET /metrics", promhttp.Handler()) - } - + // Set up and start the main API server slog.Info("Starting HTTP API server...", "host", ListenHost, "port", ListenPort) apiserver := &http.Server{ Addr: fmt.Sprintf("%s:%d", ListenHost, ListenPort), - Handler: otelhttp.NewHandler(apirouter, "get_new_request"), + Handler: setupRoutes(runs, queue, progress, rs), } go func() { if err := apiserver.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) { @@ -336,22 +231,11 @@ func main() { }() tear(apiserver.Shutdown) + // Set up and start the healthcheck server slog.Info("Starting HTTP Healthcheck server...", "port", HealthcheckListenPort) - hcrouter := http.NewServeMux() - - // Supports HEAD requests as well. - hcrouter.HandleFunc("GET /healthz", func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "text/plain") - r.Body.Close() - - // TODO: Add actual healthchecking logic here. - - fmt.Fprint(w, "OK") - }) - hcserver := &http.Server{ Addr: fmt.Sprintf(":%d", HealthcheckListenPort), - Handler: hcrouter, + Handler: setupHealthcheckRoutes(), } go func() { if err := hcserver.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) { diff --git a/routes.go b/routes.go new file mode 100644 index 0000000..f4decb0 --- /dev/null +++ b/routes.go @@ -0,0 +1,136 @@ +package main + +import ( + "bytes" + "encoding/json" + "fmt" + "io/ioutil" + "log/slog" + "net/http" + "tobey/internal/ctrlq" + + "github.com/prometheus/client_golang/prometheus/promhttp" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" +) + +func setupRoutes(runs *RunManager, queue ctrlq.VisitWorkQueue, progress ProgressReporter, rs ResultReporter) http.Handler { + apirouter := http.NewServeMux() + + apirouter.HandleFunc("GET /", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + r.Body.Close() + + fmt.Fprint(w, "Hello from Tobey.") + }) + + apirouter.HandleFunc("POST /", func(w http.ResponseWriter, r *http.Request) { + body, _ := ioutil.ReadAll(r.Body) + r.Body.Close() + + slog.Debug("Handling incoming request for crawl run...") + + // The context of the HTTP request might contain OpenTelemetry information, + // i.e. SpanID or TraceID. If this is the case the line below creates + // a sub span. Otherwise we'll start a new root span here. + reqctx, span := tracer.Start(r.Context(), "receive_crawl_request") + // This ends the very first span in handling the crawl run. It ends the HTTP handling span. + defer span.End() + + w.Header().Set("Content-Type", "application/json") + + var req APIRequest + if bytes.HasPrefix(body, []byte("http://")) || bytes.HasPrefix(body, []byte("https://")) { + // As a special case, and to support minimalism, we allow directly + // posting a single URL. + req.URL = string(body) + } else { + err := json.Unmarshal(body, &req) + if err != nil { + slog.Error("Failed to parse incoming JSON.", "error", err) + + result := &APIError{ + Message: fmt.Sprintf("%s", err), + } + w.WriteHeader(http.StatusBadRequest) + json.NewEncoder(w).Encode(result) + return + } + } + + if ok := req.Validate(); !ok { + result := &APIError{ + Message: "Invalid request.", + } + w.WriteHeader(http.StatusBadRequest) + json.NewEncoder(w).Encode(result) + return + } + + id, err := req.GetRun() + if err != nil { + slog.Error("Failed to parse given run as UUID.", "run", req.Run) + + result := &APIError{ + Message: fmt.Sprintf("Failed to parse given run (%s) as UUID or number.", req.Run), + } + + w.WriteHeader(http.StatusBadRequest) + json.NewEncoder(w).Encode(result) + return + } + + run := &Run{ + SerializableRun: SerializableRun{ + ID: id, + Metadata: req.RunMetadata, + + URLs: req.GetURLs(true), + + AuthConfigs: req.GetAuthConfigs(), + + AllowDomains: req.GetAllowDomains(), + AllowPaths: req.GetAllowPaths(), + DenyPaths: req.GetDenyPaths(), + + SkipRobots: req.SkipRobots, + SkipSitemapDiscovery: req.SkipSitemapDiscovery, + + WebhookConfig: req.WebhookResultStoreConfig, + }, + } + + // Ensure we make the run configuration available in the store, before + // we start publishing to the work queue. + runs.Add(reqctx, run) + + go run.Start(reqctx, queue, progress, rs, req.GetURLs(true)) + + result := &APIResponse{ + Run: run.ID, + } + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(result) + }) + + if UseMetrics { + apirouter.Handle("GET /metrics", promhttp.Handler()) + } + + return otelhttp.NewHandler(apirouter, "get_new_request") +} + +func setupHealthcheckRoutes() http.Handler { + hcrouter := http.NewServeMux() + + // Supports HEAD requests as well. + hcrouter.HandleFunc("GET /healthz", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + r.Body.Close() + + // TODO: Add actual healthchecking logic here. + + fmt.Fprint(w, "OK") + }) + + return hcrouter +} From d0e7b550f44ab5bdbeb73799113b846b90de4b4b Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Thu, 13 Feb 2025 15:33:47 +0100 Subject: [PATCH 49/57] Fix usage of snapshot --- internal/ctrlq/main_test.go | 60 +++++++++++++++++++++++++++++++++ internal/ctrlq/memory.go | 8 +---- internal/ctrlq/promoter.go | 32 +++++++++++------- internal/ctrlq/promoter_test.go | 54 ++++++++++++++++------------- 4 files changed, 110 insertions(+), 44 deletions(-) create mode 100644 internal/ctrlq/main_test.go diff --git a/internal/ctrlq/main_test.go b/internal/ctrlq/main_test.go new file mode 100644 index 0000000..65c0f18 --- /dev/null +++ b/internal/ctrlq/main_test.go @@ -0,0 +1,60 @@ +package ctrlq + +import ( + "context" + "testing" + "time" +) + +// TestVisitWorkQueueLifecycle tests the lifecycle of a VisitWorkQueue, by first +// publishing a URL and then consuming it. +func TestVisitWorkQueueLifecycle(t *testing.T) { + queue := NewMemoryVisitWorkQueue() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + // Open the queue + if err := queue.Open(ctx); err != nil { + t.Fatal(err) + } + defer queue.Close() + + // Give the queue and promoter more time to initialize + time.Sleep(500 * time.Millisecond) + + testURL := "https://example.com" + testRun := "test-run-1" + + // First publish a URL + if err := queue.Publish(ctx, testRun, testURL); err != nil { + t.Fatal(err) + } + + jobs, errs := queue.Consume(ctx) + timeout := time.After(4 * time.Second) + + select { + case job := <-jobs: + if job == nil { + t.Fatal("received nil job") + } + // Verify the job details + if job.URL != testURL { + t.Errorf("expected URL %q, got %q", testURL, job.URL) + } + if job.Run != testRun { + t.Errorf("expected run %q, got %q", testRun, job.Run) + } + + case err := <-errs: + t.Error(err) + + case <-timeout: + // Add debugging information + t.Error("test timed out waiting for job") + + case <-ctx.Done(): + t.Error("context deadline exceeded") + } +} diff --git a/internal/ctrlq/memory.go b/internal/ctrlq/memory.go index 1ca6ecf..9e89cb5 100644 --- a/internal/ctrlq/memory.go +++ b/internal/ctrlq/memory.go @@ -46,15 +46,9 @@ type MemoryVisitWorkQueue struct { } func (wq *MemoryVisitWorkQueue) Open(ctx context.Context) error { - snap := make(map[uint32]*ControlledQueue) - - wq.hqueues.Range(func(key any, value any) bool { - snap[key.(uint32)] = value.(*ControlledQueue) - return true - }) go func() { - promoteLoop(ctx, wq.dqueue, snap, wq.shouldRecalc) + promoteLoop(ctx, wq.dqueue, &wq.hqueues, wq.shouldRecalc) }() return nil } diff --git a/internal/ctrlq/promoter.go b/internal/ctrlq/promoter.go index ea90209..1c3a0a5 100644 --- a/internal/ctrlq/promoter.go +++ b/internal/ctrlq/promoter.go @@ -11,6 +11,7 @@ package ctrlq import ( "context" "log/slog" + "sync" "sync/atomic" "time" ) @@ -22,7 +23,7 @@ var next uint32 // promoteLoop starts a loop that shovels message from host queue into the // default queue. The promoter is responsible for load balancing the host // queues. -func promoteLoop(ctx context.Context, dqueue chan *VisitMessage, hqueues map[uint32]*ControlledQueue, shouldRecalc chan bool) { +func promoteLoop(ctx context.Context, dqueue chan *VisitMessage, hqueues *sync.Map, shouldRecalc chan bool) { slog.Debug("Work Queue: Starting promoter...") for { @@ -66,9 +67,12 @@ func promoteLoop(ctx context.Context, dqueue chan *VisitMessage, hqueues map[uin n := atomic.AddUint32(&next, 1) key := immediate[(int(n)-1)%len(immediate)] - hq, _ := hqueues[key] - // FIXME: The host queue might haven gone poof in the meantime, we should - // check if the host queue is still there. + value, ok := hqueues.Load(key) + if !ok { + // Queue has gone *poof* in the meantime, recalculate. + goto immediatecalc + } + hq := value.(*ControlledQueue) if promote(ctx, hq.Queue, dqueue) { hq.Limiter.ReleaseReservation() @@ -84,9 +88,9 @@ func promoteLoop(ctx context.Context, dqueue chan *VisitMessage, hqueues map[uin // queue is paused (non only the candidates). When no candidate is found, the // callee should wait at least for that time and than try and call this function // again. -func candidates(hqueues map[uint32]*ControlledQueue) ([]uint32, time.Time) { +func candidates(hqueues *sync.Map) ([]uint32, time.Time) { // Host queue candidates that can be queried immediately. - immediate := make([]uint32, 0, len(hqueues)) + immediate := make([]uint32, 0) var shortestPauseUntil time.Time // First calculate which host queues are candidates for immediate @@ -97,14 +101,15 @@ func candidates(hqueues map[uint32]*ControlledQueue) ([]uint32, time.Time) { // FIXME: It might be less expensive to first check for the PausedUntil // time and then check the length of the queue, depending on the // underlying implementation of the work queue. - for _, hq := range hqueues { + hqueues.Range(func(key, value interface{}) bool { + hq := value.(*ControlledQueue) hlogger := slog.With("queue.name", hq.Name) hlogger.Debug("Work Queue: Checking if is candidate.", "Queue", hq.String()) if len(hq.Queue) == 0 { // This host queue is empty, no message to process for that queue, // so don't include it in the immediate list. - continue + return true // continue iteration } // hlogger.Debug("Work Queue: Host queue has messages to process.") @@ -118,7 +123,7 @@ func candidates(hqueues map[uint32]*ControlledQueue) ([]uint32, time.Time) { // As this host queue is still paused we don't include // it in the imediate list. Continue to check if the // next host queue is paused or not. - continue + return true // continue iteration } // If we get here, the current host queue was either never @@ -129,7 +134,7 @@ func candidates(hqueues map[uint32]*ControlledQueue) ([]uint32, time.Time) { ok, delay := hq.Limiter.Reserve() if !ok { hlogger.Warn("Work Queue: Rate limiter cannot provide reservation in max wait time.") - continue + return true // continue iteration } if delay > 0 { @@ -141,11 +146,12 @@ func candidates(hqueues map[uint32]*ControlledQueue) ([]uint32, time.Time) { if shortestPauseUntil.IsZero() || until.Before(shortestPauseUntil) { shortestPauseUntil = until } - continue + return true // continue iteration } } - immediate = append(immediate, hq.ID) - } + immediate = append(immediate, key.(uint32)) + return true + }) return immediate, shortestPauseUntil } diff --git a/internal/ctrlq/promoter_test.go b/internal/ctrlq/promoter_test.go index 580d975..fb9da16 100644 --- a/internal/ctrlq/promoter_test.go +++ b/internal/ctrlq/promoter_test.go @@ -7,6 +7,7 @@ package ctrlq import ( "context" + "sync" "testing" "time" ) @@ -32,15 +33,15 @@ func TestPromoteNothingToRead(t *testing.T) { } func TestCandidatesNothingToDo(t *testing.T) { - hqueues := make(map[uint32]*ControlledQueue) + hqueues := &sync.Map{} - hqueues[1] = &ControlledQueue{ + hqueues.Store(uint32(1), &ControlledQueue{ ID: 1, Name: "example.org", IsAdaptive: false, Queue: make(chan *VisitMessage, 10), Limiter: NewMemoryLimiter(), - } + }) candidates, retry := candidates(hqueues) @@ -53,16 +54,17 @@ func TestCandidatesNothingToDo(t *testing.T) { } func TestCandidatesSingle(t *testing.T) { - hqueues := make(map[uint32]*ControlledQueue) + hqueues := &sync.Map{} - hqueues[1] = &ControlledQueue{ + queue := &ControlledQueue{ ID: 1, Name: "example.org", IsAdaptive: false, Queue: make(chan *VisitMessage, 10), Limiter: NewMemoryLimiter(), } - hqueues[1].Queue <- &VisitMessage{ID: 1} + hqueues.Store(uint32(1), queue) + queue.Queue <- &VisitMessage{ID: 1} candidates, _ := candidates(hqueues) @@ -75,30 +77,32 @@ func TestCandidatesSingle(t *testing.T) { } func TestCandidatesAcquireReservation(t *testing.T) { - hqueues := make(map[uint32]*ControlledQueue) + hqueues := &sync.Map{} - hqueues[1] = &ControlledQueue{ + queue1 := &ControlledQueue{ ID: 1, Name: "example1.org", IsAdaptive: false, Queue: make(chan *VisitMessage, 10), Limiter: NewMemoryLimiter(), } - hqueues[1].Queue <- &VisitMessage{ID: 1} + queue1.Queue <- &VisitMessage{ID: 1} + hqueues.Store(uint32(1), queue1) - hqueues[2] = &ControlledQueue{ + queue2 := &ControlledQueue{ ID: 2, Name: "example2.org", IsAdaptive: false, Queue: make(chan *VisitMessage, 10), Limiter: NewMemoryLimiter(), } - hqueues[2].Queue <- &VisitMessage{ID: 2} + queue2.Queue <- &VisitMessage{ID: 2} + hqueues.Store(uint32(2), queue2) - if hqueues[1].Limiter.HoldsReservation() != false { + if queue1.Limiter.HoldsReservation() != false { t.Errorf("Expected no reservation to be held.") } - if hqueues[2].Limiter.HoldsReservation() != false { + if queue2.Limiter.HoldsReservation() != false { t.Errorf("Expected no reservation to be held.") } @@ -108,16 +112,16 @@ func TestCandidatesAcquireReservation(t *testing.T) { t.Errorf("Expected 2 candidates, got %d", len(candidates)) } - if hqueues[1].Limiter.HoldsReservation() != true { + if queue1.Limiter.HoldsReservation() != true { t.Errorf("Expected reservation to be held.") } - if hqueues[2].Limiter.HoldsReservation() != true { + if queue2.Limiter.HoldsReservation() != true { t.Errorf("Expected reservation to be held.") } } func TestCandidatesPausedQueueDoesNotHitLimiterCalcShortest(t *testing.T) { - hqueues := make(map[uint32]*ControlledQueue) + hqueues := &sync.Map{} d1, _ := time.ParseDuration("1s") d2, _ := time.ParseDuration("2s") @@ -125,25 +129,27 @@ func TestCandidatesPausedQueueDoesNotHitLimiterCalcShortest(t *testing.T) { t1 := time.Now().Add(d1).Unix() t2 := time.Now().Add(d2).Unix() - hqueues[1] = &ControlledQueue{ + queue1 := &ControlledQueue{ ID: 1, Name: "example1.org", IsAdaptive: false, Queue: make(chan *VisitMessage, 10), Limiter: NewMemoryLimiter(), } - hqueues[1].Queue <- &VisitMessage{ID: 1} - hqueues[1].pausedUntil.Store(t1) + queue1.Queue <- &VisitMessage{ID: 1} + queue1.pausedUntil.Store(t1) + hqueues.Store(uint32(1), queue1) - hqueues[2] = &ControlledQueue{ + queue2 := &ControlledQueue{ ID: 2, Name: "example2.org", IsAdaptive: false, Queue: make(chan *VisitMessage, 10), Limiter: NewMemoryLimiter(), } - hqueues[2].Queue <- &VisitMessage{ID: 2} - hqueues[2].pausedUntil.Store(t2) + queue2.Queue <- &VisitMessage{ID: 2} + queue2.pausedUntil.Store(t2) + hqueues.Store(uint32(2), queue2) candidates, retry := candidates(hqueues) @@ -156,10 +162,10 @@ func TestCandidatesPausedQueueDoesNotHitLimiterCalcShortest(t *testing.T) { // Should not have hit rate limiter as we paused the queue and // the queue's pause has not yet passed. - if hqueues[1].Limiter.HoldsReservation() != false { + if queue1.Limiter.HoldsReservation() != false { t.Errorf("Expected no reservation to be held.") } - if hqueues[2].Limiter.HoldsReservation() != false { + if queue2.Limiter.HoldsReservation() != false { t.Errorf("Expected no reservation to be held.") } } From d6ca068a350315bf508c60ec8ae138fcc58859a8 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Thu, 13 Feb 2025 15:38:17 +0100 Subject: [PATCH 50/57] Use flag --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7eaba8e..65cec62 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ .PHONY: dev dev: - TOBEY_SKIP_CACHE=true TOBEY_DEBUG=true TOBEY_RESULTS_DSN=disk:///tmp/tobey TOBEY_HOST=127.0.0.1 go run . + TOBEY_SKIP_CACHE=true TOBEY_DEBUG=true TOBEY_RESULTS_DSN=disk:///tmp/tobey go run . -host 127.0.0.1 .PHONY: pulse pulse: From e587ee43c206ab673aadb04f330bc591e3ed0b00 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Thu, 13 Feb 2025 15:39:16 +0100 Subject: [PATCH 51/57] Remove our hosting config stubs for now --- .fabfile.yaml | 29 ---------- .hosting/scaffold/index.yml | 8 --- .../scaffold/template/app/app-deployment.yml | 53 ------------------- .../scaffold/template/app/app-service.yml | 17 ------ .../scaffold/template/secrets/app-secrets.yml | 18 ------- 5 files changed, 125 deletions(-) delete mode 100644 .fabfile.yaml delete mode 100644 .hosting/scaffold/index.yml delete mode 100644 .hosting/scaffold/template/app/app-deployment.yml delete mode 100644 .hosting/scaffold/template/app/app-service.yml delete mode 100644 .hosting/scaffold/template/secrets/app-secrets.yml diff --git a/.fabfile.yaml b/.fabfile.yaml deleted file mode 100644 index 04eb123..0000000 --- a/.fabfile.yaml +++ /dev/null @@ -1,29 +0,0 @@ -name: tobey - -inheritsFrom: - - ../.fabfile.yaml - -hosts: - ci: - scripts: - lint:tobey: - context: docker-image - image: golangci/golangci-lint:v1.56.2 - script: - - export GOCACHE=/tmp && export GOLANGCI_LINT_CACHE=/tmp && golangci-lint run -v --timeout 600s - - test:tobey: - context: docker-image - image: golang:1.22 - script: - - export GOCACHE=/tmp && go test . - - build:tobey: - defaults: - user: gitlab-ci-token - script: - - docker login -u %arguments.user% -p %secret.ci-build-token% %settings.gitlab.registry% - - execute(script, build:service, --arguments service=tobey) - push:tobey: - script: - - execute(script, push:service, --arguments service=tobey) diff --git a/.hosting/scaffold/index.yml b/.hosting/scaffold/index.yml deleted file mode 100644 index 15ce516..0000000 --- a/.hosting/scaffold/index.yml +++ /dev/null @@ -1,8 +0,0 @@ -questions: [] -assets: - - template/app/app-service.yml - - template/app/app-deployment.yml - - template/secrets/app-secrets.yml -scaffold: - - copy_assets(%rootFolder%, assets) - - log_message(info, scaffold message to display) diff --git a/.hosting/scaffold/template/app/app-deployment.yml b/.hosting/scaffold/template/app/app-deployment.yml deleted file mode 100644 index 90c3188..0000000 --- a/.hosting/scaffold/template/app/app-deployment.yml +++ /dev/null @@ -1,53 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - service_name: tobey - environment: {{ environment }} - app: tobey-deployment - type: backend - name: tobey -spec: - replicas: 2 - selector: - matchLabels: - service_name: tobey - strategy: - type: RollingUpdate - rollingUpdate: - maxSurge: 0 - maxUnavailable: 1 - template: - metadata: - annotations: - lastScaffold: {{ scaffoldTimestamp }} - labels: - service_name: tobey - environment: {{ environment }} - app: tobey-pod - type: backend - spec: - containers: - - image: {{ tobey.image }}:{{ tobey.tag }} - imagePullPolicy: Always - name: tobey - ports: - - containerPort: 8080 - envFrom: - - secretRef: - name: tobey-secret - livenessProbe: - httpGet: - path: /.lagoonhealthz - port: 8080 - timeoutSeconds: 3 - startupProbe: - httpGet: - path: /.lagoonhealthz - port: 8080 - periodSeconds: 10 - failureThreshold: 20 - imagePullSecrets: - - name: {{ registryCredentials }} - restartPolicy: Always - serviceAccountName: "" \ No newline at end of file diff --git a/.hosting/scaffold/template/app/app-service.yml b/.hosting/scaffold/template/app/app-service.yml deleted file mode 100644 index 2c3a3fb..0000000 --- a/.hosting/scaffold/template/app/app-service.yml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - labels: - service: tobey - environment: {{ environment }} - app: tobey-service - type: backend - name: tobey -spec: - selector: - service_name: tobey - ports: - - name: http - protocol: TCP - port: 80 - targetPort: 8080 \ No newline at end of file diff --git a/.hosting/scaffold/template/secrets/app-secrets.yml b/.hosting/scaffold/template/secrets/app-secrets.yml deleted file mode 100644 index beac1ae..0000000 --- a/.hosting/scaffold/template/secrets/app-secrets.yml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: tobey-secret - labels: - environment: {{ environment }} - app: tobey-service - type: backend -type: Opaque -stringData: - TOBEY_DSN: "http://tobey:9010" - TOBEY_RABBITMQ_DSN: "amqp://{{tobey.rabbitmq.user|raw}}:{{tobey.rabbitmq.pass|raw}}@{{tobey.rabbitmq.host|raw}}:5672/" - TOBEY_REDIS_DSN: "redis:6379/0" - TOBEY_PROGRESS_DSN: "http://progress" - OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: "http://jaeger:4318/v1/traces" - LOG_LEVEL: "ERROR" - TOBEY_ENABLE_TRACING: "true" - TOBEY_DEBUG: "false" \ No newline at end of file From f641d400f7f3bd97cf971afb3ee16dccc50e5ad7 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Thu, 13 Feb 2025 15:54:23 +0100 Subject: [PATCH 52/57] Move compose to examples --- compose.yml => examples/compose.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) rename compose.yml => examples/compose.yml (78%) diff --git a/compose.yml b/examples/compose.yml similarity index 78% rename from compose.yml rename to examples/compose.yml index cedbc9f..b641316 100644 --- a/compose.yml +++ b/examples/compose.yml @@ -1,7 +1,7 @@ # A docker compose file that sets up a Tobey instance that is instrumented and # exposes metrics to Prometheus via /metrics endpoint. The metrics are then # scraped by prometheus. Grafana is used to visualize the metrics scraped by -# Prometheus. +# Prometheus. It also configures Redis for coordination. services: tobey: @@ -11,9 +11,16 @@ services: context: . environment: - TOBEY_TELEMETRY=metrics + - TOBEY_REDIS_URL=redis://redis:6379 + - TOBEY_RESULTS_URL=disk:///tmp/tobey volumes: - tobey-cache:/cache + redis: + image: redis:latest + ports: + - "6379:6379" + prometheus: image: prom/prometheus ports: From 47ab61a5eee4834a22feea948649d282148c1b12 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Thu, 13 Feb 2025 15:54:30 +0100 Subject: [PATCH 53/57] Add a full integration test --- main_test.go | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 main_test.go diff --git a/main_test.go b/main_test.go new file mode 100644 index 0000000..8987275 --- /dev/null +++ b/main_test.go @@ -0,0 +1,139 @@ +package main + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + "tobey/internal/ctrlq" +) + +// TestCrawlRequestSubmission tries to perform a full integration test. On one +// hand side we have an instance of tobey on the other hand we have a crawl +// target. We then start the crawl and check if the results are as expected. +func TestCrawlRequestSubmission(t *testing.T) { + if testing.Short() { + t.Skip() + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + hits := make(chan string, 100) + defer close(hits) + + // Create a test target server that simulates a website to crawl. + targets := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + hits <- r.URL.Path + + switch r.URL.Path { + case "/": + w.Write([]byte(` + + + Page 1 + Page 2 + + + `)) + case "/page1": + w.Write([]byte(` + + +

Page 1

+ Go to Page 2 + + + `)) + case "/page2": + w.Write([]byte(` + + +

Page 2

+ Back to Page 1 + + + `)) + default: + w.WriteHeader(http.StatusNotFound) + } + })) + defer targets.Close() + + robots := NewRobots() + sitemaps := NewSitemaps(robots) + runs := NewRunManager(nil, robots, sitemaps) + + queue := ctrlq.CreateWorkQueue(nil) + if err := queue.Open(ctx); err != nil { + t.Fatal(err) + } + + rs, err := CreateResultReporter("noop://") + if err != nil { + t.Fatal(err) + } + + progress, err := CreateProgressReporter("noop://") + if err != nil { + t.Fatal(err) + } + + CreateVisitWorkersPool( + ctx, + 1, + runs, + queue, + progress, + rs, + ) + + tobeys := httptest.NewServer(setupRoutes(runs, queue, progress, rs)) + defer tobeys.Close() + + client := &http.Client{} + req, err := http.NewRequestWithContext(ctx, "POST", tobeys.URL, bytes.NewBufferString(targets.URL)) + if err != nil { + t.Fatal(err) + } + + res, err := client.Do(req) + if err != nil { + t.Fatal(err) + } + defer res.Body.Close() + + if res.StatusCode != http.StatusOK { + t.Errorf("Expected status OK, got %v", res.Status) + } + + var apires APIResponse + if err := json.NewDecoder(res.Body).Decode(&apires); err != nil { + t.Fatal(err) + } + + if apires.Run == "" { + t.Error("Expected non-empty run ID") + } + + // Wait a moment for the crawl to be performed. + time.Sleep(100 * time.Millisecond) + + // We now want to check whether exactly three 5 were made. We need to make sure we don't block on reading from the channel. + // 3 page request + 1 robots.txt request + 1 sitemap request = 5 requests + for i := 0; i < 5; i++ { + select { + case hit := <-hits: + if hit == "/" || hit == "/page1" || hit == "/page2" || hit == "/robots.txt" || hit == "/sitemap.xml" { + continue + } else { + t.Errorf("Unexpected hit: %s", hit) + } + case <-time.After(1 * time.Second): + t.Errorf("Expected 5 hits, got %d", i) + } + } +} From d1521eae9147e5e770618776b3429e8bd0cbf3c1 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Thu, 13 Feb 2025 16:11:46 +0100 Subject: [PATCH 54/57] Add test pipeline --- .github/workflows/test.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..95cb0b1 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,22 @@ +name: Tests + +on: + push: + branches: [ main, next ] + pull_request: + branches: [ main, next ] + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v4 + with: + go-version: '1.21' + + - name: Run Tests + run: go test -v ./... \ No newline at end of file From 295063e46060f10fdc4dbcd1ccbe26282a943297 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Thu, 13 Feb 2025 16:16:33 +0100 Subject: [PATCH 55/57] Run all tests --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 65cec62..dd70ef6 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ pulse: .PHONY: test test: - TOBEY_SKIP_CACHE=true TOBEY_DEBUG=true go test + TOBEY_SKIP_CACHE=true TOBEY_DEBUG=true go test -v ./... .PHONY: clean clean: From c9576cdb2f7880b558dc711e62dc3d90abd24348 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Thu, 13 Feb 2025 16:21:53 +0100 Subject: [PATCH 56/57] Check for closed channel --- internal/ctrlq/memory.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/internal/ctrlq/memory.go b/internal/ctrlq/memory.go index 9e89cb5..608dc84 100644 --- a/internal/ctrlq/memory.go +++ b/internal/ctrlq/memory.go @@ -145,6 +145,14 @@ func (wq *MemoryVisitWorkQueue) Consume(ctx context.Context) (<-chan *VisitJob, case msg := <-wq.dqueue: // slog.Debug("Work Queue: Received message, forwarding to results channel.", "msg.id", p.Message.ID) + if msg == nil { + slog.Debug("Work Queue: Detected closed channel, closing channels.") + + close(reschan) + close(errchan) + return + } + // Initializes the context for the job. Than extract the tracing // information from the carrier into the job's context. jctx := context.Background() From 1d9ca19924eb52068315480615155623bc8c8609 Mon Sep 17 00:00:00 2001 From: Marius Wilms Date: Thu, 13 Feb 2025 16:26:04 +0100 Subject: [PATCH 57/57] Add release workflow --- .github/workflows/release.yml | 53 +++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 .github/workflows/release.yml diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..6edf172 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,53 @@ +name: Release + +on: + push: + tags: + - 'v*' + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v4 + with: + go-version: '1.21' + + - name: Run Tests + run: go test -v ./... + + release: + needs: test + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v4 + with: + go-version: '1.21' + + - name: Build for Linux + run: | + GOOS=linux GOARCH=amd64 go build -o tobey-linux-amd64 + GOOS=linux GOARCH=arm64 go build -o tobey-linux-arm64 + + - name: Build for macOS + run: | + GOOS=darwin GOARCH=amd64 go build -o tobey-darwin-amd64 + GOOS=darwin GOARCH=arm64 go build -o tobey-darwin-arm64 + + - name: Create Release + uses: softprops/action-gh-release@v1 + with: + files: | + tobey-linux-amd64 + tobey-linux-arm64 + tobey-darwin-amd64 + tobey-darwin-arm64 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file