Allow customization of UA, default to new UA

factorial-io · Feb 17, 2025 · fe76bbb · fe76bbb
1 parent eb2ff1a
commit fe76bbb
Show file tree

Hide file tree

Showing 13 changed files with 80 additions and 33 deletions.
diff --git a/.env.example b/.env.example
@@ -7,6 +7,9 @@
 # Controls the number of workers per instance, by default 5.
 # TOBEY_WORKERS=5
 
+# Controls the user agent string, by default "Tobey/0".
+# TOBEY_USER_AGENT="Tobey/0"
+
 # DSN specifying where crawl results should be stored, by default no results are stored. Here we store results 
 # in the "results" directory, relative to the current working directory.
 TOBEY_RESULTS_DSN=disk://results 

diff --git a/README.md b/README.md
@@ -21,6 +21,7 @@ example configuration that should get you started.
 | `TOBEY_DEBUG` | `false` | `true`, `false`  | Controls debug mode. |
 | `TOBEY_SKIP_CACHE` | `false` | `true`, `false`  | Controls caching access. |
 | `TOBEY_WORKERS` | `5` | `1-128` | Controls the number of workers per instance. |
+| `TOBEY_USER_AGENT` | `Tobey/0` | any string | Default User-Agent string used for requests. |
 | `TOBEY_HOST` | empty | i.e. `localhost`, `127.0.0.1` | Host interface to bind the HTTP server to. Empty means listen on all interfaces. Alternatively you can use the `-host` command line flag. |
 | `TOBEY_PORT` | `8080` | `1-65535` | Port to bind the HTTP server to. Alternatively you can use the `-port` command line flag. |
 | `TOBEY_REDIS_DSN` | empty | i.e. `redis://localhost:6379` | DSN to reach a Redis instance. Only needed when operating multiple instances. |
@@ -172,6 +173,18 @@ the URL under the `url` key algonside the entrypoint:
 }
 ```
 
+### User Agents
+
+If a certain website requires a specific user agent string, you can override the
+user agent for a specific run via the `user_agent` field in the crawl request:
+
+```jsonc
+{
+  "url": "https://example.org",
+  "user_agent": "CustomBot/1.0" // Overrides the default user agent for this run
+}
+```
+
 ## Triggering Runs
 
 Each time you submit a URL to be crawled, a _Run_ is internally created. Tobey
@@ -342,4 +355,7 @@ Also Tobey can be configured - on a per run basis - to crawl websites behind
 HTTP basic auth, **it does not support fetching personalized content**. It is
 expected that the website is generally publicly available, and that the content
 is the same for all users. When HTTP basic auth is used by the website it must
-only be so in order to prevent early access.
+only be so in order to prevent early access.
+
+Per-run user agent strings are only used when visiting a website, they are not
+used, i.e. when forwarding results to a webhook.
diff --git a/api.go b/api.go
@@ -70,6 +70,8 @@ type APIRequest struct {
 	AllowPaths   []string `json:"paths"`
 	DenyPaths    []string `json:"!paths"`
 
+	UserAgent string `json:"user_agent"`
+
 	WebhookResultStoreConfig *WebhookResultReporterConfig `json:"webhook"`
 
 	// If true, we'll bypass the robots.txt check, however we'll still
@@ -186,6 +188,13 @@ func (req *APIRequest) GetAuthConfigs() []*AuthConfig {
 	return configs
 }
 
+func (req *APIRequest) GetUserAgent() string {
+	if req.UserAgent != "" {
+		return req.UserAgent
+	}
+	return UserAgent
+}
+
 func (req *APIRequest) Validate() bool {
 	if req.Run != "" {
 		_, err := uuid.Parse(req.Run)

diff --git a/.env.example.factorial → examples/.env.example.factorial b/.env.example.factorial → examples/.env.example.factorial
@@ -4,6 +4,13 @@
 # Controls caching access, uncomment to disabled caching. By default caching is enabled.
 # TOBEY_SKIP_CACHE=true
 
+# Controls the number of workers per instance, by default 5.
+# TOBEY_WORKERS=5
+
+# The value is set to a backwards compatible one. Some sites allowlist this
+# specific user agent.
+TOBEY_USER_AGENT="WebsiteStandardsBot/1.0"
+
 # DSN specifying where crawl results should be stored, by default no results are stored. Here we store results 
 # in the "results" directory, relative to the current working directory.
 # TOBEY_RESULTS_DSN=disk://results 

diff --git a/httpclient.go b/httpclient.go
@@ -60,14 +60,14 @@ type getAuthHeaderFn func(context.Context, *url.URL) (string, bool)
 
 // CreateCrawlerHTTPClient creates a new HTTP client configured and optimized for use
 // in crawling actions. It adds caching, tracing, metrics, and authentication support.
-func CreateCrawlerHTTPClient(getAuth GetAuthFn) *http.Client {
+func CreateCrawlerHTTPClient(getAuth GetAuthFn, ua string) *http.Client {
 	return &http.Client{
 		Timeout:   10 * time.Second,
-		Transport: withMiddlewares(http.DefaultTransport, getAuth),
+		Transport: withMiddlewares(http.DefaultTransport, getAuth, ua),
 	}
 }
 
-func CreateRetryingHTTPClient(getAuth GetAuthFn) *http.Client {
+func CreateRetryingHTTPClient(getAuth GetAuthFn, ua string) *http.Client {
 	rc := retryablehttp.NewClient()
 
 	// Fail a little quicker, as the caller might block until
@@ -83,7 +83,7 @@ func CreateRetryingHTTPClient(getAuth GetAuthFn) *http.Client {
 		Timeout: 10 * time.Second,
 		Transport: withMiddlewares(&retryablehttp.RoundTripper{
 			Client: rc,
-		}, getAuth),
+		}, getAuth, ua),
 	}
 }
 
@@ -98,7 +98,7 @@ func CreateRetryingHTTPClient(getAuth GetAuthFn) *http.Client {
 // -> CachingTransport
 // -> t (usually http.DefaultTransport)
 // [endpoint]
-func withMiddlewares(t http.RoundTripper, getAuth GetAuthFn) http.RoundTripper {
+func withMiddlewares(t http.RoundTripper, getAuth GetAuthFn, ua string) http.RoundTripper {
 	if !SkipCache {
 		// Adds caching support to the client. Please note that the cache is a
 		// private cache and will store responses that required authentication
@@ -133,7 +133,10 @@ func withMiddlewares(t http.RoundTripper, getAuth GetAuthFn) http.RoundTripper {
 
 	// Add User-Agent to the transport, these headers should be added
 	// before going through the caching transport.
-	t = &UserAgentTransport{Transport: t}
+	t = &UserAgentTransport{
+		Transport: t,
+		UserAgent: ua,
+	}
 
 	// Any request independent if cached or not should be traced
 	// and have metrics collected.
@@ -161,9 +164,10 @@ func (t *AuthTransport) RoundTrip(req *http.Request) (*http.Response, error) {
 // header to each request.
 type UserAgentTransport struct {
 	Transport http.RoundTripper
+	UserAgent string
 }
 
 func (t *UserAgentTransport) RoundTrip(req *http.Request) (*http.Response, error) {
-	req.Header.Add("User-Agent", UserAgent)
+	req.Header.Add("User-Agent", t.UserAgent)
 	return t.Transport.RoundTrip(req)
 }
diff --git a/httpclient_test.go b/httpclient_test.go
@@ -38,7 +38,7 @@ func TestRetryingHTTPClientReturnsResponseOn503(t *testing.T) {
 	}))
 	defer errserver.Close()
 
-	client := CreateRetryingHTTPClient(NoAuthFn)
+	client := CreateRetryingHTTPClient(NoAuthFn, "test")
 
 	resp, err := client.Get(errserver.URL)
 	if err != nil {

diff --git a/main.go b/main.go
@@ -51,6 +51,10 @@ var (
 	// The port where the main HTTP server listens and the API is served, this can
 	// be controlled via the TOBEY_PORT environment variable.
 	ListenPort int = 8080
+
+	// UserAgent to be used with all HTTP requests unless overridden per run.
+	// Can be controlled via TOBEY_USER_AGENT env var.
+	UserAgent = "Tobey/0"
 )
 
 const (
@@ -67,10 +71,6 @@ const (
 	// time the host is evicted from memory and the cache. The TTL defaults to 365 days.
 	HostTTL = 365 * 24 * time.Hour
 
-	// UserAgent to be used with all HTTP request. The value is set to a
-	// backwards compatible one. Some sites allowwlist this specific user agent.
-	UserAgent = "WebsiteStandardsBot/1.0"
-
 	// HTTPCachePath is the absolute or relative path (to the working
 	// directory) where we store the cache for HTTP responses.
 	HTTPCachePath = "./cache"
@@ -119,6 +119,11 @@ func configure() {
 		slog.Info("High Frequency Metrics (Pulse) enabled.")
 	}
 
+	if v := os.Getenv("TOBEY_USER_AGENT"); v != "" {
+		UserAgent = v
+		slog.Info("Using custom user agent.", "user_agent", UserAgent)
+	}
+
 	// First check command line args, then fall back to env vars
 	if flagHost != "" {
 		ListenHost = flagHost

diff --git a/progress.go b/progress.go
@@ -45,7 +45,7 @@ func CreateProgressReporter(dsn string) (ProgressReporter, error) {
 	case "factorial":
 		slog.Info("Progress Reporting: Enabled, using Factorial progress service for updates.", "dsn", dsn)
 		return &FactorialProgressReporter{
-			client: CreateRetryingHTTPClient(NoAuthFn),
+			client: CreateRetryingHTTPClient(NoAuthFn, UserAgent),
 		}, nil
 	case "noop":
 		slog.Info("Progress Reporting: Disabled, not sharing progress updates.")

diff --git a/results_webhook.go b/results_webhook.go
@@ -43,7 +43,7 @@ func NewWebhookResultReporter(ctx context.Context, endpoint string) *WebhookResu
 	u, err := url.Parse(endpoint)
 	if err != nil {
 		return &WebhookResultReporter{
-			client:             CreateRetryingHTTPClient(NoAuthFn),
+			client:             CreateRetryingHTTPClient(NoAuthFn, UserAgent),
 			defaultEndpoint:    endpoint,
 			allowDynamicConfig: false,
 		}
@@ -57,7 +57,7 @@ func NewWebhookResultReporter(ctx context.Context, endpoint string) *WebhookResu
 	}
 
 	return &WebhookResultReporter{
-		client: CreateRetryingHTTPClient(NoAuthFn),
+		client: CreateRetryingHTTPClient(NoAuthFn, UserAgent),
 
 		defaultEndpoint: cleanEndpoint,
 		// Presence of the query parameter is sufficient to enable dynamic config. This is,

diff --git a/robots.go b/robots.go
@@ -66,35 +66,35 @@ func NewRobots() *Robots {
 }
 
 // Check checks whether the given URL is allowed to be fetched by the given user agent.
-func (r *Robots) Check(u string, getAuth GetAuthFn, agent string) (bool, error) {
+func (r *Robots) Check(u string, getAuth GetAuthFn, ua string) (bool, error) {
 	p, err := url.Parse(u)
 	if err != nil {
 		return false, err
 	}
 
-	robot, err := r.get(NewHostFromURL(p), getAuth)
+	robot, err := r.get(NewHostFromURL(p), getAuth, ua)
 	if err != nil {
 		slog.Info("Robots: Failed to fetch robots.txt file.", "url", u, "error", err)
 	}
-	return robot.TestAgent(agent, u), err
+	return robot.TestAgent(ua, u), err
 }
 
 // Sitemaps returns available sitemap URLs for the given host.
-func (r *Robots) Sitemaps(u string, getAuth GetAuthFn) ([]string, error) {
+func (r *Robots) Sitemaps(u string, getAuth GetAuthFn, ua string) ([]string, error) {
 	p, err := url.Parse(u)
 	if err != nil {
 		return nil, err
 	}
 
-	robot, err := r.get(NewHostFromURL(p), getAuth)
+	robot, err := r.get(NewHostFromURL(p), getAuth, ua)
 	if err != nil {
 		return nil, err
 	}
 	return robot.Sitemaps, nil
 }
 
 // get ensures that the robots.txt file for the given host is fetched. It will block until.
-func (r *Robots) get(h *Host, getAuth GetAuthFn) (*robotstxt.RobotsData, error) {
+func (r *Robots) get(h *Host, getAuth GetAuthFn, ua string) (*robotstxt.RobotsData, error) {
 	var robot *robotstxt.RobotsData
 	var err error
 	var res *http.Response
@@ -112,7 +112,7 @@ func (r *Robots) get(h *Host, getAuth GetAuthFn) (*robotstxt.RobotsData, error)
 		return robot, nil
 	}
 
-	client := CreateRetryingHTTPClient(getAuth)
+	client := CreateRetryingHTTPClient(getAuth, ua)
 
 	rurl := fmt.Sprintf("%s://%s/robots.txt", h.PreferredScheme, h.String())
 	hlogger := slog.With("url", rurl, "host.name", h.Name, "host.port", h.Port)

diff --git a/routes.go b/routes.go
@@ -92,6 +92,8 @@ func setupRoutes(runs *RunManager, queue ctrlq.VisitWorkQueue, progress Progress
 				AllowPaths:   req.GetAllowPaths(),
 				DenyPaths:    req.GetDenyPaths(),
 
+				UserAgent: req.GetUserAgent(),
+
 				SkipRobots:           req.SkipRobots,
 				SkipSitemapDiscovery: req.SkipSitemapDiscovery,
 

diff --git a/run.go b/run.go
@@ -45,6 +45,8 @@ type SerializableRun struct {
 	AllowPaths   []string
 	DenyPaths    []string
 
+	UserAgent string
+
 	SkipRobots           bool
 	SkipSitemapDiscovery bool
 
@@ -74,7 +76,7 @@ func (r *Run) ShortID() string {
 
 // GetClient configures and returns the http.Client for the Run.
 func (r *Run) GetClient() *http.Client {
-	return CreateCrawlerHTTPClient(r.getAuthFn())
+	return CreateCrawlerHTTPClient(r.getAuthFn(), r.UserAgent)
 }
 
 // getAuthFn returns a GetAuthFn that can be used to get the auth configuration.
@@ -179,8 +181,7 @@ func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p Progre
 		getCollectFn(r, rs),
 	)
 
-	// TODO: We should be able to pass these into the NewCollector constructor.
-	c.UserAgent = UserAgent
+	c.UserAgent = r.UserAgent
 	c.AllowDomains = r.AllowDomains
 	c.AllowPaths = r.AllowPaths
 	c.DenyPaths = r.DenyPaths
@@ -199,16 +200,16 @@ func (r *Run) Start(ctx context.Context, q ctrlq.VisitWorkQueue, p ProgressRepor
 	// FIXME: This doesn't yet support providing an alternative robots.txt.
 	for _, u := range urls {
 		if isProbablySitemap(u) || isProbablySiteindex(u) {
-			r.sitemaps.Drain(context.WithoutCancel(ctx), r.getAuthFn(), u, c.Enqueue)
+			r.sitemaps.Drain(context.WithoutCancel(ctx), r.getAuthFn(), r.UserAgent, u, c.Enqueue)
 		} else {
 			c.Enqueue(context.WithoutCancel(ctx), u)
 		}
 	}
 
 	// This only skips *automatic* sitemap discovery, if the user provided sitemaps we still want to crawl them.
 	if !r.SkipSitemapDiscovery {
-		for _, u := range r.sitemaps.Discover(ctx, r.getAuthFn(), urls) {
-			r.sitemaps.Drain(context.WithoutCancel(ctx), r.getAuthFn(), u, c.Enqueue)
+		for _, u := range r.sitemaps.Discover(ctx, r.getAuthFn(), r.UserAgent, urls) {
+			r.sitemaps.Drain(context.WithoutCancel(ctx), r.getAuthFn(), r.UserAgent, u, c.Enqueue)
 		}
 	}
 }

diff --git a/sitemap.go b/sitemap.go
@@ -44,7 +44,7 @@ type Sitemaps struct {
 
 // Discover sitemaps for the hosts, if the robots.txt has no
 // information about it, fall back to a well known location.
-func (s *Sitemaps) Discover(ctx context.Context, getAuth GetAuthFn, urls []string) []string {
+func (s *Sitemaps) Discover(ctx context.Context, getAuth GetAuthFn, ua string, urls []string) []string {
 	bases := make([]string, 0, len(urls))
 	for _, u := range urls {
 		p, err := url.Parse(u)
@@ -61,7 +61,7 @@ func (s *Sitemaps) Discover(ctx context.Context, getAuth GetAuthFn, urls []strin
 
 	sitemaps := make([]string, 0)
 	for _, base := range bases {
-		urls, err := s.robots.Sitemaps(base, getAuth) // This may block.
+		urls, err := s.robots.Sitemaps(base, getAuth, ua) // This may block.
 
 		if err != nil {
 			slog.Error("Sitemaps: Failed to fetch sitemap URLs, taking a well known location.", "error", err)
@@ -81,8 +81,8 @@ func (s *Sitemaps) Discover(ctx context.Context, getAuth GetAuthFn, urls []strin
 //
 // FIXME: Implement this, might use FFI to use Stephan's Rust sitemap fetcher.
 // FIXME: Implement this as a work process and go through the work queue.
-func (s *Sitemaps) Drain(ctx context.Context, getAuth GetAuthFn, url string, yieldu func(context.Context, string) error) {
-	client := CreateRetryingHTTPClient(getAuth)
+func (s *Sitemaps) Drain(ctx context.Context, getAuth GetAuthFn, ua string, url string, yieldu func(context.Context, string) error) {
+	client := CreateRetryingHTTPClient(getAuth, ua)
 
 	var resolve func(context.Context, string) error
 	resolve = func(ctx context.Context, url string) error {