Skip to content

Commit

Permalink
Allow customization of UA, default to new UA
Browse files Browse the repository at this point in the history
  • Loading branch information
mariuswilms committed Feb 17, 2025
1 parent eb2ff1a commit fe76bbb
Show file tree
Hide file tree
Showing 13 changed files with 80 additions and 33 deletions.
3 changes: 3 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
# Controls the number of workers per instance, by default 5.
# TOBEY_WORKERS=5

# Controls the user agent string, by default "Tobey/0".
# TOBEY_USER_AGENT="Tobey/0"

# DSN specifying where crawl results should be stored, by default no results are stored. Here we store results
# in the "results" directory, relative to the current working directory.
TOBEY_RESULTS_DSN=disk://results
Expand Down
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ example configuration that should get you started.
| `TOBEY_DEBUG` | `false` | `true`, `false` | Controls debug mode. |
| `TOBEY_SKIP_CACHE` | `false` | `true`, `false` | Controls caching access. |
| `TOBEY_WORKERS` | `5` | `1-128` | Controls the number of workers per instance. |
| `TOBEY_USER_AGENT` | `Tobey/0` | any string | Default User-Agent string used for requests. |
| `TOBEY_HOST` | empty | i.e. `localhost`, `127.0.0.1` | Host interface to bind the HTTP server to. Empty means listen on all interfaces. Alternatively you can use the `-host` command line flag. |
| `TOBEY_PORT` | `8080` | `1-65535` | Port to bind the HTTP server to. Alternatively you can use the `-port` command line flag. |
| `TOBEY_REDIS_DSN` | empty | i.e. `redis://localhost:6379` | DSN to reach a Redis instance. Only needed when operating multiple instances. |
Expand Down Expand Up @@ -172,6 +173,18 @@ the URL under the `url` key algonside the entrypoint:
}
```

### User Agents

If a certain website requires a specific user agent string, you can override the
user agent for a specific run via the `user_agent` field in the crawl request:

```jsonc
{
"url": "https://example.org",
"user_agent": "CustomBot/1.0" // Overrides the default user agent for this run
}
```

## Triggering Runs

Each time you submit a URL to be crawled, a _Run_ is internally created. Tobey
Expand Down Expand Up @@ -342,4 +355,7 @@ Also Tobey can be configured - on a per run basis - to crawl websites behind
HTTP basic auth, **it does not support fetching personalized content**. It is
expected that the website is generally publicly available, and that the content
is the same for all users. When HTTP basic auth is used by the website it must
only be so in order to prevent early access.
only be so in order to prevent early access.

Per-run user agent strings are only used when visiting a website, they are not
used, i.e. when forwarding results to a webhook.
9 changes: 9 additions & 0 deletions api.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ type APIRequest struct {
AllowPaths []string `json:"paths"`
DenyPaths []string `json:"!paths"`

UserAgent string `json:"user_agent"`

WebhookResultStoreConfig *WebhookResultReporterConfig `json:"webhook"`

// If true, we'll bypass the robots.txt check, however we'll still
Expand Down Expand Up @@ -186,6 +188,13 @@ func (req *APIRequest) GetAuthConfigs() []*AuthConfig {
return configs
}

func (req *APIRequest) GetUserAgent() string {
if req.UserAgent != "" {
return req.UserAgent
}
return UserAgent
}

func (req *APIRequest) Validate() bool {
if req.Run != "" {
_, err := uuid.Parse(req.Run)
Expand Down
7 changes: 7 additions & 0 deletions .env.example.factorial → examples/.env.example.factorial
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@
# Controls caching access, uncomment to disabled caching. By default caching is enabled.
# TOBEY_SKIP_CACHE=true

# Controls the number of workers per instance, by default 5.
# TOBEY_WORKERS=5

# The value is set to a backwards compatible one. Some sites allowlist this
# specific user agent.
TOBEY_USER_AGENT="WebsiteStandardsBot/1.0"

# DSN specifying where crawl results should be stored, by default no results are stored. Here we store results
# in the "results" directory, relative to the current working directory.
# TOBEY_RESULTS_DSN=disk://results
Expand Down
18 changes: 11 additions & 7 deletions httpclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,14 @@ type getAuthHeaderFn func(context.Context, *url.URL) (string, bool)

// CreateCrawlerHTTPClient creates a new HTTP client configured and optimized for use
// in crawling actions. It adds caching, tracing, metrics, and authentication support.
func CreateCrawlerHTTPClient(getAuth GetAuthFn) *http.Client {
func CreateCrawlerHTTPClient(getAuth GetAuthFn, ua string) *http.Client {
return &http.Client{
Timeout: 10 * time.Second,
Transport: withMiddlewares(http.DefaultTransport, getAuth),
Transport: withMiddlewares(http.DefaultTransport, getAuth, ua),
}
}

func CreateRetryingHTTPClient(getAuth GetAuthFn) *http.Client {
func CreateRetryingHTTPClient(getAuth GetAuthFn, ua string) *http.Client {
rc := retryablehttp.NewClient()

// Fail a little quicker, as the caller might block until
Expand All @@ -83,7 +83,7 @@ func CreateRetryingHTTPClient(getAuth GetAuthFn) *http.Client {
Timeout: 10 * time.Second,
Transport: withMiddlewares(&retryablehttp.RoundTripper{
Client: rc,
}, getAuth),
}, getAuth, ua),
}
}

Expand All @@ -98,7 +98,7 @@ func CreateRetryingHTTPClient(getAuth GetAuthFn) *http.Client {
// -> CachingTransport
// -> t (usually http.DefaultTransport)
// [endpoint]
func withMiddlewares(t http.RoundTripper, getAuth GetAuthFn) http.RoundTripper {
func withMiddlewares(t http.RoundTripper, getAuth GetAuthFn, ua string) http.RoundTripper {
if !SkipCache {
// Adds caching support to the client. Please note that the cache is a
// private cache and will store responses that required authentication
Expand Down Expand Up @@ -133,7 +133,10 @@ func withMiddlewares(t http.RoundTripper, getAuth GetAuthFn) http.RoundTripper {

// Add User-Agent to the transport, these headers should be added
// before going through the caching transport.
t = &UserAgentTransport{Transport: t}
t = &UserAgentTransport{
Transport: t,
UserAgent: ua,
}

// Any request independent if cached or not should be traced
// and have metrics collected.
Expand Down Expand Up @@ -161,9 +164,10 @@ func (t *AuthTransport) RoundTrip(req *http.Request) (*http.Response, error) {
// header to each request.
type UserAgentTransport struct {
Transport http.RoundTripper
UserAgent string
}

func (t *UserAgentTransport) RoundTrip(req *http.Request) (*http.Response, error) {
req.Header.Add("User-Agent", UserAgent)
req.Header.Add("User-Agent", t.UserAgent)
return t.Transport.RoundTrip(req)
}
2 changes: 1 addition & 1 deletion httpclient_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func TestRetryingHTTPClientReturnsResponseOn503(t *testing.T) {
}))
defer errserver.Close()

client := CreateRetryingHTTPClient(NoAuthFn)
client := CreateRetryingHTTPClient(NoAuthFn, "test")

resp, err := client.Get(errserver.URL)
if err != nil {
Expand Down
13 changes: 9 additions & 4 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ var (
// The port where the main HTTP server listens and the API is served, this can
// be controlled via the TOBEY_PORT environment variable.
ListenPort int = 8080

// UserAgent to be used with all HTTP requests unless overridden per run.
// Can be controlled via TOBEY_USER_AGENT env var.
UserAgent = "Tobey/0"
)

const (
Expand All @@ -67,10 +71,6 @@ const (
// time the host is evicted from memory and the cache. The TTL defaults to 365 days.
HostTTL = 365 * 24 * time.Hour

// UserAgent to be used with all HTTP request. The value is set to a
// backwards compatible one. Some sites allowwlist this specific user agent.
UserAgent = "WebsiteStandardsBot/1.0"

// HTTPCachePath is the absolute or relative path (to the working
// directory) where we store the cache for HTTP responses.
HTTPCachePath = "./cache"
Expand Down Expand Up @@ -119,6 +119,11 @@ func configure() {
slog.Info("High Frequency Metrics (Pulse) enabled.")
}

if v := os.Getenv("TOBEY_USER_AGENT"); v != "" {
UserAgent = v
slog.Info("Using custom user agent.", "user_agent", UserAgent)
}

// First check command line args, then fall back to env vars
if flagHost != "" {
ListenHost = flagHost
Expand Down
2 changes: 1 addition & 1 deletion progress.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ func CreateProgressReporter(dsn string) (ProgressReporter, error) {
case "factorial":
slog.Info("Progress Reporting: Enabled, using Factorial progress service for updates.", "dsn", dsn)
return &FactorialProgressReporter{
client: CreateRetryingHTTPClient(NoAuthFn),
client: CreateRetryingHTTPClient(NoAuthFn, UserAgent),
}, nil
case "noop":
slog.Info("Progress Reporting: Disabled, not sharing progress updates.")
Expand Down
4 changes: 2 additions & 2 deletions results_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ func NewWebhookResultReporter(ctx context.Context, endpoint string) *WebhookResu
u, err := url.Parse(endpoint)
if err != nil {
return &WebhookResultReporter{
client: CreateRetryingHTTPClient(NoAuthFn),
client: CreateRetryingHTTPClient(NoAuthFn, UserAgent),
defaultEndpoint: endpoint,
allowDynamicConfig: false,
}
Expand All @@ -57,7 +57,7 @@ func NewWebhookResultReporter(ctx context.Context, endpoint string) *WebhookResu
}

return &WebhookResultReporter{
client: CreateRetryingHTTPClient(NoAuthFn),
client: CreateRetryingHTTPClient(NoAuthFn, UserAgent),

defaultEndpoint: cleanEndpoint,
// Presence of the query parameter is sufficient to enable dynamic config. This is,
Expand Down
14 changes: 7 additions & 7 deletions robots.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,35 +66,35 @@ func NewRobots() *Robots {
}

// Check checks whether the given URL is allowed to be fetched by the given user agent.
func (r *Robots) Check(u string, getAuth GetAuthFn, agent string) (bool, error) {
func (r *Robots) Check(u string, getAuth GetAuthFn, ua string) (bool, error) {
p, err := url.Parse(u)
if err != nil {
return false, err
}

robot, err := r.get(NewHostFromURL(p), getAuth)
robot, err := r.get(NewHostFromURL(p), getAuth, ua)
if err != nil {
slog.Info("Robots: Failed to fetch robots.txt file.", "url", u, "error", err)
}
return robot.TestAgent(agent, u), err
return robot.TestAgent(ua, u), err
}

// Sitemaps returns available sitemap URLs for the given host.
func (r *Robots) Sitemaps(u string, getAuth GetAuthFn) ([]string, error) {
func (r *Robots) Sitemaps(u string, getAuth GetAuthFn, ua string) ([]string, error) {
p, err := url.Parse(u)
if err != nil {
return nil, err
}

robot, err := r.get(NewHostFromURL(p), getAuth)
robot, err := r.get(NewHostFromURL(p), getAuth, ua)
if err != nil {
return nil, err
}
return robot.Sitemaps, nil
}

// get ensures that the robots.txt file for the given host is fetched. It will block until.
func (r *Robots) get(h *Host, getAuth GetAuthFn) (*robotstxt.RobotsData, error) {
func (r *Robots) get(h *Host, getAuth GetAuthFn, ua string) (*robotstxt.RobotsData, error) {
var robot *robotstxt.RobotsData
var err error
var res *http.Response
Expand All @@ -112,7 +112,7 @@ func (r *Robots) get(h *Host, getAuth GetAuthFn) (*robotstxt.RobotsData, error)
return robot, nil
}

client := CreateRetryingHTTPClient(getAuth)
client := CreateRetryingHTTPClient(getAuth, ua)

rurl := fmt.Sprintf("%s://%s/robots.txt", h.PreferredScheme, h.String())
hlogger := slog.With("url", rurl, "host.name", h.Name, "host.port", h.Port)
Expand Down
2 changes: 2 additions & 0 deletions routes.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ func setupRoutes(runs *RunManager, queue ctrlq.VisitWorkQueue, progress Progress
AllowPaths: req.GetAllowPaths(),
DenyPaths: req.GetDenyPaths(),

UserAgent: req.GetUserAgent(),

SkipRobots: req.SkipRobots,
SkipSitemapDiscovery: req.SkipSitemapDiscovery,

Expand Down
13 changes: 7 additions & 6 deletions run.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ type SerializableRun struct {
AllowPaths []string
DenyPaths []string

UserAgent string

SkipRobots bool
SkipSitemapDiscovery bool

Expand Down Expand Up @@ -74,7 +76,7 @@ func (r *Run) ShortID() string {

// GetClient configures and returns the http.Client for the Run.
func (r *Run) GetClient() *http.Client {
return CreateCrawlerHTTPClient(r.getAuthFn())
return CreateCrawlerHTTPClient(r.getAuthFn(), r.UserAgent)
}

// getAuthFn returns a GetAuthFn that can be used to get the auth configuration.
Expand Down Expand Up @@ -179,8 +181,7 @@ func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, p Progre
getCollectFn(r, rs),
)

// TODO: We should be able to pass these into the NewCollector constructor.
c.UserAgent = UserAgent
c.UserAgent = r.UserAgent
c.AllowDomains = r.AllowDomains
c.AllowPaths = r.AllowPaths
c.DenyPaths = r.DenyPaths
Expand All @@ -199,16 +200,16 @@ func (r *Run) Start(ctx context.Context, q ctrlq.VisitWorkQueue, p ProgressRepor
// FIXME: This doesn't yet support providing an alternative robots.txt.
for _, u := range urls {
if isProbablySitemap(u) || isProbablySiteindex(u) {
r.sitemaps.Drain(context.WithoutCancel(ctx), r.getAuthFn(), u, c.Enqueue)
r.sitemaps.Drain(context.WithoutCancel(ctx), r.getAuthFn(), r.UserAgent, u, c.Enqueue)
} else {
c.Enqueue(context.WithoutCancel(ctx), u)
}
}

// This only skips *automatic* sitemap discovery, if the user provided sitemaps we still want to crawl them.
if !r.SkipSitemapDiscovery {
for _, u := range r.sitemaps.Discover(ctx, r.getAuthFn(), urls) {
r.sitemaps.Drain(context.WithoutCancel(ctx), r.getAuthFn(), u, c.Enqueue)
for _, u := range r.sitemaps.Discover(ctx, r.getAuthFn(), r.UserAgent, urls) {
r.sitemaps.Drain(context.WithoutCancel(ctx), r.getAuthFn(), r.UserAgent, u, c.Enqueue)
}
}
}
Expand Down
8 changes: 4 additions & 4 deletions sitemap.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ type Sitemaps struct {

// Discover sitemaps for the hosts, if the robots.txt has no
// information about it, fall back to a well known location.
func (s *Sitemaps) Discover(ctx context.Context, getAuth GetAuthFn, urls []string) []string {
func (s *Sitemaps) Discover(ctx context.Context, getAuth GetAuthFn, ua string, urls []string) []string {
bases := make([]string, 0, len(urls))
for _, u := range urls {
p, err := url.Parse(u)
Expand All @@ -61,7 +61,7 @@ func (s *Sitemaps) Discover(ctx context.Context, getAuth GetAuthFn, urls []strin

sitemaps := make([]string, 0)
for _, base := range bases {
urls, err := s.robots.Sitemaps(base, getAuth) // This may block.
urls, err := s.robots.Sitemaps(base, getAuth, ua) // This may block.

if err != nil {
slog.Error("Sitemaps: Failed to fetch sitemap URLs, taking a well known location.", "error", err)
Expand All @@ -81,8 +81,8 @@ func (s *Sitemaps) Discover(ctx context.Context, getAuth GetAuthFn, urls []strin
//
// FIXME: Implement this, might use FFI to use Stephan's Rust sitemap fetcher.
// FIXME: Implement this as a work process and go through the work queue.
func (s *Sitemaps) Drain(ctx context.Context, getAuth GetAuthFn, url string, yieldu func(context.Context, string) error) {
client := CreateRetryingHTTPClient(getAuth)
func (s *Sitemaps) Drain(ctx context.Context, getAuth GetAuthFn, ua string, url string, yieldu func(context.Context, string) error) {
client := CreateRetryingHTTPClient(getAuth, ua)

var resolve func(context.Context, string) error
resolve = func(ctx context.Context, url string) error {
Expand Down

0 comments on commit fe76bbb

Please sign in to comment.