Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,38 @@ jobs:
fi
}

# Like assert_header but follows redirects and asserts on the
# final response's Content-Type. Use for URLs that legitimately
# redirect before serving the content (e.g. child repo homepages).
# Only Content-Type is supported because curl's writeout variable
# %{content_type} is the only header it exposes after -L.
function assert_final_content_type() {
url=$1
expected=$2
shift 2
extra_args=("$@")
actual=$(curl -s -L -o /dev/null -w "%{content_type}" "${extra_args[@]}" "$url")
echo "→ $url → final Content-Type: $actual"
echo "$actual" | grep -q "$expected" || (echo "❌ Expected '$expected' in final Content-Type for $url, got '$actual'" && exit 1)
}

# Asserts that the HTML at $url contains a
# <link rel="alternate" type="text/markdown" href="$expected_href">
# tag (attribute order/quoting may vary; we only require
# type=text/markdown and the expected href on the same
# <link> element). AI crawlers rely on this tag to find
# the .md counterpart of an HTML page.
function assert_html_md_alternate() {
url=$1
expected_href=$2
matched=$(curl -s "$url" | grep -oE '<link [^>]*type="?text/markdown[^>]*>' | grep -F "$expected_href" || true)
echo "→ $url → ${matched:-no match}"
if [ -z "$matched" ]; then
echo "❌ Expected <link rel=\"alternate\" type=\"text/markdown\" href=\"$expected_href\"> in HTML for $url"
exit 1
fi
}

echo "🧪 Checking open redirect protection..."
# Backslash URLs must not produce redirects (the redirect Location
# would contain \, which browsers normalize to /, creating
Expand All @@ -118,35 +150,68 @@ jobs:
assert_header "http://localhost:8080/llms.txt" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/llms-full.txt" "Content-Type" "text/markdown"

echo "🧪 Checking .md counterparts of pages that are excluded from llms.txt..."
# Pages excluded from the llms.txt index (see scripts/joinLlmsFiles.mjs)
# must still serve their .md markdown counterparts. Regression test for
# https://github.com/apify/apify-docs/pull/2470#discussion_r3161627392
assert_header "http://localhost:8080/sdk.md" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/open-source.md" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/api/v2/actor-builds-get.md" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/api/v2/dataset-get.md" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/academy/tutorials.md" "Content-Type" "text/markdown"

echo "🧪 Checking HTML alternate links to .md counterparts..."
# The HTML version of every page (including those excluded from the
# llms.txt index) must advertise its .md counterpart via
# <link rel="alternate" type="text/markdown" href="..."> so that
# AI crawlers can discover the markdown version.
assert_html_md_alternate "http://localhost:8080/sdk" "https://docs.apify.com/sdk.md"
assert_html_md_alternate "http://localhost:8080/open-source" "https://docs.apify.com/open-source.md"
assert_html_md_alternate "http://localhost:8080/api/v2/actor-builds-get" "https://docs.apify.com/api/v2/actor-builds-get.md"
assert_html_md_alternate "http://localhost:8080/api/v2/dataset-get" "https://docs.apify.com/api/v2/dataset-get.md"
assert_html_md_alternate "http://localhost:8080/academy/tutorials" "https://docs.apify.com/academy/tutorials.md"
# Sanity check: a regular (non-excluded) page also has the alternate.
assert_html_md_alternate "http://localhost:8080/platform/proxy/usage" "https://docs.apify.com/platform/proxy/usage.md"

echo "🧪 Checking Nginx responses... (apify-sdk-js)"
assert_final_content_type "http://localhost:8080/sdk/js" "text/html"
assert_final_content_type "http://localhost:8080/sdk/js" "text/markdown" -H "Accept: text/markdown"
assert_header "http://localhost:8080/sdk/js/docs/introduction/quick-start" "Content-Type" "text/html"
assert_header "http://localhost:8080/sdk/js/docs/introduction/quick-start.md" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/sdk/js/docs/introduction/quick-start" "Content-Type" "text/markdown" -H "Accept: text/markdown"
assert_header "http://localhost:8080/sdk/js/llms.txt" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/sdk/js/llms-full.txt" "Content-Type" "text/markdown"

echo "🧪 Checking Nginx responses... (apify-sdk-python)"
assert_final_content_type "http://localhost:8080/sdk/python" "text/html"
assert_final_content_type "http://localhost:8080/sdk/python" "text/markdown" -H "Accept: text/markdown"
assert_header "http://localhost:8080/sdk/python/docs/changelog" "Content-Type" "text/html"
assert_header "http://localhost:8080/sdk/python/docs/changelog.md" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/sdk/python/docs/changelog" "Content-Type" "text/markdown" -H "Accept: text/markdown"
assert_header "http://localhost:8080/sdk/python/llms.txt" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/sdk/python/llms-full.txt" "Content-Type" "text/markdown"

echo "🧪 Checking Nginx responses... (apify-client-js)"
assert_final_content_type "http://localhost:8080/api/client/js" "text/html"
assert_final_content_type "http://localhost:8080/api/client/js" "text/markdown" -H "Accept: text/markdown"
assert_header "http://localhost:8080/api/client/js/docs/changelog" "Content-Type" "text/html"
assert_header "http://localhost:8080/api/client/js/docs/changelog.md" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/api/client/js/docs/changelog" "Content-Type" "text/markdown" -H "Accept: text/markdown"
assert_header "http://localhost:8080/api/client/js/llms.txt" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/api/client/js/llms-full.txt" "Content-Type" "text/markdown"

echo "🧪 Checking Nginx responses... (apify-client-python)"
assert_final_content_type "http://localhost:8080/api/client/python" "text/html"
assert_final_content_type "http://localhost:8080/api/client/python" "text/markdown" -H "Accept: text/markdown"
assert_header "http://localhost:8080/api/client/python/docs/changelog" "Content-Type" "text/html"
assert_header "http://localhost:8080/api/client/python/docs/changelog.md" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/api/client/python/docs/changelog" "Content-Type" "text/markdown" -H "Accept: text/markdown"
assert_header "http://localhost:8080/api/client/python/llms.txt" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/api/client/python/llms-full.txt" "Content-Type" "text/markdown"

echo "🧪 Checking Nginx responses... (apify-cli)"
assert_final_content_type "http://localhost:8080/cli" "text/html"
assert_final_content_type "http://localhost:8080/cli" "text/markdown" -H "Accept: text/markdown"
assert_header "http://localhost:8080/cli/docs/changelog" "Content-Type" "text/html"
assert_header "http://localhost:8080/cli/docs/changelog.md" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/cli/docs/changelog" "Content-Type" "text/markdown" -H "Accept: text/markdown"
Expand Down
65 changes: 6 additions & 59 deletions docusaurus.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -324,67 +324,14 @@ module.exports = {
},
},
},
// NOTE: Do not list pages here just to keep them out of llms.txt -
// anything in `excludeRoutes` also loses its .md counterpart at
// /<route>.md (see https://github.com/signalwire/docusaurus-plugins).
// To exclude a page from the llms.txt index while keeping its .md
// file accessible, add it to LLMS_INDEX_EXCLUDE_PATTERNS in
// scripts/joinLlmsFiles.mjs instead.
excludeRoutes: [
'/',
// API: exclude all deprecated act-* endpoints
'/api/v2/act-*',
// API: exclude individual CRUD endpoint pages (keep Introduction pages)
'/api/v2/actor-build-abort-post',
'/api/v2/actor-build-delete',
'/api/v2/actor-build-get',
'/api/v2/actor-build-log-get',
'/api/v2/actor-build-openapi-json-get',
'/api/v2/actor-builds-get',
'/api/v2/actor-run-*',
'/api/v2/actor-runs-get',
'/api/v2/actor-task-*',
'/api/v2/actor-tasks-get',
'/api/v2/actor-tasks-post',
'/api/v2/acts-get',
'/api/v2/acts-post',
'/api/v2/dataset-*',
'/api/v2/datasets-*',
'/api/v2/key-value-store-*',
'/api/v2/key-value-stores-*',
'/api/v2/log-get',
'/api/v2/post-*',
'/api/v2/request-queue-*',
'/api/v2/request-queues-*',
'/api/v2/schedule-*',
'/api/v2/schedules-get',
'/api/v2/schedules-post',
'/api/v2/store-get',
'/api/v2/tools-*',
'/api/v2/user-get',
'/api/v2/users-me-*',
'/api/v2/webhook-*',
'/api/v2/webhooks-get',
'/api/v2/webhooks-post',
// Academy: exclude legacy JS course
'/academy/scraping-basics-javascript/legacy',
'/academy/scraping-basics-javascript/legacy/**',
// Academy: exclude individual Node.js tutorials (keep index)
'/academy/node-js/*',
// Academy: exclude individual Python tutorials (keep index)
'/academy/python/*',
// Academy: exclude exercise solutions
'/academy/expert-scraping-with-apify/solutions',
'/academy/expert-scraping-with-apify/solutions/**',
// Academy: exclude legacy scraper tutorials (keep index)
'/academy/apify-scrapers/*',
// Academy: exclude marketing playbook deep pages
'/academy/actor-marketing-playbook/**',
// Academy: exclude misc
'/academy/tutorials',
'/academy/php/**',
// Legal: exclude outdated docs
'/legal/old/**',
'/legal/fair-share-program-terms-and-conditions',
'/legal/challenge-terms-and-conditions',
'/legal/candidate-referral-program-terms',
// Misc singleton pages
'/open-source',
'/sdk',
'/search',
],
routeRules: [
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
"@docusaurus/preset-classic": "~3.9.2",
"@docusaurus/theme-common": "~3.9.2",
"@docusaurus/theme-mermaid": "~3.9.2",
"@docusaurus/utils": "~3.9.2",
"@redocly/cli": "^2.0.0",
"@signalwire/docusaurus-plugin-llms-txt": "^1.2.1",
"clsx": "^2.0.0",
Expand Down
4 changes: 4 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading