Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ jobs:
INTERCOM_APP_ID: ${{ secrets.INTERCOM_APP_ID }}
SEGMENT_TOKEN: ${{ secrets.SEGMENT_TOKEN }}

- name: Check llms.txt size
run: npm run test:llms-size

- name: Install Nginx
run: |
sudo apt-get update
Expand Down
62 changes: 61 additions & 1 deletion docusaurus.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ module.exports = {
'@signalwire/docusaurus-plugin-llms-txt',
/** @type {import('@signalwire/docusaurus-plugin-llms-txt').PluginOptions} */
({
siteDescription: 'The entire content of Apify documentation is available in a single Markdown file at https://docs.apify.com/llms-full.txt',
siteDescription: 'Apify is the largest marketplace of tools for AI. 25,000 ready-made Actors to automate your business. Get real-time web data, track competitors, generate leads, analyze sentiment, and orchestrate your apps. Actors are created by a global community of builders earning over $1M every month. Apify takes care of infrastructure, billing, and distribution.\n\nThe entire content of Apify documentation is available in a single Markdown file at https://docs.apify.com/llms-full.txt\n\nFor pricing details, see https://apify.com/pricing',
content: {
includeVersionedDocs: false,
enableLlmsFullTxt: true,
Expand Down Expand Up @@ -326,6 +326,66 @@ module.exports = {
},
excludeRoutes: [
'/',
// API: exclude all deprecated act-* endpoints
'/api/v2/act-*',
// API: exclude individual CRUD endpoint pages (keep Introduction pages)
'/api/v2/actor-build-abort-post',
'/api/v2/actor-build-delete',
'/api/v2/actor-build-get',
'/api/v2/actor-build-log-get',
'/api/v2/actor-build-openapi-json-get',
'/api/v2/actor-builds-get',
'/api/v2/actor-run-*',
'/api/v2/actor-runs-get',
'/api/v2/actor-task-*',
'/api/v2/actor-tasks-get',
'/api/v2/actor-tasks-post',
'/api/v2/acts-get',
'/api/v2/acts-post',
'/api/v2/dataset-*',
'/api/v2/datasets-*',
'/api/v2/key-value-store-*',
'/api/v2/key-value-stores-*',
'/api/v2/log-get',
'/api/v2/post-*',
'/api/v2/request-queue-*',
'/api/v2/request-queues-*',
'/api/v2/schedule-*',
'/api/v2/schedules-get',
'/api/v2/schedules-post',
'/api/v2/store-get',
'/api/v2/tools-*',
'/api/v2/user-get',
'/api/v2/users-me-*',
'/api/v2/webhook-*',
'/api/v2/webhooks-get',
'/api/v2/webhooks-post',
// Academy: exclude legacy JS course
'/academy/scraping-basics-javascript/legacy',
'/academy/scraping-basics-javascript/legacy/**',
// Academy: exclude individual Node.js tutorials (keep index)
'/academy/node-js/*',
// Academy: exclude individual Python tutorials (keep index)
'/academy/python/*',
// Academy: exclude exercise solutions
'/academy/expert-scraping-with-apify/solutions',
'/academy/expert-scraping-with-apify/solutions/**',
// Academy: exclude legacy scraper tutorials (keep index)
'/academy/apify-scrapers/*',
// Academy: exclude marketing playbook deep pages
'/academy/actor-marketing-playbook/**',
// Academy: exclude misc
'/academy/tutorials',
'/academy/php/**',
// Legal: exclude outdated docs
'/legal/old/**',
'/legal/fair-share-program-terms-and-conditions',
'/legal/challenge-terms-and-conditions',
'/legal/candidate-referral-program-terms',
// Misc singleton pages
'/open-source',
'/sdk',
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@marcel-rbro @TC-MO By including the pages here, the markdown variants themselves were removed. (e.g. https://docs.apify.com/sdk.md now returns 404)
I just want to verify if this ok and if it was intended.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. Definitely not okay, all pages need to have their Markdown counterpart (+ .md) working. Can we add integration tests to ensure these pages work

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tests are not checking the homepages of the child repos, but the setup is there already:

https://github.com/apify/apify-docs/actions/runs/25109134352/workflow#L67

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could fix it: #2480

'/search',
],
routeRules: [
{
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
"lint:code": "eslint .",
"lint:code:fix": "eslint . --fix",
"test:academy": "bats --print-output-on-failure -r .",
"test:llms-size": "node ./scripts/checkLlmsSize.mjs",
"postinstall": "patch-package",
"postbuild": "node ./scripts/joinLlmsFiles.mjs && node ./scripts/indentLlmsFile.mjs"
},
Expand Down
43 changes: 43 additions & 0 deletions scripts/checkLlmsSize.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import fs from 'node:fs/promises';
import path from 'node:path';

const BUILD_DIR = path.resolve('build');
const WARN_LIMIT = 90_000;
const ERROR_LIMIT = 100_000;

async function checkFile(filePath) {
try {
await fs.access(filePath);
} catch {
console.error(`ERROR: ${filePath} does not exist`);
process.exitCode = 1;
return null;
}
// Use string .length for character count (not byte count)
const content = await fs.readFile(filePath, 'utf8');
return content.length;
}

const llmsPath = path.join(BUILD_DIR, 'llms.txt');
const llmsFullPath = path.join(BUILD_DIR, 'llms-full.txt');

const [llmsChars, llmsFullChars] = await Promise.all([
checkFile(llmsPath),
checkFile(llmsFullPath),
]);

if (llmsChars === null || llmsFullChars === null) {
process.exit(1);
}

console.log(`llms.txt: ${llmsChars.toLocaleString()} characters`);
console.log(`llms-full.txt: ${llmsFullChars.toLocaleString()} characters`);

if (llmsChars > ERROR_LIMIT) {
console.error(`\nERROR: llms.txt exceeds ${ERROR_LIMIT.toLocaleString()} character limit`);
process.exitCode = 1;
} else if (llmsChars > WARN_LIMIT) {
console.warn(`\nWARNING: llms.txt exceeds ${WARN_LIMIT.toLocaleString()} characters — consider reducing`);
} else {
console.log(`\nOK (under ${WARN_LIMIT.toLocaleString()} character target)`);
}
114 changes: 81 additions & 33 deletions scripts/joinLlmsFiles.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,23 @@ import fs from 'node:fs/promises';
import path from 'node:path';

const BUILD_DIR = path.resolve('build');
const CURATED_FILE = path.resolve('scripts/llms-external-curated.txt');

const FILES_ROUTES = {
'llms.txt': [
'https://docs.apify.com/api/client/js/llms.txt',
'https://docs.apify.com/api/client/python/llms.txt',
'https://docs.apify.com/sdk/js/llms.txt',
'https://docs.apify.com/sdk/python/llms.txt',
'https://docs.apify.com/cli/llms.txt',
],
'llms-full.txt': [
'https://docs.apify.com/api/client/js/llms-full.txt',
'https://docs.apify.com/api/client/python/llms-full.txt',
'https://docs.apify.com/sdk/js/llms-full.txt',
'https://docs.apify.com/sdk/python/llms-full.txt',
'https://docs.apify.com/cli/llms-full.txt',
'https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/README.md',
'https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/pages/ACTOR_FILE.md',
'https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/pages/DATASET_SCHEMA.md',
'https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/pages/IDEAS.md',
'https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/pages/INPUT_SCHEMA.md',
'https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/pages/KEY_VALUE_STORE_SCHEMA.md',
'https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/pages/OUTPUT_SCHEMA.md',
'https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/pages/REQUEST_QUEUE_SCHEMA.md',
],
};
const EXTERNAL_FETCH_URLS = [
'https://docs.apify.com/api/client/js/llms-full.txt',
'https://docs.apify.com/api/client/python/llms-full.txt',
'https://docs.apify.com/sdk/js/llms-full.txt',
'https://docs.apify.com/sdk/python/llms-full.txt',
'https://docs.apify.com/cli/llms-full.txt',
'https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/README.md',
'https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/pages/ACTOR_FILE.md',
'https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/pages/DATASET_SCHEMA.md',
'https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/pages/IDEAS.md',
'https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/pages/INPUT_SCHEMA.md',
'https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/pages/KEY_VALUE_STORE_SCHEMA.md',
'https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/pages/OUTPUT_SCHEMA.md',
'https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/pages/REQUEST_QUEUE_SCHEMA.md',
];

async function fetchFile(route) {
try {
Expand All @@ -39,16 +31,70 @@ async function fetchFile(route) {
}
}

// Desired section order for llms.txt — sections not listed here appear at the end in original order
const SECTION_ORDER = [
'Platform documentation',
'Apify API',
'Apify API client for JavaScript',
'Apify API client for Python',
'Apify SDK for JavaScript',
'Apify SDK for Python',
'Apify CLI',
'Apify academy',
'Legal documents',
];

/**
* Reorder ## sections in llms.txt according to SECTION_ORDER.
* Keeps the header (everything before the first ## section) in place.
*/
function reorderSections(content) {
const sectionRegex = /^## .+$/gm;
const firstMatch = sectionRegex.exec(content);
if (!firstMatch) return content;

const header = content.slice(0, firstMatch.index);
const body = content.slice(firstMatch.index);

// Split into sections by ## headings
const sections = [];
const parts = body.split(/^(?=## )/gm);
for (const part of parts) {
const nameMatch = part.match(/^## (.+)$/m);
sections.push({ name: nameMatch ? nameMatch[1].trim() : '', content: part });
}

sections.sort((a, b) => {
const indexA = SECTION_ORDER.indexOf(a.name);
const indexB = SECTION_ORDER.indexOf(b.name);
// Sections not in the list keep their relative order at the end
const orderA = indexA === -1 ? SECTION_ORDER.length + sections.indexOf(a) : indexA;
const orderB = indexB === -1 ? SECTION_ORDER.length + sections.indexOf(b) : indexB;
return orderA - orderB;
});

return header + sections.map((s) => s.content.replace(/\n*$/, '\n')).join('\n');
}

async function joinFiles() {
await fs.mkdir(BUILD_DIR, { recursive: true });
for (const [llmsFile, files] of Object.entries(FILES_ROUTES)) {
const contents = await Promise.all(
files.map((route) => fetchFile(route)),
);
const joined = contents.filter(Boolean).join('\n\n');
await fs.appendFile(path.join(BUILD_DIR, llmsFile), joined, 'utf8');
console.log(`Wrote ${llmsFile} to build/`);
}
const llmsPath = path.join(BUILD_DIR, 'llms.txt');

// llms.txt: append curated static content, then reorder all sections
const curatedContent = await fs.readFile(CURATED_FILE, 'utf8');
await fs.appendFile(llmsPath, curatedContent, 'utf8');

const combined = await fs.readFile(llmsPath, 'utf8');
await fs.writeFile(llmsPath, reorderSections(combined), 'utf8');
console.log('Wrote and reordered build/llms.txt');

// llms-full.txt: fetch and append full content from external repos (unchanged behavior)
const contents = await Promise.all(
EXTERNAL_FETCH_URLS.map((route) => fetchFile(route)),
);
const joined = contents.filter(Boolean).join('\n\n');
await fs.appendFile(path.join(BUILD_DIR, 'llms-full.txt'), joined, 'utf8');
console.log('Wrote llms-full.txt to build/');
}

async function sanitizeFile(filePath) {
Expand All @@ -63,4 +109,6 @@ joinFiles().catch((err) => {
process.exit(1);
});

Object.keys(FILES_ROUTES).forEach((llmsFile) => sanitizeFile(path.join(BUILD_DIR, llmsFile)));
for (const llmsFile of ['llms.txt', 'llms-full.txt']) {
sanitizeFile(path.join(BUILD_DIR, llmsFile));
}
Loading