Skip to content

Commit 7d940f8

Browse files
Feat: Add Cheerio content crawler
2 parents a9d87a7 + eb5260c commit 7d940f8

File tree

14 files changed

+377
-207
lines changed

14 files changed

+377
-207
lines changed

.actor/Dockerfile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,5 +58,8 @@ COPY --from=builder --chown=myuser /home/myuser/dist ./dist
5858
# for most source file changes.
5959
COPY --chown=myuser . ./
6060

61+
# Disable experimental feature warning from Node.js
62+
ENV NODE_NO_WARNINGS=1
63+
6164
# Run the image.
6265
CMD npm run start:prod --silent

.actor/input_schema.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,15 @@
136136
"description": "If enabled, the Actor attempts to close or remove cookie consent dialogs to improve the quality of extracted text. Note that this setting increases the latency.",
137137
"default": true
138138
},
139+
"scrapingTool": {
140+
"title": "Which scraping tool to use",
141+
"type": "string",
142+
"description": "Choose what scraping tool to use for extracting the target web pages. The Browser tool is more powerful and can handle JavaScript heavy websites. While the Plain HTML tool is about two times faster.",
143+
"editor": "select",
144+
"default": "browser-playwright",
145+
"enum": ["browser-playwright", "raw-http"],
146+
"enumTitles": ["Browser (uses Playwright)", "Raw HTTP"]
147+
},
139148
"debugMode": {
140149
"title": "Enable debug mode",
141150
"type": "boolean",

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ The `/search` GET HTTP endpoint accepts the following query parameters:
116116
| `dynamicContentWaitSecs` | number | `10` | The maximum time in seconds to wait for dynamic page content to load. The Actor considers the web page as fully loaded once this time elapses or when the network becomes idle. |
117117
| `removeCookieWarnings` | boolean | `true` | If enabled, removes cookie consent dialogs to improve text extraction accuracy. This might increase latency. |
118118
| `removeElementsCssSelector` | string | `see input` | A CSS selector matching HTML elements that will be removed from the DOM, before converting it to text, Markdown, or saving as HTML. This is useful to skip irrelevant page content. The value must be a valid CSS selector as accepted by the `document.querySelectorAll()` function. \n\nBy default, the Actor removes common navigation elements, headers, footers, modals, scripts, and inline image. You can disable the removal by setting this value to some non-existent CSS selector like `dummy_keep_everything`. |
119+
| `scrapingTool` | string | `browser-playwright` | Selects which scraping tool is used to extract the target websits. `browser-playwright` uses browser and can handle complex Javascript heavy website. Meanwhile `raw-http` uses simple HTTP request to fetch the HTML provided by the URL, it can't handle websites that rely on Javascript but it's about two times faster. |
119120
| `debugMode` | boolean | `false` | If enabled, the Actor will store debugging information in the dataset's debug field. |
120121

121122
<!-- TODO: we should probably add proxyConfiguration -->

src/const.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import inputSchema from '../.actor/input_schema.json' assert { type: 'json' };
1+
import inputSchema from '../.actor/input_schema.json' with { type: 'json' };
22

33
export enum ContentCrawlerStatus {
44
PENDING = 'pending',
@@ -12,6 +12,11 @@ export enum Routes {
1212
MESSAGE = '/message',
1313
}
1414

15+
export enum ContentCrawlerTypes {
16+
PLAYWRIGHT = 'playwright',
17+
CHEERIO = 'cheerio',
18+
}
19+
1520
export const PLAYWRIGHT_REQUEST_TIMEOUT_NORMAL_MODE_SECS = 60;
1621

1722
// Default values parsed from input_schema.json
@@ -38,4 +43,5 @@ export const defaults = {
3843
serpMaxRetries: inputSchema.properties.serpMaxRetries.default,
3944
serpMaxRetriesMax: inputSchema.properties.serpMaxRetries.maximum,
4045
serpProxyGroup: inputSchema.properties.serpProxyGroup.default,
46+
scrapingTool: inputSchema.properties.scrapingTool.default,
4147
};

src/crawlers.ts

Lines changed: 65 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,11 @@ import {
1212
RequestOptions,
1313
} from 'crawlee';
1414

15+
import { ContentCrawlerTypes } from './const.js';
1516
import { scrapeOrganicResults } from './google-search/google-extractors-urls.js';
16-
import { failedRequestHandlerPlaywright, requestHandlerPlaywright } from './playwright-req-handler.js';
17+
import { failedRequestHandler, requestHandlerCheerio, requestHandlerPlaywright } from './request-handler.js';
1718
import { addEmptyResultToResponse, sendResponseError } from './responses.js';
18-
import type { PlaywrightCrawlerUserData, SearchCrawlerUserData } from './types.js';
19+
import type { ContentCrawlerOptions, ContentCrawlerUserData, SearchCrawlerUserData } from './types.js';
1920
import { addTimeMeasureEvent, createRequest } from './utils.js';
2021

2122
const crawlers = new Map<string, CheerioCrawler | PlaywrightCrawler>();
@@ -25,42 +26,22 @@ export function getCrawlerKey(crawlerOptions: CheerioCrawlerOptions | Playwright
2526
return JSON.stringify(crawlerOptions);
2627
}
2728

28-
/**
29-
* Creates and starts a Google search crawler and Playwright content crawler with the provided configurations.
30-
* A crawler won't be created if it already exists.
31-
*/
32-
export async function createAndStartCrawlers(
33-
cheerioCrawlerOptions: CheerioCrawlerOptions,
34-
playwrightCrawlerOptions: PlaywrightCrawlerOptions,
35-
startCrawlers: boolean = true,
36-
) {
37-
const { crawler: searchCrawler } = await createAndStartSearchCrawler(
38-
cheerioCrawlerOptions,
39-
startCrawlers,
40-
);
41-
const { key: playwrightCrawlerKey, crawler: playwrightCrawler } = await createAndStartCrawlerPlaywright(
42-
playwrightCrawlerOptions,
43-
startCrawlers,
44-
);
45-
return { searchCrawler, playwrightCrawler, playwrightCrawlerKey };
46-
}
47-
4829
/**
4930
* Creates and starts a Google search crawler with the provided configuration.
5031
* A crawler won't be created if it already exists.
5132
*/
52-
async function createAndStartSearchCrawler(
53-
cheerioCrawlerOptions: CheerioCrawlerOptions,
33+
export async function createAndStartSearchCrawler(
34+
searchCrawlerOptions: CheerioCrawlerOptions,
5435
startCrawler: boolean = true,
5536
) {
56-
const key = getCrawlerKey(cheerioCrawlerOptions);
37+
const key = getCrawlerKey(searchCrawlerOptions);
5738
if (crawlers.has(key)) {
5839
return { key, crawler: crawlers.get(key) };
5940
}
6041

6142
log.info(`Creating new cheerio crawler with key ${key}`);
6243
const crawler = new CheerioCrawler({
63-
...(cheerioCrawlerOptions as CheerioCrawlerOptions),
44+
...(searchCrawlerOptions as CheerioCrawlerOptions),
6445
requestQueue: await RequestQueue.open(key, { storageClient: client }),
6546
requestHandler: async ({ request, $: _$ }: CheerioCrawlingContext<SearchCrawlerUserData>) => {
6647
// NOTE: we need to cast this to fix `cheerio` type errors
@@ -92,10 +73,10 @@ async function createAndStartSearchCrawler(
9273
request.userData.query,
9374
result,
9475
responseId,
95-
request.userData.playwrightScraperSettings!,
76+
request.userData.contentScraperSettings!,
9677
request.userData.timeMeasures!,
9778
);
98-
await addPlaywrightCrawlRequest(r, responseId, request.userData.playwrightCrawlerKey!);
79+
await addContentCrawlRequest(r, responseId, request.userData.contentCrawlerKey!);
9980
}
10081
},
10182
failedRequestHandler: async ({ request }, err) => {
@@ -118,50 +99,78 @@ async function createAndStartSearchCrawler(
11899
}
119100

120101
/**
121-
* Creates and starts a Playwright content crawler with the provided configuration.
102+
* Creates and starts a content crawler with the provided configuration.
103+
* Either Playwright or Cheerio crawler will be created based on the provided crawler options.
122104
* A crawler won't be created if it already exists.
123105
*/
124-
async function createAndStartCrawlerPlaywright(
125-
crawlerOptions: PlaywrightCrawlerOptions,
106+
export async function createAndStartContentCrawler(
107+
contentCrawlerOptions: ContentCrawlerOptions,
126108
startCrawler: boolean = true,
127109
) {
110+
const { type: crawlerType, crawlerOptions } = contentCrawlerOptions;
111+
128112
const key = getCrawlerKey(crawlerOptions);
129113
if (crawlers.has(key)) {
130114
return { key, crawler: crawlers.get(key) };
131115
}
132116

133-
log.info(`Creating new playwright crawler with key ${key}`);
134-
const crawler = new PlaywrightCrawler({
135-
...(crawlerOptions as PlaywrightCrawlerOptions),
136-
keepAlive: crawlerOptions.keepAlive,
137-
requestQueue: await RequestQueue.open(key, { storageClient: client }),
138-
requestHandler: async (context: PlaywrightCrawlingContext) => {
139-
await requestHandlerPlaywright(context as unknown as PlaywrightCrawlingContext<PlaywrightCrawlerUserData>);
140-
},
141-
failedRequestHandler: ({ request }, err) => failedRequestHandlerPlaywright(request, err),
142-
});
117+
const crawler = crawlerType === 'playwright'
118+
? await createPlaywrightContentCrawler(crawlerOptions, key)
119+
: await createCheerioContentCrawler(crawlerOptions, key);
143120

144121
if (startCrawler) {
145122
crawler.run().then(
146-
() => log.warning(`Crawler playwright has finished`),
123+
() => log.warning(`Crawler ${crawlerType} has finished`),
147124
() => {},
148125
);
149-
log.info('Crawler playwright has started 💪🏼');
126+
log.info(`Crawler ${crawlerType} has started 💪🏼`);
150127
}
151128
crawlers.set(key, crawler);
152129
log.info(`Number of crawlers ${crawlers.size}`);
153130
return { key, crawler };
154131
}
155132

133+
async function createPlaywrightContentCrawler(
134+
crawlerOptions: PlaywrightCrawlerOptions,
135+
key: string,
136+
): Promise<PlaywrightCrawler> {
137+
log.info(`Creating new playwright crawler with key ${key}`);
138+
return new PlaywrightCrawler({
139+
...crawlerOptions,
140+
keepAlive: crawlerOptions.keepAlive,
141+
requestQueue: await RequestQueue.open(key, { storageClient: client }),
142+
requestHandler: async (context) => {
143+
await requestHandlerPlaywright(context as unknown as PlaywrightCrawlingContext<ContentCrawlerUserData>);
144+
},
145+
failedRequestHandler: ({ request }, err) => failedRequestHandler(request, err, ContentCrawlerTypes.PLAYWRIGHT),
146+
});
147+
}
148+
149+
async function createCheerioContentCrawler(
150+
crawlerOptions: CheerioCrawlerOptions,
151+
key: string,
152+
): Promise<CheerioCrawler> {
153+
log.info(`Creating new cheerio crawler with key ${key}`);
154+
return new CheerioCrawler({
155+
...crawlerOptions,
156+
keepAlive: crawlerOptions.keepAlive,
157+
requestQueue: await RequestQueue.open(key, { storageClient: client }),
158+
requestHandler: async (context) => {
159+
await requestHandlerCheerio(context as unknown as CheerioCrawlingContext<ContentCrawlerUserData>);
160+
},
161+
failedRequestHandler: ({ request }, err) => failedRequestHandler(request, err, ContentCrawlerTypes.CHEERIO),
162+
});
163+
}
164+
156165
/**
157166
* Adds a search request to the Google search crawler.
158167
* Create a response for the request and set the desired number of results (maxResults).
159168
*/
160169
export const addSearchRequest = async (
161-
request: RequestOptions<PlaywrightCrawlerUserData>,
162-
cheerioCrawlerOptions: CheerioCrawlerOptions,
170+
request: RequestOptions<ContentCrawlerUserData>,
171+
searchCrawlerOptions: CheerioCrawlerOptions,
163172
) => {
164-
const key = getCrawlerKey(cheerioCrawlerOptions);
173+
const key = getCrawlerKey(searchCrawlerOptions);
165174
const crawler = crawlers.get(key);
166175

167176
if (!crawler) {
@@ -174,26 +183,28 @@ export const addSearchRequest = async (
174183
};
175184

176185
/**
177-
* Adds a content crawl request to the Playwright content crawler.
186+
* Adds a content crawl request to selected content crawler.
178187
* Get existing crawler based on crawlerOptions and scraperSettings, if not present -> create new
179188
*/
180-
export const addPlaywrightCrawlRequest = async (
181-
request: RequestOptions<PlaywrightCrawlerUserData>,
189+
export const addContentCrawlRequest = async (
190+
request: RequestOptions<ContentCrawlerUserData>,
182191
responseId: string,
183-
playwrightCrawlerKey: string,
192+
contentCrawlerKey: string,
184193
) => {
185-
const crawler = crawlers.get(playwrightCrawlerKey);
194+
const crawler = crawlers.get(contentCrawlerKey);
195+
const name = crawler instanceof PlaywrightCrawler ? 'playwright' : 'cheerio';
196+
186197
if (!crawler) {
187-
log.error(`Playwright crawler not found: key ${playwrightCrawlerKey}`);
198+
log.error(`Content crawler not found: key ${contentCrawlerKey}`);
188199
return;
189200
}
190201
try {
191202
await crawler.requestQueue!.addRequest(request);
192203
// create an empty result in search request response
193204
// do not use request.uniqueKey as responseId as it is not id of a search request
194205
addEmptyResultToResponse(responseId, request);
195-
log.info(`Added request to the playwright-content-crawler: ${request.url}`);
206+
log.info(`Added request to the ${name}-content-crawler: ${request.url}`);
196207
} catch (err) {
197-
log.error(`Error adding request to playwright-content-crawler: ${request.url}, error: ${err}`);
208+
log.error(`Error adding request to ${name}-content-crawler: ${request.url}, error: ${err}`);
198209
}
199210
};

0 commit comments

Comments
 (0)