-
Notifications
You must be signed in to change notification settings - Fork 765
Commit
Adds a Camoufox-based crawler template (`camoufox-ts`). Compared to the basic `playwright-ts` template, `camoufox-ts` uses the `camoufox-js` package, which finds the correct latest Camoufox binary in [GitHub Releases](https://github.com/daijro/camoufox/releases) assets, downloads it and passes the correct launch options to it. The `main.ts` script is modified to run the downloaded binary with the correct `launchOptions`. Related to #2836
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# configurations | ||
.idea | ||
|
||
# crawlee storage folder | ||
storage | ||
|
||
# installed files | ||
node_modules |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# This file tells Git which files shouldn't be added to source control | ||
|
||
.idea | ||
dist | ||
node_modules | ||
storage | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# Specify the base Docker image. You can read more about | ||
# the available images at https://crawlee.dev/docs/guides/docker-images | ||
# You can also use any other image from Docker Hub. | ||
FROM apify/actor-node-playwright-chrome:20 AS builder | ||
|
||
# Copy just package.json and package-lock.json | ||
# to speed up the build using Docker layer cache. | ||
COPY --chown=myuser package*.json ./ | ||
|
||
# Install all dependencies. Don't audit to speed up the installation. | ||
RUN npm install --include=dev --audit=false | ||
|
||
# Next, copy the source files using the user set | ||
# in the base image. | ||
COPY --chown=myuser . ./ | ||
|
||
# Install all dependencies and build the project. | ||
# Don't audit to speed up the installation. | ||
RUN npm run build | ||
|
||
# Create final image | ||
FROM apify/actor-node-playwright-chrome:20 | ||
|
||
# Copy only built JS files from builder image | ||
COPY --from=builder --chown=myuser /home/myuser/dist ./dist | ||
|
||
# Copy just package.json and package-lock.json | ||
# to speed up the build using Docker layer cache. | ||
COPY --chown=myuser package*.json ./ | ||
|
||
# Install NPM packages, skip optional and development dependencies to | ||
# keep the image small. Avoid logging too much and print the dependency | ||
# tree for debugging | ||
RUN npm --quiet set progress=false \ | ||
&& npm install --omit=dev \ | ||
&& echo "Installed NPM packages:" \ | ||
&& (npm list --omit=dev --all || true) \ | ||
&& echo "Node.js version:" \ | ||
&& node --version \ | ||
&& echo "NPM version:" \ | ||
&& npm --version | ||
|
||
RUN npm run get-binaries | ||
|
||
# Next, copy the remaining files and directories with the source code. | ||
# Since we do this after NPM install, quick build will be really fast | ||
# for most source file changes. | ||
COPY --chown=myuser . ./ | ||
|
||
# Run the image. If you know you won't need headful browsers, | ||
# you can remove the XVFB start script for a micro perf gain. | ||
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Crawlee + PlaywrightCrawler + Camoufox + TypeScript project | ||
|
||
This template is a production ready boilerplate for developing with `PlaywrightCrawler`. Use this to bootstrap your projects using the most up-to-date code. | ||
|
||
If you're looking for examples or want to learn more visit: | ||
|
||
- [Documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler) | ||
- [Examples](https://crawlee.dev/docs/examples/playwright-crawler) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
{ | ||
"name": "crawlee-camoufox-ts", | ||
"version": "0.0.1", | ||
"type": "module", | ||
"description": "This is an example of a Crawlee project.", | ||
"dependencies": { | ||
"camoufox-js": "^0.1.3", | ||
"crawlee": "^3.0.0", | ||
"playwright": "*" | ||
}, | ||
"devDependencies": { | ||
"@apify/tsconfig": "^0.1.0", | ||
"@types/fs-extra": "^11", | ||
"@types/node": "^22.0.0", | ||
"fs-extra": "^11.3.0", | ||
"tsx": "^4.4.0", | ||
"typescript": "~5.7.0" | ||
}, | ||
"scripts": { | ||
"start": "npm run start:dev", | ||
"start:prod": "node dist/main.js", | ||
"start:dev": "tsx src/main.ts", | ||
"build": "tsc", | ||
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1", | ||
"get-binaries": "camoufox-js fetch" | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
barjin
Author
Contributor
|
||
}, | ||
"author": "It's not you it's me", | ||
"license": "ISC" | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
// For more information, see https://crawlee.dev/ | ||
import { launchOptions } from 'camoufox-js'; | ||
import { PlaywrightCrawler, ProxyConfiguration } from 'crawlee'; | ||
import { firefox } from 'playwright'; | ||
|
||
import { router } from './routes.js'; | ||
|
||
const startUrls = ['https://crawlee.dev']; | ||
|
||
const crawler = new PlaywrightCrawler({ | ||
// proxyConfiguration: new ProxyConfiguration({ proxyUrls: ['...'] }), | ||
requestHandler: router, | ||
// Comment this option to scrape the full website. | ||
maxRequestsPerCrawl: 20, | ||
launchContext: { | ||
launcher: firefox, | ||
launchOptions: await launchOptions({ | ||
headless: false, | ||
// Pass your own Camoufox parameters here... | ||
// block_images: true, | ||
// fonts: ['Times New Roman'], | ||
// ... | ||
}), | ||
}, | ||
}); | ||
|
||
await crawler.run(startUrls); |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import { createPlaywrightRouter } from 'crawlee'; | ||
|
||
export const router = createPlaywrightRouter(); | ||
|
||
router.addDefaultHandler(async ({ enqueueLinks, log }) => { | ||
log.info(`enqueueing new URLs`); | ||
await enqueueLinks({ | ||
globs: ['https://crawlee.dev/**'], | ||
label: 'detail', | ||
}); | ||
}); | ||
|
||
router.addHandler('detail', async ({ request, page, log, pushData }) => { | ||
const title = await page.title(); | ||
log.info(`${title}`, { url: request.loadedUrl }); | ||
|
||
await pushData({ | ||
url: request.loadedUrl, | ||
title, | ||
}); | ||
}); |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
{ | ||
"extends": "@apify/tsconfig", | ||
"compilerOptions": { | ||
"module": "NodeNext", | ||
"moduleResolution": "NodeNext", | ||
"target": "ES2022", | ||
"outDir": "dist", | ||
"noUnusedLocals": false, | ||
"lib": ["DOM"] | ||
}, | ||
"include": ["./src/**/*"] | ||
} |
i guess we should add the postinstall hook here as well