Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cookie popup blocking via adblock-rs #187

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,15 @@ ARG BROWSER_VERSION=105

FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION}

# TODO: Move this into base image
RUN apt-get update && apt-get install -y jq

ENV RUSTUP_HOME=/rust
ENV CARGO_HOME=/cargo
ENV PATH=/cargo/bin:/rust/bin:$PATH

RUN echo "(curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly --no-modify-path)" > /install-rust.sh && chmod 755 /install-rust.sh
RUN /install-rust.sh

# needed to add args to main build stage
ARG BROWSER_VERSION

Expand Down Expand Up @@ -36,6 +42,9 @@ RUN mkdir -p /tmp/ads && cd /tmp/ads && \
cat ad-hosts.txt | grep '^0.0.0.0 '| awk '{ print $2; }' | grep -v '0.0.0.0' | jq --raw-input --slurp 'split("\n")' > /app/ad-hosts.json && \
rm /tmp/ads/ad-hosts.txt

# Add cookie popup blocklist
RUN curl -vs -o /app/easylist-cookies.txt https://secure.fanboy.co.nz/fanboy-cookiemonster.txt

RUN yarn install

ADD *.js /app/
Expand Down
28 changes: 15 additions & 13 deletions crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI }

import { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } from "./util/constants.js";

import { AdBlockRules, BlockRules } from "./util/blockrules.js";
import { BlockRules } from "./util/blockrules.js";

// to ignore HTTPS error for HEAD check
import { Agent as HTTPAgent } from "http";
Expand Down Expand Up @@ -98,7 +98,6 @@ export class Crawler {
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");

this.blockRules = null;
this.adBlockRules = null;

this.errorCount = 0;

Expand Down Expand Up @@ -501,6 +500,10 @@ export class Crawler {
}
}

blockEnabled() {
return (this.params.blockRules && this.params.blockRules.length) || this.params.blockAds || this.params.blockCookiePopups;
}

async serializeAndExit() {
await this.serializeConfig();
process.exit(0);
Expand Down Expand Up @@ -577,12 +580,15 @@ export class Crawler {

await this.initPages();

if (this.params.blockAds) {
this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, (text) => this.debugLog(text));
}

if (this.params.blockRules && this.params.blockRules.length) {
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, (text) => this.debugLog(text));
if (this.blockEnabled()) {
this.blockRules = new BlockRules(
this.params.blockRules,
this.captureBasePrefix,
this.params.blockMessage,
this.params.blockAds,
this.params.adBlockMessage,
this.params.blockCookiePopups,
(text) => this.debugLog(text));
}

this.screencaster = this.initScreenCaster();
Expand Down Expand Up @@ -760,11 +766,7 @@ export class Crawler {
}
}

if (this.adBlockRules) {
await this.adBlockRules.initPage(page);
}

if (this.blockRules) {
if (this.blockEnabled()) {
await this.blockRules.initPage(page);
}

Expand Down
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@
},
"dependencies": {
"abort-controller": "^3.0.0",
"adblock-rs": "^0.5.8",
"browsertrix-behaviors": "^0.3.4",
"get-folder-size": "^4.0.0",
"ioredis": "^4.27.1",
"js-yaml": "^4.1.0",
"minio": "7.0.26",
"neon-cli": "^0.10.1",
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
"puppeteer-core": "^17.1.2",
"request": "^2.88.2",
Expand Down
7 changes: 7 additions & 0 deletions util/argParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,13 @@ class ArgParser {
default: false,
},

"blockCookiePopups": {
alias: "blockcookiepopups",
describe: "If set, block cookie, GDPR, and privacy notice pop-ups (based on EasyList Cookie List)",
type: "boolean",
default: false,
},

"adBlockMessage": {
describe: "If specified, when an ad is blocked, a record with this error message is added instead",
type: "string",
Expand Down
112 changes: 58 additions & 54 deletions util/blockrules.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import fs from "fs";
import * as AdBlockClient from "adblock-rs";

const RULE_TYPES = ["block", "allowOnly"];

Expand All @@ -9,7 +10,8 @@ const BlockState = {
BLOCK_PAGE_NAV: "page",
BLOCK_IFRAME_NAV: "iframe",
BLOCK_OTHER: "resource",
BLOCK_AD: "advertisement"
BLOCK_AD: "advertisement",
BLOCK_COOKIE_POPUP: "cookie pop-up"
};


Expand Down Expand Up @@ -47,12 +49,23 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
// ===========================================================================
export class BlockRules
{
constructor(blockRules, blockPutUrl, blockErrMsg, debugLog) {
constructor(blockRules, blockPutUrl, blockErrMsg, blockAds, adBlockErrMsg, blockCookiePopups, debugLog) {
this.rules = [];
this.blockPutUrl = blockPutUrl;
this.blockErrMsg = blockErrMsg;
this.blockAds = blockAds;
this.adBlockErrMsg = adBlockErrMsg;
this.blockCookiePopups = blockCookiePopups;
this.debugLog = debugLog;

this.adhosts = JSON.parse(fs.readFileSync(new URL("../ad-hosts.json", import.meta.url)));
if (this.blockCookiePopups) {
const easylistRules = fs.readFileSync(new URL("../easylist-cookies.txt", import.meta.url), { encoding: "utf-8" }).split("\n");
const filterSet = new AdBlockClient.FilterSet(true);
filterSet.addFilters(easylistRules);
this.cookieBlockClient = new AdBlockClient.Engine(filterSet, true);
}

this.blockedUrlSet = new Set();

for (const ruleData of blockRules) {
Expand All @@ -68,10 +81,6 @@ export class BlockRules
}

async initPage(page) {
if (!this.rules.length) {
return;
}

if (page._btrix_interceptionAdded) {
return true;
}
Expand All @@ -95,14 +104,25 @@ export class BlockRules
let blockState;

try {
blockState = await this.shouldBlock(request, url);

if (blockState === BlockState.ALLOW) {
await request.continue();
} else {
await request.abort("blockedbyclient");
if (this.blockAds) {
blockState = await this.shouldBlockAd(request, url);
if (blockState === BlockState.BLOCK_AD) {
return await request.abort("blockedbyclient");
}
}

if (this.blockCookiePopups) {
blockState = await this.shouldBlockCookiePopup(request, url);
if (blockState === BlockState.BLOCK_COOKIE_POPUP) {
return await request.abort("blockedbyclient");
}
}
if (this.rules.length) {
blockState = await this.shouldBlock(request, url);
if (blockState !== BlockState.ALLOW) {
return await request.abort("blockedbyclient");
}
}
await request.continue();
} catch (e) {
this.debugLog(`Block: (${blockState}) Failed On: ${url} Reason: ${e}`);
}
Expand Down Expand Up @@ -208,7 +228,27 @@ export class BlockRules
}
}

async recordBlockMsg(url) {
async shouldBlockAd(request, url) {
const fragments = url.split("/");
const domain = fragments.length > 2 ? fragments[2] : null;
if (this.adhosts.includes(domain)) {
this.debugLog(`URL blocked for being an ad: ${url}`);
await this.recordBlockMsg(url, true);
return BlockState.BLOCK_AD;
}
return BlockState.ALLOW;
}

async shouldBlockCookiePopup(request, url) {
const checkResult = this.cookieBlockClient.check(url, "", "");
if (checkResult != false) {
this.debugLog(`URL blocked for being a cookie pop-up: ${url}`);
return BlockState.BLOCK_COOKIE_POPUP;
}
return BlockState.ALLOW;
}

async recordBlockMsg(url, ad=false) {
if (this.blockedUrlSet.has(url)) {
return;
}
Expand All @@ -219,48 +259,12 @@ export class BlockRules
return;
}

const body = this.blockErrMsg;
let body = this.blockErrMsg;
if (ad) {
body = this.adBlockErrMessage;
}
const putUrl = new URL(this.blockPutUrl);
putUrl.searchParams.set("url", url);
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
}
}


// ===========================================================================
export class AdBlockRules extends BlockRules
{
constructor(blockPutUrl, blockErrMsg, debugLog, adhostsFilePath = "../ad-hosts.json") {
super([], blockPutUrl, blockErrMsg, debugLog);
this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url)));
}

async initPage(page) {
if (page._btrix_adInterceptionAdded) {
return true;
}

page._btrix_adInterceptionAdded = true;

await page.setRequestInterception(true);

page.on("request", async (request) => {
try {
await this.handleRequest(request);
} catch (e) {
console.warn(e);
}
});
}

async shouldBlock(request, url) {
const fragments = url.split("/");
const domain = fragments.length > 2 ? fragments[2] : null;
if (this.adhosts.includes(domain)) {
this.debugLog(`URL blocked for being an ad: ${url}`);
await this.recordBlockMsg(url);
return BlockState.BLOCK_AD;
}
return BlockState.ALLOW;
}
}
Loading