-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhttparchive.js
41 lines (39 loc) · 1.04 KB
/
httparchive.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
// Staging tables source: https://github.com/HTTPArchive/crawl/blob/main/crawl.py
['pages', 'requests', 'parsed_css'].forEach(table =>
declare({
schema: 'crawl_staging',
name: table
})
)
assert('corrupted_technology_values')
.tags(['crawl_complete'])
.query(ctx => `
SELECT
date,
client,
tech,
COUNT(DISTINCT page) AS cnt_pages,
ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages
FROM ${ctx.ref('crawl_staging', 'pages')} AS pages
LEFT JOIN pages.technologies AS tech
LEFT JOIN tech.categories AS category
WHERE
date = '${constants.currentMonth}' AND
(
tech.technology NOT IN (SELECT DISTINCT name FROM wappalyzer.technologies)
OR category NOT IN (SELECT DISTINCT name FROM wappalyzer.categories)
OR ARRAY_LENGTH(tech.categories) = 0
)
GROUP BY
date,
client,
tech
ORDER BY cnt_pages DESC
`);
// Wappalyzer tables source: https://github.com/HTTPArchive/wappalyzer/blob/main/.github/workflows/upload.yml
['technologies', 'categories'].forEach(table =>
declare({
schema: 'wappalyzer',
name: table
})
)