-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathharvest.js
117 lines (92 loc) · 3.38 KB
/
harvest.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import axios from 'axios'
import { JSDOM } from 'jsdom'
import readline from 'readline'
import yargs from 'yargs'
import { hideBin } from 'yargs/helpers'
import { Harvester } from 'corvee-harvester'
import { saveRecords } from './utils/index.js'
import { console, inspect } from 'corvee-core'
import config from './config/harvester.js'
const today = new Date();
const year = today.getFullYear();
const month = `${today.getMonth() + 1}`.padStart(2, '0');
const day = `${today.getDate()}`.padStart(2, '0');
const defaultJob = `${year}-${month}-${day}`;
const argv = yargs(hideBin(process.argv))
.options({
job: {
alias: 'j',
default: defaultJob,
describe: `Job id. Defaults to today\'s date: ${defaultJob}`,
type: 'string'
},
resume: {
alias: 'r',
default: false,
type: 'boolean',
describe: 'Resumes a previously stoped job. Requires --job options.',
implies: 'j'
}
})
.help()
.parseSync();
const job = argv.job;
let internLinks = new Set()
let externLinks = new Set()
async function harvest() {
readline.emitKeypressEvents(process.stdin);
process.stdin.setRawMode(true);
process.stdin.on('keypress', (str, key) => {
if (key.ctrl && key.name === 'p') {
if (harvester.isPaused) {
harvester.resume();
} else {
harvester.pause();
}
}
if (key.ctrl && key.name === 'c') {
process.exit()
}
});
console.log('Using job ' + job)
const { data: azData } = await axios('https://libguides.bib.umontreal.ca/process/az/dblist?subject_id=&type_id=&vendor_id=&content_id=0&search=&site_id=18643&is_widget=0')
const { document: azDOM } = (new JSDOM(azData.data.html)).window
const links = Array
.from(/** @type {NodeListOf<HTMLAnchorElement>} */(azDOM.querySelectorAll('#s-lg-az-results .s-lg-az-result > .s-lg-az-result-title > a[href]')))
.map(link => ({
url: link.href,
text: link.firstChild?.textContent || '',
urlData: link.getAttribute('href'),
isNavigationRequest: true
}))
const harvester = new Harvester(config)
await harvester.addUrl(links)
harvester.on('request', function onRequest(request) {
console.info(`[${request.retryCount}] Request url: ${request.url}`);
if (request.extern) {
externLinks.add(request.url)
} else {
internLinks.add(request.url)
}
})
saveRecords(harvester, job)
const task = argv.resume ? 'resume' : 'run';
console.info(`Running with config: ${inspect(harvester.config)}`);
harvester.on('start', function onStart() {
console.info(`Running with run options: ${inspect(harvester.runOptions)}`)
})
harvester.on('end', function onEnd() {
console.info(`Found ${internLinks.size} intern pages.`)
console.info(`Found ${externLinks.size} extern pages.`)
})
console.log(`${task === 'resume' ? 'Resuming' : 'Running'} harvesting.`)
try {
await harvester[task]()
} catch (e) {
console.error(e)
process.nextTick(function () {
process.exit()
})
}
}
harvest();