-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
120 lines (103 loc) · 4.28 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
const axios = require('axios');
const cheerio = require('cheerio');
const xml2js = require('xml2js');
const json2csv = require('json2csv').parse;
/**
* Scrapes data from a web page based on the provided URL and target class.
* @async
* @param {string} url - The URL of the web page to scrape.
* @param {string} targetClass - The CSS class selector of the target element(s) to scrape.
* @param {Object} [options={}] - Optional configurations for the scraping process.
* @param {Function} [options.beforeRequest] - A function to be executed before sending the HTTP request.
* @param {Function} [options.afterRequest] - A function to be executed after receiving the HTTP response.
* @param {Function} [options.onError] - A function to be executed when an error occurs during scraping.
* @param {Function} [options.beforeParse] - A function to be executed before parsing the HTML content.
* @param {Function} [options.afterParse] - A function to be executed after parsing the HTML content.
* @param {Function} [options.beforeRetry] - A function to be executed before retrying the scraping process.
* @param {Function} [options.afterRetry] - A function to be executed after a retry attempt.
* @param {Function} [options.beforeResponse] - A function to be executed before processing the HTTP response.
* @param {number} [options.timeout=5000] - The timeout duration for the HTTP request in milliseconds.
* @param {Object} [options.headers={}] - Additional headers to include in the HTTP request.
* @returns {Promise<Object>} - A promise that resolves with the scraped data or rejects with an error.
*/
async function scrapeData(url, targetClass, options = {}) {
const { beforeRequest, afterRequest, onError, beforeParse, afterParse, beforeRetry, afterRetry, beforeResponse, timeout = 5000, headers = {} } = options;
try {
// BeforeRequest event
if (beforeRequest && typeof beforeRequest === 'function') {
beforeRequest(url);
}
// Axios request configuration
const axiosConfig = {
url,
headers: {
...headers,
},
timeout,
};
const response = await axios(axiosConfig);
// AfterRequest event
if (afterRequest && typeof afterRequest === 'function') {
afterRequest(response);
}
const $ = cheerio.load(response.data);
// BeforeParse event
if (beforeParse && typeof beforeParse === 'function') {
beforeParse($);
}
const data = $(targetClass).text().trim();
// AfterParse event
if (afterParse && typeof afterParse === 'function') {
afterParse(data);
}
// json and csv data formatter
let formattedData;
if (format === 'xml') {
formattedData = await generateXML({ data });
} else if (format === 'csv') {
formattedData = generateCSV([{ data }]);
} else {
formattedData = data;
}
return { data };
} catch (error) {
// Error event
if (onError && typeof onError === 'function') {
onError(error);
}
// BeforeRetry event
if (beforeRetry && typeof beforeRetry === 'function') {
beforeRetry(error);
}
// Retry logic
if (options.retry > 0) {
console.log(`Retrying... Attempts left: ${options.retry}`);
return scrapeData(url, targetClass, { ...options, retry: options.retry - 1 });
}
// AfterRetry event
if (afterRetry && typeof afterRetry === 'function') {
afterRetry(error);
}
// Handle specific error messages
if (error.response) {
// Axios error
return { error: 'HTTP request failed' };
} else if (error.message === 'Something went wrong in Cheerio') {
// Cheerio error
return { error: 'Cheerio parsing error' };
} else {
// Other errors
return { error: error.message };
}
}
}
async function generateXML(data) {
const builder = new xml2js.Builder();
const xml = builder.buildObject({ data });
return xml;
}
function generateCSV(data) {
const csv = json2csv(data);
return csv;
}
module.exports = { scrapeData };