Skip to content

Commit 27a451e

Browse files
committed
support to scrape example from doc
1 parent c352f80 commit 27a451e

File tree

4 files changed

+141
-73
lines changed

4 files changed

+141
-73
lines changed

tool/scrape/Dockerfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ WORKDIR /app
55
RUN npm i -g nodemon typescript
66
RUN npm i
77
RUN tsc
8-
RUN apk --update add git
8+
RUN apk --update --no-cache add git
99
RUN git clone https://github.com/tencentcloudstack/terraform-provider-tencentcloud.git
10-
CMD node index.js
10+
# CMD node index.js
11+
ENTRYPOINT [ "node", "index.js" ]

tool/scrape/build.sh

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,21 @@ workdir=$(
44
pwd
55
)
66

7-
if [ $# == 0 ]; then
7+
if [ $# == 0 ]; then # default
88
echo "docker build with no params..."
9-
docker build -f ${workdir}/Dockerfile ${workdir}/
10-
else
9+
docker build -f ${workdir}/Dockerfile ${workdir}/ -t tiat/terraform-scrape:latest
10+
else # with image:tag
1111
if [ ! -n "${2}" ]; then
1212
echo "docker build with [specified image]...[${1}][${2}][${workdir}]"
1313
docker build -f ${workdir}/Dockerfile ${workdir}/ -t ${1}
1414

1515
echo "docker run [${1}] to generate tiat-resources.json."
16-
docker run -it ${1} >../../config/tips/tiat-resources.json
17-
else
16+
docker run -it ${1} resource >../../config/tips/tiat-resources.json
17+
18+
echo "docker run [${1}] to generate tiat-resources.json."
19+
docker run -it ${1} example >../../config/tips/tiat-resources.json
20+
else # with args
1821
echo "docker build with [build-arg]...[${1}][${2}][${workdir}]"
19-
docker build --build-arg base=${2} -f ${workdir}/Dockerfile ${workdir}/ -t ${1}
22+
docker build --build-arg args=${2} -f ${workdir}/Dockerfile ${workdir}/ -t ${1}
2023
fi
2124
fi

tool/scrape/index.ts

Lines changed: 127 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -25,29 +25,34 @@ interface UrlPathNodes {
2525

2626
interface Resource {
2727
name: string;
28-
args: Variable[];
29-
attrs: Variable[];
28+
args: variable[];
29+
attrs: variable[];
3030
url: string;
3131
}
3232

33-
interface Variable {
33+
interface Snippet {
34+
name: string;
35+
example: string;
36+
}
37+
38+
interface variable {
3439
name: string;
3540
description: string;
3641
}
3742

38-
const resourcesPath = "terraform-provider-tencentcloud/website/docs/r/";
39-
const dataSourcePath = "terraform-provider-tencentcloud/website/docs/d/";
40-
const indexPath = "terraform-provider-tencentcloud/website/tencentcloud.erb";
43+
const resourcesPath = "/Users/luoyin/Code/terraform-provider-tencentcloud/website/docs/r/";
44+
const dataSourcePath = "/Users/luoyin/Code/terraform-provider-tencentcloud/website/docs/d/";
45+
const indexPath = "/Users/luoyin/Code/terraform-provider-tencentcloud/website/tencentcloud.erb";
4146
// ================================== FUNCTIONS ==================================
4247

4348
function getParsed(filename: string): Promise<cheerio.Root> {
4449
return new Promise((resolve, reject) => {
45-
var file = fs.readFileSync(resourcesPath + filename) + "";
50+
let file = fs.readFileSync(resourcesPath + filename) + "";
4651
marked(file, (err, result) => {
4752
if (err) {
4853
return reject(err);
4954
}
50-
var $ = load(result);
55+
let $ = load(result);
5156
resolve($);
5257
});
5358
});
@@ -60,7 +65,7 @@ function getAllParsed(files: string[]): Promise<cheerio.Root>[] {
6065
}
6166

6267
function getNumWithArgumentReference($s: cheerio.Root[]): number {
63-
var result = _.map($s, $ => {
68+
let result = _.map($s, $ => {
6469
return $("h2").filter((z, el) => {
6570
return $(el).text() === "Argument Reference";
6671
}).length;
@@ -69,7 +74,7 @@ function getNumWithArgumentReference($s: cheerio.Root[]): number {
6974
}
7075

7176
function getNumWithAttributesReference($s: cheerio.Root[]): number {
72-
var result = _.map($s, $ => {
77+
let result = _.map($s, $ => {
7378
return $("h2").filter((z, el) => {
7479
return $(el).text() === "Attributes Reference";
7580
}).length;
@@ -83,19 +88,19 @@ function getNumWithAttributesReference($s: cheerio.Root[]): number {
8388
* @param {*} $ - The full page as a cheerio object
8489
*/
8590
function extractArgumentsContent($: cheerio.Root): ArgumentNodes {
86-
var argsH2 = $("h2").filter((z, el) => {
91+
let argsH2 = $("h2").filter((z, el) => {
8792
return $(el).text() === "Argument Reference";
8893
});
8994
if (argsH2.length !== 1) {
9095
throw "Didn't find correct number of h2 > Arguments Reference";
9196
}
92-
var nodes = [];
93-
var currentNode: any = argsH2[0];
97+
let nodes = [];
98+
let currentNode: any = argsH2[0];
9499
while (true) {
95100
if (!(currentNode.type === "text" && currentNode["data"] === "\n")) {
96101
nodes.push(currentNode);
97102
}
98-
var nextSibling = _.get(currentNode, "nextSibling");
103+
let nextSibling = _.get(currentNode, "nextSibling");
99104
if (!nextSibling || _.get(nextSibling, "name") === "h2") {
100105
break;
101106
}
@@ -105,21 +110,21 @@ function extractArgumentsContent($: cheerio.Root): ArgumentNodes {
105110
}
106111

107112
function extractAttributesContent($: cheerio.Root): AttributeNodes {
108-
var argsH2 = $("h2").filter((z, el) => {
113+
let argsH2 = $("h2").filter((z, el) => {
109114
return $(el).text() === "Attribute Reference" || $(el).text() === "Attributes Reference";
110115
});
111116
if (argsH2.length !== 1) {
112117
console.error(`Didn't find any attributes on ${extractResourceName($)}`);
113118
return { attributeNodes: [] };
114119
// throw `Didn't find correct number of h2 > Attributes Reference on ${extractResourceName($)}`;
115120
}
116-
var nodes = [];
117-
var currentNode: any = argsH2[0];
121+
let nodes = [];
122+
let currentNode: any = argsH2[0];
118123
while (true) {
119124
if (!(currentNode.type === "text" && currentNode["data"] === "\n")) {
120125
nodes.push(currentNode);
121126
}
122-
var nextSibling = _.get(currentNode, "nextSibling");
127+
let nextSibling = _.get(currentNode, "nextSibling");
123128
if (!nextSibling || _.get(nextSibling, "name") === "h2") {
124129
break;
125130
}
@@ -128,33 +133,62 @@ function extractAttributesContent($: cheerio.Root): AttributeNodes {
128133
return { attributeNodes: nodes };
129134
}
130135

131-
// function extractExampleContents($: cheerio.Root): SnippetNodes {
132-
// var argsH2 = $("h2").filter((z, el) => {
133-
// return $(el).text() == "Example Usage";
134-
// });
135-
// if (argsH2.length != 1) {
136-
// console.error(`Didn't find any example on ${extractResourceName($)}`);
137-
// throw "Didn't find correct number of h2 > Example Usage";
138-
// }
139-
// }
140-
141-
// function extractExamples(argNodes: SnippetNodes, $: cheerio.Root): Variable[] {
142-
// if (argNodes.snippetNodes.length == 0) return [];
143-
144-
// let nodes = argNodes.snippetNodes;
145-
146-
// // Find the first ul
147-
// var firstUl = _.find(nodes, (o: any) => o.name == "ul");
148-
// if (!firstUl) {
149-
// console.error(`Didn't find a UL when searching through snippets on ${extractResourceName($)}`);
150-
// }
151-
// }
152-
153-
function extractArguments(argNodes: ArgumentNodes, $: cheerio.Root): Variable[] {
136+
function extractExampleContent($: cheerio.Root): string {
137+
let expH2 = $("h2").filter((z, el) => {
138+
return $(el).text() === "Example Usage";
139+
});
140+
if (expH2.length !== 1) {
141+
console.error(`Didn't find any example on ${extractResourceName($)}`);
142+
return "";
143+
// throw "Didn't find correct number of h2 > Example Usage";
144+
}
145+
let content = "This example will be ready later.";
146+
let currentNode: any = expH2[0];
147+
while (true) {
148+
const nextSibling = _.get(currentNode, 'nextSibling');
149+
if (!nextSibling || _.get(nextSibling, 'name') === 'h2') {
150+
break;
151+
}
152+
153+
currentNode = _.get(currentNode, 'nextSibling');
154+
155+
// exsit multiple example
156+
if (currentNode.type === 'tag' && currentNode.name === 'h3') {
157+
currentNode = _.get(currentNode, 'nextSibling');
158+
continue;
159+
}
160+
161+
// only extract the first one
162+
if (currentNode.type === 'tag' && currentNode.name === 'pre') {
163+
content = $(currentNode).text();
164+
break;
165+
}
166+
}
167+
168+
return content;
169+
}
170+
171+
function extractExamples(expNodes: SnippetNodes, $: cheerio.Root): string {
172+
let nodes = expNodes.snippetNodes;
173+
174+
// const snippetText = nodes.map((nn) => $(nn).text()).join('');
175+
const textArray = _.map(nodes, node => {
176+
return $(node).text();
177+
});
178+
179+
console.debug("[DEBUG]#### len:[%d], textArray:[%v]", textArray.length, textArray);
180+
181+
const snippetTexts = textArray.join("");
182+
console.debug("[DEBUG]#### snippetTexts:[%s]", snippetTexts);
183+
184+
return snippetTexts;
185+
}
186+
187+
function extractArguments(argNodes: ArgumentNodes, $: cheerio.Root): variable[] {
154188
let nodes = argNodes.argumentNodes;
155189

156190
// Find the first ul
157-
var firstUl = _.find(nodes, (o: any) => o.name === "ul");
191+
let firstUl = _.find(nodes, (o: any) => o.name === "ul");
158192

159193
if (!firstUl) {
160194
// throw "Didn't find a UL when searching through arguments";
@@ -165,7 +199,7 @@ function extractArguments(argNodes: ArgumentNodes, $: cheerio.Root): Variable[]
165199
let text = $(li).text();
166200
let regex = /([a-zA-Z0-9_]+) (.+)/;
167201
let result = text.match(regex);
168-
var name, description;
202+
let name, description;
169203
if (!result) {
170204
name = text;
171205
//console.error(`Didn't find a description for ${text} on ${extractResourceName($)}`);
@@ -178,14 +212,14 @@ function extractArguments(argNodes: ArgumentNodes, $: cheerio.Root): Variable[]
178212
});
179213
}
180214

181-
function extractAttributes(argNodes: AttributeNodes, $: cheerio.Root): Variable[] {
215+
function extractAttributes(argNodes: AttributeNodes, $: cheerio.Root): variable[] {
182216
if (argNodes.attributeNodes.length === 0) {
183217
return [];
184218
}
185219

186220
let nodes = argNodes.attributeNodes;
187221
// Find the first ul
188-
var firstUl = _.find(nodes, (o: any) => o.name === "ul");
222+
let firstUl = _.find(nodes, (o: any) => o.name === "ul");
189223
if (!firstUl) {
190224
// console.error(`Didn't find a UL when searching through attributes on ${extractResourceName($)}`);
191225
return [];
@@ -194,7 +228,7 @@ function extractAttributes(argNodes: AttributeNodes, $: cheerio.Root): Variable[
194228
let text = $(li).text();
195229
let regex = /([a-zA-Z0-9_]+) (.+)/;
196230
let result = text.match(regex);
197-
var name, description;
231+
let name, description;
198232
if (!result) {
199233
name = text;
200234
// console.error(`Didn't find a description for ${text} on ${extractResourceName($)}`);
@@ -272,23 +306,52 @@ function extractResourceUrl(html: string): Map<string, string> {
272306
const files = fs.readdirSync(resourcesPath);
273307
const indexHtml = fs.readFileSync(indexPath, "utf-8");
274308
Promise.all(getAllParsed(files)).then($s => {
275-
const resIndexMap = extractResourceUrl(indexHtml);
276-
const resources: Resource[] = _.map($s, $ => {
277-
const resName = extractResourceName($);
278-
return {
279-
name: resName,
280-
args: extractArguments(extractArgumentsContent($), $),
281-
attrs: extractAttributes(extractAttributesContent($), $),
282-
url: resIndexMap.get(resName)
283-
};
284-
});
285-
let transformed = _.transform(resources, (result, value, key) => {
286-
result[value.name] = {
287-
args: value.args,
288-
attrs: value.attrs,
289-
url: value.url
290-
};
291-
}, {});
309+
const args = process.argv.slice(2);
310+
let type = "";
311+
if (args.length > 0) {
312+
type = args[0];
313+
}
314+
315+
let transformed: any;
316+
if (type === "example") {
317+
// example collection
318+
const examples: Snippet[] = _.map($s, $ => {
319+
const resName = extractResourceName($);
320+
return {
321+
name: resName,
322+
example: extractExampleContent($),
323+
};
324+
});
325+
transformed = _.transform(examples, (result, value) => {
326+
result[value.name] = {
327+
example: value.example
328+
};
329+
}, {});
330+
331+
} else {
332+
// resource collection
333+
const resIndexMap = extractResourceUrl(indexHtml);
334+
const resources: Resource[] = _.map($s, $ => {
335+
const resName = extractResourceName($);
336+
return {
337+
name: resName,
338+
args: extractArguments(extractArgumentsContent($), $),
339+
attrs: extractAttributes(extractAttributesContent($), $),
340+
url: resIndexMap.get(resName)
341+
};
342+
});
343+
transformed = _.transform(resources, (result, value, key) => {
344+
result[value.name] = {
345+
args: value.args,
346+
attrs: value.attrs,
347+
url: value.url
348+
};
349+
}, {});
350+
}
292351

293352
console.log(JSON.stringify(transformed));
294-
});
353+
});
354+
355+
function fun(v: any, i: any, array: any): (value: string, index: number, array: string[]) => void {
356+
throw new Error("Function not implemented.");
357+
}

tool/scrape/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
"description": "",
55
"main": "index.js",
66
"scripts": {
7-
"test": "echo \"Error: no test specified\" && exit 1"
7+
"test": "echo \"Error: no test specified\" && exit 1",
8+
"watch_compile": "tsc -watch -p ./"
89
},
910
"author": "",
1011
"license": "ISC",

0 commit comments

Comments
 (0)