-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTheoPlace.yaml
397 lines (316 loc) · 11.5 KB
/
TheoPlace.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
# This "Generic HTML" scraper for TheoPlace is provided as an example, both of the power of generic scrapers and their
# limitations.
# In particular, the limitations of this generic scraper are the following :
# - In some bibles, major and normal sections will be messed up because the website's formatting on those is inconsistent.
# The generic parser cannot include complex logic based on sibling comparisons to cope with such inconsistencies.
# - Chapters and books that are missing in some bibles will actually be included as empty, because generic parsers do
# not provide a mechanism to ignore pages after parsing them.
# For these reasons, when extracting bibles for actual use, the native TheoPlace scraper should be used instead !
description: "Bibles multiples de theo.place"
inputs: [ BIBLE ]
bible:
metadata:
language: fr
systemName: "fre{BIBLE}tp"
title: "{BIBLE} extraite de TheoPlace"
patterns:
# By using the default name "bookUrl" and leaving the pagePattern unset, this pattern will be implicitly used at book level.
bookUrl: "https://theo.place/intro-livre-{BIBLE}-{BOOK_NB}-{BOOK_NAME}"
# By using the default name "chapterUrl" and leaving the pagePattern unset, this pattern will be implicitly used at chapter level.
chapterUrl: "https://theo.place/bible-{BIBLE}-{BOOK_NB}-{CHAPTER}"
# By using the default name "chapterPublishedNumber" and leaving the pagePattern unset, this pattern will be implicitly used at chapter level.
chapterPublishedNumber: "{CHAPTER}"
args:
# By default, the CHAPTER argument should be evaluated only within chapter sequence, and default to $i, the OSIS chapter number.
CHAPTER: = $i
books:
- osis: Gen
args: { BOOK_NB: 1, BOOK_NAME: genese }
chapters: [ { from: 1, to: 50 } ]
- osis: Exod
args: { BOOK_NB: 2, BOOK_NAME: exode }
chapters: [ { from: 1, to: 40 } ]
- osis: Lev
args: { BOOK_NB: 3, BOOK_NAME: levitique }
chapters: [ { from: 1, to: 27 } ]
- osis: Num
args: { BOOK_NB: 4, BOOK_NAME: nombres }
chapters: [ { from: 1, to: 36 } ]
- osis: Deut
args: { BOOK_NB: 5, BOOK_NAME: deuteronome }
chapters: [ { from: 1, to: 34 } ]
- osis: Josh
args: { BOOK_NB: 6, BOOK_NAME: josue }
chapters: [ { from: 1, to: 24 } ]
- osis: Judg
args: { BOOK_NB: 7, BOOK_NAME: juges }
chapters: [ { from: 1, to: 21 } ]
- osis: Ruth
args: { BOOK_NB: 8, BOOK_NAME: ruth }
chapters: [ { from: 1, to: 4 } ]
- osis: 1Sam
args: { BOOK_NB: 9, BOOK_NAME: 1samuel }
chapters: [ { from: 1, to: 31 } ]
- osis: 2Sam
args: { BOOK_NB: 10, BOOK_NAME: 2samuel }
chapters: [ { from: 1, to: 24 } ]
- osis: 1Kgs
args: { BOOK_NB: 11, BOOK_NAME: 1rois }
chapters: [ { from: 1, to: 22 } ]
- osis: 2Kgs
args: { BOOK_NB: 12, BOOK_NAME: 2rois }
chapters: [ { from: 1, to: 25 } ]
- osis: 1Chr
args: { BOOK_NB: 13, BOOK_NAME: 1chroniques }
chapters: [ { from: 1, to: 29 } ]
- osis: 2Chr
args: { BOOK_NB: 14, BOOK_NAME: 2chroniques }
chapters: [ { from: 1, to: 36 } ]
- osis: Ezra
args: { BOOK_NB: 15, BOOK_NAME: esdras }
chapters: [ { from: 1, to: 10 } ]
- osis: Neh
args: { BOOK_NB: 16, BOOK_NAME: nehemie }
chapters: [ { from: 1, to: 13 } ]
- osis: Esth
args: { BOOK_NB: 17, BOOK_NAME: esther }
chapters: [ { from: 1, to: 10 } ]
- osis: Job
args: { BOOK_NB: 18, BOOK_NAME: job }
chapters: [ { from: 1, to: 42 } ]
- osis: Ps
args: { BOOK_NB: 19, BOOK_NAME: psaumes }
chapters: [ { from: 1, to: 150 } ]
- osis: Prov
args: { BOOK_NB: 20, BOOK_NAME: proverbes }
chapters: [ { from: 1, to: 31 } ]
- osis: Eccl
args: { BOOK_NB: 21, BOOK_NAME: ecclesiaste }
chapters: [ { from: 1, to: 12 } ]
- osis: Song
args: { BOOK_NB: 22, BOOK_NAME: cantique }
chapters: [ { from: 1, to: 8 } ]
- osis: Isa
args: { BOOK_NB: 23, BOOK_NAME: esaie }
chapters: [ { from: 1, to: 66 } ]
- osis: Jer
args: { BOOK_NB: 24, BOOK_NAME: jeremie }
chapters: [ { from: 1, to: 52 } ]
- osis: Lam
args: { BOOK_NB: 25, BOOK_NAME: lamentations }
chapters: [ { from: 1, to: 5 } ]
- osis: Ezek
args: { BOOK_NB: 26, BOOK_NAME: ezechiel }
chapters: [ { from: 1, to: 48 } ]
- osis: Dan
args: { BOOK_NB: 27, BOOK_NAME: daniel }
chapters: [ { from: 1, to: 14 } ]
- osis: Hos
args: { BOOK_NB: 28, BOOK_NAME: osee }
chapters: [ { from: 1, to: 14 } ]
- osis: Joel
args: { BOOK_NB: 29, BOOK_NAME: jo%C3%ABl }
chapters: [ { from: 1, to: 4 } ]
- osis: Amos
args: { BOOK_NB: 30, BOOK_NAME: amos }
chapters: [ { from: 1, to: 9 } ]
- osis: Obad
args: { BOOK_NB: 31, BOOK_NAME: abdias }
chapters: [ { at: 1 } ]
- osis: Jonah
args: { BOOK_NB: 32, BOOK_NAME: jonas }
chapters: [ { from: 1, to: 4 } ]
- osis: Mic
args: { BOOK_NB: 33, BOOK_NAME: michee }
chapters: [ { from: 1, to: 7 } ]
- osis: Nah
args: { BOOK_NB: 34, BOOK_NAME: nahum }
chapters: [ { from: 1, to: 3 } ]
- osis: Hab
args: { BOOK_NB: 35, BOOK_NAME: habakuk }
chapters: [ { from: 1, to: 3 } ]
- osis: Zeph
args: { BOOK_NB: 36, BOOK_NAME: sophonie }
chapters: [ { from: 1, to: 3 } ]
- osis: Hag
args: { BOOK_NB: 37, BOOK_NAME: aggee }
chapters: [ { from: 1, to: 2 } ]
- osis: Zech
args: { BOOK_NB: 38, BOOK_NAME: zacharie }
chapters: [ { from: 1, to: 14 } ]
- osis: Mal
args: { BOOK_NB: 39, BOOK_NAME: malachie }
chapters: [ { from: 1, to: 4 } ]
- osis: Matt
args: { BOOK_NB: 40, BOOK_NAME: matthieu }
chapters: [ { from: 1, to: 28 } ]
- osis: Mark
args: { BOOK_NB: 41, BOOK_NAME: marc }
chapters: [ { from: 1, to: 16 } ]
- osis: Luke
args: { BOOK_NB: 42, BOOK_NAME: luc }
chapters: [ { from: 1, to: 24 } ]
- osis: John
args: { BOOK_NB: 43, BOOK_NAME: jean }
chapters: [ { from: 1, to: 21 } ]
- osis: Acts
args: { BOOK_NB: 44, BOOK_NAME: actes }
chapters: [ { from: 1, to: 28 } ]
- osis: Rom
args: { BOOK_NB: 45, BOOK_NAME: romains }
chapters: [ { from: 1, to: 16 } ]
- osis: 1Cor
args: { BOOK_NB: 46, BOOK_NAME: 1corinthiens }
chapters: [ { from: 1, to: 16 } ]
- osis: 2Cor
args: { BOOK_NB: 47, BOOK_NAME: 2corinthiens }
chapters: [ { from: 1, to: 13 } ]
- osis: Gal
args: { BOOK_NB: 48, BOOK_NAME: galates }
chapters: [ { from: 1, to: 6 } ]
- osis: Eph
args: { BOOK_NB: 49, BOOK_NAME: ephesiens }
chapters: [ { from: 1, to: 6 } ]
- osis: Phil
args: { BOOK_NB: 50, BOOK_NAME: philippiens }
chapters: [ { from: 1, to: 4 } ]
- osis: Col
args: { BOOK_NB: 51, BOOK_NAME: colossiens }
chapters: [ { from: 1, to: 4 } ]
- osis: 1Thess
args: { BOOK_NB: 52, BOOK_NAME: 1thessaloniciens }
chapters: [ { from: 1, to: 5 } ]
- osis: 2Thess
args: { BOOK_NB: 53, BOOK_NAME: 2thessaloniciens }
chapters: [ { from: 1, to: 3 } ]
- osis: 1Tim
args: { BOOK_NB: 54, BOOK_NAME: 1timothee }
chapters: [ { from: 1, to: 6 } ]
- osis: 2Tim
args: { BOOK_NB: 55, BOOK_NAME: 2timothee }
chapters: [ { from: 1, to: 4 } ]
- osis: Titus
args: { BOOK_NB: 56, BOOK_NAME: tite }
chapters: [ { from: 1, to: 3 } ]
- osis: Phlm
args: { BOOK_NB: 57, BOOK_NAME: philemon }
chapters: [ { at: 1 } ]
- osis: Heb
args: { BOOK_NB: 58, BOOK_NAME: hebreux }
chapters: [ { from: 1, to: 13 } ]
- osis: Jas
args: { BOOK_NB: 59, BOOK_NAME: jacques }
chapters: [ { from: 1, to: 5 } ]
- osis: 1Pet
args: { BOOK_NB: 60, BOOK_NAME: 1pierre }
chapters: [ { from: 1, to: 5 } ]
- osis: 2Pet
args: { BOOK_NB: 61, BOOK_NAME: 2pierre }
chapters: [ { from: 1, to: 3 } ]
- osis: 1John
args: { BOOK_NB: 62, BOOK_NAME: 1jean }
chapters: [ { from: 1, to: 5 } ]
- osis: 2John
args: { BOOK_NB: 63, BOOK_NAME: 2jean }
chapters: [ { at: 1 } ]
- osis: 3John
args: { BOOK_NB: 64, BOOK_NAME: 3jean }
chapters: [ { at: 1 } ]
- osis: Jude
args: { BOOK_NB: 65, BOOK_NAME: jude }
chapters: [ { at: 1 } ]
- osis: Rev
args: { BOOK_NB: 66, BOOK_NAME: apocalypse }
chapters: [ { from: 1, to: 22 } ]
- osis: Bar
args: { BOOK_NB: 67, BOOK_NAME: baruch }
chapters: [ { from: 1, to: 6 } ]
- osis: Tob
args: { BOOK_NB: 68, BOOK_NAME: tobie }
chapters: [ { from: 1, to: 14 } ]
- osis: Jdt
args: { BOOK_NB: 69, BOOK_NAME: judith }
chapters: [ { from: 1, to: 16 } ]
- osis: 1Macc
args: { BOOK_NB: 70, BOOK_NAME: 1maccabees }
chapters: [ { from: 1, to: 16 } ]
- osis: 2Macc
args: { BOOK_NB: 71, BOOK_NAME: 2maccabees }
chapters: [ { from: 1, to: 15 } ]
- osis: Wis
args: { BOOK_NB: 72, BOOK_NAME: sagesse }
chapters: [ { from: 1, to: 19 } ]
- osis: Sir
args: { BOOK_NB: 73, BOOK_NAME: ecclesiastique }
chapters: [ { from: 0, to: 51 } ]
html:
elements:
- selector: "h1.mb-3"
contexts:
- type: BOOK_TITLE
descendants:
- type: TEXT
op: ownText
- selector: "#logos > p"
withAncestors: [ BOOK_INTRO ]
contexts:
- type: PARAGRAPH_BREAK
- type: TEXT
op: text
- selector: "#logos > h2"
contexts:
- type: MAJOR_SECTION_TITLE
descendants:
- type: TEXT
op: text
- selector: "#logos > h3:not(:has(> em))"
contexts:
- type: SECTION_TITLE
descendants:
- type: TEXT
op: text
- selector: "#logos > h3:has(> em)"
contexts:
- type: MINOR_SECTION_TITLE
descendants:
- type: TEXT
op: text
- selector: "#logos > span.verset"
contexts:
- type: VERSE
op: text
externalParsers:
- selector: "#logos > span[data-verset]"
withAncestors: [ VERSE ]
withoutAncestors: [ STRUCTURE_MARKER, NOTE, MARKUP ]
nodeParser:
nodes:
- contexts:
- type: TEXT
regexp: (.*)
elements:
- selector: "#logos > span[data-verset] > *:not(button.footnote):not(br)"
contexts:
- type: TEXT
op: text
- selector: "#logos button.footnote"
contexts:
- type: NOTE
descendants:
- type: TEXT
op: attribute=data-bs-content
# theo.place does not have explicit poetry markup, but uses line breaks inside verses.
# The following rules interpret that in a "best-effort" mode, such that verses which contain line breaks are treated as poetry.
# It leads to some false positives : when the poetry starts or end in the middle of a verse, the full verse will be tagged as poetry anyway.
# See Num 21:14 in the "tob" bible for example.
- selector: "#logos > span[data-verset]:has(> br)"
contexts:
- type: POETRY_LINE_START
op: literal=1
- selector: "#logos > span[data-verset] > br"
contexts:
- type: POETRY_LINE_START
op: literal=1
- selector: "#logos > span[data-verset]:has(> br) + br + span.verset + span[data-verset]:not(:has(> br))"
contexts:
- type: POETRY_STANZA_BREAK