18
18
RE_PATTERN_COMMA_SEPARATED_NUM = r"\b\d{1,3}(?:,\d{3})*\b"
19
19
20
20
21
- def get_text_html_tag (html_tag ):
21
+ def get_html_tag_as_text (html_tag ):
22
22
"""Get the text from an HTML tag or return NULL_VALUE if the tag is None or has no text.
23
23
24
24
Args:
@@ -31,7 +31,7 @@ def get_text_html_tag(html_tag):
31
31
return html_tag_text or NULL_VALUE
32
32
33
33
34
- def get_text_html_tag_attr (html_tag , attr ):
34
+ def get_html_tag_attribute_as_text (html_tag , attr ):
35
35
"""Get the attribute text from an HTML tag or return NULL_VALUE if the tag is None or has no text.
36
36
37
37
Args:
@@ -52,8 +52,7 @@ class Chrono24:
52
52
page_size = 120
53
53
54
54
def __init__ (self , query ):
55
- """
56
- Initialize a chrono24 object with a query.
55
+ """Initialize a chrono24 object with a query.
57
56
58
57
Args:
59
58
query (str): The search query to be performed.
@@ -212,8 +211,7 @@ class Listings:
212
211
"""A class representing a collection of listings extracted from HTML content."""
213
212
214
213
def __init__ (self , html ):
215
- """
216
- Initialize the Listings object with HTML content.
214
+ """Initialize the Listings object with HTML content.
217
215
218
216
Args:
219
217
html (bs4.element.ResultSet): The HTML content containing listings.
@@ -282,21 +280,21 @@ def json(self):
282
280
dict: A dictionary containing the extracted listing information.
283
281
"""
284
282
return {
285
- "id" : get_text_html_tag_attr (self .html , "data-article-id" ),
286
- "url" : BASE_URL + get_text_html_tag_attr (self .html , "href" ),
287
- "manufacturer" : get_text_html_tag_attr (self .html , "data-manufacturer" ),
288
- "certification_status" : get_text_html_tag_attr (
283
+ "id" : get_html_tag_attribute_as_text (self .html , "data-article-id" ),
284
+ "url" : BASE_URL + get_html_tag_attribute_as_text (self .html , "href" ),
285
+ "manufacturer" : get_html_tag_attribute_as_text (self .html , "data-manufacturer" ),
286
+ "certification_status" : get_html_tag_attribute_as_text (
289
287
self .html , "data-watch-certification-status"
290
288
),
291
- "title" : get_text_html_tag (
289
+ "title" : get_html_tag_as_text (
292
290
self .html .find (
293
291
"div" , class_ = lambda x : x and "text-bold" in x and "text-ellipsis" in x
294
292
)
295
293
),
296
- "description" : get_text_html_tag (
294
+ "description" : get_html_tag_as_text (
297
295
self .html .find ("div" , class_ = lambda x : x and "m-b-2" in x and "text-ellipsis" in x )
298
296
),
299
- "price" : get_text_html_tag (
297
+ "price" : get_html_tag_as_text (
300
298
(lambda x : x .parent if x else x )(self .html .find ("span" , {"class" : "currency" }))
301
299
),
302
300
"shipping_price" : self ._shipping_price ,
@@ -314,7 +312,7 @@ def _shipping_price(self):
314
312
str: The shipping price extracted from the content, formatted as a string ('$X' format).
315
313
"""
316
314
# Extract comma-separated shipping price
317
- shipping_price_text = get_text_html_tag (
315
+ shipping_price_text = get_html_tag_as_text (
318
316
self .html .find ("div" , {"class" : "text-muted text-sm" })
319
317
)
320
318
match = re .search (RE_PATTERN_COMMA_SEPARATED_NUM , shipping_price_text )
@@ -328,7 +326,7 @@ def _location_and_merchant_name(self):
328
326
Returns:
329
327
tuple: A tuple containing the location and merchant name extracted from the content.
330
328
"""
331
- location = get_text_html_tag_attr (
329
+ location = get_html_tag_attribute_as_text (
332
330
self .html .find ("button" , {"class" : "js-tooltip" }), "data-content"
333
331
)
334
332
# Possible merchant names found in listings page
@@ -352,7 +350,7 @@ def _image_urls(self):
352
350
image_divs = self .html .find_all ("div" , {"class" : "js-carousel-cell" })
353
351
# Modify URLs to select for extra large images
354
352
return [
355
- get_text_html_tag_attr (image_div .find ("img" ), "data-lazy-sweet-spot-master-src" )
353
+ get_html_tag_attribute_as_text (image_div .find ("img" ), "data-lazy-sweet-spot-master-src" )
356
354
.lower ()
357
355
.replace ("square_size_" , "ExtraLarge" )
358
356
for image_div in image_divs
@@ -366,7 +364,7 @@ def _badge(self):
366
364
str: The badge information related to the listing.
367
365
"""
368
366
badge = self .html .find ("span" , {"class" : "article-item-article-badge" })
369
- return get_text_html_tag (badge )
367
+ return get_html_tag_as_text (badge )
370
368
371
369
372
370
class DetailedListing :
@@ -402,16 +400,16 @@ def _product_details(self):
402
400
details = [section .find_all ("td" ) for section in detail_section .find_all ("tr" )]
403
401
for idx , detail in enumerate (details ):
404
402
# Get detail key and set default detail value
405
- detail_key = get_text_html_tag (detail [0 ]).lower ().replace (" " , "_" )
403
+ detail_key = get_html_tag_as_text (detail [0 ]).lower ().replace (" " , "_" )
406
404
detail_description = NULL_VALUE
407
405
try :
408
- detail_description = get_text_html_tag (detail [1 ])
406
+ detail_description = get_html_tag_as_text (detail [1 ])
409
407
product_details [detail_key ] = self ._tidy_product_detail (detail_description )
410
408
except IndexError :
411
409
# Check if `detail` is a header above description column or description body
412
410
# We want to map description headers to their bodies
413
411
if idx + 1 != len (details ) and len (details [idx + 1 ]) == 1 :
414
- detail_description = get_text_html_tag (details [idx + 1 ][0 ])
412
+ detail_description = get_html_tag_as_text (details [idx + 1 ][0 ])
415
413
product_details [detail_key ] = self ._tidy_product_detail (detail_description )
416
414
417
415
return product_details
@@ -476,7 +474,7 @@ def _anticipated_delivery(self):
476
474
str: The anticipated delivery details for the listing.
477
475
"""
478
476
anticipated_delivery = self .html .find ("span" , {"class" : "js-shipping-time" })
479
- return get_text_html_tag (anticipated_delivery ).replace ("Anticipated delivery: " , "" )
477
+ return get_html_tag_as_text (anticipated_delivery ).replace ("Anticipated delivery: " , "" )
480
478
481
479
@property
482
480
def _merchant_name (self ):
@@ -486,7 +484,7 @@ def _merchant_name(self):
486
484
str: The name of the merchant associated with the listing.
487
485
"""
488
486
merchant_name = self .html .find ("button" , {"class" : "js-link-merchant-name" })
489
- return get_text_html_tag (merchant_name )
487
+ return get_html_tag_as_text (merchant_name )
490
488
491
489
@property
492
490
def _merchant_rating (self ):
@@ -496,7 +494,7 @@ def _merchant_rating(self):
496
494
str: The rating of the merchant associated with the listing.
497
495
"""
498
496
rating = self .html .find ("span" , {"class" : "rating" })
499
- return get_text_html_tag (rating )
497
+ return get_html_tag_as_text (rating )
500
498
501
499
@property
502
500
def _merchant_reviews (self ):
@@ -506,7 +504,7 @@ def _merchant_reviews(self):
506
504
str: The number of reviews for the merchant associated with the listing.
507
505
"""
508
506
num_reviews = self .html .find ("button" , {"class" : "js-link-merchant-reviews" })
509
- return get_text_html_tag (num_reviews )
507
+ return get_html_tag_as_text (num_reviews )
510
508
511
509
@property
512
510
def _merchant_badges (self ):
@@ -520,6 +518,6 @@ def _merchant_badges(self):
520
518
badge_html = BeautifulSoup (badge .get ("data-content" ), "html.parser" )
521
519
badge_text = badge_html .find ("span" , {"class" : "" })
522
520
if badge_text :
523
- badges .append (get_text_html_tag (badge_text ))
521
+ badges .append (get_html_tag_as_text (badge_text ))
524
522
525
523
return badges
0 commit comments