-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpers.py
93 lines (73 loc) · 2.4 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from datetime import datetime
import re
from enum import Enum
def extract_integer(s):
# This regex pattern looks for digits, possibly separated by commas
pattern = r"(\d{1,3}(?:,\d{3})*)"
match = re.search(pattern, s)
if match:
# Remove commas and return the integer as a string
return int(match.group(0).replace(",", ""))
return None
def extract_float_from_phrase(s):
# Use regular expression to find the float value
match = re.search(r"\d+\.\d+|\d+", s)
if match:
rating = float(match.group())
return rating
else:
return None
def parse_review_date_and_country(date_text):
if not date_text:
return None
try:
# First try the standard format
match = re.search(r"Reviewed in (.+?) on (.+)", date_text)
if match:
date_str = match.group(2)
return {
"country": match.group(1),
"date": datetime.strptime(date_str, "%B %d, %Y"),
}
# Try alternate format if standard fails
match = re.search(r"Reviewed on (.+)", date_text)
if match:
date_str = match.group(1)
return {
"country": "Unknown",
"date": datetime.strptime(date_str, "%B %d, %Y"),
}
except ValueError as e:
print(f"Date parsing error: {e} for text: {date_text}")
return None
def parse_reviews_count(phrase: str) -> int:
"""
Parses the number of reviews from a given phrase.
Args:
phrase (str): The input phrase, e.g., "1,473 total ratings, 520 with reviews".
Returns:
int: The number of reviews, or 0 if not found.
"""
match = re.search(r"(\d[\d,]*) with reviews", phrase)
if match:
# Remove commas and convert to integer
return int(match.group(1).replace(",", ""))
return 0
class AmazonFilterMediaType(Enum):
# MEDIA_REVIEWS_ONLY = "media_reviews_only"
ALL_CONTENTS = "all_contents"
class AmazonFilterSortBy(Enum):
RECENT = "recent"
HELPFUL = "helpful"
class AmazonFilterStarRating(Enum):
# ALL_STAR = "all_star"
FIVE_STAR = "five_star"
FOUR_STAR = "four_star"
THREE_STAR = "three_star"
TWO_STAR = "two_star"
ONE_STAR = "one_star"
# POSITIVE = "positive"
# CRITICAL = "critical"
class AmazonFilterFormatType(Enum):
ALL_FORMATS = "all_formats"
CURRENT_FORMAT = "current_format"