Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 17 additions & 10 deletions price_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ def fromstring(cls, price: Optional[str],
Given price and currency text extracted from HTML elements, return
``Price`` instance, which provides a clean currency symbol and
price amount as a Decimal number.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you have against paragraphs? 😛

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That might be due to the auto-indentation of Pycharm. We can definitely undo it. 😄

``currency_hint`` is optional; you can pass value of some element
which may contain currency, as a hint. If currency is present in
``price`` string, it could be **preferred** over a value extracted
Expand Down Expand Up @@ -162,18 +161,15 @@ def extract_price_text(price: str) -> Optional[str]:
maybe some other text. If multiple price-looking substrings are present,
the first is returned (FIXME: it is better to return a number
which is near a currency symbol).

>>> extract_price_text("price: $12.99")
'12.99'
>>> extract_price_text("Free")
'0'
>>> extract_price_text("Foo")
>>> extract_price_text("1,235 USD")
'1,235'

In addition to numbers, it has a limited support for a case where
currency symbol (currently only euro) is a decimal separator:

>>> extract_price_text("99 €, 79 €")
'99'
>>> extract_price_text("99 € 79 €")
Expand Down Expand Up @@ -201,11 +197,18 @@ def extract_price_text(price: str) -> Optional[str]:
m = re.search(r"""
(\d[\d\s.,]*) # number, probably with thousand separators
\s*? # skip whitespace
([m|M,b|B]il\w*)? # check million* or billion*
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

@rpalsaxena rpalsaxena Mar 12, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @Gallaecio for flagging this!
I'll update it with: ([mMbB]il\w*)?

Let me know if there's still some improvement that needs to be made.

(?:[^%\d]|$) # capture next symbol - it shouldn't be %
""", price, re.VERBOSE)

if m:
return m.group(1).strip(',.').strip()
subprice = m.group(2)
if 'bil' in subprice:
price = m.group(1).strip(',.').strip()+' x 10\u2079'
elif 'mil' in subprice:
price = m.group(1).strip(',.').strip()+' x 10\u2076'
return price

if 'free' in price.lower():
return '0'
return None
Expand All @@ -226,7 +229,6 @@ def extract_price_text(price: str) -> Optional[str]:
def get_decimal_separator(price: str) -> Optional[str]:
""" Return decimal separator symbol or None if there
is no decimal separator.

>>> get_decimal_separator("1000")
>>> get_decimal_separator("12.99")
'.'
Expand All @@ -247,7 +249,6 @@ def parse_number(num: str,
decimal_separator: Optional[str] = None) -> Optional[Decimal]:
""" Parse a string with a number to a Decimal, guessing its format:
decimal separator, thousand separator. Return None if parsing fails.

>>> parse_number("1,234")
Decimal('1234')
>>> parse_number("12,34")
Expand Down Expand Up @@ -281,7 +282,12 @@ def parse_number(num: str,
"""
if not num:
return None
num = num.strip().replace(' ', '')
num_copy = num
if 'x' in num:
num = num.split()[0]
else:
num = num.strip().replace(' ', '')

decimal_separator = decimal_separator or get_decimal_separator(num)
# NOTE: Keep supported separators in sync with _search_decimal_sep
if decimal_separator is None:
Expand All @@ -294,6 +300,7 @@ def parse_number(num: str,
assert decimal_separator == '€'
num = num.replace('.', '').replace(',', '').replace('€', '.')
try:
return Decimal(num)
num = num+num_copy.split()[1]+num_copy.split()[2] if 'x' in num_copy else ''
return num
except InvalidOperation:
return None
return None
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
name='price-parser',
version='0.3.1',
description='Extract price and currency from a raw string',
long_description=open('README.rst').read() + "\n\n" + open('CHANGES.rst').read(),
long_description=open('README.rst', encoding="utf8").read() + "\n\n" + open('CHANGES.rst').read(),
author='Mikhail Korobov',
author_email='[email protected]',
url='https://github.com/scrapinghub/price-parser',
Expand Down