-
Notifications
You must be signed in to change notification settings - Fork 51
Fixes issue #1 #36
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Fixes issue #1 #36
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,7 +31,6 @@ def fromstring(cls, price: Optional[str], | |
| Given price and currency text extracted from HTML elements, return | ||
| ``Price`` instance, which provides a clean currency symbol and | ||
| price amount as a Decimal number. | ||
|
|
||
| ``currency_hint`` is optional; you can pass value of some element | ||
| which may contain currency, as a hint. If currency is present in | ||
| ``price`` string, it could be **preferred** over a value extracted | ||
|
|
@@ -162,18 +161,15 @@ def extract_price_text(price: str) -> Optional[str]: | |
| maybe some other text. If multiple price-looking substrings are present, | ||
| the first is returned (FIXME: it is better to return a number | ||
| which is near a currency symbol). | ||
|
|
||
| >>> extract_price_text("price: $12.99") | ||
| '12.99' | ||
| >>> extract_price_text("Free") | ||
| '0' | ||
| >>> extract_price_text("Foo") | ||
| >>> extract_price_text("1,235 USD") | ||
| '1,235' | ||
|
|
||
| In addition to numbers, it has a limited support for a case where | ||
| currency symbol (currently only euro) is a decimal separator: | ||
|
|
||
| >>> extract_price_text("99 €, 79 €") | ||
| '99' | ||
| >>> extract_price_text("99 € 79 €") | ||
|
|
@@ -201,11 +197,18 @@ def extract_price_text(price: str) -> Optional[str]: | |
| m = re.search(r""" | ||
| (\d[\d\s.,]*) # number, probably with thousand separators | ||
| \s*? # skip whitespace | ||
| ([m|M,b|B]il\w*)? # check million* or billion* | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please, check out how [] works.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @Gallaecio for flagging this! Let me know if there's still some improvement that needs to be made. |
||
| (?:[^%\d]|$) # capture next symbol - it shouldn't be % | ||
| """, price, re.VERBOSE) | ||
|
|
||
| if m: | ||
| return m.group(1).strip(',.').strip() | ||
| subprice = m.group(2) | ||
| if 'bil' in subprice: | ||
| price = m.group(1).strip(',.').strip()+' x 10\u2079' | ||
| elif 'mil' in subprice: | ||
| price = m.group(1).strip(',.').strip()+' x 10\u2076' | ||
| return price | ||
|
|
||
| if 'free' in price.lower(): | ||
| return '0' | ||
| return None | ||
|
|
@@ -226,7 +229,6 @@ def extract_price_text(price: str) -> Optional[str]: | |
| def get_decimal_separator(price: str) -> Optional[str]: | ||
| """ Return decimal separator symbol or None if there | ||
| is no decimal separator. | ||
|
|
||
| >>> get_decimal_separator("1000") | ||
| >>> get_decimal_separator("12.99") | ||
| '.' | ||
|
|
@@ -247,7 +249,6 @@ def parse_number(num: str, | |
| decimal_separator: Optional[str] = None) -> Optional[Decimal]: | ||
| """ Parse a string with a number to a Decimal, guessing its format: | ||
| decimal separator, thousand separator. Return None if parsing fails. | ||
|
|
||
| >>> parse_number("1,234") | ||
| Decimal('1234') | ||
| >>> parse_number("12,34") | ||
|
|
@@ -281,7 +282,12 @@ def parse_number(num: str, | |
| """ | ||
| if not num: | ||
| return None | ||
| num = num.strip().replace(' ', '') | ||
| num_copy = num | ||
| if 'x' in num: | ||
| num = num.split()[0] | ||
| else: | ||
| num = num.strip().replace(' ', '') | ||
|
|
||
| decimal_separator = decimal_separator or get_decimal_separator(num) | ||
| # NOTE: Keep supported separators in sync with _search_decimal_sep | ||
| if decimal_separator is None: | ||
|
|
@@ -294,6 +300,7 @@ def parse_number(num: str, | |
| assert decimal_separator == '€' | ||
| num = num.replace('.', '').replace(',', '').replace('€', '.') | ||
| try: | ||
| return Decimal(num) | ||
| num = num+num_copy.split()[1]+num_copy.split()[2] if 'x' in num_copy else '' | ||
| return num | ||
| except InvalidOperation: | ||
| return None | ||
| return None | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,7 +6,7 @@ | |
| name='price-parser', | ||
| version='0.3.1', | ||
| description='Extract price and currency from a raw string', | ||
| long_description=open('README.rst').read() + "\n\n" + open('CHANGES.rst').read(), | ||
| long_description=open('README.rst', encoding="utf8").read() + "\n\n" + open('CHANGES.rst').read(), | ||
| author='Mikhail Korobov', | ||
| author_email='[email protected]', | ||
| url='https://github.com/scrapinghub/price-parser', | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What do you have against paragraphs? 😛
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That might be due to the auto-indentation of Pycharm. We can definitely undo it. 😄