Skip to content

Commit

Permalink
scraper: skip price info request for invalid SKUs
Browse files Browse the repository at this point in the history
Some SKUs seem to have been removed from the CT database. Unfortunately,
they are still listed by the API endpoints we use to list products and
SKUs.

When we attempt a PriceAvailability request for a batch of SKUs that
contains an invalid SKU, the service replies with the following kind of
error:

  {
    "statusCode": 400,
    "errors": [
        "An error on the server occurred"
    ],
    "isCdsError": true,
    "error": "Bad Request",
    "errCode": "0",
    "xRequestId": null,
    "requestId": "3706a044-a8d1-4691-97cf-bdb78eae08c7",
    "correlationId": "f5ebd64d-8c62-4c6a-bb18-1989391cfa1a"
  }

I have not found anything in the SKU listing response that allows us to
determine which SKUs are no longer valid.

For the moment, this provides a work-around by falling back to
individually querying for the price of each SKU when a batch fails with
the '400' status code.

Signed-off-by: Jérémie Galarneau <[email protected]>
  • Loading branch information
jgalar committed Feb 24, 2024
1 parent 46b5867 commit 78214e3
Showing 1 changed file with 58 additions and 12 deletions.
70 changes: 58 additions & 12 deletions src/canadiantracker/triangle.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import time
from collections.abc import Iterable, Iterator, Sequence
from datetime import datetime
from typing import Callable, Generator, Tuple
from typing import Callable, Generator, Optional, Tuple

import latest_user_agents
import requests
Expand Down Expand Up @@ -358,7 +358,13 @@ def __iter__(self):

# A non-200 HTTP response when querying prices.
class _PriceQueryException(Exception):
pass
def __init__(self, msg: str, request_status_code: Optional[int] = None):
super().__init__(msg)
self._request_status_code = request_status_code

@property
def request_status_code(self) -> Optional[int]:
return self._request_status_code


class ProductLedger(Iterable):
Expand All @@ -379,9 +385,7 @@ def _batches(it: Iterator, batch_max_size: int) -> Generator[list, None, None]:
yield batch

@staticmethod
def _get_price_infos(
sku_codes: Sequence[str],
) -> Sequence[PriceInfo]:
def _request_price_infos(sku_codes: Sequence[str]) -> requests.Response:
for ntry in range(5):
url = "https://apim.canadiantire.ca/v1/product/api/v1/product/sku/PriceAvailability/?lang=en_CA&storeId=64"
headers = _base_headers.copy()
Expand All @@ -398,27 +402,69 @@ def _get_price_infos(
]
}

logger.debug("requested {} price infos".format(len(sku_codes)))
logger.debug(
f"Sending batched price info query request: ntry={ntry} batch_size={len(sku_codes)} sku_codes={sku_codes}"
)
response = requests.post(url, headers=headers, json=body)

if response.status_code != 200:
# Wait a bit before retrying, in case the admin is restarting the container.
logger.error(f"Got status code {response.status_code} on try {ntry}")
logger.error(response.text)

if "Request failed with status code 404" in response.text:
break
raise _PriceQueryException("Failed to get product info", 404)
elif response.status_code == 400:
raise _PriceQueryException(
"Failed to get product info", response.status_code
)

time.sleep(5)
continue

response = response.json(parse_float=decimal.Decimal)
response_skus = response["skus"]
logger.debug("received {} price infos".format(len(response_skus)))
return response

raise _PriceQueryException("Failed to get product info")

@staticmethod
def _get_price_infos(
sku_codes: Sequence[str],
) -> Sequence[PriceInfo]:
try:
response_skus = ProductLedger._request_price_infos(sku_codes).json(
parse_float=decimal.Decimal
)["skus"]
logger.debug("Received {} price infos".format(len(response_skus)))
return [PriceInfo(price_info) for price_info in response_skus]

raise _PriceQueryException("Failed to get product info")
except _PriceQueryException as batch_query_exception:
logger.warn(
f"Price info query failed with status {batch_query_exception.request_status_code}"
)
if batch_query_exception.request_status_code == 400 and len(sku_codes) > 1:
# Some SKUs are retired and probing their price will cause the server
# to return an "internal error" if they are part as part of the
# requested batch. In those cases, fallback to requesting the prices
# one by one.
logger.debug(
"Attempting to process failed price info query batch item by item"
)
price_infos = []
for code in sku_codes:
try:
price_infos.append(ProductLedger._get_price_infos([code])[0])
except _PriceQueryException as single_query_exception:
logger.warn(
f"Individual price info query failed with status {batch_query_exception.request_status_code}"
)
if single_query_exception.request_status_code == 400:
logger.debug(f"Skipping price info query for sku '{code}'")
continue
else:
raise single_query_exception

return price_infos
else:
raise batch_query_exception

def __iter__(self) -> Iterator[PriceInfo]:
# The API limits requests to 50 products
Expand Down

0 comments on commit 78214e3

Please sign in to comment.