Skip to content

Commit b1b26a6

Browse files
authored
Fix sitemap issue and add a sitmap limit, fix playground (#43)
1 parent 23fb541 commit b1b26a6

File tree

4 files changed

+16
-4
lines changed

4 files changed

+16
-4
lines changed

apps/tasks.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,11 @@ def extract_urls_task(url):
134134
url_content_type = get_url_content_type(url=url)
135135
url_content_type_parts = url_content_type.split(';')
136136
mime_type = url_content_type_parts[0]
137-
if mime_type != 'text/html' or is_youtube_video_url(url):
137+
138+
if is_youtube_video_url(url):
139+
return [url]
140+
141+
if mime_type != 'text/html' and not is_sitemap_url(url):
138142
return [url]
139143

140144
# Get url domain

common/utils/crawlers.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from scrapy.spiders import Rule
1212
from scrapy.spiders import SitemapSpider
1313
from unstructured.partition.auto import partition_html
14+
from scrapy.exceptions import CloseSpider
1415

1516
from django.conf import settings
1617

@@ -32,13 +33,17 @@ def get_domain(url):
3233
class SitemapXMLSpider(SitemapSpider):
3334
name = 'sitemap_spider'
3435

35-
def __init__(self, url, output, *args, **kwargs):
36+
def __init__(self, url, output, max_urls = 20, *args, **kwargs):
3637
self.sitemap_urls = [url]
3738
self.output = output
39+
self.max_urls = max_urls
3840
super(SitemapXMLSpider, self).__init__(*args, **kwargs)
3941

4042
def parse(self, response):
4143
data = {}
44+
if len(self.output) > self.max_urls:
45+
raise CloseSpider('Reached maximum number of crawled URLs')
46+
4247
# Extract data from the page using CSS or XPath selectors
4348
data['title'] = response.css('title::text').get()
4449
data['url'] = response.url

common/utils/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ def get_url_content_type(url):
143143
def is_sitemap_url(url):
144144
try:
145145
content_type = get_url_content_type(url)
146-
if 'application/xml' in content_type or 'text/xml' in content_type or 'text/plain' in content_type:
146+
if 'application/xml' in content_type or 'text/xml' in content_type or 'text/plain' in content_type or 'application/rss+xml' in content_type:
147147
return True
148148
else:
149149
return False

processors/apis.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,10 @@ async def stream_output():
454454
logger.exception(e)
455455
raise Exception(f'Error starting coordinator: {e}')
456456

457-
return {'output': json.loads(output)} if 'errors' not in output else output
457+
if isinstance(output, dict) and 'errors' in output:
458+
return output
459+
else:
460+
return {'output': json.loads(output)}
458461

459462

460463
class ApiProviderViewSet(viewsets.ViewSet):

0 commit comments

Comments
 (0)