Skip to content

Commit

Permalink
Pull sitemaps from disk
Browse files Browse the repository at this point in the history
* Provide a directory of xml sitemaps on disk as an option
  to initializing SitemapToContentDatabase
* Allow for multiple sitemap urls
  • Loading branch information
medoror committed Aug 4, 2024
1 parent 318412b commit d7c19f2
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 6 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ following:
from contentmap.sitemap import SitemapToContentDatabase

database = SitemapToContentDatabase(
sitemap_url="https://yourblog.com/sitemap.xml",
sitemap_sources=["https://yourblog.com/sitemap.xml"],
concurrency=10,
include_vss=True
)
Expand Down
39 changes: 34 additions & 5 deletions contentmap/sitemap.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import asyncio
import logging
from typing import Literal
import requests
import os

import aiohttp
import trafilatura
Expand All @@ -11,10 +13,17 @@


class SitemapToContentDatabase:
SOURCE_TYPE_URL: Literal['url'] = 'url'
SOURCE_TYPE_DISK: Literal['disk'] = 'disk'
SourceType = Literal['url', 'disk']

def __init__(self, sitemap_url, seconds_timeout=10, concurrency=None,
def __init__(self, sitemap_sources: list,
source_type: SourceType = SOURCE_TYPE_URL,
seconds_timeout=10,
concurrency=None,
include_vss=False):
self.sitemap_url = sitemap_url
self.sitemap_sources = sitemap_sources
self.source_type = source_type
self.semaphore = asyncio.Semaphore(concurrency) if concurrency is not None else None
self.timeout = aiohttp.ClientTimeout(
sock_connect=seconds_timeout,
Expand All @@ -30,13 +39,33 @@ def build(self):
cm.build()

def get_urls(self):
r = requests.get(self.sitemap_url)
all_urls = []
if self.source_type == self.SOURCE_TYPE_URL:
for sitemap_url in self.sitemap_sources:
urls = self.get_urls_from_url(sitemap_url)
all_urls.extend(urls)
elif self.source_type == self.SOURCE_TYPE_DISK:
for filename in os.listdir(self.sitemap_sources):
if filename.endswith('.xml'):
filepath = os.path.join(self.sitemap_sources, filename)
urls = self.get_urls_from_disk(filepath)
all_urls.extend(urls)
return all_urls

def get_urls_from_url(self, sitemap_url):
r = requests.get(sitemap_url)
tree = etree.fromstring(r.content)
urls = [
return self.extract_urls_from_tree(tree)

def get_urls_from_disk(self, filepath):
tree = etree.parse(filepath)
return self.extract_urls_from_tree(tree)

def extract_urls_from_tree(self, tree):
return [
url.text for url
in tree.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")
]
return urls

async def get_contents(self, urls):
async with aiohttp.ClientSession(timeout=self.timeout) as session:
Expand Down

0 comments on commit d7c19f2

Please sign in to comment.