-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add content map creator from XML sitemap
- Loading branch information
1 parent
2963079
commit 514cd16
Showing
6 changed files
with
944 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
contentmap.db | ||
|
||
|
||
Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,53 @@ | ||
from typing import List, Dict, TypedDict | ||
from datetime import datetime | ||
import sqlite3 | ||
import importlib.metadata | ||
|
||
|
||
class ContentMapCreator: | ||
|
||
def __init__(self, contents: List[Dict[str, str]]): | ||
def __init__( | ||
self, | ||
contents: List[Dict[str, str]], | ||
database: str = "contentmap.db" | ||
): | ||
self.contents = contents | ||
self.db = sqlite3.connect("contentmap.db") | ||
self.db = sqlite3.connect(database) | ||
self.db.row_factory = sqlite3.Row | ||
self.cursor = self.db.cursor() | ||
|
||
def init_db(self): | ||
self.cursor.execute("CREATE TABLE IF NOT EXISTS content (url, content)") | ||
self.cursor.execute("CREATE TABLE IF NOT EXISTS config (cat, value)") | ||
self.db.commit() | ||
|
||
def add_config(self): | ||
data = [ | ||
{"Version:": "1"}, | ||
{"Generated with:": "Contentmap lib"}, | ||
{"Date:": datetime.now().strftime("%Y-%m-%d %H:%M:%S")}, | ||
{"Embeddings:": "mistral-embed"}, | ||
{"FTSE:": None}, | ||
] | ||
data = [{"cat": k, "value": v} for row in data for k, v in row.items()] | ||
self.cursor.executemany("INSERT INTO config VALUES (:cat, :value)", data) | ||
self.db.commit() | ||
|
||
def build(self): | ||
self.cursor.execute("CREATE TABLE contentmap (url, content)") | ||
self.init_db() | ||
self.add_config() | ||
self.cursor.executemany( | ||
"INSERT INTO contentmap VALUES (:url, :content)", | ||
"INSERT INTO content VALUES (:url, :content)", | ||
self.contents | ||
) | ||
self.db.commit() | ||
|
||
|
||
if __name__ == "__main__": | ||
from datetime import datetime | ||
|
||
def test_release_dummy(self): | ||
return 1 | ||
start = datetime.now() | ||
contentmap = ContentMapCreator(contents=[ | ||
{"url": "https://adrianbeaumont.net", "content": "Hello world"} | ||
]) | ||
contentmap.build() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import asyncio | ||
import logging | ||
import requests | ||
|
||
import aiohttp | ||
import trafilatura | ||
from tqdm.asyncio import tqdm_asyncio | ||
from lxml import etree | ||
|
||
from contentmap.core import ContentMapCreator | ||
|
||
|
||
class SitemapToContentDatabase: | ||
|
||
def __init__(self, sitemap_url, seconds_timeout=10, concurrency=None): | ||
self.sitemap_url = sitemap_url | ||
self.semaphore = asyncio.Semaphore(concurrency) if concurrency is not None else None | ||
self.timeout = aiohttp.ClientTimeout( | ||
sock_connect=seconds_timeout, | ||
sock_read=seconds_timeout | ||
) | ||
|
||
def load(self): | ||
urls = self.get_urls() | ||
loop = asyncio.get_event_loop() | ||
contents = loop.run_until_complete(self.get_contents(urls)) | ||
cm = ContentMapCreator(contents) | ||
cm.build() | ||
|
||
def get_urls(self): | ||
r = requests.get(self.sitemap_url) | ||
tree = etree.fromstring(r.content) | ||
urls = [ | ||
url.text for url | ||
in tree.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc") | ||
] | ||
return urls | ||
|
||
async def get_contents(self, urls): | ||
async with aiohttp.ClientSession(timeout=self.timeout) as session: | ||
tasks = [self.fetch_content(session, url) for url in urls] | ||
return await tqdm_asyncio.gather(*tasks) | ||
|
||
async def fetch_content(self, session, url): | ||
try: | ||
if not self.semaphore: | ||
async with session.get(url) as response: | ||
raw = await response.text() | ||
else: | ||
async with self.semaphore, session.get(url) as response: | ||
raw = await response.text() | ||
content = trafilatura.extract(raw) | ||
return {"url": url, "content": content} | ||
|
||
except aiohttp.ClientConnectionError as e: | ||
logging.error(f"Error while fetching {url}: {e.__repr__}") | ||
return None | ||
|
||
|
||
if __name__ == "__main__": | ||
from datetime import datetime | ||
|
||
start = datetime.now() | ||
contentmap = SitemapToContentDatabase("https://adrianbeaumont.net/sitemap-1.xml", concurrency=20) | ||
contents = contentmap.load() | ||
end = datetime.now() | ||
print(end - start) | ||
print(contents) | ||
|
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.