Skip to content

Commit

Permalink
feat: Add content map creator from XML sitemap
Browse files Browse the repository at this point in the history
  • Loading branch information
philippe2803 committed Jan 9, 2024
1 parent 2963079 commit 514cd16
Show file tree
Hide file tree
Showing 6 changed files with 944 additions and 9 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
contentmap.db


Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
44 changes: 38 additions & 6 deletions contentmap/core.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,53 @@
from typing import List, Dict, TypedDict
from datetime import datetime
import sqlite3
import importlib.metadata


class ContentMapCreator:

def __init__(self, contents: List[Dict[str, str]]):
def __init__(
self,
contents: List[Dict[str, str]],
database: str = "contentmap.db"
):
self.contents = contents
self.db = sqlite3.connect("contentmap.db")
self.db = sqlite3.connect(database)
self.db.row_factory = sqlite3.Row
self.cursor = self.db.cursor()

def init_db(self):
self.cursor.execute("CREATE TABLE IF NOT EXISTS content (url, content)")
self.cursor.execute("CREATE TABLE IF NOT EXISTS config (cat, value)")
self.db.commit()

def add_config(self):
data = [
{"Version:": "1"},
{"Generated with:": "Contentmap lib"},
{"Date:": datetime.now().strftime("%Y-%m-%d %H:%M:%S")},
{"Embeddings:": "mistral-embed"},
{"FTSE:": None},
]
data = [{"cat": k, "value": v} for row in data for k, v in row.items()]
self.cursor.executemany("INSERT INTO config VALUES (:cat, :value)", data)
self.db.commit()

def build(self):
self.cursor.execute("CREATE TABLE contentmap (url, content)")
self.init_db()
self.add_config()
self.cursor.executemany(
"INSERT INTO contentmap VALUES (:url, :content)",
"INSERT INTO content VALUES (:url, :content)",
self.contents
)
self.db.commit()


if __name__ == "__main__":
from datetime import datetime

def test_release_dummy(self):
return 1
start = datetime.now()
contentmap = ContentMapCreator(contents=[
{"url": "https://adrianbeaumont.net", "content": "Hello world"}
])
contentmap.build()
69 changes: 69 additions & 0 deletions contentmap/sitemap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import asyncio
import logging
import requests

import aiohttp
import trafilatura
from tqdm.asyncio import tqdm_asyncio
from lxml import etree

from contentmap.core import ContentMapCreator


class SitemapToContentDatabase:

def __init__(self, sitemap_url, seconds_timeout=10, concurrency=None):
self.sitemap_url = sitemap_url
self.semaphore = asyncio.Semaphore(concurrency) if concurrency is not None else None
self.timeout = aiohttp.ClientTimeout(
sock_connect=seconds_timeout,
sock_read=seconds_timeout
)

def load(self):
urls = self.get_urls()
loop = asyncio.get_event_loop()
contents = loop.run_until_complete(self.get_contents(urls))
cm = ContentMapCreator(contents)
cm.build()

def get_urls(self):
r = requests.get(self.sitemap_url)
tree = etree.fromstring(r.content)
urls = [
url.text for url
in tree.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")
]
return urls

async def get_contents(self, urls):
async with aiohttp.ClientSession(timeout=self.timeout) as session:
tasks = [self.fetch_content(session, url) for url in urls]
return await tqdm_asyncio.gather(*tasks)

async def fetch_content(self, session, url):
try:
if not self.semaphore:
async with session.get(url) as response:
raw = await response.text()
else:
async with self.semaphore, session.get(url) as response:
raw = await response.text()
content = trafilatura.extract(raw)
return {"url": url, "content": content}

except aiohttp.ClientConnectionError as e:
logging.error(f"Error while fetching {url}: {e.__repr__}")
return None


if __name__ == "__main__":
from datetime import datetime

start = datetime.now()
contentmap = SitemapToContentDatabase("https://adrianbeaumont.net/sitemap-1.xml", concurrency=20)
contents = contentmap.load()
end = datetime.now()
print(end - start)
print(contents)

2 changes: 0 additions & 2 deletions contentmap/test.py

This file was deleted.

Loading

0 comments on commit 514cd16

Please sign in to comment.