Skip to content

Commit

Permalink
adds testing
Browse files Browse the repository at this point in the history
  • Loading branch information
medoror committed Aug 9, 2024
1 parent d7c19f2 commit 5bcd5ba
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 11 deletions.
23 changes: 12 additions & 11 deletions contentmap/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,26 +42,27 @@ def get_urls(self):
all_urls = []
if self.source_type == self.SOURCE_TYPE_URL:
for sitemap_url in self.sitemap_sources:
urls = self.get_urls_from_url(sitemap_url)
urls = self._get_urls_from_url(sitemap_url)
all_urls.extend(urls)
elif self.source_type == self.SOURCE_TYPE_DISK:
for filename in os.listdir(self.sitemap_sources):
if filename.endswith('.xml'):
filepath = os.path.join(self.sitemap_sources, filename)
urls = self.get_urls_from_disk(filepath)
all_urls.extend(urls)
for directory in self.sitemap_sources:
for filename in os.listdir(directory):
if filename.endswith('.xml'):
filepath = os.path.join(directory, filename)
urls = self._get_urls_from_disk(filepath)
all_urls.extend(urls)
return all_urls

def get_urls_from_url(self, sitemap_url):
def _get_urls_from_url(self, sitemap_url):
r = requests.get(sitemap_url)
tree = etree.fromstring(r.content)
return self.extract_urls_from_tree(tree)
return self._extract_urls_from_tree(tree)

def get_urls_from_disk(self, filepath):
def _get_urls_from_disk(self, filepath):
tree = etree.parse(filepath)
return self.extract_urls_from_tree(tree)
return self._extract_urls_from_tree(tree)

def extract_urls_from_tree(self, tree):
def _extract_urls_from_tree(self, tree):
return [
url.text for url
in tree.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")
Expand Down
9 changes: 9 additions & 0 deletions tests/fixtures/sitemap_folder_a/sitemap_a.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?xml version="1.0"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
<url>
<loc>https://www.example.com/docs/en/example/?topic=testing</loc>
</url>
<url>
<loc>https://www.example.com/docs/en/example/?topic=contact-us</loc>
</url>
</urlset>
9 changes: 9 additions & 0 deletions tests/fixtures/sitemap_folder_b/sitemap_b.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?xml version="1.0"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
<url>
<loc>https://www.example.com/docs/en/example/?topic=library-overview</loc>
</url>
<url>
<loc>https://www.example.com/docs/en/example/?topic=about-this-content</loc>
</url>
</urlset>
71 changes: 71 additions & 0 deletions tests/test_sitemap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import os
import unittest
import pytest

from unittest.mock import patch, MagicMock
from contentmap.sitemap import SitemapToContentDatabase


class TestSitemapToContentDatabase(unittest.TestCase):
def create_mock_response(self, content):
mock_response = MagicMock()
mock_response.content = content
return mock_response

def generate_sample_sitemap_xml(self, url):
return f'''
<urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="https://www.w3.org/1999/xhtml">
<url>
<loc>{url}</loc>
</url>
</urlset>'''
@patch('contentmap.sitemap.requests.get')
def test_get_urls_given_one_sitemap_url(self, mock_get):
mock_get.return_value = self.create_mock_response(self.generate_sample_sitemap_xml('https://www.example.com/docs/en/example/?topic=testing'))

sitemap_db = SitemapToContentDatabase(sitemap_sources=['https://example.com/sitemap.xml'], source_type='url')
urls = sitemap_db.get_urls()

self.assertEqual(urls, ['https://www.example.com/docs/en/example/?topic=testing'])
mock_get.assert_called_once_with('https://example.com/sitemap.xml')


@patch('contentmap.sitemap.requests.get')
def test_get_urls_given_multiple_sitemap_urls(self, mock_get):
mock_get.side_effect = [
self.create_mock_response(self.generate_sample_sitemap_xml('https://www.example.com/docs/en/example/?topic=testing')),
self.create_mock_response(self.generate_sample_sitemap_xml('https://www.anotherexample.com/docs/en/example/?topic=contact-us'))
]

sitemap_db = SitemapToContentDatabase(sitemap_sources=['https://example.com/sitemap.xml', 'https://anotherexample.com/sitemap.xml'], source_type='url')
urls = sitemap_db.get_urls()

self.assertEqual(urls, [
'https://www.example.com/docs/en/example/?topic=testing',
'https://www.anotherexample.com/docs/en/example/?topic=contact-us'
])
mock_get.assert_any_call('https://example.com/sitemap.xml')
mock_get.assert_any_call('https://anotherexample.com/sitemap.xml')
self.assertEqual(mock_get.call_count, 2)

def test_get_urls_given_one_location_on_disk(self):
sitemap_folder_a_path = os.path.join(os.path.dirname(__file__), 'fixtures', 'sitemap_folder_a')
sitemap_db = SitemapToContentDatabase(sitemap_sources=[sitemap_folder_a_path], source_type='disk')
urls = sitemap_db.get_urls()

self.assertEqual(urls, ['https://www.example.com/docs/en/example/?topic=testing',
'https://www.example.com/docs/en/example/?topic=contact-us'
])


def test_get_urls_given_multiple_locations_on_disk(self):
sitemap_folder_a_path = os.path.join(os.path.dirname(__file__), 'fixtures', 'sitemap_folder_a')
sitemap_folder_b_path = os.path.join(os.path.dirname(__file__), 'fixtures', 'sitemap_folder_b')
sitemap_db = SitemapToContentDatabase(sitemap_sources=[sitemap_folder_a_path, sitemap_folder_b_path], source_type='disk')
urls = sitemap_db.get_urls()

self.assertEqual(urls, ['https://www.example.com/docs/en/example/?topic=testing',
'https://www.example.com/docs/en/example/?topic=contact-us',
'https://www.example.com/docs/en/example/?topic=library-overview',
'https://www.example.com/docs/en/example/?topic=about-this-content'
])

0 comments on commit 5bcd5ba

Please sign in to comment.