From 6898821aeb51d9471d57897dd7a6b6e1c57f761e Mon Sep 17 00:00:00 2001 From: Andreas Jaeger Date: Fri, 28 Jul 2017 09:36:29 +0200 Subject: [PATCH] Change priorities for sitemap Downgrade old EOL releases, give current maintained releases and development versions a higher priority. this needs changes to the test framework, update mocking. Also, rename URL to be docs.openstack.org and not .com. Change-Id: I2c2c0408e203a65a9541baaf55ffe60694463975 --- sitemap/generator/spiders/sitemap_file.py | 35 ++++++++++++++++------- test/test_sitemap_file.py | 31 ++++++++++++++------ 2 files changed, 48 insertions(+), 18 deletions(-) diff --git a/sitemap/generator/spiders/sitemap_file.py b/sitemap/generator/spiders/sitemap_file.py index ab2ac101..b573c471 100644 --- a/sitemap/generator/spiders/sitemap_file.py +++ b/sitemap/generator/spiders/sitemap_file.py @@ -10,6 +10,7 @@ # License for the specific language governing permissions and limitations # under the License. +import re import time try: import urlparse @@ -31,7 +32,8 @@ class SitemapItem(item.Item): class SitemapSpider(spiders.CrawlSpider): name = 'sitemap' - old_releases = tuple(["/%s" % old_release for old_release in [ + + EOL_SERIES = [ 'austin', 'bexar', 'cactus', @@ -44,10 +46,16 @@ class SitemapSpider(spiders.CrawlSpider): 'juno', 'kilo', 'liberty', - 'mitaka', + 'mitaka' + ] + EOL_RELEASES_PAT = re.compile('^/(' + '|'.join(EOL_SERIES) + ')/') + MAINT_SERIES = [ 'newton', - 'ocata' - ]]) + 'ocata', + 'pike' + ] + MAINT_RELEASES_PAT = re.compile('^/(' + '|'.join(MAINT_SERIES) + ')/') + LATEST_PAT = re.compile('^/latest/') rules = [ spiders.Rule( @@ -62,9 +70,6 @@ class SitemapSpider(spiders.CrawlSpider): deny=[ r'/trunk/', r'/draft/', - r'/api/', - r'/juno/', - r'/icehouse/' ] ), follow=True, callback='parse_item' @@ -86,11 +91,21 @@ class SitemapSpider(spiders.CrawlSpider): item['loc'] = response.url path = urlparse.urlsplit(response.url).path - if path.startswith(self.old_releases): - # weekly changefrequency and lower priority for old files - item['priority'] = '0.5' + + if self.MAINT_RELEASES_PAT.match(path): + # weekly changefrequency and highest prio for maintained release + item['priority'] = '1.0' item['changefreq'] = 'weekly' + elif self.LATEST_PAT.match(path): + # daily changefrequency and high priority for current files + item['priority'] = '0.8' + item['changefreq'] = 'daily' + elif self.EOL_RELEASES_PAT.match(path): + # yearly changefrequency and lowest priority for old stable files + item['priority'] = '0.1' + item['changefreq'] = 'yearly' else: + # These are unversioned documents # daily changefrequency and highest priority for current files item['priority'] = '1.0' item['changefreq'] = 'daily' diff --git a/test/test_sitemap_file.py b/test/test_sitemap_file.py index 030e92fc..68e59ca0 100644 --- a/test/test_sitemap_file.py +++ b/test/test_sitemap_file.py @@ -58,10 +58,17 @@ class TestSitemapSpider(unittest.TestCase): def test_parse_items_inits_sitemap(self): response = mock.MagicMock() + path = sitemap_file.urlparse.SplitResult( + scheme='https', + netloc='docs.openstack.org', + path='/ocata/something.html', + query='', + fragment='' + ) with mock.patch.object(sitemap_file, 'SitemapItem') as mocked_sitemap_item: - with mock.patch.object(sitemap_file.urlparse, - 'urlsplit'): + with mock.patch.object(sitemap_file.urlparse, 'urlsplit', + return_value=path): with mock.patch.object(sitemap_file, 'time'): self.spider.parse_item(response) @@ -69,9 +76,17 @@ class TestSitemapSpider(unittest.TestCase): def test_parse_items_gets_path(self): response = mock.MagicMock() + path = sitemap_file.urlparse.SplitResult( + scheme='https', + netloc='docs.openstackorg', + path='/ocata/something.html', + query='', + fragment='' + ) with mock.patch.object(sitemap_file, 'SitemapItem'): with mock.patch.object(sitemap_file.urlparse, - 'urlsplit') as mocked_urlsplit: + 'urlsplit', + return_value=path) as mocked_urlsplit: with mock.patch.object(sitemap_file, 'time'): self.spider.parse_item(response) @@ -81,8 +96,8 @@ class TestSitemapSpider(unittest.TestCase): response = mock.MagicMock() path = sitemap_file.urlparse.SplitResult( scheme='https', - netloc='docs.openstack.com', - path='/mitaka', + netloc='docs.openstack.org', + path='/ocata/something.html', query='', fragment='' ) @@ -91,14 +106,14 @@ class TestSitemapSpider(unittest.TestCase): with mock.patch.object(sitemap_file, 'time'): returned_item = self.spider.parse_item(response) - self.assertEqual('0.5', returned_item['priority']) + self.assertEqual('1.0', returned_item['priority']) self.assertEqual('weekly', returned_item['changefreq']) def test_parse_items_high_priority_daily_freq(self): response = mock.MagicMock() path = sitemap_file.urlparse.SplitResult( scheme='https', - netloc='docs.openstack.com', + netloc='docs.openstack.org', path='/contributor-guide', query='', fragment='' @@ -115,7 +130,7 @@ class TestSitemapSpider(unittest.TestCase): response = mock.MagicMock() path = sitemap_file.urlparse.SplitResult( scheme='https', - netloc='docs.openstack.com', + netloc='docs.openstack.org', path='/ocata', query='', fragment=''