Change priorities for sitemap

Downgrade old EOL releases, give current maintained releases and
development versions a higher priority.

Change-Id: I2c2c0408e203a65a9541baaf55ffe60694463975
This commit is contained in:
Andreas Jaeger 2017-07-28 09:36:29 +02:00
parent f87192cc77
commit 895962ecda
2 changed files with 27 additions and 12 deletions

View File

@ -10,6 +10,7 @@
# License for the specific language governing permissions and limitations # License for the specific language governing permissions and limitations
# under the License. # under the License.
import re
import time import time
try: try:
import urlparse import urlparse
@ -31,7 +32,8 @@ class SitemapItem(item.Item):
class SitemapSpider(spiders.CrawlSpider): class SitemapSpider(spiders.CrawlSpider):
name = 'sitemap' name = 'sitemap'
old_releases = tuple(["/%s" % old_release for old_release in [
EOL_SERIES = [
'austin', 'austin',
'bexar', 'bexar',
'cactus', 'cactus',
@ -44,10 +46,16 @@ class SitemapSpider(spiders.CrawlSpider):
'juno', 'juno',
'kilo', 'kilo',
'liberty', 'liberty',
'mitaka', 'mitaka'
]
EOL_RELEASES_PAT = re.compile('^/(' + '|'.join(EOL_SERIES) + ')/')
MAINT_SERIES = [
'newton', 'newton',
'ocata' 'ocata',
]]) 'pike'
]
MAINT_RELEASES_PAT = re.compile('^/(' + '|'.join(MAINT_SERIES) + ')/')
LATEST_PAT = re.compile('^/latest/')
rules = [ rules = [
spiders.Rule( spiders.Rule(
@ -62,9 +70,6 @@ class SitemapSpider(spiders.CrawlSpider):
deny=[ deny=[
r'/trunk/', r'/trunk/',
r'/draft/', r'/draft/',
r'/api/',
r'/juno/',
r'/icehouse/'
] ]
), ),
follow=True, callback='parse_item' follow=True, callback='parse_item'
@ -86,11 +91,21 @@ class SitemapSpider(spiders.CrawlSpider):
item['loc'] = response.url item['loc'] = response.url
path = urlparse.urlsplit(response.url).path path = urlparse.urlsplit(response.url).path
if path.startswith(self.old_releases):
# weekly changefrequency and lower priority for old files if self.MAINT_RELEASES_PAT.match(path):
item['priority'] = '0.5' # weekly changefrequency and highest prio for maintained release
item['priority'] = '1.0'
item['changefreq'] = 'weekly' item['changefreq'] = 'weekly'
elif self.LATEST_PAT.match(path):
# daily changefrequency and high priority for current files
item['priority'] = '0.8'
item['changefreq'] = 'daily'
elif self.EOL_RELEASES_PAT.match(path):
# yearly changefrequency and lowest priority for old stable files
item['priority'] = '0.1'
item['changefreq'] = 'yearly'
else: else:
# These are unversioned documents
# daily changefrequency and highest priority for current files # daily changefrequency and highest priority for current files
item['priority'] = '1.0' item['priority'] = '1.0'
item['changefreq'] = 'daily' item['changefreq'] = 'daily'

View File

@ -82,7 +82,7 @@ class TestSitemapSpider(unittest.TestCase):
path = sitemap_file.urlparse.SplitResult( path = sitemap_file.urlparse.SplitResult(
scheme='https', scheme='https',
netloc='docs.openstack.com', netloc='docs.openstack.com',
path='/mitaka', path='/mitaka/',
query='', query='',
fragment='' fragment=''
) )
@ -91,7 +91,7 @@ class TestSitemapSpider(unittest.TestCase):
with mock.patch.object(sitemap_file, 'time'): with mock.patch.object(sitemap_file, 'time'):
returned_item = self.spider.parse_item(response) returned_item = self.spider.parse_item(response)
self.assertEqual('0.5', returned_item['priority']) self.assertEqual('0.1', returned_item['priority'])
self.assertEqual('weekly', returned_item['changefreq']) self.assertEqual('weekly', returned_item['changefreq'])
def test_parse_items_high_priority_daily_freq(self): def test_parse_items_high_priority_daily_freq(self):