From 8ebb376bb8aab36e72ab9e41f6c553a1bf4823c1 Mon Sep 17 00:00:00 2001 From: Christian Berendt Date: Tue, 12 Jan 2016 09:12:26 +0100 Subject: [PATCH] [sitemap] set higher priority for files of the current release Change-Id: I9dbaa787354582f2f766fcce58aff95766d242c7 --- sitemap/generator/spiders/sitemap.py | 36 +++++++++++++++------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/sitemap/generator/spiders/sitemap.py b/sitemap/generator/spiders/sitemap.py index 44009d5..1c36405 100644 --- a/sitemap/generator/spiders/sitemap.py +++ b/sitemap/generator/spiders/sitemap.py @@ -10,7 +10,6 @@ # License for the specific language governing permissions and limitations # under the License. -import posixpath import time import urlparse @@ -21,6 +20,19 @@ from scrapy import spiders class SitemapSpider(spiders.CrawlSpider): name = 'sitemap' + old_releases = tuple(["/%s" % old_release for old_release in [ + 'austin', + 'bexar', + 'cactus', + 'diablo', + 'essex', + 'folsom', + 'grizzly', + 'havana', + 'icehouse', + 'juno', + 'kilo' + ]]) rules = [ spiders.Rule( @@ -52,25 +64,17 @@ class SitemapSpider(spiders.CrawlSpider): def parse_item(self, response): item = items.SitemapItem() - item['priority'] = '0.5' - item['changefreq'] = 'daily' item['loc'] = response.url path = urlparse.urlsplit(response.url).path - filename = posixpath.basename(path) - - if filename == 'index.html' or filename == '': + if path.startswith(self.old_releases): + # weekly changefrequency and lower priority for old files + item['priority'] = '0.5' + item['changefreq'] = 'weekly' + else: + # daily changefrequency and highest priority for current files item['priority'] = '1.0' - - weekly = [ - 'juno', - 'icehouse', - 'havana' - ] - - for entry in weekly: - if path.startswith("/%s" % entry): - item['changefreq'] = 'weekly' + item['changefreq'] = 'daily' if 'Last-Modified' in response.headers: timestamp = response.headers['Last-Modified']