[sitemap] set higher priority for files of the current release

Change-Id: I9dbaa787354582f2f766fcce58aff95766d242c7
This commit is contained in:
Christian Berendt 2016-01-12 09:12:26 +01:00
parent b60e29dcf1
commit 8ebb376bb8
1 changed files with 20 additions and 16 deletions

View File

@ -10,7 +10,6 @@
# License for the specific language governing permissions and limitations
# under the License.
import posixpath
import time
import urlparse
@ -21,6 +20,19 @@ from scrapy import spiders
class SitemapSpider(spiders.CrawlSpider):
name = 'sitemap'
old_releases = tuple(["/%s" % old_release for old_release in [
'austin',
'bexar',
'cactus',
'diablo',
'essex',
'folsom',
'grizzly',
'havana',
'icehouse',
'juno',
'kilo'
]])
rules = [
spiders.Rule(
@ -52,25 +64,17 @@ class SitemapSpider(spiders.CrawlSpider):
def parse_item(self, response):
item = items.SitemapItem()
item['priority'] = '0.5'
item['changefreq'] = 'daily'
item['loc'] = response.url
path = urlparse.urlsplit(response.url).path
filename = posixpath.basename(path)
if filename == 'index.html' or filename == '':
if path.startswith(self.old_releases):
# weekly changefrequency and lower priority for old files
item['priority'] = '0.5'
item['changefreq'] = 'weekly'
else:
# daily changefrequency and highest priority for current files
item['priority'] = '1.0'
weekly = [
'juno',
'icehouse',
'havana'
]
for entry in weekly:
if path.startswith("/%s" % entry):
item['changefreq'] = 'weekly'
item['changefreq'] = 'daily'
if 'Last-Modified' in response.headers:
timestamp = response.headers['Last-Modified']