Change priorities for sitemap
Downgrade old EOL releases, give current maintained releases and development versions a higher priority. Change-Id: I2c2c0408e203a65a9541baaf55ffe60694463975
This commit is contained in:
parent
f87192cc77
commit
ff1377b8cc
|
@ -10,6 +10,7 @@
|
|||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import re
|
||||
import time
|
||||
try:
|
||||
import urlparse
|
||||
|
@ -20,7 +21,6 @@ from scrapy import item
|
|||
from scrapy.linkextractors import LinkExtractor
|
||||
from scrapy import spiders
|
||||
|
||||
|
||||
class SitemapItem(item.Item):
|
||||
'''Class to represent an item in the sitemap.'''
|
||||
loc = item.Field()
|
||||
|
@ -31,7 +31,8 @@ class SitemapItem(item.Item):
|
|||
|
||||
class SitemapSpider(spiders.CrawlSpider):
|
||||
name = 'sitemap'
|
||||
old_releases = tuple(["/%s" % old_release for old_release in [
|
||||
|
||||
EOL_SERIES = [
|
||||
'austin',
|
||||
'bexar',
|
||||
'cactus',
|
||||
|
@ -44,10 +45,16 @@ class SitemapSpider(spiders.CrawlSpider):
|
|||
'juno',
|
||||
'kilo',
|
||||
'liberty',
|
||||
'mitaka',
|
||||
'mitaka'
|
||||
]
|
||||
EOL_RELEASES_PAT = re.compile('^/(' + '|'.join(EOL_SERIES) + ')/')
|
||||
MAINT_SERIES = [
|
||||
'newton',
|
||||
'ocata'
|
||||
]])
|
||||
'ocata',
|
||||
'pike'
|
||||
]
|
||||
MAINT_RELEASES_PAT = re.compile('^/(' + '|'.join(MAINT_SERIES) + ')/')
|
||||
LATEST_PAT = re.compile('^/latest/')
|
||||
|
||||
rules = [
|
||||
spiders.Rule(
|
||||
|
@ -62,9 +69,6 @@ class SitemapSpider(spiders.CrawlSpider):
|
|||
deny=[
|
||||
r'/trunk/',
|
||||
r'/draft/',
|
||||
r'/api/',
|
||||
r'/juno/',
|
||||
r'/icehouse/'
|
||||
]
|
||||
),
|
||||
follow=True, callback='parse_item'
|
||||
|
@ -86,11 +90,21 @@ class SitemapSpider(spiders.CrawlSpider):
|
|||
item['loc'] = response.url
|
||||
|
||||
path = urlparse.urlsplit(response.url).path
|
||||
if path.startswith(self.old_releases):
|
||||
# weekly changefrequency and lower priority for old files
|
||||
item['priority'] = '0.5'
|
||||
|
||||
if self.MAINT_RELEASES_PAT.match(path):
|
||||
# weekly changefrequency and highest prio for maintained release
|
||||
item['priority'] = '1.0'
|
||||
item['changefreq'] = 'weekly'
|
||||
elif self.LATEST_PAT.match(path):
|
||||
# daily changefrequency and high priority for current files
|
||||
item['priority'] = '0.8'
|
||||
item['changefreq'] = 'daily'
|
||||
elif self.EOL_RELEASES_PAT.match(path):
|
||||
# yearly changefrequency and lowest priority for old stable files
|
||||
item['priority'] = '0.1'
|
||||
item['changefreq'] = 'yearly'
|
||||
else:
|
||||
# These are unversioned documents
|
||||
# daily changefrequency and highest priority for current files
|
||||
item['priority'] = '1.0'
|
||||
item['changefreq'] = 'daily'
|
||||
|
|
|
@ -91,7 +91,7 @@ class TestSitemapSpider(unittest.TestCase):
|
|||
with mock.patch.object(sitemap_file, 'time'):
|
||||
returned_item = self.spider.parse_item(response)
|
||||
|
||||
self.assertEqual('0.5', returned_item['priority'])
|
||||
self.assertEqual('0.1', returned_item['priority'])
|
||||
self.assertEqual('weekly', returned_item['changefreq'])
|
||||
|
||||
def test_parse_items_high_priority_daily_freq(self):
|
||||
|
|
Loading…
Reference in New Issue