Update sitemap

* Update releases: Newton is unmainted; train, ussuri are relased
* We have redirects to some sites, ignore them
* Remove python2 code
* Use https as starting point

Switch repo also to victoria PTI jobs.

Change-Id: I744e725b32a797b0e208ac7e81afe4b8d3578751
This commit is contained in:
Andreas Jaeger 2020-06-21 17:57:32 +02:00
parent 7ee77130c4
commit 8cca4479b6
3 changed files with 23 additions and 12 deletions

View File

@ -2,6 +2,6 @@
templates: templates:
- check-requirements - check-requirements
- release-notes-jobs-python3 - release-notes-jobs-python3
- openstack-python3-ussuri-jobs - openstack-python3-victoria-jobs
- publish-openstack-docs-pti - publish-openstack-docs-pti
- openstack-lower-constraints-jobs - openstack-lower-constraints-jobs

View File

@ -12,10 +12,7 @@
import re import re
import time import time
try: import urllib.parse as urlparse
import urlparse
except ImportError:
import urllib.parse as urlparse
from scrapy import item from scrapy import item
from scrapy import linkextractors from scrapy import linkextractors
@ -34,12 +31,13 @@ class SitemapSpider(spiders.CrawlSpider):
name = 'sitemap' name = 'sitemap'
MAINT_SERIES = [ MAINT_SERIES = [
'newton',
'ocata', 'ocata',
'pike', 'pike',
'queens', 'queens',
'rocky', 'rocky',
'stein', 'stein',
'train',
'ussuri',
] ]
MAINT_RELEASES_PAT = re.compile('^.*/(' + '|'.join(MAINT_SERIES) + ')/') MAINT_RELEASES_PAT = re.compile('^.*/(' + '|'.join(MAINT_SERIES) + ')/')
LATEST_PAT = re.compile('^.*/latest/') LATEST_PAT = re.compile('^.*/latest/')
@ -69,7 +67,16 @@ class SitemapSpider(spiders.CrawlSpider):
r'/juno/', r'/juno/',
r'/kilo/', r'/kilo/',
r'/liberty/', r'/liberty/',
r'/mitaka/' r'/mitaka/',
r'/newton/',
],
deny_domains=[
# docs.o.o redirects to a few sites, filter
# them out
'docs.opendev.org',
'opendev.org',
'releases.openstack.org',
'zuul-ci.org',
] ]
), ),
follow=True, callback='parse_item' follow=True, callback='parse_item'
@ -80,7 +87,7 @@ class SitemapSpider(spiders.CrawlSpider):
super(SitemapSpider, self).__init__(*args, **kwargs) super(SitemapSpider, self).__init__(*args, **kwargs)
self.domain = domain self.domain = domain
self.allowed_domains = [domain] self.allowed_domains = [domain]
self.start_urls = ['http://%s' % domain] self.start_urls = ['https://%s' % domain]
for url in urls.split(','): for url in urls.split(','):
if not url: if not url:
continue continue
@ -90,13 +97,17 @@ class SitemapSpider(spiders.CrawlSpider):
item = SitemapItem() item = SitemapItem()
item['loc'] = response.url item['loc'] = response.url
path = urlparse.urlsplit(response.url).path components = urlparse.urlsplit(response.url)
if self.MAINT_RELEASES_PAT.match(path): # Filter out any redirected URLs to other domains
if self.domain != components.netloc:
return
if self.MAINT_RELEASES_PAT.match(components.path):
# weekly changefrequency and highest prio for maintained release # weekly changefrequency and highest prio for maintained release
item['priority'] = '1.0' item['priority'] = '1.0'
item['changefreq'] = 'weekly' item['changefreq'] = 'weekly'
elif self.LATEST_PAT.match(path): elif self.LATEST_PAT.match(components.path):
# daily changefrequency and normal priority for current files # daily changefrequency and normal priority for current files
item['priority'] = '0.5' item['priority'] = '0.5'
item['changefreq'] = 'daily' item['changefreq'] = 'daily'

View File

@ -44,7 +44,7 @@ class TestSitemapSpider(unittest.TestCase):
domain = 'docs.openstack.org' domain = 'docs.openstack.org'
self.assertEqual(self.spider.domain, domain) self.assertEqual(self.spider.domain, domain)
self.assertEqual(self.spider.allowed_domains, [domain]) self.assertEqual(self.spider.allowed_domains, [domain])
self.assertEqual(self.spider.start_urls, ['http://%s' % domain]) self.assertEqual(self.spider.start_urls, ['https://%s' % domain])
def test_start_urls_get_appended(self): def test_start_urls_get_appended(self):
urls = 'new.openstack.org, old.openstack.org' urls = 'new.openstack.org, old.openstack.org'