Update sitemap
* Update releases: Newton is unmainted; train, ussuri are relased * We have redirects to some sites, ignore them * Remove python2 code * Use https as starting point Switch repo also to victoria PTI jobs. Change-Id: I744e725b32a797b0e208ac7e81afe4b8d3578751
This commit is contained in:
parent
7ee77130c4
commit
8cca4479b6
|
@ -2,6 +2,6 @@
|
||||||
templates:
|
templates:
|
||||||
- check-requirements
|
- check-requirements
|
||||||
- release-notes-jobs-python3
|
- release-notes-jobs-python3
|
||||||
- openstack-python3-ussuri-jobs
|
- openstack-python3-victoria-jobs
|
||||||
- publish-openstack-docs-pti
|
- publish-openstack-docs-pti
|
||||||
- openstack-lower-constraints-jobs
|
- openstack-lower-constraints-jobs
|
||||||
|
|
|
@ -12,10 +12,7 @@
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
try:
|
import urllib.parse as urlparse
|
||||||
import urlparse
|
|
||||||
except ImportError:
|
|
||||||
import urllib.parse as urlparse
|
|
||||||
|
|
||||||
from scrapy import item
|
from scrapy import item
|
||||||
from scrapy import linkextractors
|
from scrapy import linkextractors
|
||||||
|
@ -34,12 +31,13 @@ class SitemapSpider(spiders.CrawlSpider):
|
||||||
name = 'sitemap'
|
name = 'sitemap'
|
||||||
|
|
||||||
MAINT_SERIES = [
|
MAINT_SERIES = [
|
||||||
'newton',
|
|
||||||
'ocata',
|
'ocata',
|
||||||
'pike',
|
'pike',
|
||||||
'queens',
|
'queens',
|
||||||
'rocky',
|
'rocky',
|
||||||
'stein',
|
'stein',
|
||||||
|
'train',
|
||||||
|
'ussuri',
|
||||||
]
|
]
|
||||||
MAINT_RELEASES_PAT = re.compile('^.*/(' + '|'.join(MAINT_SERIES) + ')/')
|
MAINT_RELEASES_PAT = re.compile('^.*/(' + '|'.join(MAINT_SERIES) + ')/')
|
||||||
LATEST_PAT = re.compile('^.*/latest/')
|
LATEST_PAT = re.compile('^.*/latest/')
|
||||||
|
@ -69,7 +67,16 @@ class SitemapSpider(spiders.CrawlSpider):
|
||||||
r'/juno/',
|
r'/juno/',
|
||||||
r'/kilo/',
|
r'/kilo/',
|
||||||
r'/liberty/',
|
r'/liberty/',
|
||||||
r'/mitaka/'
|
r'/mitaka/',
|
||||||
|
r'/newton/',
|
||||||
|
],
|
||||||
|
deny_domains=[
|
||||||
|
# docs.o.o redirects to a few sites, filter
|
||||||
|
# them out
|
||||||
|
'docs.opendev.org',
|
||||||
|
'opendev.org',
|
||||||
|
'releases.openstack.org',
|
||||||
|
'zuul-ci.org',
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
follow=True, callback='parse_item'
|
follow=True, callback='parse_item'
|
||||||
|
@ -80,7 +87,7 @@ class SitemapSpider(spiders.CrawlSpider):
|
||||||
super(SitemapSpider, self).__init__(*args, **kwargs)
|
super(SitemapSpider, self).__init__(*args, **kwargs)
|
||||||
self.domain = domain
|
self.domain = domain
|
||||||
self.allowed_domains = [domain]
|
self.allowed_domains = [domain]
|
||||||
self.start_urls = ['http://%s' % domain]
|
self.start_urls = ['https://%s' % domain]
|
||||||
for url in urls.split(','):
|
for url in urls.split(','):
|
||||||
if not url:
|
if not url:
|
||||||
continue
|
continue
|
||||||
|
@ -90,13 +97,17 @@ class SitemapSpider(spiders.CrawlSpider):
|
||||||
item = SitemapItem()
|
item = SitemapItem()
|
||||||
item['loc'] = response.url
|
item['loc'] = response.url
|
||||||
|
|
||||||
path = urlparse.urlsplit(response.url).path
|
components = urlparse.urlsplit(response.url)
|
||||||
|
|
||||||
if self.MAINT_RELEASES_PAT.match(path):
|
# Filter out any redirected URLs to other domains
|
||||||
|
if self.domain != components.netloc:
|
||||||
|
return
|
||||||
|
|
||||||
|
if self.MAINT_RELEASES_PAT.match(components.path):
|
||||||
# weekly changefrequency and highest prio for maintained release
|
# weekly changefrequency and highest prio for maintained release
|
||||||
item['priority'] = '1.0'
|
item['priority'] = '1.0'
|
||||||
item['changefreq'] = 'weekly'
|
item['changefreq'] = 'weekly'
|
||||||
elif self.LATEST_PAT.match(path):
|
elif self.LATEST_PAT.match(components.path):
|
||||||
# daily changefrequency and normal priority for current files
|
# daily changefrequency and normal priority for current files
|
||||||
item['priority'] = '0.5'
|
item['priority'] = '0.5'
|
||||||
item['changefreq'] = 'daily'
|
item['changefreq'] = 'daily'
|
||||||
|
|
|
@ -44,7 +44,7 @@ class TestSitemapSpider(unittest.TestCase):
|
||||||
domain = 'docs.openstack.org'
|
domain = 'docs.openstack.org'
|
||||||
self.assertEqual(self.spider.domain, domain)
|
self.assertEqual(self.spider.domain, domain)
|
||||||
self.assertEqual(self.spider.allowed_domains, [domain])
|
self.assertEqual(self.spider.allowed_domains, [domain])
|
||||||
self.assertEqual(self.spider.start_urls, ['http://%s' % domain])
|
self.assertEqual(self.spider.start_urls, ['https://%s' % domain])
|
||||||
|
|
||||||
def test_start_urls_get_appended(self):
|
def test_start_urls_get_appended(self):
|
||||||
urls = 'new.openstack.org, old.openstack.org'
|
urls = 'new.openstack.org, old.openstack.org'
|
||||||
|
|
Loading…
Reference in New Issue