Update sitemap

* Update releases: Newton is unmainted; train, ussuri are relased
* We have redirects to some sites, ignore them
* Remove python2 code

Change-Id: I744e725b32a797b0e208ac7e81afe4b8d3578751
This commit is contained in:
Andreas Jaeger 2020-06-21 17:57:32 +02:00
parent 7ee77130c4
commit 2fbc9666d4
1 changed files with 14 additions and 7 deletions

View File

@ -12,10 +12,7 @@
import re
import time
try:
import urlparse
except ImportError:
import urllib.parse as urlparse
import urllib.parse as urlparse
from scrapy import item
from scrapy import linkextractors
@ -34,12 +31,13 @@ class SitemapSpider(spiders.CrawlSpider):
name = 'sitemap'
MAINT_SERIES = [
'newton',
'ocata',
'pike',
'queens',
'rocky',
'stein',
'train',
'ussuri',
]
MAINT_RELEASES_PAT = re.compile('^.*/(' + '|'.join(MAINT_SERIES) + ')/')
LATEST_PAT = re.compile('^.*/latest/')
@ -69,7 +67,16 @@ class SitemapSpider(spiders.CrawlSpider):
r'/juno/',
r'/kilo/',
r'/liberty/',
r'/mitaka/'
r'/mitaka/',
r'/newton/',
],
deny_domains = [
# docs.o.o redirects to a few sites, filter
# them out
'docs.opendev.org',
'opendev.org',
'releases.openstack.org',
'zuul-ci.org',
]
),
follow=True, callback='parse_item'
@ -80,7 +87,7 @@ class SitemapSpider(spiders.CrawlSpider):
super(SitemapSpider, self).__init__(*args, **kwargs)
self.domain = domain
self.allowed_domains = [domain]
self.start_urls = ['http://%s' % domain]
self.start_urls = ['https://%s' % domain]
for url in urls.split(','):
if not url:
continue