Update sitemap
* Update releases: Newton is unmainted; train, ussuri are relased * We have redirects to some sites, ignore them * Remove python2 code * Use https as starting point Change-Id: I744e725b32a797b0e208ac7e81afe4b8d3578751
This commit is contained in:
parent
7ee77130c4
commit
cc3c0347a1
|
@ -12,10 +12,7 @@
|
|||
|
||||
import re
|
||||
import time
|
||||
try:
|
||||
import urlparse
|
||||
except ImportError:
|
||||
import urllib.parse as urlparse
|
||||
import urllib.parse as urlparse
|
||||
|
||||
from scrapy import item
|
||||
from scrapy import linkextractors
|
||||
|
@ -34,12 +31,13 @@ class SitemapSpider(spiders.CrawlSpider):
|
|||
name = 'sitemap'
|
||||
|
||||
MAINT_SERIES = [
|
||||
'newton',
|
||||
'ocata',
|
||||
'pike',
|
||||
'queens',
|
||||
'rocky',
|
||||
'stein',
|
||||
'train',
|
||||
'ussuri',
|
||||
]
|
||||
MAINT_RELEASES_PAT = re.compile('^.*/(' + '|'.join(MAINT_SERIES) + ')/')
|
||||
LATEST_PAT = re.compile('^.*/latest/')
|
||||
|
@ -69,7 +67,16 @@ class SitemapSpider(spiders.CrawlSpider):
|
|||
r'/juno/',
|
||||
r'/kilo/',
|
||||
r'/liberty/',
|
||||
r'/mitaka/'
|
||||
r'/mitaka/',
|
||||
r'/newton/',
|
||||
],
|
||||
deny_domains = [
|
||||
# docs.o.o redirects to a few sites, filter
|
||||
# them out
|
||||
'docs.opendev.org',
|
||||
'opendev.org',
|
||||
'releases.openstack.org',
|
||||
'zuul-ci.org',
|
||||
]
|
||||
),
|
||||
follow=True, callback='parse_item'
|
||||
|
@ -80,7 +87,7 @@ class SitemapSpider(spiders.CrawlSpider):
|
|||
super(SitemapSpider, self).__init__(*args, **kwargs)
|
||||
self.domain = domain
|
||||
self.allowed_domains = [domain]
|
||||
self.start_urls = ['http://%s' % domain]
|
||||
self.start_urls = ['https://%s' % domain]
|
||||
for url in urls.split(','):
|
||||
if not url:
|
||||
continue
|
||||
|
|
Loading…
Reference in New Issue