openstack-doc-tools/sitemap/generator/spiders/sitemap_file.py

117 lines
3.6 KiB
Python

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import re
import time
try:
import urlparse
except ImportError:
import urllib.parse as urlparse
from scrapy import item
from scrapy import linkextractors
from scrapy import spiders
class SitemapItem(item.Item):
'''Class to represent an item in the sitemap.'''
loc = item.Field()
lastmod = item.Field()
priority = item.Field()
changefreq = item.Field()
class SitemapSpider(spiders.CrawlSpider):
name = 'sitemap'
MAINT_SERIES = [
'newton',
'ocata',
'pike',
'queens',
'rocky',
'stein',
]
MAINT_RELEASES_PAT = re.compile('^.*/(' + '|'.join(MAINT_SERIES) + ')/')
LATEST_PAT = re.compile('^.*/latest/')
rules = [
spiders.Rule(
linkextractors.LinkExtractor(
allow=[
r'.*\.html',
r'.*\.pdf',
r'.*\.xml',
r'.*\.txt',
r'.*/',
],
deny=[
r'/trunk/',
r'/draft/',
r'/austin/',
r'/bexar/',
r'/cactus/',
r'/diablo/',
r'/essex/',
r'/folsom/',
r'/grizzly/',
r'/havana/',
r'/icehouse/',
r'/juno/',
r'/kilo/',
r'/liberty/',
r'/mitaka/'
]
),
follow=True, callback='parse_item'
)
]
def __init__(self, domain='docs.openstack.org', urls='', *args, **kwargs):
super(SitemapSpider, self).__init__(*args, **kwargs)
self.domain = domain
self.allowed_domains = [domain]
self.start_urls = ['http://%s' % domain]
for url in urls.split(','):
if not url:
continue
self.start_urls.append(url)
def parse_item(self, response):
item = SitemapItem()
item['loc'] = response.url
path = urlparse.urlsplit(response.url).path
if self.MAINT_RELEASES_PAT.match(path):
# weekly changefrequency and highest prio for maintained release
item['priority'] = '1.0'
item['changefreq'] = 'weekly'
elif self.LATEST_PAT.match(path):
# daily changefrequency and normal priority for current files
item['priority'] = '0.5'
item['changefreq'] = 'daily'
else:
# These are unversioned documents
# daily changefrequency and highest priority for current files
item['priority'] = '1.0'
item['changefreq'] = 'daily'
if 'Last-Modified' in response.headers:
timestamp = response.headers['Last-Modified']
else:
timestamp = response.headers['Date']
lastmod = time.strptime(timestamp.decode("utf-8"),
"%a, %d %b %Y %H:%M:%S %Z")
item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
return item