Tools used by OpenStack Documentation
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import posixpath
import time
import urlparse
from scrapy.contrib.linkextractors import sgml
from scrapy.contrib import spiders
from generator import items
class SitemapSpider(spiders.CrawlSpider):
name = 'sitemap'
rules = [
follow=True, callback='parse_item'
def __init__(self, domain='', *args, **kwargs):
super(SitemapSpider, self).__init__(*args, **kwargs)
self.domain = domain
self.allowed_domains = [domain]
self.start_urls = [
'http://%s/index.html' % domain,
def parse_item(self, response):
item = items.SitemapItem()
item['priority'] = '0.5'
item['changefreq'] = 'daily'
item['loc'] = response.url
path = urlparse.urlsplit(response.url).path
filename = posixpath.basename(path)
if filename == 'index.html' or filename == '':
item['priority'] = '1.0'
weekly = [
for entry in weekly:
if path.startswith("/%s" % entry):
item['changefreq'] = 'weekly'
lastmod = time.strptime(response.headers['Last-Modified'],
"%a, %d %b %Y %H:%M:%S %Z")
item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
return item