openstack-doc-tools/sitemap/generator/spiders/sitemap.py

79 lines
2.3 KiB
Python

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import posixpath
import time
import urlparse
from generator import items
from scrapy.linkextractors import sgml
from scrapy import spiders
class SitemapSpider(spiders.CrawlSpider):
name = 'sitemap'
rules = [
spiders.Rule(
sgml.SgmlLinkExtractor(
allow=[
r'.*\.html',
r'.*\.pdf',
r'.*\.xml',
r'.*\.txt',
r'.*/',
],
deny=[
r'/trunk/',
r'/draft/',
r'/api/'
]
),
follow=True, callback='parse_item'
)
]
def __init__(self, domain='docs.openstack.org', *args, **kwargs):
super(SitemapSpider, self).__init__(*args, **kwargs)
self.domain = domain
self.allowed_domains = [domain]
self.start_urls = [
'http://%s/index.html' % domain,
]
def parse_item(self, response):
item = items.SitemapItem()
item['priority'] = '0.5'
item['changefreq'] = 'daily'
item['loc'] = response.url
path = urlparse.urlsplit(response.url).path
filename = posixpath.basename(path)
if filename == 'index.html' or filename == '':
item['priority'] = '1.0'
weekly = [
'juno',
'icehouse',
'havana'
]
for entry in weekly:
if path.startswith("/%s" % entry):
item['changefreq'] = 'weekly'
lastmod = time.strptime(response.headers['Last-Modified'],
"%a, %d %b %Y %H:%M:%S %Z")
item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
return item