79 lines
2.3 KiB
Python
79 lines
2.3 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import posixpath
|
|
import time
|
|
import urlparse
|
|
|
|
from generator import items
|
|
from scrapy.linkextractors import sgml
|
|
from scrapy import spiders
|
|
|
|
|
|
class SitemapSpider(spiders.CrawlSpider):
|
|
name = 'sitemap'
|
|
|
|
rules = [
|
|
spiders.Rule(
|
|
sgml.SgmlLinkExtractor(
|
|
allow=[
|
|
r'.*\.html',
|
|
r'.*\.pdf',
|
|
r'.*\.xml',
|
|
r'.*\.txt',
|
|
r'.*/',
|
|
],
|
|
deny=[
|
|
r'/trunk/',
|
|
r'/draft/',
|
|
r'/api/'
|
|
]
|
|
),
|
|
follow=True, callback='parse_item'
|
|
)
|
|
]
|
|
|
|
def __init__(self, domain='docs.openstack.org', *args, **kwargs):
|
|
super(SitemapSpider, self).__init__(*args, **kwargs)
|
|
self.domain = domain
|
|
self.allowed_domains = [domain]
|
|
self.start_urls = [
|
|
'http://%s/index.html' % domain,
|
|
]
|
|
|
|
def parse_item(self, response):
|
|
item = items.SitemapItem()
|
|
item['priority'] = '0.5'
|
|
item['changefreq'] = 'daily'
|
|
item['loc'] = response.url
|
|
|
|
path = urlparse.urlsplit(response.url).path
|
|
filename = posixpath.basename(path)
|
|
|
|
if filename == 'index.html' or filename == '':
|
|
item['priority'] = '1.0'
|
|
|
|
weekly = [
|
|
'juno',
|
|
'icehouse',
|
|
'havana'
|
|
]
|
|
|
|
for entry in weekly:
|
|
if path.startswith("/%s" % entry):
|
|
item['changefreq'] = 'weekly'
|
|
|
|
lastmod = time.strptime(response.headers['Last-Modified'],
|
|
"%a, %d %b %Y %H:%M:%S %Z")
|
|
item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
|
|
return item
|