Tools used by OpenStack Documentation
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

75 lines
2.2 KiB

  1. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  2. # not use this file except in compliance with the License. You may obtain
  3. # a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  9. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  10. # License for the specific language governing permissions and limitations
  11. # under the License.
  12. import posixpath
  13. import time
  14. import urlparse
  15. from scrapy.contrib.linkextractors import sgml
  16. from scrapy.contrib import spiders
  17. from generator import items
  18. class SitemapSpider(spiders.CrawlSpider):
  19. name = 'sitemap'
  20. rules = [
  21. spiders.Rule(
  22. sgml.SgmlLinkExtractor(
  23. allow=[
  24. r'.*\.html',
  25. r'.*\.pdf',
  26. r'.*\.xml',
  27. r'.*\.txt',
  28. r'.*/',
  29. ]
  30. ),
  31. follow=True, callback='parse_item'
  32. )
  33. ]
  34. def __init__(self, domain='docs.openstack.org', *args, **kwargs):
  35. super(SitemapSpider, self).__init__(*args, **kwargs)
  36. self.domain = domain
  37. self.allowed_domains = [domain]
  38. self.start_urls = [
  39. 'http://%s/index.html' % domain,
  40. ]
  41. def parse_item(self, response):
  42. item = items.SitemapItem()
  43. item['priority'] = '0.5'
  44. item['changefreq'] = 'daily'
  45. item['loc'] = response.url
  46. path = urlparse.urlsplit(response.url).path
  47. filename = posixpath.basename(path)
  48. if filename == 'index.html' or filename == '':
  49. item['priority'] = '1.0'
  50. weekly = [
  51. 'icehouse',
  52. 'havana',
  53. 'grizzly'
  54. ]
  55. for entry in weekly:
  56. if path.startswith("/%s" % entry):
  57. item['changefreq'] = 'weekly'
  58. lastmod = time.strptime(response.headers['Last-Modified'],
  59. "%a, %d %b %Y %H:%M:%S %Z")
  60. item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
  61. return item