Tools used by OpenStack Documentation
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

82 lines
2.4KB

  1. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  2. # not use this file except in compliance with the License. You may obtain
  3. # a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  9. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  10. # License for the specific language governing permissions and limitations
  11. # under the License.
  12. import posixpath
  13. import time
  14. import urlparse
  15. from generator import items
  16. from scrapy.linkextractors import LinkExtractor
  17. from scrapy import spiders
  18. class SitemapSpider(spiders.CrawlSpider):
  19. name = 'sitemap'
  20. rules = [
  21. spiders.Rule(
  22. LinkExtractor(
  23. allow=[
  24. r'.*\.html',
  25. r'.*\.pdf',
  26. r'.*\.xml',
  27. r'.*\.txt',
  28. r'.*/',
  29. ],
  30. deny=[
  31. r'/trunk/',
  32. r'/draft/',
  33. r'/api/'
  34. ]
  35. ),
  36. follow=True, callback='parse_item'
  37. )
  38. ]
  39. def __init__(self, domain='docs.openstack.org', urls='', *args, **kwargs):
  40. super(SitemapSpider, self).__init__(*args, **kwargs)
  41. self.domain = domain
  42. self.allowed_domains = [domain]
  43. self.start_urls = ['http://%s/index.html' % domain]
  44. for url in urls.split(','):
  45. self.start_urls.append(url)
  46. def parse_item(self, response):
  47. item = items.SitemapItem()
  48. item['priority'] = '0.5'
  49. item['changefreq'] = 'daily'
  50. item['loc'] = response.url
  51. path = urlparse.urlsplit(response.url).path
  52. filename = posixpath.basename(path)
  53. if filename == 'index.html' or filename == '':
  54. item['priority'] = '1.0'
  55. weekly = [
  56. 'juno',
  57. 'icehouse',
  58. 'havana'
  59. ]
  60. for entry in weekly:
  61. if path.startswith("/%s" % entry):
  62. item['changefreq'] = 'weekly'
  63. if 'Last-Modified' in response.headers:
  64. timestamp = response.headers['Last-Modified']
  65. else:
  66. timestamp = response.headers['Date']
  67. lastmod = time.strptime(timestamp, "%a, %d %b %Y %H:%M:%S %Z")
  68. item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
  69. return item