Tools used by OpenStack Documentation
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

99 lines
2.9KB

  1. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  2. # not use this file except in compliance with the License. You may obtain
  3. # a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  9. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  10. # License for the specific language governing permissions and limitations
  11. # under the License.
  12. import time
  13. import urlparse
  14. from scrapy import item
  15. from scrapy.linkextractors import LinkExtractor
  16. from scrapy import spiders
  17. class SitemapItem(item.Item):
  18. '''Class to represent an item in the sitemap.'''
  19. loc = item.Field()
  20. lastmod = item.Field()
  21. priority = item.Field()
  22. changefreq = item.Field()
  23. class SitemapSpider(spiders.CrawlSpider):
  24. name = 'sitemap'
  25. old_releases = tuple(["/%s" % old_release for old_release in [
  26. 'austin',
  27. 'bexar',
  28. 'cactus',
  29. 'diablo',
  30. 'essex',
  31. 'folsom',
  32. 'grizzly',
  33. 'havana',
  34. 'icehouse',
  35. 'juno',
  36. 'kilo',
  37. 'liberty'
  38. ]])
  39. rules = [
  40. spiders.Rule(
  41. LinkExtractor(
  42. allow=[
  43. r'.*\.html',
  44. r'.*\.pdf',
  45. r'.*\.xml',
  46. r'.*\.txt',
  47. r'.*/',
  48. ],
  49. deny=[
  50. r'/trunk/',
  51. r'/draft/',
  52. r'/api/',
  53. r'/juno/',
  54. r'/icehouse/'
  55. ]
  56. ),
  57. follow=True, callback='parse_item'
  58. )
  59. ]
  60. def __init__(self, domain='docs.openstack.org', urls='', *args, **kwargs):
  61. super(SitemapSpider, self).__init__(*args, **kwargs)
  62. self.domain = domain
  63. self.allowed_domains = [domain]
  64. self.start_urls = ['http://%s' % domain]
  65. for url in urls.split(','):
  66. if not url:
  67. continue
  68. self.start_urls.append(url)
  69. def parse_item(self, response):
  70. item = SitemapItem()
  71. item['loc'] = response.url
  72. path = urlparse.urlsplit(response.url).path
  73. if path.startswith(self.old_releases):
  74. # weekly changefrequency and lower priority for old files
  75. item['priority'] = '0.5'
  76. item['changefreq'] = 'weekly'
  77. else:
  78. # daily changefrequency and highest priority for current files
  79. item['priority'] = '1.0'
  80. item['changefreq'] = 'daily'
  81. if 'Last-Modified' in response.headers:
  82. timestamp = response.headers['Last-Modified']
  83. else:
  84. timestamp = response.headers['Date']
  85. lastmod = time.strptime(timestamp, "%a, %d %b %Y %H:%M:%S %Z")
  86. item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
  87. return item