Tools used by OpenStack Documentation
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

116 lines
3.5KB

  1. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  2. # not use this file except in compliance with the License. You may obtain
  3. # a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  9. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  10. # License for the specific language governing permissions and limitations
  11. # under the License.
  12. import re
  13. import time
  14. try:
  15. import urlparse
  16. except ImportError:
  17. import urllib.parse as urlparse
  18. from scrapy import item
  19. from scrapy import linkextractors
  20. from scrapy import spiders
  21. class SitemapItem(item.Item):
  22. '''Class to represent an item in the sitemap.'''
  23. loc = item.Field()
  24. lastmod = item.Field()
  25. priority = item.Field()
  26. changefreq = item.Field()
  27. class SitemapSpider(spiders.CrawlSpider):
  28. name = 'sitemap'
  29. MAINT_SERIES = [
  30. 'newton',
  31. 'ocata',
  32. 'pike',
  33. 'queens',
  34. 'rocky',
  35. 'stein',
  36. ]
  37. MAINT_RELEASES_PAT = re.compile('/(' + '|'.join(MAINT_SERIES) + ')/')
  38. LATEST_PAT = re.compile('/latest/')
  39. rules = [
  40. spiders.Rule(
  41. linkextractors.LinkExtractor(
  42. allow=[
  43. r'.*\.html',
  44. r'.*\.pdf',
  45. r'.*\.xml',
  46. r'.*\.txt',
  47. r'.*/',
  48. ],
  49. deny=[
  50. r'/trunk/',
  51. r'/draft/',
  52. r'/austin/',
  53. r'/bexar/',
  54. r'/cactus/',
  55. r'/diablo/',
  56. r'/essex/',
  57. r'/folsom/',
  58. r'/grizzly/',
  59. r'/havana/',
  60. r'/icehouse/',
  61. r'/juno/',
  62. r'/kilo/',
  63. r'/liberty/',
  64. r'/mitaka/'
  65. ]
  66. ),
  67. follow=True, callback='parse_item'
  68. )
  69. ]
  70. def __init__(self, domain='docs.openstack.org', urls='', *args, **kwargs):
  71. super(SitemapSpider, self).__init__(*args, **kwargs)
  72. self.domain = domain
  73. self.allowed_domains = [domain]
  74. self.start_urls = ['http://%s' % domain]
  75. for url in urls.split(','):
  76. if not url:
  77. continue
  78. self.start_urls.append(url)
  79. def parse_item(self, response):
  80. item = SitemapItem()
  81. item['loc'] = response.url
  82. path = urlparse.urlsplit(response.url).path
  83. if self.MAINT_RELEASES_PAT.match(path):
  84. # weekly changefrequency and highest prio for maintained release
  85. item['priority'] = '1.0'
  86. item['changefreq'] = 'weekly'
  87. elif self.LATEST_PAT.match(path):
  88. # daily changefrequency and normal priority for current files
  89. item['priority'] = '0.5'
  90. item['changefreq'] = 'daily'
  91. else:
  92. # These are unversioned documents
  93. # daily changefrequency and highest priority for current files
  94. item['priority'] = '1.0'
  95. item['changefreq'] = 'daily'
  96. if 'Last-Modified' in response.headers:
  97. timestamp = response.headers['Last-Modified']
  98. else:
  99. timestamp = response.headers['Date']
  100. lastmod = time.strptime(timestamp, "%a, %d %b %Y %H:%M:%S %Z")
  101. item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
  102. return item