Tools used by OpenStack Documentation
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

117 lines
3.6KB

  1. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  2. # not use this file except in compliance with the License. You may obtain
  3. # a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  9. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  10. # License for the specific language governing permissions and limitations
  11. # under the License.
  12. import re
  13. import time
  14. try:
  15. import urlparse
  16. except ImportError:
  17. import urllib.parse as urlparse
  18. from scrapy import item
  19. from scrapy import linkextractors
  20. from scrapy import spiders
  21. class SitemapItem(item.Item):
  22. '''Class to represent an item in the sitemap.'''
  23. loc = item.Field()
  24. lastmod = item.Field()
  25. priority = item.Field()
  26. changefreq = item.Field()
  27. class SitemapSpider(spiders.CrawlSpider):
  28. name = 'sitemap'
  29. MAINT_SERIES = [
  30. 'newton',
  31. 'ocata',
  32. 'pike',
  33. 'queens',
  34. 'rocky',
  35. 'stein',
  36. ]
  37. MAINT_RELEASES_PAT = re.compile('^.*/(' + '|'.join(MAINT_SERIES) + ')/')
  38. print('/(' + '|'.join(MAINT_SERIES) + ')/')
  39. LATEST_PAT = re.compile('^.*/latest/')
  40. rules = [
  41. spiders.Rule(
  42. linkextractors.LinkExtractor(
  43. allow=[
  44. r'.*\.html',
  45. r'.*\.pdf',
  46. r'.*\.xml',
  47. r'.*\.txt',
  48. r'.*/',
  49. ],
  50. deny=[
  51. r'/trunk/',
  52. r'/draft/',
  53. r'/austin/',
  54. r'/bexar/',
  55. r'/cactus/',
  56. r'/diablo/',
  57. r'/essex/',
  58. r'/folsom/',
  59. r'/grizzly/',
  60. r'/havana/',
  61. r'/icehouse/',
  62. r'/juno/',
  63. r'/kilo/',
  64. r'/liberty/',
  65. r'/mitaka/'
  66. ]
  67. ),
  68. follow=True, callback='parse_item'
  69. )
  70. ]
  71. def __init__(self, domain='docs.openstack.org', urls='', *args, **kwargs):
  72. super(SitemapSpider, self).__init__(*args, **kwargs)
  73. self.domain = domain
  74. self.allowed_domains = [domain]
  75. self.start_urls = ['http://%s' % domain]
  76. for url in urls.split(','):
  77. if not url:
  78. continue
  79. self.start_urls.append(url)
  80. def parse_item(self, response):
  81. item = SitemapItem()
  82. item['loc'] = response.url
  83. path = urlparse.urlsplit(response.url).path
  84. if self.MAINT_RELEASES_PAT.match(path):
  85. # weekly changefrequency and highest prio for maintained release
  86. item['priority'] = '1.0'
  87. item['changefreq'] = 'weekly'
  88. elif self.LATEST_PAT.match(path):
  89. # daily changefrequency and normal priority for current files
  90. item['priority'] = '0.5'
  91. item['changefreq'] = 'daily'
  92. else:
  93. # These are unversioned documents
  94. # daily changefrequency and highest priority for current files
  95. item['priority'] = '1.0'
  96. item['changefreq'] = 'daily'
  97. if 'Last-Modified' in response.headers:
  98. timestamp = response.headers['Last-Modified']
  99. else:
  100. timestamp = response.headers['Date']
  101. lastmod = time.strptime(timestamp, "%a, %d %b %Y %H:%M:%S %Z")
  102. item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
  103. return item