Tools used by OpenStack Documentation
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

113 lignes
3.5KB

  1. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  2. # not use this file except in compliance with the License. You may obtain
  3. # a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  9. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  10. # License for the specific language governing permissions and limitations
  11. # under the License.
  12. import re
  13. import time
  14. try:
  15. import urlparse
  16. except ImportError:
  17. import urllib.parse as urlparse
  18. from scrapy import item
  19. from scrapy import linkextractors
  20. from scrapy import spiders
  21. class SitemapItem(item.Item):
  22. '''Class to represent an item in the sitemap.'''
  23. loc = item.Field()
  24. lastmod = item.Field()
  25. priority = item.Field()
  26. changefreq = item.Field()
  27. class SitemapSpider(spiders.CrawlSpider):
  28. name = 'sitemap'
  29. MAINT_SERIES = [
  30. 'newton',
  31. 'ocata',
  32. 'pike'
  33. ]
  34. MAINT_RELEASES_PAT = re.compile('^/(' + '|'.join(MAINT_SERIES) + ')/')
  35. LATEST_PAT = re.compile('^/latest/')
  36. rules = [
  37. spiders.Rule(
  38. linkextractors.LinkExtractor(
  39. allow=[
  40. r'.*\.html',
  41. r'.*\.pdf',
  42. r'.*\.xml',
  43. r'.*\.txt',
  44. r'.*/',
  45. ],
  46. deny=[
  47. r'/trunk/',
  48. r'/draft/',
  49. r'/austin/',
  50. r'/bexar/',
  51. r'/cactus/',
  52. r'/diablo/',
  53. r'/essex/',
  54. r'/folsom/',
  55. r'/grizzly/',
  56. r'/havana/',
  57. r'/icehouse/',
  58. r'/juno/',
  59. r'/kilo/',
  60. r'/liberty/',
  61. r'/mitaka/'
  62. ]
  63. ),
  64. follow=True, callback='parse_item'
  65. )
  66. ]
  67. def __init__(self, domain='docs.openstack.org', urls='', *args, **kwargs):
  68. super(SitemapSpider, self).__init__(*args, **kwargs)
  69. self.domain = domain
  70. self.allowed_domains = [domain]
  71. self.start_urls = ['http://%s' % domain]
  72. for url in urls.split(','):
  73. if not url:
  74. continue
  75. self.start_urls.append(url)
  76. def parse_item(self, response):
  77. item = SitemapItem()
  78. item['loc'] = response.url
  79. path = urlparse.urlsplit(response.url).path
  80. if self.MAINT_RELEASES_PAT.match(path):
  81. # weekly changefrequency and highest prio for maintained release
  82. item['priority'] = '1.0'
  83. item['changefreq'] = 'weekly'
  84. elif self.LATEST_PAT.match(path):
  85. # daily changefrequency and normal priority for current files
  86. item['priority'] = '0.5'
  87. item['changefreq'] = 'daily'
  88. else:
  89. # These are unversioned documents
  90. # daily changefrequency and highest priority for current files
  91. item['priority'] = '1.0'
  92. item['changefreq'] = 'daily'
  93. if 'Last-Modified' in response.headers:
  94. timestamp = response.headers['Last-Modified']
  95. else:
  96. timestamp = response.headers['Date']
  97. lastmod = time.strptime(timestamp, "%a, %d %b %Y %H:%M:%S %Z")
  98. item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
  99. return item