Tools used by OpenStack Documentation
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

113 行
3.5KB

  1. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  2. # not use this file except in compliance with the License. You may obtain
  3. # a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  9. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  10. # License for the specific language governing permissions and limitations
  11. # under the License.
  12. import re
  13. import time
  14. try:
  15. import urlparse
  16. except ImportError:
  17. import urllib.parse as urlparse
  18. from scrapy import item
  19. from scrapy import linkextractors
  20. from scrapy import spiders
  21. class SitemapItem(item.Item):
  22. '''Class to represent an item in the sitemap.'''
  23. loc = item.Field()
  24. lastmod = item.Field()
  25. priority = item.Field()
  26. changefreq = item.Field()
  27. class SitemapSpider(spiders.CrawlSpider):
  28. name = 'sitemap'
  29. MAINT_SERIES = [
  30. 'newton',
  31. 'ocata',
  32. 'pike'
  33. ]
  34. MAINT_RELEASES_PAT = re.compile('^/(' + '|'.join(MAINT_SERIES) + ')/')
  35. LATEST_PAT = re.compile('^/latest/')
  36. rules = [
  37. spiders.Rule(
  38. linkextractors.LinkExtractor(
  39. allow=[
  40. r'.*\.html',
  41. r'.*\.pdf',
  42. r'.*\.xml',
  43. r'.*\.txt',
  44. r'.*/',
  45. ],
  46. deny=[
  47. r'/trunk/',
  48. r'/draft/',
  49. r'/austin/',
  50. r'/bexar/',
  51. r'/cactus/',
  52. r'/diablo/',
  53. r'/essex/',
  54. r'/folsom/',
  55. r'/grizzly/',
  56. r'/havana/',
  57. r'/icehouse/',
  58. r'/juno/',
  59. r'/kilo/',
  60. r'/liberty/',
  61. r'/mitaka/'
  62. ]
  63. ),
  64. follow=True, callback='parse_item'
  65. )
  66. ]
  67. def __init__(self, domain='docs.openstack.org', urls='', *args, **kwargs):
  68. super(SitemapSpider, self).__init__(*args, **kwargs)
  69. self.domain = domain
  70. self.allowed_domains = [domain]
  71. self.start_urls = ['http://%s' % domain]
  72. for url in urls.split(','):
  73. if not url:
  74. continue
  75. self.start_urls.append(url)
  76. def parse_item(self, response):
  77. item = SitemapItem()
  78. item['loc'] = response.url
  79. path = urlparse.urlsplit(response.url).path
  80. if self.MAINT_RELEASES_PAT.match(path):
  81. # weekly changefrequency and highest prio for maintained release
  82. item['priority'] = '1.0'
  83. item['changefreq'] = 'weekly'
  84. elif self.LATEST_PAT.match(path):
  85. # daily changefrequency and normal priority for current files
  86. item['priority'] = '0.5'
  87. item['changefreq'] = 'daily'
  88. else:
  89. # These are unversioned documents
  90. # daily changefrequency and highest priority for current files
  91. item['priority'] = '1.0'
  92. item['changefreq'] = 'daily'
  93. if 'Last-Modified' in response.headers:
  94. timestamp = response.headers['Last-Modified']
  95. else:
  96. timestamp = response.headers['Date']
  97. lastmod = time.strptime(timestamp, "%a, %d %b %Y %H:%M:%S %Z")
  98. item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
  99. return item