Tools used by OpenStack Documentation
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

91 lines
2.8 KiB

  1. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  2. # not use this file except in compliance with the License. You may obtain
  3. # a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  9. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  10. # License for the specific language governing permissions and limitations
  11. # under the License.
  12. import time
  13. import urlparse
  14. from scrapy.linkextractors import LinkExtractor
  15. from scrapy import spiders
  16. from sitemap.generator import items
  17. class SitemapSpider(spiders.CrawlSpider):
  18. name = 'sitemap'
  19. old_releases = tuple(["/%s" % old_release for old_release in [
  20. 'austin',
  21. 'bexar',
  22. 'cactus',
  23. 'diablo',
  24. 'essex',
  25. 'folsom',
  26. 'grizzly',
  27. 'havana',
  28. 'icehouse',
  29. 'juno',
  30. 'kilo',
  31. 'liberty'
  32. ]])
  33. rules = [
  34. spiders.Rule(
  35. LinkExtractor(
  36. allow=[
  37. r'.*\.html',
  38. r'.*\.pdf',
  39. r'.*\.xml',
  40. r'.*\.txt',
  41. r'.*/',
  42. ],
  43. deny=[
  44. r'/trunk/',
  45. r'/draft/',
  46. r'/api/',
  47. r'/juno/',
  48. r'/icehouse/'
  49. ]
  50. ),
  51. follow=True, callback='parse_item'
  52. )
  53. ]
  54. def __init__(self, domain='docs.openstack.org', urls='', *args, **kwargs):
  55. super(SitemapSpider, self).__init__(*args, **kwargs)
  56. self.domain = domain
  57. self.allowed_domains = [domain]
  58. self.start_urls = ['http://%s' % domain]
  59. for url in urls.split(','):
  60. if not url:
  61. continue
  62. self.start_urls.append(url)
  63. def parse_item(self, response):
  64. item = items.SitemapItem()
  65. item['loc'] = response.url
  66. path = urlparse.urlsplit(response.url).path
  67. if path.startswith(self.old_releases):
  68. # weekly changefrequency and lower priority for old files
  69. item['priority'] = '0.5'
  70. item['changefreq'] = 'weekly'
  71. else:
  72. # daily changefrequency and highest priority for current files
  73. item['priority'] = '1.0'
  74. item['changefreq'] = 'daily'
  75. if 'Last-Modified' in response.headers:
  76. timestamp = response.headers['Last-Modified']
  77. else:
  78. timestamp = response.headers['Date']
  79. lastmod = time.strptime(timestamp, "%a, %d %b %Y %H:%M:%S %Z")
  80. item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
  81. return item