Tools used by OpenStack Documentation
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

92 lignes
3.2KB

  1. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  2. # not use this file except in compliance with the License. You may obtain
  3. # a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  9. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  10. # License for the specific language governing permissions and limitations
  11. # under the License.
  12. import os
  13. import lxml
  14. import scrapy
  15. from scrapy import exporters
  16. class SitemapItemExporter(exporters.XmlItemExporter):
  17. '''XmlItemExporer with adjusted attributes for the root element.'''
  18. def start_exporting(self):
  19. '''Set namespace / schema attributes for the root element.'''
  20. self.xg.startDocument()
  21. self.xg.startElement(self.root_element, {
  22. "xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9",
  23. "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
  24. "xsi:schemaLocation":
  25. "http://www.sitemaps.org/schemas/sitemap/0.9 "
  26. "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
  27. })
  28. class IgnoreDuplicateUrls(object):
  29. '''Ignore duplicated URLs.'''
  30. def __init__(self):
  31. self.processed = set()
  32. def process_item(self, item, spider):
  33. '''Check if a URL was already found.'''
  34. if item['loc'] in self.processed:
  35. raise scrapy.exceptions.DropItem("Duplicate URL found: %s."
  36. % item['loc'])
  37. else:
  38. self.processed.add(item['loc'])
  39. return item
  40. class ExportSitemap(object):
  41. '''Write found URLs to a sitemap file.
  42. Based on http://doc.scrapy.org/en/latest/topics/exporters.html.
  43. '''
  44. def __init__(self):
  45. self.files = {}
  46. self.exporter = None
  47. @classmethod
  48. def from_crawler(cls, crawler):
  49. pipeline = cls()
  50. crawler.signals.connect(pipeline.spider_opened,
  51. scrapy.signals.spider_opened)
  52. crawler.signals.connect(pipeline.spider_closed,
  53. scrapy.signals.spider_closed)
  54. return pipeline
  55. def spider_opened(self, spider):
  56. output = open(os.path.join(os.getcwd(), 'sitemap_%s.xml'
  57. % spider.domain), 'w')
  58. self.files[spider] = output
  59. self.exporter = SitemapItemExporter(output, item_element='url',
  60. root_element='urlset')
  61. self.exporter.start_exporting()
  62. def spider_closed(self, spider):
  63. self.exporter.finish_exporting()
  64. output = self.files.pop(spider)
  65. output.close()
  66. tree = lxml.etree.parse(os.path.join(os.getcwd(), "sitemap_%s.xml"
  67. % spider.domain))
  68. with open(os.path.join(os.getcwd(), "sitemap_%s.xml" % spider.domain),
  69. 'w') as pretty:
  70. pretty.write(lxml.etree.tostring(tree, pretty_print=True,
  71. encoding='unicode'))
  72. def process_item(self, item, spider):
  73. self.exporter.export_item(item)
  74. return item