Tools used by OpenStack Documentation
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

pipelines.py 3.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  2. # not use this file except in compliance with the License. You may obtain
  3. # a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  9. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  10. # License for the specific language governing permissions and limitations
  11. # under the License.
  12. import os
  13. import lxml
  14. import scrapy
  15. from scrapy import exporters
  16. class SitemapItemExporter(exporters.XmlItemExporter):
  17. '''XmlItemExporer with adjusted attributes for the root element.'''
  18. def start_exporting(self):
  19. '''Set namespace / schema attributes for the root element.'''
  20. self.xg.startDocument()
  21. self.xg.startElement(self.root_element, {
  22. "xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9",
  23. "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
  24. "xsi:schemaLocation":
  25. "http://www.sitemaps.org/schemas/sitemap/0.9 "
  26. "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
  27. })
  28. class IgnoreDuplicateUrls(object):
  29. '''Ignore duplicated URLs.'''
  30. def __init__(self):
  31. self.processed = set()
  32. def process_item(self, item, spider):
  33. '''Check if a URL was already found.'''
  34. if item['loc'] in self.processed:
  35. raise scrapy.exceptions.DropItem("Duplicate URL found: %s."
  36. % item['loc'])
  37. else:
  38. self.processed.add(item['loc'])
  39. return item
  40. class ExportSitemap(object):
  41. '''Write found URLs to a sitemap file.
  42. Based on http://doc.scrapy.org/en/latest/topics/exporters.html.
  43. '''
  44. def __init__(self):
  45. self.files = {}
  46. self.exporter = None
  47. @classmethod
  48. def from_crawler(cls, crawler):
  49. pipeline = cls()
  50. crawler.signals.connect(pipeline.spider_opened,
  51. scrapy.signals.spider_opened)
  52. crawler.signals.connect(pipeline.spider_closed,
  53. scrapy.signals.spider_closed)
  54. return pipeline
  55. def spider_opened(self, spider):
  56. output = open(os.path.join(os.getcwd(), 'sitemap_%s.xml'
  57. % spider.domain), 'w')
  58. self.files[spider] = output
  59. self.exporter = SitemapItemExporter(output, item_element='url',
  60. root_element='urlset')
  61. self.exporter.start_exporting()
  62. def spider_closed(self, spider):
  63. self.exporter.finish_exporting()
  64. output = self.files.pop(spider)
  65. output.close()
  66. tree = lxml.etree.parse(os.path.join(os.getcwd(), "sitemap_%s.xml"
  67. % spider.domain))
  68. with open(os.path.join(os.getcwd(), "sitemap_%s.xml" % spider.domain),
  69. 'w') as pretty:
  70. pretty.write(lxml.etree.tostring(tree, pretty_print=True))
  71. def process_item(self, item, spider):
  72. self.exporter.export_item(item)
  73. return item