Christian Berendt 94d69f1868 [sitemap] resolve remaining ScrapyDeprecationWarning exceptions
* SgmlLinkExtractor is deprecated
* Module `scrapy.contrib.exporter` is deprecated

Change-Id: Ie3207a537abfd2b75602000fb4fef8f4c1e8ba56
2015-10-02 10:34:13 +02:00

91 lines
3.1 KiB
Python

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import os
import lxml
import scrapy
from scrapy import exporters
class SitemapItemExporter(exporters.XmlItemExporter):
'''XmlItemExporer with adjusted attributes for the root element.'''
def start_exporting(self):
'''Set namespace / schema attributes for the root element.'''
self.xg.startDocument()
self.xg.startElement(self.root_element, {
"xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9",
"xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
"xsi:schemaLocation":
"http://www.sitemaps.org/schemas/sitemap/0.9 "
"http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
})
class IgnoreDuplicateUrls(object):
'''Ignore duplicated URLs.'''
def __init__(self):
self.processed = set()
def process_item(self, item, spider):
'''Check if a URL was already found.'''
if item['loc'] in self.processed:
raise scrapy.exceptions.DropItem("Duplicate URL found: %s."
% item['loc'])
else:
self.processed.add(item['loc'])
return item
class ExportSitemap(object):
'''Write found URLs to a sitemap file.
Based on http://doc.scrapy.org/en/latest/topics/exporters.html.
'''
def __init__(self):
self.files = {}
self.exporter = None
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened,
scrapy.signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed,
scrapy.signals.spider_closed)
return pipeline
def spider_opened(self, spider):
output = open(os.path.join(os.getcwd(), 'sitemap_%s.xml'
% spider.domain), 'w')
self.files[spider] = output
self.exporter = SitemapItemExporter(output, item_element='url',
root_element='urlset')
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
output = self.files.pop(spider)
output.close()
tree = lxml.etree.parse(os.path.join(os.getcwd(), "sitemap_%s.xml"
% spider.domain))
with open(os.path.join(os.getcwd(), "sitemap_%s.xml" % spider.domain),
'w') as pretty:
pretty.write(lxml.etree.tostring(tree, pretty_print=True))
def process_item(self, item, spider):
self.exporter.export_item(item)
return item