From 815fcb5bb6ffd395515aa6890c773bdbeea5f248 Mon Sep 17 00:00:00 2001 From: Christian Berendt Date: Fri, 16 Oct 2015 07:52:15 +0200 Subject: [PATCH] [sitemap] introduce attribute to define start URLs Change-Id: I09018a824310ef48f6b62f04f09d6fd704808119 Closes-bug: #1506601 --- sitemap/README.rst | 5 +++++ sitemap/generator/spiders/sitemap.py | 8 ++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/sitemap/README.rst b/sitemap/README.rst index 42f5386b..72c571c1 100644 --- a/sitemap/README.rst +++ b/sitemap/README.rst @@ -24,6 +24,11 @@ The result will be available in the file ``sitemap_developer.openstack.org.xml`` To write log messages into a file append the parameter ``-s LOG_FILE=scrapy.log``. +It is possible to define a set of additional start URLs using the attribute +``urls``. Separate multiple URLs with ``,``. + + $ scrapy crawl sitemap -a domain=developer.openstack.org -a urls="http://developer.openstack.org/de/api-guide/quick-start/" + Dependencies ============ diff --git a/sitemap/generator/spiders/sitemap.py b/sitemap/generator/spiders/sitemap.py index 4650966d..44009d56 100644 --- a/sitemap/generator/spiders/sitemap.py +++ b/sitemap/generator/spiders/sitemap.py @@ -42,13 +42,13 @@ class SitemapSpider(spiders.CrawlSpider): ) ] - def __init__(self, domain='docs.openstack.org', *args, **kwargs): + def __init__(self, domain='docs.openstack.org', urls='', *args, **kwargs): super(SitemapSpider, self).__init__(*args, **kwargs) self.domain = domain self.allowed_domains = [domain] - self.start_urls = [ - 'http://%s/index.html' % domain, - ] + self.start_urls = ['http://%s/index.html' % domain] + for url in urls.split(','): + self.start_urls.append(url) def parse_item(self, response): item = items.SitemapItem()