Browse Source

[sitemap] introduce attribute to define start URLs

Change-Id: I09018a824310ef48f6b62f04f09d6fd704808119
Closes-bug: #1506601
tags/0.32.0
Christian Berendt 4 years ago
parent
commit
815fcb5bb6
2 changed files with 9 additions and 4 deletions
  1. 5
    0
      sitemap/README.rst
  2. 4
    4
      sitemap/generator/spiders/sitemap.py

+ 5
- 0
sitemap/README.rst View File

@@ -24,6 +24,11 @@ The result will be available in the file ``sitemap_developer.openstack.org.xml``
24 24
 
25 25
 To write log messages into a file append the parameter ``-s LOG_FILE=scrapy.log``.
26 26
 
27
+It is possible to define a set of additional start URLs using the attribute
28
+``urls``. Separate multiple URLs with ``,``.
29
+
30
+    $ scrapy crawl sitemap -a domain=developer.openstack.org -a urls="http://developer.openstack.org/de/api-guide/quick-start/"
31
+
27 32
 Dependencies
28 33
 ============
29 34
 

+ 4
- 4
sitemap/generator/spiders/sitemap.py View File

@@ -42,13 +42,13 @@ class SitemapSpider(spiders.CrawlSpider):
42 42
         )
43 43
     ]
44 44
 
45
-    def __init__(self, domain='docs.openstack.org', *args, **kwargs):
45
+    def __init__(self, domain='docs.openstack.org', urls='', *args, **kwargs):
46 46
         super(SitemapSpider, self).__init__(*args, **kwargs)
47 47
         self.domain = domain
48 48
         self.allowed_domains = [domain]
49
-        self.start_urls = [
50
-            'http://%s/index.html' % domain,
51
-        ]
49
+        self.start_urls = ['http://%s/index.html' % domain]
50
+        for url in urls.split(','):
51
+            self.start_urls.append(url)
52 52
 
53 53
     def parse_item(self, response):
54 54
         item = items.SitemapItem()

Loading…
Cancel
Save