ソースを参照

script to generate the sitemap.xml for docs.openstack.org

This script crawls all available sites on http://docs.openstack.org and extracts
all URLs. Based on the URLs the script generates a sitemap for search engines
according to the protocol described at http://www.sitemaps.org/protocol.html.

Change-Id: Id7839d2048989da503d31e436455aea9bb4cdc1d
tags/0.16
Christian Berendt 5年前
コミット
31c25891c1

+ 39
- 0
sitemap/README.md ファイルの表示

@@ -0,0 +1,39 @@
1
+= Sitemap Generator
2
+
3
+This script crawls all available sites on http://docs.openstack.org and extracts
4
+all URLs. Based on the URLs the script generates a sitemap for search engines
5
+according to the protocol described at http://www.sitemaps.org/protocol.html.
6
+
7
+== Usage
8
+
9
+To generate a new sitemap file simply run the spider using the
10
+following command. It will take several minutes to crawl all available sites
11
+on http://docs.openstack.org. The result will be available in the file
12
+```sitemap_docs.openstack.org.xml```.
13
+
14
+```
15
+$ scrapy crawl sitemap
16
+```
17
+
18
+It's also possible to crawl other sites using the attribute ```domain```.
19
+
20
+For example to crawl http://developer.openstack.org use the following command.
21
+The result will be available in the file ```sitemap_developer.openstack.org.xml```.
22
+
23
+```
24
+$ scrapy crawl sitemap -a domain=developer.openstack.org
25
+```
26
+
27
+To write log messages into a file append the parameter ```-s LOG_FILE=scrapy.log```.
28
+
29
+== Dependencies
30
+
31
+* Scrapy (https://pypi.python.org/pypi/Scrapy)
32
+
33
+To install the needed modules you can use pip or the package management system included
34
+in your distribution. When using the package management system maybe the name of the
35
+packages differ. When using pip it's maybe necessary to install some development packages.
36
+
37
+```
38
+$ pip install scrapy
39
+```

+ 0
- 0
sitemap/generator/__init__.py ファイルの表示


+ 21
- 0
sitemap/generator/items.py ファイルの表示

@@ -0,0 +1,21 @@
1
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
2
+# not use this file except in compliance with the License. You may obtain
3
+# a copy of the License at
4
+#
5
+#      http://www.apache.org/licenses/LICENSE-2.0
6
+#
7
+# Unless required by applicable law or agreed to in writing, software
8
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
9
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
10
+# License for the specific language governing permissions and limitations
11
+# under the License.
12
+
13
+import scrapy
14
+
15
+
16
+class SitemapItem(scrapy.item.Item):
17
+    '''Class to represent an item in the sitemap.'''
18
+    loc = scrapy.item.Field()
19
+    lastmod = scrapy.item.Field()
20
+    priority = scrapy.item.Field()
21
+    changefreq = scrapy.item.Field()

+ 89
- 0
sitemap/generator/pipelines.py ファイルの表示

@@ -0,0 +1,89 @@
1
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
2
+# not use this file except in compliance with the License. You may obtain
3
+# a copy of the License at
4
+#
5
+#      http://www.apache.org/licenses/LICENSE-2.0
6
+#
7
+# Unless required by applicable law or agreed to in writing, software
8
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
9
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
10
+# License for the specific language governing permissions and limitations
11
+# under the License.
12
+
13
+import os
14
+
15
+import lxml
16
+import scrapy
17
+from scrapy.contrib import exporter
18
+
19
+
20
+class SitemapItemExporter(exporter.XmlItemExporter):
21
+    '''XmlItemExporer with adjusted attributes for the root element.'''
22
+
23
+    def start_exporting(self):
24
+        '''Set namespace / schema attributes for the root element.'''
25
+        self.xg.startDocument()
26
+        self.xg.startElement(self.root_element, {
27
+            "xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9",
28
+            "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
29
+            "xsi:schemaLocation":
30
+            "http://www.sitemaps.org/schemas/sitemap/0.9 "
31
+            "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
32
+        })
33
+
34
+
35
+class IgnoreDuplicateUrls(object):
36
+    '''Ignore duplicated URLs.'''
37
+
38
+    def __init__(self):
39
+        self.processed = set()
40
+
41
+    def process_item(self, item, spider):
42
+        '''Check if a URL was already found.'''
43
+        if item['loc'] in self.processed:
44
+            raise scrapy.exceptions.DropItem("Duplicate URL found: %s."
45
+                                             % item['loc'])
46
+        else:
47
+            self.processed.add(item['loc'])
48
+            return item
49
+
50
+
51
+class ExportSitemap(object):
52
+    '''Write found URLs to a sitemap file, based on
53
+    http://doc.scrapy.org/en/latest/topics/exporters.html.
54
+    '''
55
+
56
+    def __init__(self):
57
+        self.files = {}
58
+        self.exporter = None
59
+
60
+    @classmethod
61
+    def from_crawler(cls, crawler):
62
+        pipeline = cls()
63
+        crawler.signals.connect(pipeline.spider_opened,
64
+                                scrapy.signals.spider_opened)
65
+        crawler.signals.connect(pipeline.spider_closed,
66
+                                scrapy.signals.spider_closed)
67
+        return pipeline
68
+
69
+    def spider_opened(self, spider):
70
+        output = open(os.path.join(os.getcwd(), 'sitemap_%s.xml'
71
+                      % spider.domain), 'w')
72
+        self.files[spider] = output
73
+        self.exporter = SitemapItemExporter(output, item_element='url',
74
+                                            root_element='urlset')
75
+        self.exporter.start_exporting()
76
+
77
+    def spider_closed(self, spider):
78
+        self.exporter.finish_exporting()
79
+        output = self.files.pop(spider)
80
+        output.close()
81
+        tree = lxml.etree.parse(os.path.join(os.getcwd(), "sitemap_%s.xml"
82
+                                % spider.domain))
83
+        with open(os.path.join(os.getcwd(), "sitemap_%s.xml" % spider.domain),
84
+                  'w') as pretty:
85
+            pretty.write(lxml.etree.tostring(tree, pretty_print=True))
86
+
87
+    def process_item(self, item, spider):
88
+        self.exporter.export_item(item)
89
+        return item

+ 34
- 0
sitemap/generator/settings.py ファイルの表示

@@ -0,0 +1,34 @@
1
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
2
+# not use this file except in compliance with the License. You may obtain
3
+# a copy of the License at
4
+#
5
+#      http://www.apache.org/licenses/LICENSE-2.0
6
+#
7
+# Unless required by applicable law or agreed to in writing, software
8
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
9
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
10
+# License for the specific language governing permissions and limitations
11
+# under the License.
12
+
13
+# Configuration variables used inside Scrapy to enable modules/pipelines
14
+# and to affect the behavior of several parts.
15
+#
16
+# All available configuration variables are documente at
17
+# http://doc.scrapy.org/en/latest/topics/settings.html.
18
+
19
+from scrapy import linkextractor
20
+
21
+BOT_NAME = 'sitemap'
22
+SPIDER_MODULES = ['generator.spiders']
23
+ITEM_PIPELINES = {
24
+    'generator.pipelines.IgnoreDuplicateUrls': 500,
25
+    'generator.pipelines.ExportSitemap': 100,
26
+}
27
+CONCURRENT_REQUESTS = 32
28
+CONCURRENT_REQUESTS_PER_DOMAIN = 32
29
+CONCURRENT_REQUESTS_PER_IP = 32
30
+LOG_LEVEL = 'INFO'
31
+LOGGING_ENABLED = True
32
+RANDOMIZE_DOWNLOAD_DELAY = False
33
+TELNETCONSOLE_ENABLED = False
34
+linkextractor.IGNORED_EXTENSIONS.remove('pdf')

+ 0
- 0
sitemap/generator/spiders/__init__.py ファイルの表示


+ 74
- 0
sitemap/generator/spiders/sitemap.py ファイルの表示

@@ -0,0 +1,74 @@
1
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
2
+# not use this file except in compliance with the License. You may obtain
3
+# a copy of the License at
4
+#
5
+#      http://www.apache.org/licenses/LICENSE-2.0
6
+#
7
+# Unless required by applicable law or agreed to in writing, software
8
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
9
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
10
+# License for the specific language governing permissions and limitations
11
+# under the License.
12
+
13
+import posixpath
14
+import time
15
+import urlparse
16
+
17
+from scrapy.contrib.linkextractors import sgml
18
+from scrapy.contrib import spiders
19
+
20
+from generator import items
21
+
22
+
23
+class SitemapSpider(spiders.CrawlSpider):
24
+    name = 'sitemap'
25
+
26
+    rules = [
27
+        spiders.Rule(
28
+            sgml.SgmlLinkExtractor(
29
+                allow=[
30
+                    r'.*\.html',
31
+                    r'.*\.pdf',
32
+                    r'.*\.xml',
33
+                    r'.*\.txt',
34
+                    r'.*/',
35
+                ]
36
+            ),
37
+            follow=True, callback='parse_item'
38
+        )
39
+    ]
40
+
41
+    def __init__(self, domain='docs.openstack.org', *args, **kwargs):
42
+        super(SitemapSpider, self).__init__(*args, **kwargs)
43
+        self.domain = domain
44
+        self.allowed_domains = [domain]
45
+        self.start_urls = [
46
+            'http://%s/index.html' % domain,
47
+        ]
48
+
49
+    def parse_item(self, response):
50
+        item = items.SitemapItem()
51
+        item['priority'] = '0.5'
52
+        item['changefreq'] = 'daily'
53
+        item['loc'] = response.url
54
+
55
+        path = urlparse.urlsplit(response.url).path
56
+        filename = posixpath.basename(path)
57
+
58
+        if filename == 'index.html' or filename == '':
59
+            item['priority'] = '1.0'
60
+
61
+        weekly = [
62
+            'icehouse',
63
+            'havana',
64
+            'grizzly'
65
+        ]
66
+
67
+        for entry in weekly:
68
+            if path.startswith("/%s" % entry):
69
+                item['changefreq'] = 'weekly'
70
+
71
+        lastmod = time.strptime(response.headers['Last-Modified'],
72
+                                "%a, %d %b %Y %H:%M:%S %Z")
73
+        item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
74
+        return item

+ 5
- 0
sitemap/scrapy.cfg ファイルの表示

@@ -0,0 +1,5 @@
1
+[settings]
2
+default = generator.settings
3
+
4
+[deploy]
5
+project = generator

読み込み中…
キャンセル
保存