script to generate the sitemap.xml for docs.openstack.org

This script crawls all available sites on http://docs.openstack.org and extracts all URLs. Based on the URLs the script generates a sitemap for search engines according to the protocol described at http://www.sitemaps.org/protocol.html. Change-Id: Id7839d2048989da503d31e436455aea9bb4cdc1d
2014-05-27 20:29:58 +02:00 · 2014-05-27 20:29:58 +02:00 · 31c25891c1
commit 31c25891c1
parent 403fc23549
8 changed files with 262 additions and 0 deletions
--- a/sitemap/README.md
+++ b/sitemap/README.md
@ -0,0 +1,39 @@
+= Sitemap Generator
+
+This script crawls all available sites on http://docs.openstack.org and extracts
+all URLs. Based on the URLs the script generates a sitemap for search engines
+according to the protocol described at http://www.sitemaps.org/protocol.html.
+
+== Usage
+
+To generate a new sitemap file simply run the spider using the
+following command. It will take several minutes to crawl all available sites
+on http://docs.openstack.org. The result will be available in the file
+```sitemap_docs.openstack.org.xml```.
+
+```
+$ scrapy crawl sitemap
+```
+
+It's also possible to crawl other sites using the attribute ```domain```.
+
+For example to crawl http://developer.openstack.org use the following command.
+The result will be available in the file ```sitemap_developer.openstack.org.xml```.
+
+```
+$ scrapy crawl sitemap -a domain=developer.openstack.org
+```
+
+To write log messages into a file append the parameter ```-s LOG_FILE=scrapy.log```.
+
+== Dependencies
+
+* Scrapy (https://pypi.python.org/pypi/Scrapy)
+
+To install the needed modules you can use pip or the package management system included
+in your distribution. When using the package management system maybe the name of the
+packages differ. When using pip it's maybe necessary to install some development packages.
+
+```
+$ pip install scrapy
+```
--- a/sitemap/generator/init.py
+++ b/sitemap/generator/init.py
--- a/sitemap/generator/items.py
+++ b/sitemap/generator/items.py
@ -0,0 +1,21 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import scrapy
+
+
+class SitemapItem(scrapy.item.Item):
+    '''Class to represent an item in the sitemap.'''
+    loc = scrapy.item.Field()
+    lastmod = scrapy.item.Field()
+    priority = scrapy.item.Field()
+    changefreq = scrapy.item.Field()
--- a/sitemap/generator/pipelines.py
+++ b/sitemap/generator/pipelines.py
@ -0,0 +1,89 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import os
+
+import lxml
+import scrapy
+from scrapy.contrib import exporter
+
+
+class SitemapItemExporter(exporter.XmlItemExporter):
+    '''XmlItemExporer with adjusted attributes for the root element.'''
+
+    def start_exporting(self):
+        '''Set namespace / schema attributes for the root element.'''
+        self.xg.startDocument()
+        self.xg.startElement(self.root_element, {
+            "xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9",
+            "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
+            "xsi:schemaLocation":
+            "http://www.sitemaps.org/schemas/sitemap/0.9 "
+            "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
+        })
+
+
+class IgnoreDuplicateUrls(object):
+    '''Ignore duplicated URLs.'''
+
+    def __init__(self):
+        self.processed = set()
+
+    def process_item(self, item, spider):
+        '''Check if a URL was already found.'''
+        if item['loc'] in self.processed:
+            raise scrapy.exceptions.DropItem("Duplicate URL found: %s."
+                                             % item['loc'])
+        else:
+            self.processed.add(item['loc'])
+            return item
+
+
+class ExportSitemap(object):
+    '''Write found URLs to a sitemap file, based on
+    http://doc.scrapy.org/en/latest/topics/exporters.html.
+    '''
+
+    def __init__(self):
+        self.files = {}
+        self.exporter = None
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        pipeline = cls()
+        crawler.signals.connect(pipeline.spider_opened,
+                                scrapy.signals.spider_opened)
+        crawler.signals.connect(pipeline.spider_closed,
+                                scrapy.signals.spider_closed)
+        return pipeline
+
+    def spider_opened(self, spider):
+        output = open(os.path.join(os.getcwd(), 'sitemap_%s.xml'
+                      % spider.domain), 'w')
+        self.files[spider] = output
+        self.exporter = SitemapItemExporter(output, item_element='url',
+                                            root_element='urlset')
+        self.exporter.start_exporting()
+
+    def spider_closed(self, spider):
+        self.exporter.finish_exporting()
+        output = self.files.pop(spider)
+        output.close()
+        tree = lxml.etree.parse(os.path.join(os.getcwd(), "sitemap_%s.xml"
+                                % spider.domain))
+        with open(os.path.join(os.getcwd(), "sitemap_%s.xml" % spider.domain),
+                  'w') as pretty:
+            pretty.write(lxml.etree.tostring(tree, pretty_print=True))
+
+    def process_item(self, item, spider):
+        self.exporter.export_item(item)
+        return item
--- a/sitemap/generator/settings.py
+++ b/sitemap/generator/settings.py
@ -0,0 +1,34 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+# Configuration variables used inside Scrapy to enable modules/pipelines
+# and to affect the behavior of several parts.
+#
+# All available configuration variables are documente at
+# http://doc.scrapy.org/en/latest/topics/settings.html.
+
+from scrapy import linkextractor
+
+BOT_NAME = 'sitemap'
+SPIDER_MODULES = ['generator.spiders']
+ITEM_PIPELINES = {
+    'generator.pipelines.IgnoreDuplicateUrls': 500,
+    'generator.pipelines.ExportSitemap': 100,
+}
+CONCURRENT_REQUESTS = 32
+CONCURRENT_REQUESTS_PER_DOMAIN = 32
+CONCURRENT_REQUESTS_PER_IP = 32
+LOG_LEVEL = 'INFO'
+LOGGING_ENABLED = True
+RANDOMIZE_DOWNLOAD_DELAY = False
+TELNETCONSOLE_ENABLED = False
+linkextractor.IGNORED_EXTENSIONS.remove('pdf')
--- a/sitemap/generator/spiders/init.py
+++ b/sitemap/generator/spiders/init.py
--- a/sitemap/generator/spiders/sitemap.py
+++ b/sitemap/generator/spiders/sitemap.py
@ -0,0 +1,74 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import posixpath
+import time
+import urlparse
+
+from scrapy.contrib.linkextractors import sgml
+from scrapy.contrib import spiders
+
+from generator import items
+
+
+class SitemapSpider(spiders.CrawlSpider):
+    name = 'sitemap'
+
+    rules = [
+        spiders.Rule(
+            sgml.SgmlLinkExtractor(
+                allow=[
+                    r'.*\.html',
+                    r'.*\.pdf',
+                    r'.*\.xml',
+                    r'.*\.txt',
+                    r'.*/',
+                ]
+            ),
+            follow=True, callback='parse_item'
+        )
+    ]
+
+    def __init__(self, domain='docs.openstack.org', *args, **kwargs):
+        super(SitemapSpider, self).__init__(*args, **kwargs)
+        self.domain = domain
+        self.allowed_domains = [domain]
+        self.start_urls = [
+            'http://%s/index.html' % domain,
+        ]
+
+    def parse_item(self, response):
+        item = items.SitemapItem()
+        item['priority'] = '0.5'
+        item['changefreq'] = 'daily'
+        item['loc'] = response.url
+
+        path = urlparse.urlsplit(response.url).path
+        filename = posixpath.basename(path)
+
+        if filename == 'index.html' or filename == '':
+            item['priority'] = '1.0'
+
+        weekly = [
+            'icehouse',
+            'havana',
+            'grizzly'
+        ]
+
+        for entry in weekly:
+            if path.startswith("/%s" % entry):
+                item['changefreq'] = 'weekly'
+
+        lastmod = time.strptime(response.headers['Last-Modified'],
+                                "%a, %d %b %Y %H:%M:%S %Z")
+        item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
+        return item
--- a/sitemap/scrapy.cfg
+++ b/sitemap/scrapy.cfg
@ -0,0 +1,5 @@
+[settings]
+default = generator.settings
+
+[deploy]
+project = generator