From 31c25891c1622a5ac604127c7fc84c8c9028a154 Mon Sep 17 00:00:00 2001 From: Christian Berendt Date: Tue, 27 May 2014 20:29:58 +0200 Subject: [PATCH] script to generate the sitemap.xml for docs.openstack.org This script crawls all available sites on http://docs.openstack.org and extracts all URLs. Based on the URLs the script generates a sitemap for search engines according to the protocol described at http://www.sitemaps.org/protocol.html. Change-Id: Id7839d2048989da503d31e436455aea9bb4cdc1d --- sitemap/README.md | 39 ++++++++++++ sitemap/generator/__init__.py | 0 sitemap/generator/items.py | 21 +++++++ sitemap/generator/pipelines.py | 89 +++++++++++++++++++++++++++ sitemap/generator/settings.py | 34 ++++++++++ sitemap/generator/spiders/__init__.py | 0 sitemap/generator/spiders/sitemap.py | 74 ++++++++++++++++++++++ sitemap/scrapy.cfg | 5 ++ 8 files changed, 262 insertions(+) create mode 100644 sitemap/README.md create mode 100644 sitemap/generator/__init__.py create mode 100644 sitemap/generator/items.py create mode 100644 sitemap/generator/pipelines.py create mode 100644 sitemap/generator/settings.py create mode 100644 sitemap/generator/spiders/__init__.py create mode 100644 sitemap/generator/spiders/sitemap.py create mode 100644 sitemap/scrapy.cfg diff --git a/sitemap/README.md b/sitemap/README.md new file mode 100644 index 00000000..a95f0bad --- /dev/null +++ b/sitemap/README.md @@ -0,0 +1,39 @@ += Sitemap Generator + +This script crawls all available sites on http://docs.openstack.org and extracts +all URLs. Based on the URLs the script generates a sitemap for search engines +according to the protocol described at http://www.sitemaps.org/protocol.html. + +== Usage + +To generate a new sitemap file simply run the spider using the +following command. It will take several minutes to crawl all available sites +on http://docs.openstack.org. The result will be available in the file +```sitemap_docs.openstack.org.xml```. + +``` +$ scrapy crawl sitemap +``` + +It's also possible to crawl other sites using the attribute ```domain```. + +For example to crawl http://developer.openstack.org use the following command. +The result will be available in the file ```sitemap_developer.openstack.org.xml```. + +``` +$ scrapy crawl sitemap -a domain=developer.openstack.org +``` + +To write log messages into a file append the parameter ```-s LOG_FILE=scrapy.log```. + +== Dependencies + +* Scrapy (https://pypi.python.org/pypi/Scrapy) + +To install the needed modules you can use pip or the package management system included +in your distribution. When using the package management system maybe the name of the +packages differ. When using pip it's maybe necessary to install some development packages. + +``` +$ pip install scrapy +``` diff --git a/sitemap/generator/__init__.py b/sitemap/generator/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/sitemap/generator/items.py b/sitemap/generator/items.py new file mode 100644 index 00000000..d504a5f1 --- /dev/null +++ b/sitemap/generator/items.py @@ -0,0 +1,21 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import scrapy + + +class SitemapItem(scrapy.item.Item): + '''Class to represent an item in the sitemap.''' + loc = scrapy.item.Field() + lastmod = scrapy.item.Field() + priority = scrapy.item.Field() + changefreq = scrapy.item.Field() diff --git a/sitemap/generator/pipelines.py b/sitemap/generator/pipelines.py new file mode 100644 index 00000000..0bf68a6a --- /dev/null +++ b/sitemap/generator/pipelines.py @@ -0,0 +1,89 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import os + +import lxml +import scrapy +from scrapy.contrib import exporter + + +class SitemapItemExporter(exporter.XmlItemExporter): + '''XmlItemExporer with adjusted attributes for the root element.''' + + def start_exporting(self): + '''Set namespace / schema attributes for the root element.''' + self.xg.startDocument() + self.xg.startElement(self.root_element, { + "xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9", + "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", + "xsi:schemaLocation": + "http://www.sitemaps.org/schemas/sitemap/0.9 " + "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" + }) + + +class IgnoreDuplicateUrls(object): + '''Ignore duplicated URLs.''' + + def __init__(self): + self.processed = set() + + def process_item(self, item, spider): + '''Check if a URL was already found.''' + if item['loc'] in self.processed: + raise scrapy.exceptions.DropItem("Duplicate URL found: %s." + % item['loc']) + else: + self.processed.add(item['loc']) + return item + + +class ExportSitemap(object): + '''Write found URLs to a sitemap file, based on + http://doc.scrapy.org/en/latest/topics/exporters.html. + ''' + + def __init__(self): + self.files = {} + self.exporter = None + + @classmethod + def from_crawler(cls, crawler): + pipeline = cls() + crawler.signals.connect(pipeline.spider_opened, + scrapy.signals.spider_opened) + crawler.signals.connect(pipeline.spider_closed, + scrapy.signals.spider_closed) + return pipeline + + def spider_opened(self, spider): + output = open(os.path.join(os.getcwd(), 'sitemap_%s.xml' + % spider.domain), 'w') + self.files[spider] = output + self.exporter = SitemapItemExporter(output, item_element='url', + root_element='urlset') + self.exporter.start_exporting() + + def spider_closed(self, spider): + self.exporter.finish_exporting() + output = self.files.pop(spider) + output.close() + tree = lxml.etree.parse(os.path.join(os.getcwd(), "sitemap_%s.xml" + % spider.domain)) + with open(os.path.join(os.getcwd(), "sitemap_%s.xml" % spider.domain), + 'w') as pretty: + pretty.write(lxml.etree.tostring(tree, pretty_print=True)) + + def process_item(self, item, spider): + self.exporter.export_item(item) + return item diff --git a/sitemap/generator/settings.py b/sitemap/generator/settings.py new file mode 100644 index 00000000..41269628 --- /dev/null +++ b/sitemap/generator/settings.py @@ -0,0 +1,34 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# Configuration variables used inside Scrapy to enable modules/pipelines +# and to affect the behavior of several parts. +# +# All available configuration variables are documente at +# http://doc.scrapy.org/en/latest/topics/settings.html. + +from scrapy import linkextractor + +BOT_NAME = 'sitemap' +SPIDER_MODULES = ['generator.spiders'] +ITEM_PIPELINES = { + 'generator.pipelines.IgnoreDuplicateUrls': 500, + 'generator.pipelines.ExportSitemap': 100, +} +CONCURRENT_REQUESTS = 32 +CONCURRENT_REQUESTS_PER_DOMAIN = 32 +CONCURRENT_REQUESTS_PER_IP = 32 +LOG_LEVEL = 'INFO' +LOGGING_ENABLED = True +RANDOMIZE_DOWNLOAD_DELAY = False +TELNETCONSOLE_ENABLED = False +linkextractor.IGNORED_EXTENSIONS.remove('pdf') diff --git a/sitemap/generator/spiders/__init__.py b/sitemap/generator/spiders/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/sitemap/generator/spiders/sitemap.py b/sitemap/generator/spiders/sitemap.py new file mode 100644 index 00000000..5a8fd83c --- /dev/null +++ b/sitemap/generator/spiders/sitemap.py @@ -0,0 +1,74 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import posixpath +import time +import urlparse + +from scrapy.contrib.linkextractors import sgml +from scrapy.contrib import spiders + +from generator import items + + +class SitemapSpider(spiders.CrawlSpider): + name = 'sitemap' + + rules = [ + spiders.Rule( + sgml.SgmlLinkExtractor( + allow=[ + r'.*\.html', + r'.*\.pdf', + r'.*\.xml', + r'.*\.txt', + r'.*/', + ] + ), + follow=True, callback='parse_item' + ) + ] + + def __init__(self, domain='docs.openstack.org', *args, **kwargs): + super(SitemapSpider, self).__init__(*args, **kwargs) + self.domain = domain + self.allowed_domains = [domain] + self.start_urls = [ + 'http://%s/index.html' % domain, + ] + + def parse_item(self, response): + item = items.SitemapItem() + item['priority'] = '0.5' + item['changefreq'] = 'daily' + item['loc'] = response.url + + path = urlparse.urlsplit(response.url).path + filename = posixpath.basename(path) + + if filename == 'index.html' or filename == '': + item['priority'] = '1.0' + + weekly = [ + 'icehouse', + 'havana', + 'grizzly' + ] + + for entry in weekly: + if path.startswith("/%s" % entry): + item['changefreq'] = 'weekly' + + lastmod = time.strptime(response.headers['Last-Modified'], + "%a, %d %b %Y %H:%M:%S %Z") + item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod) + return item diff --git a/sitemap/scrapy.cfg b/sitemap/scrapy.cfg new file mode 100644 index 00000000..167b9288 --- /dev/null +++ b/sitemap/scrapy.cfg @@ -0,0 +1,5 @@ +[settings] +default = generator.settings + +[deploy] +project = generator