script to generate the sitemap.xml for docs.openstack.org

This script crawls all available sites on http://docs.openstack.org and extracts
all URLs. Based on the URLs the script generates a sitemap for search engines
according to the protocol described at http://www.sitemaps.org/protocol.html.

Change-Id: Id7839d2048989da503d31e436455aea9bb4cdc1d
This commit is contained in:
Christian Berendt 2014-05-27 20:29:58 +02:00
parent 403fc23549
commit 31c25891c1
8 changed files with 262 additions and 0 deletions

39
sitemap/README.md Normal file
View File

@ -0,0 +1,39 @@
= Sitemap Generator
This script crawls all available sites on http://docs.openstack.org and extracts
all URLs. Based on the URLs the script generates a sitemap for search engines
according to the protocol described at http://www.sitemaps.org/protocol.html.
== Usage
To generate a new sitemap file simply run the spider using the
following command. It will take several minutes to crawl all available sites
on http://docs.openstack.org. The result will be available in the file
```sitemap_docs.openstack.org.xml```.
```
$ scrapy crawl sitemap
```
It's also possible to crawl other sites using the attribute ```domain```.
For example to crawl http://developer.openstack.org use the following command.
The result will be available in the file ```sitemap_developer.openstack.org.xml```.
```
$ scrapy crawl sitemap -a domain=developer.openstack.org
```
To write log messages into a file append the parameter ```-s LOG_FILE=scrapy.log```.
== Dependencies
* Scrapy (https://pypi.python.org/pypi/Scrapy)
To install the needed modules you can use pip or the package management system included
in your distribution. When using the package management system maybe the name of the
packages differ. When using pip it's maybe necessary to install some development packages.
```
$ pip install scrapy
```

View File

View File

@ -0,0 +1,21 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import scrapy
class SitemapItem(scrapy.item.Item):
'''Class to represent an item in the sitemap.'''
loc = scrapy.item.Field()
lastmod = scrapy.item.Field()
priority = scrapy.item.Field()
changefreq = scrapy.item.Field()

View File

@ -0,0 +1,89 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import os
import lxml
import scrapy
from scrapy.contrib import exporter
class SitemapItemExporter(exporter.XmlItemExporter):
'''XmlItemExporer with adjusted attributes for the root element.'''
def start_exporting(self):
'''Set namespace / schema attributes for the root element.'''
self.xg.startDocument()
self.xg.startElement(self.root_element, {
"xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9",
"xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
"xsi:schemaLocation":
"http://www.sitemaps.org/schemas/sitemap/0.9 "
"http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
})
class IgnoreDuplicateUrls(object):
'''Ignore duplicated URLs.'''
def __init__(self):
self.processed = set()
def process_item(self, item, spider):
'''Check if a URL was already found.'''
if item['loc'] in self.processed:
raise scrapy.exceptions.DropItem("Duplicate URL found: %s."
% item['loc'])
else:
self.processed.add(item['loc'])
return item
class ExportSitemap(object):
'''Write found URLs to a sitemap file, based on
http://doc.scrapy.org/en/latest/topics/exporters.html.
'''
def __init__(self):
self.files = {}
self.exporter = None
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened,
scrapy.signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed,
scrapy.signals.spider_closed)
return pipeline
def spider_opened(self, spider):
output = open(os.path.join(os.getcwd(), 'sitemap_%s.xml'
% spider.domain), 'w')
self.files[spider] = output
self.exporter = SitemapItemExporter(output, item_element='url',
root_element='urlset')
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
output = self.files.pop(spider)
output.close()
tree = lxml.etree.parse(os.path.join(os.getcwd(), "sitemap_%s.xml"
% spider.domain))
with open(os.path.join(os.getcwd(), "sitemap_%s.xml" % spider.domain),
'w') as pretty:
pretty.write(lxml.etree.tostring(tree, pretty_print=True))
def process_item(self, item, spider):
self.exporter.export_item(item)
return item

View File

@ -0,0 +1,34 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# Configuration variables used inside Scrapy to enable modules/pipelines
# and to affect the behavior of several parts.
#
# All available configuration variables are documente at
# http://doc.scrapy.org/en/latest/topics/settings.html.
from scrapy import linkextractor
BOT_NAME = 'sitemap'
SPIDER_MODULES = ['generator.spiders']
ITEM_PIPELINES = {
'generator.pipelines.IgnoreDuplicateUrls': 500,
'generator.pipelines.ExportSitemap': 100,
}
CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS_PER_DOMAIN = 32
CONCURRENT_REQUESTS_PER_IP = 32
LOG_LEVEL = 'INFO'
LOGGING_ENABLED = True
RANDOMIZE_DOWNLOAD_DELAY = False
TELNETCONSOLE_ENABLED = False
linkextractor.IGNORED_EXTENSIONS.remove('pdf')

View File

View File

@ -0,0 +1,74 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import posixpath
import time
import urlparse
from scrapy.contrib.linkextractors import sgml
from scrapy.contrib import spiders
from generator import items
class SitemapSpider(spiders.CrawlSpider):
name = 'sitemap'
rules = [
spiders.Rule(
sgml.SgmlLinkExtractor(
allow=[
r'.*\.html',
r'.*\.pdf',
r'.*\.xml',
r'.*\.txt',
r'.*/',
]
),
follow=True, callback='parse_item'
)
]
def __init__(self, domain='docs.openstack.org', *args, **kwargs):
super(SitemapSpider, self).__init__(*args, **kwargs)
self.domain = domain
self.allowed_domains = [domain]
self.start_urls = [
'http://%s/index.html' % domain,
]
def parse_item(self, response):
item = items.SitemapItem()
item['priority'] = '0.5'
item['changefreq'] = 'daily'
item['loc'] = response.url
path = urlparse.urlsplit(response.url).path
filename = posixpath.basename(path)
if filename == 'index.html' or filename == '':
item['priority'] = '1.0'
weekly = [
'icehouse',
'havana',
'grizzly'
]
for entry in weekly:
if path.startswith("/%s" % entry):
item['changefreq'] = 'weekly'
lastmod = time.strptime(response.headers['Last-Modified'],
"%a, %d %b %Y %H:%M:%S %Z")
item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
return item

5
sitemap/scrapy.cfg Normal file
View File

@ -0,0 +1,5 @@
[settings]
default = generator.settings
[deploy]
project = generator