ソースを参照

script to generate the sitemap.xml for docs.openstack.org

This script crawls all available sites on http://docs.openstack.org and extracts
all URLs. Based on the URLs the script generates a sitemap for search engines
according to the protocol described at http://www.sitemaps.org/protocol.html.

Change-Id: Id7839d2048989da503d31e436455aea9bb4cdc1d
tags/0.16
Christian Berendt 5年前
コミット
31c25891c1

+ 39
- 0
sitemap/README.md ファイルの表示

@@ -0,0 +1,39 @@
= Sitemap Generator

This script crawls all available sites on http://docs.openstack.org and extracts
all URLs. Based on the URLs the script generates a sitemap for search engines
according to the protocol described at http://www.sitemaps.org/protocol.html.

== Usage

To generate a new sitemap file simply run the spider using the
following command. It will take several minutes to crawl all available sites
on http://docs.openstack.org. The result will be available in the file
```sitemap_docs.openstack.org.xml```.

```
$ scrapy crawl sitemap
```

It's also possible to crawl other sites using the attribute ```domain```.

For example to crawl http://developer.openstack.org use the following command.
The result will be available in the file ```sitemap_developer.openstack.org.xml```.

```
$ scrapy crawl sitemap -a domain=developer.openstack.org
```

To write log messages into a file append the parameter ```-s LOG_FILE=scrapy.log```.

== Dependencies

* Scrapy (https://pypi.python.org/pypi/Scrapy)

To install the needed modules you can use pip or the package management system included
in your distribution. When using the package management system maybe the name of the
packages differ. When using pip it's maybe necessary to install some development packages.

```
$ pip install scrapy
```

+ 0
- 0
sitemap/generator/__init__.py ファイルの表示


+ 21
- 0
sitemap/generator/items.py ファイルの表示

@@ -0,0 +1,21 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

import scrapy


class SitemapItem(scrapy.item.Item):
'''Class to represent an item in the sitemap.'''
loc = scrapy.item.Field()
lastmod = scrapy.item.Field()
priority = scrapy.item.Field()
changefreq = scrapy.item.Field()

+ 89
- 0
sitemap/generator/pipelines.py ファイルの表示

@@ -0,0 +1,89 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

import os

import lxml
import scrapy
from scrapy.contrib import exporter


class SitemapItemExporter(exporter.XmlItemExporter):
'''XmlItemExporer with adjusted attributes for the root element.'''

def start_exporting(self):
'''Set namespace / schema attributes for the root element.'''
self.xg.startDocument()
self.xg.startElement(self.root_element, {
"xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9",
"xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
"xsi:schemaLocation":
"http://www.sitemaps.org/schemas/sitemap/0.9 "
"http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
})


class IgnoreDuplicateUrls(object):
'''Ignore duplicated URLs.'''

def __init__(self):
self.processed = set()

def process_item(self, item, spider):
'''Check if a URL was already found.'''
if item['loc'] in self.processed:
raise scrapy.exceptions.DropItem("Duplicate URL found: %s."
% item['loc'])
else:
self.processed.add(item['loc'])
return item


class ExportSitemap(object):
'''Write found URLs to a sitemap file, based on
http://doc.scrapy.org/en/latest/topics/exporters.html.
'''

def __init__(self):
self.files = {}
self.exporter = None

@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened,
scrapy.signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed,
scrapy.signals.spider_closed)
return pipeline

def spider_opened(self, spider):
output = open(os.path.join(os.getcwd(), 'sitemap_%s.xml'
% spider.domain), 'w')
self.files[spider] = output
self.exporter = SitemapItemExporter(output, item_element='url',
root_element='urlset')
self.exporter.start_exporting()

def spider_closed(self, spider):
self.exporter.finish_exporting()
output = self.files.pop(spider)
output.close()
tree = lxml.etree.parse(os.path.join(os.getcwd(), "sitemap_%s.xml"
% spider.domain))
with open(os.path.join(os.getcwd(), "sitemap_%s.xml" % spider.domain),
'w') as pretty:
pretty.write(lxml.etree.tostring(tree, pretty_print=True))

def process_item(self, item, spider):
self.exporter.export_item(item)
return item

+ 34
- 0
sitemap/generator/settings.py ファイルの表示

@@ -0,0 +1,34 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

# Configuration variables used inside Scrapy to enable modules/pipelines
# and to affect the behavior of several parts.
#
# All available configuration variables are documente at
# http://doc.scrapy.org/en/latest/topics/settings.html.

from scrapy import linkextractor

BOT_NAME = 'sitemap'
SPIDER_MODULES = ['generator.spiders']
ITEM_PIPELINES = {
'generator.pipelines.IgnoreDuplicateUrls': 500,
'generator.pipelines.ExportSitemap': 100,
}
CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS_PER_DOMAIN = 32
CONCURRENT_REQUESTS_PER_IP = 32
LOG_LEVEL = 'INFO'
LOGGING_ENABLED = True
RANDOMIZE_DOWNLOAD_DELAY = False
TELNETCONSOLE_ENABLED = False
linkextractor.IGNORED_EXTENSIONS.remove('pdf')

+ 0
- 0
sitemap/generator/spiders/__init__.py ファイルの表示


+ 74
- 0
sitemap/generator/spiders/sitemap.py ファイルの表示

@@ -0,0 +1,74 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

import posixpath
import time
import urlparse

from scrapy.contrib.linkextractors import sgml
from scrapy.contrib import spiders

from generator import items


class SitemapSpider(spiders.CrawlSpider):
name = 'sitemap'

rules = [
spiders.Rule(
sgml.SgmlLinkExtractor(
allow=[
r'.*\.html',
r'.*\.pdf',
r'.*\.xml',
r'.*\.txt',
r'.*/',
]
),
follow=True, callback='parse_item'
)
]

def __init__(self, domain='docs.openstack.org', *args, **kwargs):
super(SitemapSpider, self).__init__(*args, **kwargs)
self.domain = domain
self.allowed_domains = [domain]
self.start_urls = [
'http://%s/index.html' % domain,
]

def parse_item(self, response):
item = items.SitemapItem()
item['priority'] = '0.5'
item['changefreq'] = 'daily'
item['loc'] = response.url

path = urlparse.urlsplit(response.url).path
filename = posixpath.basename(path)

if filename == 'index.html' or filename == '':
item['priority'] = '1.0'

weekly = [
'icehouse',
'havana',
'grizzly'
]

for entry in weekly:
if path.startswith("/%s" % entry):
item['changefreq'] = 'weekly'

lastmod = time.strptime(response.headers['Last-Modified'],
"%a, %d %b %Y %H:%M:%S %Z")
item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
return item

+ 5
- 0
sitemap/scrapy.cfg ファイルの表示

@@ -0,0 +1,5 @@
[settings]
default = generator.settings

[deploy]
project = generator

読み込み中…
キャンセル
保存