Christian Berendt 31c25891c1 script to generate the sitemap.xml for docs.openstack.org
This script crawls all available sites on http://docs.openstack.org and extracts
all URLs. Based on the URLs the script generates a sitemap for search engines
according to the protocol described at http://www.sitemaps.org/protocol.html.

Change-Id: Id7839d2048989da503d31e436455aea9bb4cdc1d
2014-05-29 01:29:18 +02:00

35 lines
1.2 KiB
Python

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# Configuration variables used inside Scrapy to enable modules/pipelines
# and to affect the behavior of several parts.
#
# All available configuration variables are documente at
# http://doc.scrapy.org/en/latest/topics/settings.html.
from scrapy import linkextractor
BOT_NAME = 'sitemap'
SPIDER_MODULES = ['generator.spiders']
ITEM_PIPELINES = {
'generator.pipelines.IgnoreDuplicateUrls': 500,
'generator.pipelines.ExportSitemap': 100,
}
CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS_PER_DOMAIN = 32
CONCURRENT_REQUESTS_PER_IP = 32
LOG_LEVEL = 'INFO'
LOGGING_ENABLED = True
RANDOMIZE_DOWNLOAD_DELAY = False
TELNETCONSOLE_ENABLED = False
linkextractor.IGNORED_EXTENSIONS.remove('pdf')