
This script crawls all available sites on http://docs.openstack.org and extracts all URLs. Based on the URLs the script generates a sitemap for search engines according to the protocol described at http://www.sitemaps.org/protocol.html. Change-Id: Id7839d2048989da503d31e436455aea9bb4cdc1d
35 lines
1.2 KiB
Python
35 lines
1.2 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# Configuration variables used inside Scrapy to enable modules/pipelines
|
|
# and to affect the behavior of several parts.
|
|
#
|
|
# All available configuration variables are documente at
|
|
# http://doc.scrapy.org/en/latest/topics/settings.html.
|
|
|
|
from scrapy import linkextractor
|
|
|
|
BOT_NAME = 'sitemap'
|
|
SPIDER_MODULES = ['generator.spiders']
|
|
ITEM_PIPELINES = {
|
|
'generator.pipelines.IgnoreDuplicateUrls': 500,
|
|
'generator.pipelines.ExportSitemap': 100,
|
|
}
|
|
CONCURRENT_REQUESTS = 32
|
|
CONCURRENT_REQUESTS_PER_DOMAIN = 32
|
|
CONCURRENT_REQUESTS_PER_IP = 32
|
|
LOG_LEVEL = 'INFO'
|
|
LOGGING_ENABLED = True
|
|
RANDOMIZE_DOWNLOAD_DELAY = False
|
|
TELNETCONSOLE_ENABLED = False
|
|
linkextractor.IGNORED_EXTENSIONS.remove('pdf')
|