script to generate the sitemap.xml for docs.openstack.org
This script crawls all available sites on http://docs.openstack.org and extracts all URLs. Based on the URLs the script generates a sitemap for search engines according to the protocol described at http://www.sitemaps.org/protocol.html. Change-Id: Id7839d2048989da503d31e436455aea9bb4cdc1d
This commit is contained in:
parent
403fc23549
commit
31c25891c1
39
sitemap/README.md
Normal file
39
sitemap/README.md
Normal file
@ -0,0 +1,39 @@
|
||||
= Sitemap Generator
|
||||
|
||||
This script crawls all available sites on http://docs.openstack.org and extracts
|
||||
all URLs. Based on the URLs the script generates a sitemap for search engines
|
||||
according to the protocol described at http://www.sitemaps.org/protocol.html.
|
||||
|
||||
== Usage
|
||||
|
||||
To generate a new sitemap file simply run the spider using the
|
||||
following command. It will take several minutes to crawl all available sites
|
||||
on http://docs.openstack.org. The result will be available in the file
|
||||
```sitemap_docs.openstack.org.xml```.
|
||||
|
||||
```
|
||||
$ scrapy crawl sitemap
|
||||
```
|
||||
|
||||
It's also possible to crawl other sites using the attribute ```domain```.
|
||||
|
||||
For example to crawl http://developer.openstack.org use the following command.
|
||||
The result will be available in the file ```sitemap_developer.openstack.org.xml```.
|
||||
|
||||
```
|
||||
$ scrapy crawl sitemap -a domain=developer.openstack.org
|
||||
```
|
||||
|
||||
To write log messages into a file append the parameter ```-s LOG_FILE=scrapy.log```.
|
||||
|
||||
== Dependencies
|
||||
|
||||
* Scrapy (https://pypi.python.org/pypi/Scrapy)
|
||||
|
||||
To install the needed modules you can use pip or the package management system included
|
||||
in your distribution. When using the package management system maybe the name of the
|
||||
packages differ. When using pip it's maybe necessary to install some development packages.
|
||||
|
||||
```
|
||||
$ pip install scrapy
|
||||
```
|
0
sitemap/generator/__init__.py
Normal file
0
sitemap/generator/__init__.py
Normal file
21
sitemap/generator/items.py
Normal file
21
sitemap/generator/items.py
Normal file
@ -0,0 +1,21 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class SitemapItem(scrapy.item.Item):
|
||||
'''Class to represent an item in the sitemap.'''
|
||||
loc = scrapy.item.Field()
|
||||
lastmod = scrapy.item.Field()
|
||||
priority = scrapy.item.Field()
|
||||
changefreq = scrapy.item.Field()
|
89
sitemap/generator/pipelines.py
Normal file
89
sitemap/generator/pipelines.py
Normal file
@ -0,0 +1,89 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import os
|
||||
|
||||
import lxml
|
||||
import scrapy
|
||||
from scrapy.contrib import exporter
|
||||
|
||||
|
||||
class SitemapItemExporter(exporter.XmlItemExporter):
|
||||
'''XmlItemExporer with adjusted attributes for the root element.'''
|
||||
|
||||
def start_exporting(self):
|
||||
'''Set namespace / schema attributes for the root element.'''
|
||||
self.xg.startDocument()
|
||||
self.xg.startElement(self.root_element, {
|
||||
"xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9",
|
||||
"xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
|
||||
"xsi:schemaLocation":
|
||||
"http://www.sitemaps.org/schemas/sitemap/0.9 "
|
||||
"http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
|
||||
})
|
||||
|
||||
|
||||
class IgnoreDuplicateUrls(object):
|
||||
'''Ignore duplicated URLs.'''
|
||||
|
||||
def __init__(self):
|
||||
self.processed = set()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
'''Check if a URL was already found.'''
|
||||
if item['loc'] in self.processed:
|
||||
raise scrapy.exceptions.DropItem("Duplicate URL found: %s."
|
||||
% item['loc'])
|
||||
else:
|
||||
self.processed.add(item['loc'])
|
||||
return item
|
||||
|
||||
|
||||
class ExportSitemap(object):
|
||||
'''Write found URLs to a sitemap file, based on
|
||||
http://doc.scrapy.org/en/latest/topics/exporters.html.
|
||||
'''
|
||||
|
||||
def __init__(self):
|
||||
self.files = {}
|
||||
self.exporter = None
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
pipeline = cls()
|
||||
crawler.signals.connect(pipeline.spider_opened,
|
||||
scrapy.signals.spider_opened)
|
||||
crawler.signals.connect(pipeline.spider_closed,
|
||||
scrapy.signals.spider_closed)
|
||||
return pipeline
|
||||
|
||||
def spider_opened(self, spider):
|
||||
output = open(os.path.join(os.getcwd(), 'sitemap_%s.xml'
|
||||
% spider.domain), 'w')
|
||||
self.files[spider] = output
|
||||
self.exporter = SitemapItemExporter(output, item_element='url',
|
||||
root_element='urlset')
|
||||
self.exporter.start_exporting()
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self.exporter.finish_exporting()
|
||||
output = self.files.pop(spider)
|
||||
output.close()
|
||||
tree = lxml.etree.parse(os.path.join(os.getcwd(), "sitemap_%s.xml"
|
||||
% spider.domain))
|
||||
with open(os.path.join(os.getcwd(), "sitemap_%s.xml" % spider.domain),
|
||||
'w') as pretty:
|
||||
pretty.write(lxml.etree.tostring(tree, pretty_print=True))
|
||||
|
||||
def process_item(self, item, spider):
|
||||
self.exporter.export_item(item)
|
||||
return item
|
34
sitemap/generator/settings.py
Normal file
34
sitemap/generator/settings.py
Normal file
@ -0,0 +1,34 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# Configuration variables used inside Scrapy to enable modules/pipelines
|
||||
# and to affect the behavior of several parts.
|
||||
#
|
||||
# All available configuration variables are documente at
|
||||
# http://doc.scrapy.org/en/latest/topics/settings.html.
|
||||
|
||||
from scrapy import linkextractor
|
||||
|
||||
BOT_NAME = 'sitemap'
|
||||
SPIDER_MODULES = ['generator.spiders']
|
||||
ITEM_PIPELINES = {
|
||||
'generator.pipelines.IgnoreDuplicateUrls': 500,
|
||||
'generator.pipelines.ExportSitemap': 100,
|
||||
}
|
||||
CONCURRENT_REQUESTS = 32
|
||||
CONCURRENT_REQUESTS_PER_DOMAIN = 32
|
||||
CONCURRENT_REQUESTS_PER_IP = 32
|
||||
LOG_LEVEL = 'INFO'
|
||||
LOGGING_ENABLED = True
|
||||
RANDOMIZE_DOWNLOAD_DELAY = False
|
||||
TELNETCONSOLE_ENABLED = False
|
||||
linkextractor.IGNORED_EXTENSIONS.remove('pdf')
|
0
sitemap/generator/spiders/__init__.py
Normal file
0
sitemap/generator/spiders/__init__.py
Normal file
74
sitemap/generator/spiders/sitemap.py
Normal file
74
sitemap/generator/spiders/sitemap.py
Normal file
@ -0,0 +1,74 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import posixpath
|
||||
import time
|
||||
import urlparse
|
||||
|
||||
from scrapy.contrib.linkextractors import sgml
|
||||
from scrapy.contrib import spiders
|
||||
|
||||
from generator import items
|
||||
|
||||
|
||||
class SitemapSpider(spiders.CrawlSpider):
|
||||
name = 'sitemap'
|
||||
|
||||
rules = [
|
||||
spiders.Rule(
|
||||
sgml.SgmlLinkExtractor(
|
||||
allow=[
|
||||
r'.*\.html',
|
||||
r'.*\.pdf',
|
||||
r'.*\.xml',
|
||||
r'.*\.txt',
|
||||
r'.*/',
|
||||
]
|
||||
),
|
||||
follow=True, callback='parse_item'
|
||||
)
|
||||
]
|
||||
|
||||
def __init__(self, domain='docs.openstack.org', *args, **kwargs):
|
||||
super(SitemapSpider, self).__init__(*args, **kwargs)
|
||||
self.domain = domain
|
||||
self.allowed_domains = [domain]
|
||||
self.start_urls = [
|
||||
'http://%s/index.html' % domain,
|
||||
]
|
||||
|
||||
def parse_item(self, response):
|
||||
item = items.SitemapItem()
|
||||
item['priority'] = '0.5'
|
||||
item['changefreq'] = 'daily'
|
||||
item['loc'] = response.url
|
||||
|
||||
path = urlparse.urlsplit(response.url).path
|
||||
filename = posixpath.basename(path)
|
||||
|
||||
if filename == 'index.html' or filename == '':
|
||||
item['priority'] = '1.0'
|
||||
|
||||
weekly = [
|
||||
'icehouse',
|
||||
'havana',
|
||||
'grizzly'
|
||||
]
|
||||
|
||||
for entry in weekly:
|
||||
if path.startswith("/%s" % entry):
|
||||
item['changefreq'] = 'weekly'
|
||||
|
||||
lastmod = time.strptime(response.headers['Last-Modified'],
|
||||
"%a, %d %b %Y %H:%M:%S %Z")
|
||||
item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
|
||||
return item
|
5
sitemap/scrapy.cfg
Normal file
5
sitemap/scrapy.cfg
Normal file
@ -0,0 +1,5 @@
|
||||
[settings]
|
||||
default = generator.settings
|
||||
|
||||
[deploy]
|
||||
project = generator
|
Loading…
Reference in New Issue
Block a user