openstack-manuals/tools/www-generator.py
Doug Hellmann 6ea7250c93 be more accurate when checking for guide links
Use URLs with index.html to explicitly look for content being
published, not just an apache directory listing.

Closes-Bug: #1705250
Change-Id: Iae116ee81210489d9de2a1833d341c3842ac95e5
Signed-off-by: Doug Hellmann <doug@doughellmann.com>
2017-07-19 08:29:59 -04:00

366 lines
13 KiB
Python
Executable File

#!/usr/bin/env python
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import argparse
import glob
import logging
import os
import os.path
import sys
from bs4 import BeautifulSoup
import jinja2
import jsonschema
import requests
import yaml
def initialize_logging(debug, verbose):
"""Initialize the Logger."""
logger = logging.getLogger()
formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s')
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger.addHandler(handler)
if verbose:
logger.setLevel(logging.INFO)
if debug:
logger.setLevel(logging.DEBUG)
return logger
def parse_command_line_arguments():
"""Parse the command line arguments."""
parser = argparse.ArgumentParser()
parser.add_argument("--debug", help="Print debugging messages.",
action="store_true", default=False)
parser.add_argument("--verbose", help="Be more verbose.",
action="store_true", default=False)
parser.add_argument("--source-directory", type=str,
default='www', help='Set source directory.')
parser.add_argument("--output-directory", type=str,
default='publish-docs/www',
help='Set output directory.')
parser.add_argument("--check-all-links", action="store_true",
default=False,
help='Check for links with flags set false.')
parser.add_argument("--skip-links", action="store_true",
default=False,
help='Skip link checks')
return parser.parse_args()
def _check_url(url):
"Return True if the URL exists, False otherwise."
try:
resp = requests.get(url)
except requests.exceptions.TooManyRedirects:
return False, 301
return (resp.status_code // 100) == 2, resp.status_code
# NOTE(dhellmann): List of tuple of flag name and URL template. None
# for the flag name means always apply the URL, otherwise look for a
# True value associated with the flag in the project data.
#
# NOTE(dhellmann): We use URLs with explicit index.html to ensure that
# a real page is published to the location, and we are not retrieving
# a file list generated by the web server.
_URLS = [
(None,
'https://docs.openstack.org/{name}/{series}/index.html'),
('has_install_guide',
'https://docs.openstack.org/{name}/{series}/install/index.html'),
('has_admin_guide',
'https://docs.openstack.org/{name}/{series}/admin/index.html'),
('has_config_ref',
'https://docs.openstack.org/{name}/{series}/configuration/index.html'),
('has_in_tree_api_docs',
'https://docs.openstack.org/{name}/{series}/api/index.html'),
('has_user_guide',
'https://docs.openstack.org/{name}/{series}/user/index.html'),
('has_api_ref',
'https://developer.openstack.org/api-ref/{service_type}/index.html'),
('has_api_guide',
'https://developer.openstack.org/api-guide/{service_type}/index.html'),
]
_SERVICE_TYPES_URL = 'http://git.openstack.org/cgit/openstack/service-types-authority/plain/service-types.yaml' # noqa
def _get_service_types():
"Return a map between repo base name and service type"
raw = requests.get(_SERVICE_TYPES_URL) # noqa
data = yaml.safe_load(raw.text)
service_types = {
d['project'].rsplit('/')[-1]: d['service_type']
for d in data['services']
}
return service_types
def load_project_data(source_directory,
check_all_links=False,
skip_links=False):
"Return a dict with project data grouped by series."
logger = logging.getLogger()
project_data = {}
fail = False
service_types = _get_service_types()
# Set up a schema validator so we can quickly check that the input
# data conforms.
project_schema_filename = os.path.join(
source_directory,
'project-data',
'schema.yaml',
)
with open(project_schema_filename, 'r') as f:
project_schema = yaml.safe_load(f.read())
validator = jsonschema.Draft4Validator(project_schema)
# Load the data files, using the file basename as the release
# series name.
for filename in glob.glob(
os.path.join(source_directory, 'project-data', '*.yaml')):
if filename.endswith('schema.yaml'):
continue
series, _ = os.path.splitext(os.path.basename(filename))
logger.info('loading %s project data from %s', series, filename)
with open(filename, 'r') as f:
data = yaml.safe_load(f.read())
for error in validator.iter_errors(data):
logger.error(str(error))
fail = True
for project in data:
# If the project has a service-type set, ensure it matches
# the value in the service-type-authority data.base.
st = project.get('service_type')
if st is not None:
if project['name'] not in service_types:
logger.error(
'did not find %s in %s',
project['name'], _SERVICE_TYPES_URL,
)
fail = True
elif project['service_type'] != service_types[project['name']]:
logger.error(
'expected service_type %r for %s but got %r',
service_types[project['name']], project['name'],
project['service_type'],
)
fail = True
# client projects must have a description
project_type = project.get('type')
if project_type == 'client' and not project.get('description'):
logger.error(
'client project %s has no description',
project['name'],
)
fail = True
# If the project claims to have a separately published guide
# of some sort, look for it before allowing the flag to stand.
if not skip_links:
for flag, url_template in _URLS:
if flag is None:
flag_val = True
else:
flag_val = project.get(flag, False)
try:
url = url_template.format(series=series, **project)
except KeyError:
# The project data does not include a field needed
# to build the URL (typically the
# service_type). Ignore this URL, unless the flag
# is set.
if flag_val:
raise
continue
# Only try to fetch the URL if we're going to do
# something with the result.
if flag_val or check_all_links:
logger.info('%s:%s looking for %s',
series, project['name'], url)
exists, status = _check_url(url)
if flag_val and not exists:
logger.error(
'%s set for %s but %s does not exist (%s)',
flag, project['name'], url, status,
)
fail = True
elif (not flag_val) and check_all_links and exists:
logger.warning(
'%s not set for %s but %s does exist',
flag, project['name'], url,
)
if fail:
raise ValueError('invalid input in %s' % filename)
project_data[series] = data
return project_data
_GOVERNANCE_URL = 'http://git.openstack.org/cgit/openstack/governance/plain/reference/projects.yaml' # noqa
def _get_official_repos():
"""Return a tuple containing lists of all official repos.
The first member is the list of regular project repos. The second
member is the list of infra repos.
"""
raw = requests.get(_GOVERNANCE_URL)
data = yaml.safe_load(raw.text)
regular_repos = []
infra_repos = []
for t_name, team in data.items():
for d_name, d_data in team.get('deliverables', {}).items():
if t_name == 'Infrastructure':
add = infra_repos.append
else:
add = regular_repos.append
for repo in d_data.get('repos', []):
add({'name': repo, 'base': repo.rsplit('/')[-1]})
return (regular_repos, infra_repos)
def render_template(environment, project_data, regular_repos, infra_repos,
template_file, output_directory, extra={}):
logger = logging.getLogger()
logger.info("generating %s", template_file)
# Determine the relative path to a few common directories so
# we don't need to set them in the templates.
topdir = os.path.relpath(
'.', os.path.dirname(template_file),
).rstrip('/') + '/'
scriptdir = os.path.join(topdir, 'common', 'js').rstrip('/') + '/'
cssdir = os.path.join(topdir, 'common', 'css').rstrip('/') + '/'
imagedir = os.path.join(topdir, 'common', 'images').rstrip('/') + '/'
try:
template = environment.get_template(template_file)
except Exception as e:
logger.error("parsing template %s failed: %s" %
(template_file, e))
raise
try:
output = template.render(
PROJECT_DATA=project_data,
TEMPLATE_FILE=template_file,
REGULAR_REPOS=regular_repos,
INFRA_REPOS=infra_repos,
topdir=topdir,
scriptdir=scriptdir,
cssdir=cssdir,
imagedir=imagedir,
**extra
)
if template_file.endswith('.html'):
soup = BeautifulSoup(output, "lxml")
output = soup.prettify()
except Exception as e:
logger.error("rendering template %s failed: %s" %
(template_file, e))
raise
try:
target_directory = os.path.join(output_directory,
os.path.dirname(template_file))
target_file = os.path.join(output_directory, template_file)
if not os.path.isdir(target_directory):
logger.debug("creating target directory %s" %
target_directory)
os.makedirs(target_directory)
logger.debug("writing %s" % target_file)
with open(os.path.join(target_file), 'wb') as fh:
fh.write(output.encode('utf8'))
except (IOError, OSError, UnicodeEncodeError) as e:
logger.error("writing %s failed: %s" % (target_file, e))
def main():
"""Entry point for this script."""
args = parse_command_line_arguments()
logger = initialize_logging(args.debug, args.verbose)
project_data = load_project_data(
args.source_directory,
args.check_all_links,
args.skip_links,
)
regular_repos, infra_repos = _get_official_repos()
# Set up jinja to discover the templates.
try:
logger.info('looking for templates in %s', args.source_directory)
loader = jinja2.FileSystemLoader(args.source_directory)
environment = jinja2.Environment(loader=loader)
except Exception as e:
logger.error("initialising template environment failed: %s" % e)
return 1
# Render the templates.
output_pages = []
page_list_template = None
for template_file in environment.list_templates():
if not (template_file.endswith('.html')
or template_file.endswith('.htaccess')):
logger.info('ignoring %s', template_file)
continue
if template_file.endswith('www-index.html'):
# Process this one at the end, so we have the full list of
# other output files.
page_list_template = template_file
continue
render_template(
environment,
project_data,
regular_repos,
infra_repos,
template_file,
args.output_directory,
)
output_pages.append(template_file)
if page_list_template is not None:
output_pages.sort()
render_template(
environment,
project_data,
regular_repos,
infra_repos,
page_list_template,
args.output_directory,
extra={
'file_list': output_pages,
},
)
return 0
if __name__ == '__main__':
sys.exit(main())