#!/usr/bin/env python # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain # a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. import argparse import collections import glob import logging import multiprocessing import multiprocessing.pool import os import os.path import re import sys from openstack_governance import governance from bs4 import BeautifulSoup import jinja2 import jsonschema import os_service_types import percache import requests import yaml SeriesInfo = collections.namedtuple('SeriesInfo', 'date status') # The 'date' should be a string containing the month name and 4 digit year. # # The 'status' field should be one of: # 'obsolete' -- the release existed, but we have no more artifacts for it # 'EOL' -- the release is closed but we have docs for it # 'maintained' -- the release still has an open branch # 'development' -- the current release being developed SERIES_INFO = { 'austin': SeriesInfo(date='October 2010', status='obsolete'), 'bexar': SeriesInfo(date='February 2011', status='obsolete'), 'cactus': SeriesInfo(date='April 2011', status='obsolete'), 'diablo': SeriesInfo(date='September 2011', status='obsolete'), 'essex': SeriesInfo(date='April 2012', status='obsolete'), 'folsom': SeriesInfo(date='September 2012', status='obsolete'), 'grizzly': SeriesInfo(date='April 2013', status='obsolete'), 'havana': SeriesInfo(date='October 2013', status='obsolete'), 'icehouse': SeriesInfo(date='April 2014', status='EOL'), 'juno': SeriesInfo(date='October 2014', status='EOL'), 'kilo': SeriesInfo(date='April 2015', status='EOL'), 'liberty': SeriesInfo(date='October 2015', status='EOL'), 'mitaka': SeriesInfo(date='April 2016', status='EOL'), 'newton': SeriesInfo(date='October 2016', status='EOL'), 'ocata': SeriesInfo(date='February 2017', status='maintained'), 'pike': SeriesInfo(date='August 2017', status='maintained'), 'queens': SeriesInfo(date='March 2018', status='maintained'), 'rocky': SeriesInfo(date='August 2018', status='maintained'), 'stein': SeriesInfo(date='April 2019', status='maintained'), 'train': SeriesInfo(date='September 2019', status='development'), } # Build a list of the series that are not the current series being # developed. PAST_SERIES = [ name for name, info in sorted(SERIES_INFO.items()) if info.status != 'development' ] # Find the currently maintained series. MAINTAINED_SERIES = [ name for name, info in sorted(SERIES_INFO.items()) if info.status == 'maintained' ] # Find the most recently released series. RELEASED_SERIES = MAINTAINED_SERIES[-1] # Find the series being developed. SERIES_IN_DEVELOPMENT = [ name for name, info in sorted(SERIES_INFO.items()) if info.status == 'development' ][0] # Do not modify this variable. ALL_SERIES = list(sorted(SERIES_INFO.keys())) SERIES_PAT = re.compile('^(' + '|'.join(ALL_SERIES) + ')/') cache = percache.Cache("./OS_GOVERNANCE_DATA_CACHE") def initialize_logging(debug, verbose): """Initialize the Logger.""" logger = logging.getLogger() formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s') handler = logging.StreamHandler() handler.setFormatter(formatter) logger.addHandler(handler) if verbose: logger.setLevel(logging.INFO) if debug: logger.setLevel(logging.DEBUG) return logger def parse_command_line_arguments(): """Parse the command line arguments.""" parser = argparse.ArgumentParser() parser.add_argument("--debug", help="Print debugging messages.", action="store_true", default=False) parser.add_argument("--verbose", help="Be more verbose.", action="store_true", default=False) parser.add_argument("--source-directory", type=str, default='www', help='Set source directory.') parser.add_argument("--output-directory", type=str, default='publish-docs/www', help='Set output directory.') parser.add_argument("--check-all-links", action="store_true", default=False, help='Check for links with flags set false.') parser.add_argument("--skip-links", action="store_true", default=False, help='Skip link checks') parser.add_argument('--series', default=[], action='append', help='series to update/test', ) parser.add_argument('--skip-render', default=False, action='store_true', help='only test links, do not render templates', ) parser.add_argument('--strict', default=False, action='store_true', help='treat warnings as errors', ) parser.add_argument('--project', default=[], action='append', help='project to check (defaults to all)', ) parser.add_argument('--publish', default=False, action='store_true', help='use absolute paths for publish environment', ) return parser.parse_args() def _check_url(args): "Return True if the URL exists, False otherwise." url, project_name, flag, flag_val = args try: resp = requests.head(url) except requests.exceptions.TooManyRedirects: return False, 301 return (url, project_name, flag, flag_val, (resp.status_code // 100) == 2, resp.status_code) # NOTE(dhellmann): List of tuple of flag name and URL template. None # for the flag name means always apply the URL, otherwise look for a # True value associated with the flag in the project data. # # NOTE(dhellmann): We use URLs with explicit index.html to ensure that # a real page is published to the location, and we are not retrieving # a file list generated by the web server. URLSettings = collections.namedtuple( 'URLSettings', ['flag_name', 'types', 'template', 'default'], ) _URLS = [ URLSettings( flag_name='has_project_guide', types=[], template='https://docs.openstack.org/{name}/{series}/index.html', default=True, ), URLSettings( flag_name='has_install_guide', types=['service'], template='https://docs.openstack.org/{name}/{series}/install/index.html', # noqa default=False, ), URLSettings( flag_name='has_admin_guide', types=['service'], template='https://docs.openstack.org/{name}/{series}/admin/index.html', default=False, ), URLSettings( flag_name='has_config_ref', types=['service', 'library'], template='https://docs.openstack.org/{name}/{series}/configuration/index.html', # noqa default=False, ), URLSettings( flag_name='has_in_tree_api_docs', types=['service'], template='https://docs.openstack.org/{name}/{series}/api/index.html', default=False, ), URLSettings( flag_name='has_user_guide', types=['service'], template='https://docs.openstack.org/{name}/{series}/user/index.html', default=False, ), URLSettings( flag_name='has_api_ref', types=['service'], template='https://developer.openstack.org/api-ref/{service_type}/index.html', # noqa default=False, ), URLSettings( flag_name='has_api_guide', types=['service'], template='https://developer.openstack.org/api-guide/{service_type}/index.html', # noqa default=False, ), URLSettings( flag_name='has_deployment_guide', types=['deployment'], template='https://docs.openstack.org/project-deploy-guide/{name}/{series}/index.html', # noqa default=False, ), ] @cache def load_project_data(source_directory, check_all_links=False, skip_links=False, series_to_load=None, governed_deliverables=[], strict=False, projects_to_check=[]): "Return a dict with project data grouped by series." logger = logging.getLogger() series_to_load = series_to_load or [] project_data = {} fail = False service_types = os_service_types.ServiceTypes( session=requests.Session(), only_remote=True) # Set up a schema validator so we can quickly check that the input # data conforms. project_schema_filename = os.path.join( source_directory, 'project-data', 'schema.yaml', ) with open(project_schema_filename, 'r') as f: project_schema = yaml.safe_load(f.read()) validator = jsonschema.Draft4Validator(project_schema) # Load the data files, using the file basename as the release # series name. for filename in glob.glob( os.path.join(source_directory, 'project-data', '*.yaml')): if filename.endswith('schema.yaml'): continue series, _ = os.path.splitext(os.path.basename(filename)) if series_to_load and series not in series_to_load: continue logger.info('loading %s project data from %s', series, filename) with open(filename, 'r') as f: raw_data = yaml.safe_load(f.read()) for error in validator.iter_errors(raw_data): logger.error(str(error)) fail = True links_to_check = [] data = [] for project in raw_data: deliverable_name = project.get('deliverable-name', project['name']) # Set the defaults for the flags so that the templates can # assume the flags with true defaults are defined. for url_info in _URLS: if url_info.flag_name not in project: project[url_info.flag_name] = url_info.default if (series == 'latest' and deliverable_name not in governed_deliverables): msg = ('{} is no longer part of an official project, ' '{} in {}').format( deliverable_name, 'error' if strict else 'ignoring', filename) logger.warning(msg) if strict: logger.info('Known deliverables: %s', sorted(governed_deliverables)) raise RuntimeError(msg) continue logger.info('including %s', deliverable_name) data.append(project) # If the project has a service-type set, ensure it matches # the value in the service-type-authority data.base. st = project.get('service_type') if st is not None: st_data = service_types.get_service_data_for_project( project['name']) if not st_data: # It's possible this is a project listed by its # service-type st_data = service_types.get_service_data(st) if not st_data: logger.error( 'did not find %s in Service Types Authority', project['name'], ) fail = True elif st != st_data['service_type']: logger.error( 'expected service_type %r for %s but got %r', st_data['service_type'], project['name'], st, ) fail = True # client projects must have a description project_type = project.get('type') if (project_type in ['cloud-client', 'service-client'] and not project.get('description')): logger.error( 'client project %s has no description', project['name'], ) fail = True # If the project claims to have a separately published guide # of some sort, look for it before allowing the flag to stand. check_links_this_project = ( deliverable_name in projects_to_check or not projects_to_check ) if check_links_this_project and not skip_links: for url_info in _URLS: flag_val = project.get(url_info.flag_name, url_info.default) if ((not flag_val) and url_info.types and project_type not in url_info.types): # This type of project isn't expected to have # this type of link, so if we are not # explicitly told to check for it don't. continue try: url = url_info.template.format( series=series, **project) except KeyError: # The project data does not include a field needed # to build the URL (typically the # service_type). Ignore this URL, unless the flag # is set. if flag_val: raise continue # Only try to fetch the URL if we're going to do # something with the result. if flag_val or check_all_links: logger.info('%s:%s looking for %s', series, project['name'], url) links_to_check.append( (url, project['name'], url_info.flag_name, flag_val) ) if links_to_check: logger.info('checking %s links from %s...', len(links_to_check), filename) pool = multiprocessing.pool.ThreadPool() results = pool.map(_check_url, links_to_check) for url, project_name, flag, flag_val, exists, status in results: if flag_val and not exists: logger.error( '%s set for %s but %s does not exist (%s)', flag, project_name, url, status, ) fail = True elif (not flag_val) and check_all_links and exists: msg = '{} not set for {} but {} does exist'.format( flag, project_name, url) logger.warning(msg) if strict: raise RuntimeError(msg) if fail: raise ValueError('invalid input in %s' % filename) project_data[series] = data return project_data _IGNORED_REPOS = [ 'openstack/releases', 'openstack-infra/releasestatus', 'openstack/contributor-guide', 'openstack/operations-guide', 'openstack/ha-guide', 'openstack/arch-design', 'openstack/project-team-guide', ] # List of infra repos that publish to the normal location (/REPO/) and # not to /infra/REPO. _INFRA_REPOS_EXCEPTION = [ 'openstack-infra/pynotedb', 'openstack-infra/subunit2sql', 'openstack/diskimage-builder', ] @cache def _get_official_repos(): """Return a tuple containing lists of all official repos. The first member is the list of regular project repos. The second member is the list of infra repos. """ seen_repos = set() regular_repos = [] infra_repos = [] deliverables = set() # NOTE(dhellmann): We could get fancy and support loading # governance data from a local repo so we could support zuul's # Depends-On feature to link together patches, but that would # complicate the build environment needed for an individual # developer, so we just always pull from the remote repo for now. gov_data = governance.Governance.from_remote_repo() for repository in gov_data.get_repositories(): repo = repository.name base = repo.rsplit('/')[-1] if repo in seen_repos: # Sometimes the governance data ends up with # duplicates, but we don't want duplicate rules to # be generated. continue seen_repos.add(repo) deliverables.add(repository.deliverable.name) if repository.deliverable.team.name == 'Infrastructure': add = infra_repos.append else: add = regular_repos.append # Overwrite infra list for a few repositories if repo in _INFRA_REPOS_EXCEPTION: regular_repos.append({'name': repo, 'base': base}) elif repo not in _IGNORED_REPOS: add({'name': repo, 'base': base}) return (regular_repos, infra_repos, list(sorted(deliverables))) def render_template(environment, project_data, regular_repos, infra_repos, template_files, template_file, output_directory, is_publish, extra={}): logger = logging.getLogger() logger.info("generating %s", template_file) # Determine the relative path to a few common directories so # we don't need to set them in the templates. if is_publish: topdir = 'https://docs.openstack.org/' scriptdir = topdir + 'common/js/' cssdir = topdir + 'common/css/' imagedir = topdir + 'common/images/' else: topdir = os.path.relpath( '.', os.path.dirname(template_file), ).rstrip('/') + '/' scriptdir = os.path.join(topdir, 'common', 'js').rstrip('/') + '/' cssdir = os.path.join(topdir, 'common', 'css').rstrip('/') + '/' imagedir = os.path.join(topdir, 'common', 'images').rstrip('/') + '/' series_match = SERIES_PAT.match(template_file) if series_match: series = series_match.groups()[0] series_path_prefix = series series_title = series.title() series_info = SERIES_INFO[series] if series == SERIES_IN_DEVELOPMENT: series = 'latest' else: series = None series_path_prefix = None series_title = '' series_info = SeriesInfo('', '') logger.info('series = %s, path prefix = %s, title = %s', series, series_path_prefix, series_title) try: template = environment.get_template(template_file) except Exception as e: logger.error("parsing template %s failed: %s" % (template_file, e)) raise try: output = template.render( PROJECT_DATA=project_data, TEMPLATE_FILE=template_file, TEMPLATE_FILES={f: True for f in template_files}, REGULAR_REPOS=regular_repos, INFRA_REPOS=infra_repos, ALL_SERIES=ALL_SERIES, PAST_SERIES=PAST_SERIES, RELEASED_SERIES=RELEASED_SERIES, MAINTAINED_SERIES=MAINTAINED_SERIES, SERIES_IN_DEVELOPMENT=SERIES_IN_DEVELOPMENT, TOPDIR=topdir, SCRIPTDIR=scriptdir, CSSDIR=cssdir, IMAGEDIR=imagedir, SERIES=series, SERIES_PATH_PREFIX=series_path_prefix, SERIES_TITLE=series_title, SERIES_INFO=series_info, **extra ) if template_file.endswith('.html'): soup = BeautifulSoup(output, "lxml") output = soup.prettify() except Exception as e: logger.error("rendering template %s failed: %s" % (template_file, e)) raise try: target_directory = os.path.join(output_directory, os.path.dirname(template_file)) target_file = os.path.join(output_directory, template_file) if not os.path.isdir(target_directory): logger.debug("creating target directory %s" % target_directory) os.makedirs(target_directory) logger.debug("writing %s" % target_file) with open(os.path.join(target_file), 'wb') as fh: fh.write(output.encode('utf8')) except (IOError, OSError, UnicodeEncodeError) as e: logger.error("writing %s failed: %s" % (target_file, e)) raise def main(): """Entry point for this script.""" args = parse_command_line_arguments() logger = initialize_logging(args.debug, args.verbose) logger.debug("getting official repos ...") regular_repos, infra_repos, deliverables = _get_official_repos() logger.debug("loading project data ...") project_data = load_project_data( source_directory=args.source_directory, check_all_links=args.check_all_links, skip_links=args.skip_links, series_to_load=args.series, governed_deliverables=deliverables, strict=args.strict, projects_to_check=args.project, ) # Set up jinja to discover the templates. try: logger.info('looking for templates in %s', args.source_directory) loader = jinja2.FileSystemLoader(args.source_directory) environment = jinja2.Environment(loader=loader) except Exception as e: logger.error("initialising template environment failed: %s" % e) raise if args.skip_render: return 0 # Render the templates. output_pages = [] page_list_template = None template_files = environment.list_templates() for template_file in template_files: if (template_file.startswith('static/') or template_file.startswith('templates/')): logger.info('ignoring %s', template_file) continue if template_file.endswith('www-index.html'): # Process this one at the end, so we have the full list of # other output files. page_list_template = template_file continue render_template( environment, project_data, regular_repos, infra_repos, template_files, template_file, args.output_directory, args.publish ) output_pages.append(template_file) if page_list_template is not None: output_pages.sort() render_template( environment, project_data, regular_repos, infra_repos, template_files, page_list_template, args.output_directory, args.publish, extra={ 'file_list': output_pages, }, ) return 0 if __name__ == '__main__': sys.exit(main())