run the link checks in parallel

Use a ThreadPool to check the links in parallel so the check runs
faster. Results are still reported in order.

Also only get the HEAD and not the whole body of the page.

Change-Id: Ie4fbb97dd3393ac050b36057aee3b939dea58de2
Signed-off-by: Doug Hellmann <doug@doughellmann.com>
This commit is contained in:
Doug Hellmann 2017-07-24 14:51:47 -04:00
parent 3c0a3d87d1
commit f77fb23c63

View File

@ -15,6 +15,8 @@
import argparse import argparse
import glob import glob
import logging import logging
import multiprocessing
import multiprocessing.pool
import os import os
import os.path import os.path
import re import re
@ -98,13 +100,19 @@ def parse_command_line_arguments():
return parser.parse_args() return parser.parse_args()
def _check_url(url): def _check_url(args):
"Return True if the URL exists, False otherwise." "Return True if the URL exists, False otherwise."
url, project_name, flag, flag_val = args
try: try:
resp = requests.get(url) resp = requests.head(url)
except requests.exceptions.TooManyRedirects: except requests.exceptions.TooManyRedirects:
return False, 301 return False, 301
return (resp.status_code // 100) == 2, resp.status_code return (url,
project_name,
flag,
flag_val,
(resp.status_code // 100) == 2,
resp.status_code)
# NOTE(dhellmann): List of tuple of flag name and URL template. None # NOTE(dhellmann): List of tuple of flag name and URL template. None
@ -180,6 +188,7 @@ def load_project_data(source_directory,
logger.error(str(error)) logger.error(str(error))
fail = True fail = True
links_to_check = []
for project in data: for project in data:
# If the project has a service-type set, ensure it matches # If the project has a service-type set, ensure it matches
# the value in the service-type-authority data.base. # the value in the service-type-authority data.base.
@ -232,17 +241,26 @@ def load_project_data(source_directory,
if flag_val or check_all_links: if flag_val or check_all_links:
logger.info('%s:%s looking for %s', logger.info('%s:%s looking for %s',
series, project['name'], url) series, project['name'], url)
exists, status = _check_url(url) links_to_check.append(
(url, project['name'], flag, flag_val)
)
logger.info('checking %s links from %s...',
len(links_to_check), filename)
pool = multiprocessing.pool.ThreadPool()
results = pool.map(_check_url, links_to_check)
for url, project_name, flag, flag_val, exists, status in results:
if flag_val and not exists: if flag_val and not exists:
logger.error( logger.error(
'%s set for %s but %s does not exist (%s)', '%s set for %s but %s does not exist (%s)',
flag, project['name'], url, status, flag, project_name, url, status,
) )
fail = True fail = True
elif (not flag_val) and check_all_links and exists: elif (not flag_val) and check_all_links and exists:
logger.warning( logger.warning(
'%s not set for %s but %s does exist', '%s not set for %s but %s does exist',
flag, project['name'], url, flag, project_name, url,
) )
if fail: if fail: