docker-images: better registry error handling

This commit enables better error detection when checking whether an
image/tag exists in a remote registry. Current implementation sometimes
falsely believes a remote tag is missing and attempts to (re-)push the
images, potentially overwriting them.

Examples:
- Registry is not reachable due to a temporary network outage
- With docker.io: we exceed the request rate limit. Original script
  looked for remote tags by enumerating all tags. This resulted in
  dozens of REST calls per image, occasionally exceeding Dockerhub's
  request limit.

Solution: add new script that exits on connectivity errors, rather than
returning false. Script requires an external tool, regctl:
  https://github.com/regclient/regclient

TESTS
====================================
- Test with missing/existing images in Harbor, DockerHub and
  AWS ECR registries, as well as various connectivity errors.
- Run retag-images.sh and make sure it still works

Closes-Bug: 2003898
Change-Id: Id9dd0c30580748c0c4c4bfbbd520d4d38bdd2ec6
Signed-off-by: Davlet Panech <davlet.panech@windriver.com>
This commit is contained in:
Davlet Panech 2023-05-05 16:57:39 -04:00
parent 5c1e7e7b75
commit 775ad108af
2 changed files with 222 additions and 46 deletions

View File

@ -0,0 +1,217 @@
# bash
#
# Copyright (c) 2018-2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#
# Usage: docker_reg_tag_exists [OPTIONS...] REGISTRY/IMAGE:TAG
#
# Check whether the specified image exists in a remote registry.
#
# --max-attempts MAX_ATTEMPTS try to access the tag at most this many times
# upon detecting transient errors.
# Default: 3.
#
# --backoff-delay SECONDS sleep this many seconds between retries
# By default we sleep 5 seconds on the first retry,
# then increment the sleep time by 5 on subsequent
# retries.
#
# --request-timeout SECONDS timeout for the REST API request
# Default: 10.
#
# Returns:
# 0 (true) - if image/tag exists
# 1 (false) - if image/tag doesn't exist, or we have no permissions to access it.
#
# Exits with status other than 0 or 1 if we can't establish a connection
# with the registry.
#
declare _DRU_REGCTL_FOUND=
declare -A _DRU_STATUS=(
[found]=0
[not_found]=1
[err_unknown]=2
[err_invalid_ref]=3
[err_auth]=4
[err_dns]=5
[err_bad_gateway]=6
[err_connrefused]=7
[err_no_route]=8
[err_tls]=9
[err_rate_limit]=10
[err_timeout]=124
[err_interrupt]=130
)
function docker_reg_tag_exists {
local image
local max_attempts=3
local backoff_delay=5
local backoff_delay_increment=5
local req_timeout=10
local error_code=${_DRU_STATUS[err_unknown]}
local usage="\
Usage: ${FUNCNAME[0]} OPTIONS REGISTRY/IMAGE:TAG
--max-attempts MAX_ATTEMPTS
--backoff-delay SECONDS
--request-timeout SECONDS
"
# process command line
local opts
if ! opts=$(getopt -l max-attempts:,backoff-delay:,request-timeout: -- \
${FUNCNAME[0]} "$@") ; then
echo "$usage" >&2
exit ${_DRU_STATUS[err_unknown]}
fi
eval set -- "${opts}"
while [[ "$#" -gt 0 ]] ; do
case "$1" in
--max-attempts)
max_attempts="$2"
shift 2
;;
--backoff-delay)
backoff_delay="$2"
backoff_delay_increment=0
shift 2
;;
--request-timeout)
req_timeout="$2"
shift 2
;;
--)
shift
break
;;
*)
echo "$usage" >&2
exit ${_DRU_STATUS[err_unknown]}
;;
esac
done
if [[ "$#" -ne 1 ]] ; then
echo "$usage" >&2
exit ${_DRU_STATUS[err_unknown]}
fi
image="$1"
# make sure regctl exists
if [[ ! "$_DRU_REGCTL_FOUND" ]] ; then
if ! regctl --help >/dev/null ; then
echo >&2
echo "The regctl command was not found in your \$PATH" >&2
echo "Please install it from here:" >&2
echo " https://github.com/regclient/regclient/releases" >&2
echo >&2
exit ${_DRU_STATUS[err_unknown]}
fi
_DRU_REGCTL_FOUND=1
fi
local attempt=1
local error_msg
while true ; do
local regctl=(timeout --foreground "${req_timeout}s" \
regctl -v debug manifest get "$image")
local stderr status
stderr="$("${regctl[@]}" 2>&1 1>/dev/null)"
exit_status="$?"
if [[ $exit_status -eq 0 ]] ; then
return ${_DRU_STATUS[found]}
fi
local retry=0
# interrupt
if [[ $exit_status -eq 130 ]] ; then
error_code=${_DRU_STATUS[err_interrupt]}
error_msg=
retry=0
# invalid "registry/image:tag" format
elif echo "$stderr" | grep -qi "invalid reference" ; then
error_code=${_DRU_STATUS[err_invalid_ref]}
error_msg="invalid image reference format"
retry=0
# amazon returns this when the auto-generated username/password in
# ~/.docker/config.json is valid, but expired recently
elif echo "$stderr" | grep -qi 'authorization token has expired' ; then
error_code=${_DRU_STATUS[err_auth]}
error_msg="authorization token has expired"
retry=0
# HTTP proxy error
elif echo "$stderr" | grep -qi "bad gateway" ; then
error_code=${_DRU_STATUS[err_bad_gateway]}
error_msg="registry server returned <bad gateway>"
retry=1
# registry host name unresolvable
elif echo "$stderr" | grep -qi 'lookup .* no such host' ; then
error_msg="DNS lookup error"
error_code=${_DRU_STATUS[err_dns]}
retry=1
# TCP connection refused
elif echo "$stderr" | grep -qi 'connection refused' ; then
error_msg="connection refused"
error_code=${_DRU_STATUS[err_connrefused]}
retry=1
# IP routing error
elif echo "$stderr" | grep -qi 'no route to host' ; then
error_msg="no route to host"
error_code=${_DRU_STATUS[err_no_route]}
retry=1
# SSL: untrusted signer
elif echo "$stderr" | grep -qi \
'certificate signed by unknown authority' ; then
error_msg="invalid SSL certificate"
error_code=${_DRU_STATUS[err_tls]}
retry=0
# SSL: expired cert
elif echo "$stderr" | grep -qi 'certificate has expired' ; then
error_msg="expired SSL certificate"
error_code=${_DRU_STATUS[err_tls]}
retry=0
# docker hub rate limit
elif echo "$stderr" | grep -qi 'rate limit exceeded' ; then
error_msg="request rate limit exceeded"
error_code=${_DRU_STATUS[err_rate_limit]}
retry=1
# the timeout command returns 124 if we timed out
elif [[ $exit_status -eq 124 ]] ; then
error_msg='operation timed out'
error_code=${_DRU_STATUS[err_timeout]}
retry=1
# Some other error, such as http "404 Not Found" or "403 Forbidden".
# There's no way to distinguish non-existent namespaces from insufficient
# permissions (both return "permission denied"-type errors).
# These errors likely mean "docker push" would fail as well.
# Return false in all of these cases.
else
return ${_DRU_STATUS[not_found]}
fi
# retry on intermittent errors
if [[ $retry -eq 1 && $attempt -lt $max_attempts ]] ; then
let ++attempt
echo "$image: connection error," \
"sleeping $backoff_delay second(s)" >&2
sleep $backoff_delay || exit ${_DRU_STATUS[err_unknown]}
let backoff_delay+=backoff_delay_increment
echo "$image: retrying, attempt $attempt/$max_attempts" >&2
continue
fi
echo "error: command failed: ${regctl[@]}" >&2
echo "$stderr" | sed -r 's/^/ /' >&2
break
done
if [[ "$error_msg" ]] ; then
echo "error: $image: $error_msg" >&2
fi
exit $error_code
}

View File

@ -20,6 +20,8 @@
declare RUNCMD=
declare -a REPUSH
source $(dirname "${BASH_SOURCE[0]}")/../docker_reg_utils.sh || exit 1
function usage {
cat >&2 <<EOF
Usage:
@ -62,27 +64,6 @@ for fname in sys.argv[1:]:
' ${@}
}
function get_tags_from_docker_hub {
local url=$1
curl -k -sSL -X GET ${url} | python -c '
import sys, json
y=json.loads(sys.stdin.read())
if y and y.get("next"):
print("next=%s" % y.get("next"))
if y and y.get("results"):
for res in y.get("results"):
if res.get("name"):
print("tag=%s" % res.get("name"))
' | while IFS='=' read key value; do
if [ "${key}" = "next" ]; then
get_tags_from_docker_hub ${value}
else
echo "${key}=${value}"
fi
done
}
function retag_and_push_image {
local name=$1
local src_tag=$2
@ -96,32 +77,10 @@ function retag_and_push_image {
if is_in $(basename $label) ${REPUSH[@]}; then
echo "Skipping existence check for ${name}"
else
if [ "${docker_registry}" = "docker.io" ]; then
get_tags_from_docker_hub https://registry.hub.docker.com/v2/repositories/${image}/tags \
| grep -q "^tag=${new_tag}$"
if [ $? -eq 0 ]; then
# Already exists
elif docker_reg_tag_exists "$name:$new_tag" ; then
echo "Image tag exists: ${name}:${new_tag}"
return 0
fi
else
curl -k -sSL -X GET https://${docker_registry}/v2/${image}/tags/list \
| python -c '
import sys, json, re
y=json.loads(sys.stdin.read())
RC=1
if y and sys.argv[1] in [img for img in y.get("tags")]:
RC=0
sys.exit(RC)
' ${new_tag}
if [ $? -eq 0 ]; then
# Already exists
echo "Image tag exists: ${name}:${new_tag}"
return 0
fi
fi
fi
${RUNCMD} docker image pull ${name}:${src_tag}
if [ $? -ne 0 ]; then