Merge "Updated resiliency gate"

This commit is contained in:
Zuul 2020-07-22 17:34:33 +00:00 committed by Gerrit Code Review
commit 5b50146f02
11 changed files with 91 additions and 20 deletions

View File

@ -43,6 +43,25 @@ debug it, e.g.:
./tools/g2/bin/ssh.sh n0
Running Resilency Tests Behind Corporate Proxy
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
If your development environment is behind a corporate proxy, you will need to
update following files to add your envrionment's proxy information, dns, or
possibly your internal ntp servers, in order to deploy airship:
* `charts/coredns/values.yaml`: Update the upstream coredns nameserver IPs
to your internal DNS addresses.
* `examples/basic/KubernetesNetwork.yaml`: Since resilency manifest uses
the examples/basic environment configuration, you will need to Update
the kubernetes network configuration in this folder. Update the upstream
nameserver IPs to your internal DNS addresses. Add the http(s) proxy URL
and additional_no_proxy list. Also, if your enviornment requires that,
update the ntp server list to your internal ntp server addresses for
more reliable time sync.
* `tools/g2/templates/network-config.sub`: Update the upstream nameserver
IPs to your internal DNS addresses.
Bootstrapping
-------------

View File

@ -74,6 +74,14 @@ export http_proxy={{ config['KubernetesNetwork:proxy.url'] | default('', true) }
export https_proxy={{ config['KubernetesNetwork:proxy.url'] | default('', true) }}
export no_proxy={{ config.get(kind='KubernetesNetwork') | fill_no_proxy }}
# Configure apt proxy
if [[ -n "${http_proxy}" ]]; then
log "Configuring Apt Proxy"
cat << EOF | sudo tee /etc/apt/apt.conf.d/50proxyconf
Acquire::https::proxy "${https_proxy}";
Acquire::http::proxy "${http_proxy}";
EOF
fi
# Install system packages
#
@ -139,5 +147,13 @@ fi
if systemctl -q is-enabled containerd > /dev/null 2>&1; then
systemctl restart containerd || true
fi
# Pull the hyperkube image prior to restarting kubelet, this is
# needed for more reliable image pull in an environment with slow
# network connectivity to avoid image pull timeouts and retries by
# kubelet.
# The || true is added to let the deployment continue, evenif the
# $IMAGE_HYPERKUBE is not defined in the environment, and the image
# pull doesn't happen.
docker image pull "${IMAGE_HYPERKUBE}" || true
systemctl enable kubelet
systemctl restart kubelet

View File

@ -222,7 +222,7 @@ function validate_kubectl_logs {
NAMESPACE=default
POD_NAME=log-test-${NODE}-$(date +%s)
cat <<EOPOD | kubectl --namespace $NAMESPACE apply -f -
cat <<EOPOD | kubectl --namespace $NAMESPACE --timeout 100s apply -f -
---
apiVersion: v1
kind: Pod
@ -244,6 +244,7 @@ EOPOD
wait_for_node_ready $NODE 300
wait_for_pod_termination $NAMESPACE $POD_NAME
sleep 5
ACTUAL_LOGS=$(kubectl --namespace $NAMESPACE logs $POD_NAME)
if [ "x$ACTUAL_LOGS" != "xEXPECTED RESULT" ]; then
log Got unexpected logs:

View File

@ -1,6 +1,6 @@
export TEMP_DIR=${TEMP_DIR:-$(mktemp -d)}
export BASE_IMAGE_SIZE=${BASE_IMAGE_SIZE:-68719476736}
export BASE_IMAGE_URL=${BASE_IMAGE_URL:-https://cloud-images.ubuntu.com/releases/16.04/release/ubuntu-16.04-server-cloudimg-amd64-disk1.img}
export BASE_IMAGE_SIZE=${BASE_IMAGE_SIZE:-344784896}
export BASE_IMAGE_URL=${BASE_IMAGE_URL:-https://cloud-images.ubuntu.com/releases/bionic/release/ubuntu-18.04-server-cloudimg-amd64.img}
export IMAGE_PROMENADE=${IMAGE_PROMENADE:-quay.io/airshipit/promenade:master}
export IMAGE_PROMENADE_DISTRO=${IMAGE_PROMENADE_DISTRO:-ubuntu_bionic}
export IMAGE_HYPERKUBE=${IMAGE_HYPERKUBE:-gcr.io/google_containers/hyperkube-amd64:v1.17.3}

View File

@ -14,3 +14,17 @@ etcdctl_member_list() {
etcdctl_cmd "${CLUSTER}" "${VM}" member list -w json | jq -r '.members[].name' | sort
}
etcdctl_member_remove() {
CLUSTER=${1}
VM=${2}
NODE=${3}
shift 3
MEMBER_ID=$(etcdctl_cmd $CLUSTER ${VM} member list | awk -F', ' "/${NODE}/ "'{ print $1}')
if [[ -n $MEMBER_ID ]] ; then
etcdctl_cmd "${CLUSTER}" "${VM}" member remove "$MEMBER_ID"
else
log No members found in cluster "$CLUSTER" for node "$NODE"
fi
}

View File

@ -13,13 +13,24 @@ validate_etcd_membership() {
EXPECTED_MEMBERS="${*}"
# NOTE(mark-burnett): Wait a moment for disks in test environment to settle.
sleep 10
sleep 60
log Validating "${CLUSTER}" etcd membership via "${VM}" for members: "${EXPECTED_MEMBERS[@]}"
FOUND_MEMBERS=$(etcdctl_member_list "${CLUSTER}" "${VM}" | tr '\n' ' ' | sed 's/ $//')
if [[ "x${EXPECTED_MEMBERS}" != "x${FOUND_MEMBERS}" ]]; then
log Etcd membership check failed for cluster "${CLUSTER}"
local retries=25
for ((n=0;n<=$retries;n++)); do
FOUND_MEMBERS=$(etcdctl_member_list "${CLUSTER}" "${VM}" | tr '\n' ' ' | sed 's/ $//')
log "Found \"${FOUND_MEMBERS}\", expected \"${EXPECTED_MEMBERS}\""
exit 1
fi
if [[ "x${EXPECTED_MEMBERS}" != "x${FOUND_MEMBERS}" ]]; then
log Etcd membership check failed for cluster "${CLUSTER}" on attempt "$n".
if [[ "$n" == "$retries" ]]; then
log Etcd membership check failed for cluster "${CLUSTER}" after "$n" retries. Exiting.
exit 1
fi
sleep 30
else
log Etcd membership check succeeded for cluster "${CLUSTER}" on attempt "${n}"
break
fi
done
}

View File

@ -120,6 +120,8 @@
"name": "Teardown Genesis",
"script": "teardown-nodes.sh",
"arguments": [
"-e", "kubernetes",
"-e", "calico",
"-v", "n1",
"-n", "n0",
"-r"
@ -160,7 +162,7 @@
}
],
"vm": {
"memory": 3072,
"memory": 4096,
"names": [
"n0",
"n1",

View File

@ -7,7 +7,9 @@ source "${GATE_UTILS}"
rsync_cmd "${TEMP_DIR}/scripts"/*genesis* "${GENESIS_NAME}:/root/promenade/"
set -o pipefail
ssh_cmd "${GENESIS_NAME}" env "PROMENADE_ENCRYPTION_KEY=${PROMENADE_ENCRYPTION_KEY}" /root/promenade/genesis.sh 2>&1 | tee -a "${LOG_FILE}"
ssh_cmd "${GENESIS_NAME}" env "IMAGE_HYPERKUBE=${IMAGE_HYPERKUBE}" \
env "PROMENADE_ENCRYPTION_KEY=${PROMENADE_ENCRYPTION_KEY}" \
/root/promenade/genesis.sh 2>&1 | tee -a "${LOG_FILE}"
ssh_cmd "${GENESIS_NAME}" /root/promenade/validate-genesis.sh 2>&1 | tee -a "${LOG_FILE}"
set +o pipefail

View File

@ -52,7 +52,7 @@ mkdir -p "${SCRIPT_DIR}"
for NAME in "${NODES[@]}"; do
log Building join script for node "${NAME}"
CURL_ARGS=("--fail" "--max-time" "300" "--retry" "16" "--retry-delay" "15")
CURL_ARGS=("-v" "--max-time" "600" "--retry" "20" "--retry-delay" "15" "--connect-timeout" "30" "--progress-bar")
if [[ $GET_KEYSTONE_TOKEN == 1 ]]; then
TOKEN="$(os_ks_get_token "${VIA}")"
if [[ -z $TOKEN ]]; then
@ -67,7 +67,7 @@ for NAME in "${NODES[@]}"; do
promenade_health_check "${VIA}"
log "Validating documents"
ssh_cmd "${VIA}" curl -v "${CURL_ARGS[@]}" -X POST -H "Content-Type: application/json" -d "$(promenade_render_validate_body "${USE_DECKHAND}" "${DECKHAND_REVISION}")" "$(promenade_render_validate_url)"
ssh_cmd "${VIA}" curl "${CURL_ARGS[@]}" -X POST -H "Content-Type: application/json" -d "$(promenade_render_validate_body "${USE_DECKHAND}" "${DECKHAND_REVISION}")" "$(promenade_render_validate_url)"
JOIN_CURL_URL="$(promenade_render_curl_url "${NAME}" "${USE_DECKHAND}" "${DECKHAND_REVISION}" "${LABELS[@]}")"
log "Fetching join script via: ${JOIN_CURL_URL}"

View File

@ -6,15 +6,15 @@ source "${GATE_UTILS}"
VIA="n1"
CURL_ARGS=("--fail" "--max-time" "300" "--retry" "16" "--retry-delay" "15")
CURL_ARGS=("-v" "--max-time" "600" "--retry" "20" "--retry-delay" "15" "--connect-timeout" "30" "--progress-bar")
log Adding labels to node n0
log "Adding labels to node n0"
JSON="{\"calico-etcd\": \"enabled\", \"coredns\": \"enabled\", \"kubernetes-apiserver\": \"enabled\", \"kubernetes-controller-manager\": \"enabled\", \"kubernetes-etcd\": \"enabled\", \"kubernetes-scheduler\": \"enabled\", \"ucp-control-plane\": \"enabled\"}"
ssh_cmd "${VIA}" curl -v "${CURL_ARGS[@]}" -X PUT -H "Content-Type: application/json" -d "${JSON}" "$(promenade_put_labels_url n0)"
ssh_cmd "${VIA}" curl "${CURL_ARGS[@]}" -X PUT -H "Content-Type: application/json" -d "${JSON}" "$(promenade_put_labels_url n0)"
# Need to wait
sleep 60
sleep 120
validate_etcd_membership kubernetes n1 n0 n1 n2 n3
validate_etcd_membership calico n1 n0 n1 n2 n3
@ -22,10 +22,10 @@ validate_etcd_membership calico n1 n0 n1 n2 n3
log Removing labels from node n2
JSON="{\"coredns\": \"enabled\", \"ucp-control-plane\": \"enabled\"}"
ssh_cmd "${VIA}" curl -v "${CURL_ARGS[@]}" -X PUT -H "Content-Type: application/json" -d "${JSON}" "$(promenade_put_labels_url n2)"
ssh_cmd "${VIA}" curl "${CURL_ARGS[@]}" -X PUT -H "Content-Type: application/json" -d "${JSON}" "$(promenade_put_labels_url n2)"
# Need to wait
sleep 60
sleep 120
validate_cluster n1

View File

@ -8,8 +8,11 @@ declare -a NODES
RECREATE=0
while getopts "n:rv:" opt; do
while getopts "e:n:rv:" opt; do
case "${opt}" in
e)
ETCD_CLUSTERS+=("${OPTARG}")
;;
n)
NODES+=("${OPTARG}")
;;
@ -35,6 +38,9 @@ fi
for NAME in "${NODES[@]}"; do
log Tearing down node "${NAME}"
promenade_teardown_node "${NAME}" "${VIA}"
for ETCD_CLUSTER in "${ETCD_CLUSTERS[@]}"; do
etcdctl_member_remove "${ETCD_CLUSTER}" "${VIA}" "${NAME}"
done
vm_clean "${NAME}"
if [[ ${RECREATE} == "1" ]]; then
vm_create "${NAME}"