Improve tolerance for failed nodes

Raise SSH reconnects to 20
Disable any_errors_fatal

Depends on https://github.com/kubernetes-incubator/kargo/pull/1201/

Change-Id: I0d20c9e0e80f87561c50957fdff1e576ec89646c
This commit is contained in:
Matthew Mosesohn 2017-04-04 13:21:30 +03:00
parent 1d5dd1127e
commit 05f11e6348
2 changed files with 5 additions and 1 deletions

View File

@ -142,7 +142,8 @@ function with_ansible {
until admin_node_command \
ANSIBLE_CONFIG=$ADMIN_WORKSPACE/utils/kargo/ansible.cfg \
ansible-playbook \
--ssh-extra-args "-A\ -o\ StrictHostKeyChecking=no" -u ${ADMIN_USER} -b \
--ssh-extra-args "-A\ -o\ StrictHostKeyChecking=no\ -o\ ConnectionAttempts=20" \
-u ${ADMIN_USER} -b \
--become-user=root -i $ADMIN_WORKSPACE/inventory/inventory.cfg \
--forks=$ANSIBLE_FORKS --timeout $ANSIBLE_TIMEOUT $DEFAULT_OPTS \
-e ansible_ssh_user=${ADMIN_USER} \

View File

@ -50,6 +50,9 @@ upstream_dns_servers:
# has some bugs when DHCP is enabled.
resolvconf_mode: host_resolvconf
# Continue deploying other hosts even if one failed
any_errors_fatal: false
# Tweak kubelet monitoring parameters to node/endpoint node flapping
kubelet_status_update_frequency: "20s"
kube_controller_node_monitor_grace_period: "2m"