diff --git a/ceph-client/templates/bin/_helm-tests.sh.tpl b/ceph-client/templates/bin/_helm-tests.sh.tpl index ba4528dcc..c5cf47932 100755 --- a/ceph-client/templates/bin/_helm-tests.sh.tpl +++ b/ceph-client/templates/bin/_helm-tests.sh.tpl @@ -47,34 +47,91 @@ function check_osd_count() { fi } +function mgr_validation() { + echo "#### Start: MGR validation ####" + mgr_dump=$(ceph mgr dump -f json-pretty) + echo "Checking for ${MGR_COUNT} MGRs" + + mgr_avl=$(echo ${mgr_dump} | jq -r '.["available"]') + + if [ "x${mgr_avl}" == "xtrue" ]; then + mgr_active=$(echo ${mgr_dump} | jq -r '.["active_name"]') + echo "Out of ${MGR_COUNT}, 1 MGR is active" + + # Now lets check for standby managers + mgr_stdby_count=$(echo ${mgr_dump} | jq -r '.["standbys"]' | jq length) + + #Total MGR Count - 1 Active = Expected MGRs + expected_standbys=$(( MGR_COUNT -1 )) + + if [ $mgr_stdby_count -eq $expected_standbys ] + then + echo "Cluster has 1 Active MGR, $mgr_stdby_count Standbys MGR" + else + echo "Cluster Standbys MGR: Expected count= $expected_standbys Available=$mgr_stdby_count" + retcode=1 + fi + + else + echo "No Active Manager found, Expected 1 MGR to be active out of ${MGR_COUNT}" + retcode=1 + fi + + if [ "x${retcode}" == "x1" ] + then + exit 1 + fi +} + function pool_validation() { + echo "#### Start: Checking Ceph pools ####" - pool_dump=$(ceph osd pool ls detail -f json-pretty) - osd_crush_rule_dump=$(ceph osd crush rule dump -f json-pretty) + + echo "From env variables, RBD pool replication count is: ${RBD}" + + # Assuming all pools have same replication count as RBD + # If RBD replication count is greater then 1, POOLMINSIZE should be 1 less then replication count + # If RBD replication count is not greate then 1, then POOLMINSIZE should be 1 + + if [ ${RBD} -gt 1 ]; then + EXPECTED_POOLMINSIZE=$[${RBD}-1] + else + EXPECTED_POOLMINSIZE=1 + fi + + echo "EXPECTED_POOLMINSIZE: ${EXPECTED_POOLMINSIZE}" expectedCrushRuleId="" - nrules=$(echo ${osd_crush_rule_dump} | jq length) + nrules=$(echo ${OSD_CRUSH_RULE_DUMP} | jq length) c=$[nrules-1] for n in $(seq 0 ${c}) do - name=$(echo ${osd_crush_rule_dump} | jq -r .[${n}].rule_name) + osd_crush_rule_obj=$(echo ${OSD_CRUSH_RULE_DUMP} | jq -r .[${n}]) + + name=$(echo ${osd_crush_rule_obj} | jq -r .rule_name) + echo "Expected Crushrule: ${EXPECTED_CRUSHRULE}, Pool Crushmap: ${name}" + if [ "x${EXPECTED_CRUSHRULE}" == "x${name}" ]; then - expectedCrushRuleId=$(echo ${osd_crush_rule_dump} | jq .[${n}].rule_id) + expectedCrushRuleId=$(echo ${osd_crush_rule_obj} | jq .rule_id) echo "Checking against rule: id: ${expectedCrushRuleId}, name:${name}" + else + echo "Didn't match" fi done echo "Checking cluster for size:${RBD}, min_size:${EXPECTED_POOLMINSIZE}, crush_rule:${EXPECTED_CRUSHRULE}, crush_rule_id:${expectedCrushRuleId}" - npools=$(echo ${pool_dump} | jq length) + npools=$(echo ${OSD_POOLS_DETAILS} | jq length) i=$[npools - 1] for n in $(seq 0 ${i}) do - size=$(echo ${pool_dump} | jq -r ".[${n}][\"size\"]") - min_size=$(echo ${pool_dump} | jq -r ".[${n}][\"min_size\"]") - pg_num=$(echo ${pool_dump} | jq -r ".[${n}][\"pg_num\"]") - pg_placement_num=$(echo ${pool_dump} | jq -r ".[${n}][\"pg_placement_num\"]") - crush_rule=$(echo ${pool_dump} | jq -r ".[${n}][\"crush_rule\"]") - name=$(echo ${pool_dump} | jq -r ".[${n}][\"pool_name\"]") + pool_obj=$(echo ${OSD_POOLS_DETAILS} | jq -r ".[${n}]") + + size=$(echo ${pool_obj} | jq -r .size) + min_size=$(echo ${pool_obj} | jq -r .min_size) + pg_num=$(echo ${pool_obj} | jq -r .pg_num) + pg_placement_num=$(echo ${pool_obj} | jq -r .pg_placement_num) + crush_rule=$(echo ${pool_obj} | jq -r .crush_rule) + name=$(echo ${pool_obj} | jq -r .pool_name) if [ "x${size}" != "x${RBD}" ] || [ "x${min_size}" != "x${EXPECTED_POOLMINSIZE}" ] \ || [ "x${pg_num}" != "x${pg_placement_num}" ] || [ "x${crush_rule}" != "x${expectedCrushRuleId}" ]; then @@ -88,30 +145,33 @@ function pool_validation() { function pool_failuredomain_validation() { echo "#### Start: Checking Pools are configured with specific failure domain ####" - osd_pool_ls_details=$(ceph osd pool ls detail -f json-pretty) - osd_crush_rule_dump=$(ceph osd crush rule dump -f json-pretty) expectedCrushRuleId="" - nrules=$(echo ${osd_crush_rule_dump} | jq length) + nrules=$(echo ${OSD_CRUSH_RULE_DUMP} | jq length) c=$[nrules-1] for n in $(seq 0 ${c}) do - name=$(echo ${osd_crush_rule_dump} | jq -r .[${n}].rule_name) + osd_crush_rule_obj=$(echo ${OSD_CRUSH_RULE_DUMP} | jq -r .[${n}]) + + name=$(echo ${osd_crush_rule_obj} | jq -r .rule_name) if [ "x${EXPECTED_CRUSHRULE}" == "x${name}" ]; then - expectedCrushRuleId=$(echo ${osd_crush_rule_dump} | jq .[${n}].rule_id) + expectedCrushRuleId=$(echo ${osd_crush_rule_obj} | jq .rule_id) echo "Checking against rule: id: ${expectedCrushRuleId}, name:${name}" fi done echo "Checking OSD pools are configured with Crush rule name:${EXPECTED_CRUSHRULE}, id:${expectedCrushRuleId}" - npools=$(echo ${osd_pool_ls_details} | jq length) + npools=$(echo ${OSD_POOLS_DETAILS} | jq length) i=$[npools-1] for p in $(seq 0 ${i}) do - pool_crush_rule_id=$(echo $osd_pool_ls_details | jq -r ".[${p}][\"crush_rule\"]") - pool_name=$(echo $osd_pool_ls_details | jq -r ".[${p}][\"pool_name\"]") + pool_obj=$(echo ${OSD_POOLS_DETAILS} | jq -r ".[${p}]") + + pool_crush_rule_id=$(echo $pool_obj | jq -r .crush_rule) + pool_name=$(echo $pool_obj | jq -r .pool_name) + if [ "x${pool_crush_rule_id}" == "x${expectedCrushRuleId}" ]; then echo "--> Info: Pool ${pool_name} is configured with the correct rule ${pool_crush_rule_id}" else @@ -123,59 +183,37 @@ function pool_failuredomain_validation() { function pg_validation() { echo "#### Start: Checking placement groups active+clean ####" - osd_pool_ls_details=$(ceph pg stat -f json-pretty) - num_pgs=$(echo ${osd_pool_ls_details} | jq -r .num_pgs) - npoolls=$(echo ${osd_pool_ls_details} | jq -r .num_pg_by_state | jq length) - i=${npoolls-1} + + num_pgs=$(echo ${PG_STAT} | jq -r .num_pgs) + npoolls=$(echo ${PG_STAT} | jq -r .num_pg_by_state | jq length) + i=$[npoolls-1] for n in $(seq 0 ${i}) do - pg_state=$(echo ${osd_pool_ls_details} | jq -r .num_pg_by_state[${n}].name) + pg_state=$(echo ${PG_STAT} | jq -r .num_pg_by_state[${n}].name) if [ "xactive+clean" == "x${pg_state}" ]; then - active_clean_pg_num=$(echo ${osd_pool_ls_details} | jq -r .num_pg_by_state[${n}].num) + active_clean_pg_num=$(echo ${PG_STAT} | jq -r .num_pg_by_state[${n}].num) if [ $num_pgs -eq $active_clean_pg_num ]; then echo "Success: All PGs configured (${num_pgs}) are in active+clean status" else echo "Error: All PGs configured (${num_pgs}) are NOT in active+clean status" exit 1 fi + else + echo "Error: PG state not in active+clean status" + exit 1 fi done } -function mgr_validation() { - echo "#### Start: MGR validation ####" - mgr_dump=$(ceph mgr dump -f json-pretty) - echo "Checking for ${MGR_COUNT} MGRs" - - mgr_avl=$(echo ${mgr_dump} | jq -r '.["available"]') - - if [ "x${mgr_avl}" == "xtrue" ]; then - mgr_active=$(echo ${mgr_dump} | jq -r '.["active_name"]') - - # Now test to check is we have at least one valid standby - mgr_stdby_count=$(echo ${mgr_dump} | jq -r '.["standbys"]' | jq length) - if [ $mgr_stdby_count -ge 1 ] - then - echo "Active manager ${mgr_active} is up and running. ${mgr_stdby_count} standby managers available" - else - echo "No standby Manager available" - retcode=1 - fi - else - echo "Manager is not active" - retcode=1 - fi - - if [ "x${retcode}" == "x1" ] - then - exit 1 - fi -} - check_cluster_status check_osd_count mgr_validation + +OSD_POOLS_DETAILS=$(ceph osd pool ls detail -f json-pretty) +OSD_CRUSH_RULE_DUMP=$(ceph osd crush rule dump -f json-pretty) +PG_STAT=$(ceph pg stat -f json-pretty) + pg_validation pool_validation pool_failuredomain_validation diff --git a/ceph-client/templates/pod-helm-tests.yaml b/ceph-client/templates/pod-helm-tests.yaml index ead219086..05999bd5a 100644 --- a/ceph-client/templates/pod-helm-tests.yaml +++ b/ceph-client/templates/pod-helm-tests.yaml @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */}} -{{- if .Values.manifests.pod_test }} +{{- if .Values.manifests.helm_tests }} {{- $envAll := . }} {{- $serviceAccountName := printf "%s-%s" $envAll.Release.Name "test" }} {{ tuple $envAll "tests" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }} @@ -45,12 +45,8 @@ spec: value: {{ .Values.conf.pool.target.osd | quote }} - name: EXPECTED_CRUSHRULE value: {{ .Values.conf.pool.default.crush_rule | default "replicated_rule" | quote }} - - name: EXPECTED_POOLMINSIZE - value: "2" - name: MGR_COUNT value: {{ .Values.pod.replicas.mgr | default "1" | quote }} - - name: SPECS - value: {{ include "helm-toolkit.utils.joinListWithComma" .Values.conf.pool.spec }} {{- range $pool := .Values.conf.pool.spec -}} {{- with $pool }} - name: {{ .name | upper | replace "." "_" }} diff --git a/ceph-client/values.yaml b/ceph-client/values.yaml index 932a2a57c..e5c39b65b 100644 --- a/ceph-client/values.yaml +++ b/ceph-client/values.yaml @@ -410,6 +410,8 @@ dependencies: services: - endpoint: internal service: ceph_mon + - endpoint: internal + service: ceph_mgr bootstrap: enabled: false @@ -511,5 +513,5 @@ manifests: job_image_repo_sync: true job_rbd_pool: true service_mgr: true - pod_test: false + helm_tests: true cronjob_checkPGs: true diff --git a/tools/deployment/multinode/115-radosgw-osh-infra.sh b/tools/deployment/multinode/115-radosgw-osh-infra.sh index 441128364..4f863950b 100755 --- a/tools/deployment/multinode/115-radosgw-osh-infra.sh +++ b/tools/deployment/multinode/115-radosgw-osh-infra.sh @@ -71,4 +71,4 @@ helm upgrade --install radosgw-osh-infra ./ceph-rgw \ #NOTE: Validate Deployment info helm status radosgw-osh-infra -helm test radosgw-osh-infra +helm test radosgw-osh-infra --timeout 900 diff --git a/tools/deployment/openstack-support/100-ceph-radosgateway.sh b/tools/deployment/openstack-support/100-ceph-radosgateway.sh index 0606fc424..91e5d86b3 100755 --- a/tools/deployment/openstack-support/100-ceph-radosgateway.sh +++ b/tools/deployment/openstack-support/100-ceph-radosgateway.sh @@ -62,4 +62,4 @@ sleep 60 #NOTE(portdirect): Wait for ingress controller to update rules and rest openstack service list openstack endpoint list -helm test radosgw-openstack +helm test radosgw-openstack --timeout 900 diff --git a/tools/deployment/osh-infra-logging/020-ceph.sh b/tools/deployment/osh-infra-logging/020-ceph.sh index 91533cf6d..5ef73e30d 100755 --- a/tools/deployment/osh-infra-logging/020-ceph.sh +++ b/tools/deployment/osh-infra-logging/020-ceph.sh @@ -207,3 +207,5 @@ for CHART in ceph-mon ceph-osd ceph-client ceph-provisioners; do --no-headers | awk '{ print $1; exit }') kubectl exec -n ceph ${MON_POD} -- ceph -s done +helm test ceph-osd --timeout 900 +helm test ceph-client --timeout 900 diff --git a/tools/deployment/tenant-ceph/030-ceph.sh b/tools/deployment/tenant-ceph/030-ceph.sh index 8a99bd0de..81a76105f 100755 --- a/tools/deployment/tenant-ceph/030-ceph.sh +++ b/tools/deployment/tenant-ceph/030-ceph.sh @@ -131,3 +131,5 @@ for CHART in ceph-mon ceph-osd ceph-client ceph-provisioners; do --no-headers | awk '{ print $1; exit }') kubectl exec -n ceph ${MON_POD} -- ceph -s done +helm test ceph-osd --timeout 900 +helm test ceph-client --timeout 900 diff --git a/tools/deployment/tenant-ceph/040-tenant-ceph.sh b/tools/deployment/tenant-ceph/040-tenant-ceph.sh index e485b2464..5a95408a6 100755 --- a/tools/deployment/tenant-ceph/040-tenant-ceph.sh +++ b/tools/deployment/tenant-ceph/040-tenant-ceph.sh @@ -158,3 +158,6 @@ for CHART in ceph-mon ceph-osd ceph-client; do --no-headers | awk '{ print $1; exit }') kubectl exec -n tenant-ceph ${MON_POD} -- ceph -s done + +helm test tenant-ceph-osd --timeout 900 +helm test ceph-client --timeout 900 diff --git a/tools/deployment/tenant-ceph/060-radosgw-openstack.sh b/tools/deployment/tenant-ceph/060-radosgw-openstack.sh index 3ca169261..89e769fd4 100755 --- a/tools/deployment/tenant-ceph/060-radosgw-openstack.sh +++ b/tools/deployment/tenant-ceph/060-radosgw-openstack.sh @@ -71,4 +71,4 @@ helm upgrade --install radosgw-openstack ./ceph-rgw \ #NOTE: Validate Deployment info helm status radosgw-openstack -helm test radosgw-openstack +helm test radosgw-openstack --timeout 900