From d2afe9b8e7279cc70cc8b74308c45699adfe14b8 Mon Sep 17 00:00:00 2001 From: Connor Chamberlain Date: Fri, 25 Feb 2022 08:33:10 -0700 Subject: [PATCH] Backport pg-repair action (pacific) This action automatically repairs inconsistent placement groups which are caused by read errors. PGs are repaired using `ceph pg repair `. Action is only taken if on of a PG's shards has a "read_error", and no action will be taken if any additional errors are found. No action will be taken if multiple "read_errors" are found. This action is intended to be safe to run in all contexts. Closes-Bug: #1923218 (cherry-picked from commit a1cffc669322a2fe1c709e09a717ff9f78ab5680) Change-Id: I367f5e9569ed688ba712a4ff8cacd0b05208366a --- actions.yaml | 2 + actions/pg-repair | 1 + actions/pg_repair.py | 202 ++++++++++++++++++++ bindep.txt | 2 + tox.ini | 2 +- unit_tests/test_action_pg_repair.py | 280 ++++++++++++++++++++++++++++ 6 files changed, 488 insertions(+), 1 deletion(-) create mode 120000 actions/pg-repair create mode 100755 actions/pg_repair.py create mode 100644 unit_tests/test_action_pg_repair.py diff --git a/actions.yaml b/actions.yaml index c1ca254c..fcea02dd 100644 --- a/actions.yaml +++ b/actions.yaml @@ -405,3 +405,5 @@ get-quorum-status: - text - json description: Specify output format (text|json). +pg-repair: + description: "Repair inconsistent placement groups, if safe to do so." diff --git a/actions/pg-repair b/actions/pg-repair new file mode 120000 index 00000000..e60c9660 --- /dev/null +++ b/actions/pg-repair @@ -0,0 +1 @@ +pg_repair.py \ No newline at end of file diff --git a/actions/pg_repair.py b/actions/pg_repair.py new file mode 100755 index 00000000..6dd17ecc --- /dev/null +++ b/actions/pg_repair.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python3 +# +# Copyright 2022 Canonical Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import os +import sys +from subprocess import check_output, CalledProcessError + +_path = os.path.dirname(os.path.realpath(__file__)) +_hooks = os.path.abspath(os.path.join(_path, "../hooks")) +_lib = os.path.abspath(os.path.join(_path, "../lib")) + + +def _add_path(path): + if path not in sys.path: + sys.path.insert(1, path) + + +_add_path(_hooks) +_add_path(_lib) + + +from charmhelpers.core.hookenv import ( + log, + function_fail, + function_set, +) +from charms_ceph.utils import list_pools + + +def get_rados_inconsistent_objs(pg): + """Get all inconsistent objects for a given placement group. + + :param pg: Name of a placement group + :type pg: str + :return: list of inconsistent objects + :rtype: list[str] + """ + return json.loads( + check_output( + ["rados", "list-inconsistent-obj", pg, "--format=json-pretty"] + ).decode("UTF-8") + ) + + +def get_rados_inconsistent_pgs(pool): + """Get all inconsistent placement groups for a given pool. + + :param pool: Name of a Ceph pool + :type pool: str + :returns: list of inconsistent placement group IDs + :rtype: list[str] + """ + return json.loads( + check_output(["rados", "list-inconsistent-pg", pool]).decode("UTF-8") + ) + + +def get_inconsistent_pgs(ceph_pools): + """Get all inconsistent placement groups for a list of pools. + + :param ceph_pools: List of names of Ceph pools + :type ceph_pools: list[str] + :returns: list of inconsistent placement group IDs as a set + :rtype: set[str] + """ + inconsistent_pgs = set() + for pool in ceph_pools: + inconsistent_pgs.update(get_rados_inconsistent_pgs(pool)) + return inconsistent_pgs + + +def get_safe_pg_repairs(inconsistent_pgs): + """Filters inconsistent placement groups for ones that are safe to repair. + + :param inconsistent_pgs: List of inconsistent placement groups + :type inconsistent_pgs: list[str] + :returns: list of safely repairable placement groups as a set + :rtype: set[str] + """ + return {pg for pg in inconsistent_pgs if is_pg_safe_to_repair(pg)} + + +def is_pg_safe_to_repair(pg): + """Determines if a placement group is safe to repair. + + :param pg: Name of an inconsistent placement group + :type pg: str + :returns: placement group is safe to repair + :rtype: bool + """ + # Additional tests for known safe cases can be added here. + return has_read_error_only(pg) + + +def has_read_error_only(pg): + """Determines if an inconsistent placement group is caused by a read error. + Returns False if no read errors are found, or if any errors other than read + errors are found. + + :param pg: ID of an inconsistent placement group + :type pg: str + :returns: placement group is safe to repair + :rtype: bool + """ + rados_inconsistent_objs = get_rados_inconsistent_objs(pg) + read_error_found = False + for inconsistent in rados_inconsistent_objs.get("inconsistents", []): + for shard in inconsistent.get("shards", []): + errors = shard.get("errors", []) + if errors == ["read_error"]: + if read_error_found: + return False + read_error_found = True + continue + elif errors: + # Error other than "read_error" detected + return False + return read_error_found + + +def perform_pg_repairs(pgs): + """Runs `ceph pg repair` on a group of placement groups. + All placement groups provided should be confirmed as safe prior to using + this method. + + :param pgs: List of safe-to-repair placement groups + :type pg: list[str] + """ + for pg in pgs: + log("Repairing ceph placement group {}".format(pg)) + check_output(["ceph", "pg", "repair", pg]) + + +def pg_repair(): + """Repair all inconsistent placement groups caused by read errors.""" + ceph_pools = list_pools() + if not ceph_pools: + msg = "No Ceph pools found." + log(msg) + function_set(msg) + return + + # Get inconsistent placement groups + inconsistent_pgs = get_inconsistent_pgs(ceph_pools) + if not inconsistent_pgs: + msg = "No inconsistent placement groups found." + log(msg) + function_set(msg) + return + + # Filter for known safe cases + safe_pg_repairs = get_safe_pg_repairs(inconsistent_pgs) + unsafe_pg_repairs = inconsistent_pgs.difference(safe_pg_repairs) + + # Perform safe placement group repairs + if unsafe_pg_repairs: + log( + "Ignoring unsafe placement group repairs: {}".format( + unsafe_pg_repairs + ) + ) + if safe_pg_repairs: + log("Safe placement group repairs found: {}".format(safe_pg_repairs)) + perform_pg_repairs(safe_pg_repairs) + function_set( + { + "message": "placement groups repaired: {}".format( + sorted(safe_pg_repairs) + ) + } + ) + else: + msg = "No safe placement group repairs found." + log(msg) + function_set(msg) + + +def main(): + try: + pg_repair() + except CalledProcessError as e: + log(e) + function_fail( + "Safe placement group repair failed with error: {}".format(str(e)) + ) + + +if __name__ == "__main__": + main() diff --git a/bindep.txt b/bindep.txt index ba2ccb4b..2666ca6e 100644 --- a/bindep.txt +++ b/bindep.txt @@ -2,3 +2,5 @@ libxml2-dev [platform:dpkg test] libxslt1-dev [platform:dpkg test] build-essential [platform:dpkg test] zlib1g-dev [platform:dpkg test] +libffi-dev [platform:dpkg test] + diff --git a/tox.ini b/tox.ini index 81fd2492..606c928f 100644 --- a/tox.ini +++ b/tox.ini @@ -84,7 +84,7 @@ deps = -r{toxinidir}/requirements.txt [testenv:pep8] basepython = python3 deps = flake8==3.9.2 - charm-tools==2.8.3 + charm-tools==2.8.4 commands = flake8 {posargs} hooks unit_tests tests actions lib files charm-proof diff --git a/unit_tests/test_action_pg_repair.py b/unit_tests/test_action_pg_repair.py new file mode 100644 index 00000000..258c6103 --- /dev/null +++ b/unit_tests/test_action_pg_repair.py @@ -0,0 +1,280 @@ +# Copyright 2022 Canonical Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for the pg_repair action.""" + +from actions import pg_repair as action +import unittest.mock as mock +from test_utils import CharmTestCase +import json + + +class PlacementGroupRepairTestCase(CharmTestCase): + """Run tests for the action.""" + + def setUp(self): + """Init mocks for test cases.""" + super(PlacementGroupRepairTestCase, self).setUp( + action, + [ + "function_fail", + "function_set", + "get_rados_inconsistent_objs", + "get_rados_inconsistent_pgs", + ], + ) + + @mock.patch("actions.pg_repair.get_rados_inconsistent_pgs") + def test_get_inconsistent_pgs(self, _rados_inc_pgs): + """Test collection of all inconsistent placement groups.""" + _rados_inc_pgs.side_effect = (["1.a", "2.b"], ["2.b", "3.c"], []) + ceph_pools = ["testPool0", "testPool1", "testPool2"] + result = action.get_inconsistent_pgs(ceph_pools) + self.assertEqual(result, {"1.a", "2.b", "3.c"}) + + @mock.patch("actions.pg_repair.get_rados_inconsistent_objs") + def test_safe_case_detection(self, _rados_inc_objs): + """Test that safe case is detected.""" + _rados_inc_objs.return_value = rados_inc_obj_output_safe() + result = action.is_pg_safe_to_repair("") + self.assertTrue(result) + + @mock.patch("actions.pg_repair.get_rados_inconsistent_objs") + def test_unsafe_case_detection_extra_erros(self, _rados_inc_objs): + """Test that the unsafe case of extra errors is detected.""" + _rados_inc_objs.return_value = rados_inc_obj_output_extra_errors() + result = action.is_pg_safe_to_repair("") + self.assertFalse(result) + + @mock.patch("actions.pg_repair.get_rados_inconsistent_objs") + def test_unsafe_case_detection_multiple_read_errors(self, _rados_inc_objs): + """Test that the unsafe case of multiple read errors is detected.""" + _rados_inc_objs.return_value = ( + rados_inc_obj_output_multiple_read_errors() + ) + result = action.is_pg_safe_to_repair("") + self.assertFalse(result) + + @mock.patch("actions.pg_repair.get_rados_inconsistent_objs") + def test_get_safe_pg_repair(self, _rados_inc_objs): + _rados_inc_objs.side_effect = ( + rados_inc_obj_output_safe(), + rados_inc_obj_output_extra_errors(), + rados_inc_obj_output_multiple_read_errors(), + ) + inconsistent_pgs = ("3.1f2", "12.ab3", "16.222") + result = action.get_safe_pg_repairs(inconsistent_pgs) + self.assertEqual(result, {"3.1f2"}) + + @mock.patch("actions.pg_repair.list_pools") + def test_pg_repair_no_ceph_pools(self, _list_pools): + """Test action fails when no Ceph pools found.""" + _list_pools.return_value = [] + action.pg_repair() + msg = "No Ceph pools found." + self.function_set.assert_called_once_with(msg) + + @mock.patch("actions.pg_repair.get_inconsistent_pgs") + @mock.patch("actions.pg_repair.list_pools") + def test_pg_repair_no_inconsistent_pgs(self, _list_pools, _get_inc_pgs): + _list_pools.return_value = ["testPool"] + _get_inc_pgs.return_value = [] + action.pg_repair() + msg = "No inconsistent placement groups found." + self.function_set.assert_called_once_with(msg) + + @mock.patch("actions.pg_repair.check_output") + @mock.patch("actions.pg_repair.get_rados_inconsistent_objs") + @mock.patch("actions.pg_repair.get_rados_inconsistent_pgs") + @mock.patch("actions.pg_repair.list_pools") + def test_pg_repair_safe_case( + self, _list_pools, _rados_inc_pgs, _rados_inc_objs, _check_output + ): + """Test action succeeds with one read error.""" + _list_pools.return_value = ["testPool"] + _rados_inc_pgs.return_value = {"16.abf", "12.bd4"} + _rados_inc_objs.return_value = rados_inc_obj_output_safe() + _check_output.return_value = b"" + action.pg_repair() + self.function_set.assert_called_once_with( + {"message": "placement groups repaired: ['12.bd4', '16.abf']"} + ) + + @mock.patch("actions.pg_repair.get_rados_inconsistent_objs") + @mock.patch("actions.pg_repair.get_rados_inconsistent_pgs") + @mock.patch("actions.pg_repair.list_pools") + def test_pg_repair_extra_errors( + self, _list_pools, _rados_inc_pgs, _rados_inc_objs + ): + """Test action fails with errors other than read errors.""" + _list_pools.return_value = ["testPool"] + _rados_inc_pgs.return_value = {"16.abf", "12.bd4"} + _rados_inc_objs.return_value = rados_inc_obj_output_extra_errors() + action.pg_repair() + self.function_set.assert_called_once() + + @mock.patch("actions.pg_repair.get_rados_inconsistent_objs") + @mock.patch("actions.pg_repair.get_rados_inconsistent_pgs") + @mock.patch("actions.pg_repair.list_pools") + def test_pg_repair_multiple_read_errors( + self, _list_pools, _rados_inc_pgs, _rados_inc_objs + ): + """Test action fails with multiple read errors.""" + _list_pools.return_value = ["testPool"] + _rados_inc_pgs.return_value = {"16.abf", "12.bd4"} + _rados_inc_objs.return_value = ( + rados_inc_obj_output_multiple_read_errors() + ) + action.pg_repair() + self.function_set.assert_called_once() + + +def rados_inc_obj_output_safe(): + return json.loads("""{ + "epoch": 873, + "inconsistents": [ + { + "object": { + "data": "nothing to see here" + }, + "errors": [], + "union_shard_errors": [ + "read_error" + ], + "selected_object_info": { + "data": "nothing to see here" + }, + "shards": [ + { + "osd": 53, + "primary": true, + "errors": [ + "read_error" + ], + "size": 4046848 + }, + { + "osd": 56, + "primary": false, + "errors": [], + "size": 4046848, + "omap_digest": "0xffffffff", + "data_digest": "0xb86056e7" + }, + { + "osd": 128, + "primary": false, + "errors": [], + "size": 4046848, + "omap_digest": "0xffffffff", + "data_digest": "0xb86056e7" + } + ] + } + ] + }""") + + +def rados_inc_obj_output_extra_errors(): + return json.loads("""{ + "epoch": 873, + "inconsistents": [ + { + "object": { + "data": "nothing to see here" + }, + "errors": [], + "union_shard_errors": [ + "read_error" + ], + "selected_object_info": { + "data": "nothing to see here" + }, + "shards": [ + { + "osd": 53, + "primary": true, + "errors": [ + "read_error", + "some_other_error" + ], + "size": 4046848 + }, + { + "osd": 56, + "primary": false, + "errors": [], + "size": 4046848, + "omap_digest": "0xffffffff", + "data_digest": "0xb86056e7" + }, + { + "osd": 128, + "primary": false, + "errors": [], + "size": 4046848, + "omap_digest": "0xffffffff", + "data_digest": "0xb86056e7" + } + ] + } + ] + }""") + + +def rados_inc_obj_output_multiple_read_errors(): + return json.loads("""{ + "epoch": 873, + "inconsistents": [ + { + "object": { + "data": "nothing to see here" + }, + "errors": [], + "union_shard_errors": [ + "read_error" + ], + "selected_object_info": { + "data": "nothing to see here" + }, + "shards": [ + { + "osd": 53, + "primary": true, + "errors": [ + "read_error" + ], + "size": 4046848 + }, + { + "osd": 56, + "primary": false, + "errors": [ + "read_error" + ], + "size": 4046848, + "omap_digest": "0xffffffff", + "data_digest": "0xb86056e7" + }, + { + "osd": 128, + "primary": false, + "errors": [], + "size": 4046848, + "omap_digest": "0xffffffff", + "data_digest": "0xb86056e7" + } + ] + } + ] + }""")