Added safe-pg-repair action

This action automatically repairs inconsistent placement groups
which are caused by read errors.

PGs are repaired using `ceph pg repair <pgid>`.

Action is only taken if on of a PG's shards has a "read_error",
and no action will be taken if any additional errors are found.
No action will be taken if multiple "read_errors" are found.

This action is intended to be safe to run in all contexts.

Closes-Bug: #1923218
Change-Id: I903dfe02aa3b7c67414e3d0d9b57f4042d301830
This commit is contained in:
Connor Chamberlain 2022-02-25 08:33:10 -07:00 committed by jneo8
parent d3b2494ee8
commit a1cffc6693
4 changed files with 486 additions and 1 deletions

View File

@ -439,3 +439,5 @@ delete-user:
type: string type: string
description: "User ID to delete." description: "User ID to delete."
required: [username] required: [username]
pg-repair:
description: "Repair inconsistent placement groups, if safe to do so."

1
actions/pg-repair Symbolic link
View File

@ -0,0 +1 @@
pg_repair.py

202
actions/pg_repair.py Executable file
View File

@ -0,0 +1,202 @@
#!/usr/bin/env python3
#
# Copyright 2022 Canonical Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import sys
from subprocess import check_output, CalledProcessError
_path = os.path.dirname(os.path.realpath(__file__))
_hooks = os.path.abspath(os.path.join(_path, "../hooks"))
_lib = os.path.abspath(os.path.join(_path, "../lib"))
def _add_path(path):
if path not in sys.path:
sys.path.insert(1, path)
_add_path(_hooks)
_add_path(_lib)
from charmhelpers.core.hookenv import (
log,
function_fail,
function_set,
)
from charms_ceph.utils import list_pools
def get_rados_inconsistent_objs(pg):
"""Get all inconsistent objects for a given placement group.
:param pg: Name of a placement group
:type pg: str
:return: list of inconsistent objects
:rtype: list[str]
"""
return json.loads(
check_output(
["rados", "list-inconsistent-obj", pg, "--format=json-pretty"]
).decode("UTF-8")
)
def get_rados_inconsistent_pgs(pool):
"""Get all inconsistent placement groups for a given pool.
:param pool: Name of a Ceph pool
:type pool: str
:returns: list of inconsistent placement group IDs
:rtype: list[str]
"""
return json.loads(
check_output(["rados", "list-inconsistent-pg", pool]).decode("UTF-8")
)
def get_inconsistent_pgs(ceph_pools):
"""Get all inconsistent placement groups for a list of pools.
:param ceph_pools: List of names of Ceph pools
:type ceph_pools: list[str]
:returns: list of inconsistent placement group IDs as a set
:rtype: set[str]
"""
inconsistent_pgs = set()
for pool in ceph_pools:
inconsistent_pgs.update(get_rados_inconsistent_pgs(pool))
return inconsistent_pgs
def get_safe_pg_repairs(inconsistent_pgs):
"""Filters inconsistent placement groups for ones that are safe to repair.
:param inconsistent_pgs: List of inconsistent placement groups
:type inconsistent_pgs: list[str]
:returns: list of safely repairable placement groups as a set
:rtype: set[str]
"""
return {pg for pg in inconsistent_pgs if is_pg_safe_to_repair(pg)}
def is_pg_safe_to_repair(pg):
"""Determines if a placement group is safe to repair.
:param pg: Name of an inconsistent placement group
:type pg: str
:returns: placement group is safe to repair
:rtype: bool
"""
# Additional tests for known safe cases can be added here.
return has_read_error_only(pg)
def has_read_error_only(pg):
"""Determines if an inconsistent placement group is caused by a read error.
Returns False if no read errors are found, or if any errors other than read
errors are found.
:param pg: ID of an inconsistent placement group
:type pg: str
:returns: placement group is safe to repair
:rtype: bool
"""
rados_inconsistent_objs = get_rados_inconsistent_objs(pg)
read_error_found = False
for inconsistent in rados_inconsistent_objs.get("inconsistents", []):
for shard in inconsistent.get("shards", []):
errors = shard.get("errors", [])
if errors == ["read_error"]:
if read_error_found:
return False
read_error_found = True
continue
elif errors:
# Error other than "read_error" detected
return False
return read_error_found
def perform_pg_repairs(pgs):
"""Runs `ceph pg repair` on a group of placement groups.
All placement groups provided should be confirmed as safe prior to using
this method.
:param pgs: List of safe-to-repair placement groups
:type pg: list[str]
"""
for pg in pgs:
log("Repairing ceph placement group {}".format(pg))
check_output(["ceph", "pg", "repair", pg])
def pg_repair():
"""Repair all inconsistent placement groups caused by read errors."""
ceph_pools = list_pools()
if not ceph_pools:
msg = "No Ceph pools found."
log(msg)
function_set(msg)
return
# Get inconsistent placement groups
inconsistent_pgs = get_inconsistent_pgs(ceph_pools)
if not inconsistent_pgs:
msg = "No inconsistent placement groups found."
log(msg)
function_set(msg)
return
# Filter for known safe cases
safe_pg_repairs = get_safe_pg_repairs(inconsistent_pgs)
unsafe_pg_repairs = inconsistent_pgs.difference(safe_pg_repairs)
# Perform safe placement group repairs
if unsafe_pg_repairs:
log(
"Ignoring unsafe placement group repairs: {}".format(
unsafe_pg_repairs
)
)
if safe_pg_repairs:
log("Safe placement group repairs found: {}".format(safe_pg_repairs))
perform_pg_repairs(safe_pg_repairs)
function_set(
{
"message": "placement groups repaired: {}".format(
sorted(safe_pg_repairs)
)
}
)
else:
msg = "No safe placement group repairs found."
log(msg)
function_set(msg)
def main():
try:
pg_repair()
except CalledProcessError as e:
log(e)
function_fail(
"Safe placement group repair failed with error: {}".format(str(e))
)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,280 @@
# Copyright 2022 Canonical Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for the pg_repair action."""
from actions import pg_repair as action
import unittest.mock as mock
from test_utils import CharmTestCase
import json
class PlacementGroupRepairTestCase(CharmTestCase):
"""Run tests for the action."""
def setUp(self):
"""Init mocks for test cases."""
super(PlacementGroupRepairTestCase, self).setUp(
action,
[
"function_fail",
"function_set",
"get_rados_inconsistent_objs",
"get_rados_inconsistent_pgs",
],
)
@mock.patch("actions.pg_repair.get_rados_inconsistent_pgs")
def test_get_inconsistent_pgs(self, _rados_inc_pgs):
"""Test collection of all inconsistent placement groups."""
_rados_inc_pgs.side_effect = (["1.a", "2.b"], ["2.b", "3.c"], [])
ceph_pools = ["testPool0", "testPool1", "testPool2"]
result = action.get_inconsistent_pgs(ceph_pools)
self.assertEqual(result, {"1.a", "2.b", "3.c"})
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
def test_safe_case_detection(self, _rados_inc_objs):
"""Test that safe case is detected."""
_rados_inc_objs.return_value = rados_inc_obj_output_safe()
result = action.is_pg_safe_to_repair("")
self.assertTrue(result)
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
def test_unsafe_case_detection_extra_erros(self, _rados_inc_objs):
"""Test that the unsafe case of extra errors is detected."""
_rados_inc_objs.return_value = rados_inc_obj_output_extra_errors()
result = action.is_pg_safe_to_repair("")
self.assertFalse(result)
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
def test_unsafe_case_detection_multiple_read_errors(self, _rados_inc_objs):
"""Test that the unsafe case of multiple read errors is detected."""
_rados_inc_objs.return_value = (
rados_inc_obj_output_multiple_read_errors()
)
result = action.is_pg_safe_to_repair("")
self.assertFalse(result)
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
def test_get_safe_pg_repair(self, _rados_inc_objs):
_rados_inc_objs.side_effect = (
rados_inc_obj_output_safe(),
rados_inc_obj_output_extra_errors(),
rados_inc_obj_output_multiple_read_errors(),
)
inconsistent_pgs = ("3.1f2", "12.ab3", "16.222")
result = action.get_safe_pg_repairs(inconsistent_pgs)
self.assertEqual(result, {"3.1f2"})
@mock.patch("actions.pg_repair.list_pools")
def test_pg_repair_no_ceph_pools(self, _list_pools):
"""Test action fails when no Ceph pools found."""
_list_pools.return_value = []
action.pg_repair()
msg = "No Ceph pools found."
self.function_set.assert_called_once_with(msg)
@mock.patch("actions.pg_repair.get_inconsistent_pgs")
@mock.patch("actions.pg_repair.list_pools")
def test_pg_repair_no_inconsistent_pgs(self, _list_pools, _get_inc_pgs):
_list_pools.return_value = ["testPool"]
_get_inc_pgs.return_value = []
action.pg_repair()
msg = "No inconsistent placement groups found."
self.function_set.assert_called_once_with(msg)
@mock.patch("actions.pg_repair.check_output")
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
@mock.patch("actions.pg_repair.get_rados_inconsistent_pgs")
@mock.patch("actions.pg_repair.list_pools")
def test_pg_repair_safe_case(
self, _list_pools, _rados_inc_pgs, _rados_inc_objs, _check_output
):
"""Test action succeeds with one read error."""
_list_pools.return_value = ["testPool"]
_rados_inc_pgs.return_value = {"16.abf", "12.bd4"}
_rados_inc_objs.return_value = rados_inc_obj_output_safe()
_check_output.return_value = b""
action.pg_repair()
self.function_set.assert_called_once_with(
{"message": "placement groups repaired: ['12.bd4', '16.abf']"}
)
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
@mock.patch("actions.pg_repair.get_rados_inconsistent_pgs")
@mock.patch("actions.pg_repair.list_pools")
def test_pg_repair_extra_errors(
self, _list_pools, _rados_inc_pgs, _rados_inc_objs
):
"""Test action fails with errors other than read errors."""
_list_pools.return_value = ["testPool"]
_rados_inc_pgs.return_value = {"16.abf", "12.bd4"}
_rados_inc_objs.return_value = rados_inc_obj_output_extra_errors()
action.pg_repair()
self.function_set.assert_called_once()
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
@mock.patch("actions.pg_repair.get_rados_inconsistent_pgs")
@mock.patch("actions.pg_repair.list_pools")
def test_pg_repair_multiple_read_errors(
self, _list_pools, _rados_inc_pgs, _rados_inc_objs
):
"""Test action fails with multiple read errors."""
_list_pools.return_value = ["testPool"]
_rados_inc_pgs.return_value = {"16.abf", "12.bd4"}
_rados_inc_objs.return_value = (
rados_inc_obj_output_multiple_read_errors()
)
action.pg_repair()
self.function_set.assert_called_once()
def rados_inc_obj_output_safe():
return json.loads("""{
"epoch": 873,
"inconsistents": [
{
"object": {
"data": "nothing to see here"
},
"errors": [],
"union_shard_errors": [
"read_error"
],
"selected_object_info": {
"data": "nothing to see here"
},
"shards": [
{
"osd": 53,
"primary": true,
"errors": [
"read_error"
],
"size": 4046848
},
{
"osd": 56,
"primary": false,
"errors": [],
"size": 4046848,
"omap_digest": "0xffffffff",
"data_digest": "0xb86056e7"
},
{
"osd": 128,
"primary": false,
"errors": [],
"size": 4046848,
"omap_digest": "0xffffffff",
"data_digest": "0xb86056e7"
}
]
}
]
}""")
def rados_inc_obj_output_extra_errors():
return json.loads("""{
"epoch": 873,
"inconsistents": [
{
"object": {
"data": "nothing to see here"
},
"errors": [],
"union_shard_errors": [
"read_error"
],
"selected_object_info": {
"data": "nothing to see here"
},
"shards": [
{
"osd": 53,
"primary": true,
"errors": [
"read_error",
"some_other_error"
],
"size": 4046848
},
{
"osd": 56,
"primary": false,
"errors": [],
"size": 4046848,
"omap_digest": "0xffffffff",
"data_digest": "0xb86056e7"
},
{
"osd": 128,
"primary": false,
"errors": [],
"size": 4046848,
"omap_digest": "0xffffffff",
"data_digest": "0xb86056e7"
}
]
}
]
}""")
def rados_inc_obj_output_multiple_read_errors():
return json.loads("""{
"epoch": 873,
"inconsistents": [
{
"object": {
"data": "nothing to see here"
},
"errors": [],
"union_shard_errors": [
"read_error"
],
"selected_object_info": {
"data": "nothing to see here"
},
"shards": [
{
"osd": 53,
"primary": true,
"errors": [
"read_error"
],
"size": 4046848
},
{
"osd": 56,
"primary": false,
"errors": [
"read_error"
],
"size": 4046848,
"omap_digest": "0xffffffff",
"data_digest": "0xb86056e7"
},
{
"osd": 128,
"primary": false,
"errors": [],
"size": 4046848,
"omap_digest": "0xffffffff",
"data_digest": "0xb86056e7"
}
]
}
]
}""")