Added safe-pg-repair action
This action automatically repairs inconsistent placement groups which are caused by read errors. PGs are repaired using `ceph pg repair <pgid>`. Action is only taken if on of a PG's shards has a "read_error", and no action will be taken if any additional errors are found. No action will be taken if multiple "read_errors" are found. This action is intended to be safe to run in all contexts. Closes-Bug: #1923218 Change-Id: I903dfe02aa3b7c67414e3d0d9b57f4042d301830
This commit is contained in:
parent
d3b2494ee8
commit
a1cffc6693
@ -439,3 +439,5 @@ delete-user:
|
||||
type: string
|
||||
description: "User ID to delete."
|
||||
required: [username]
|
||||
pg-repair:
|
||||
description: "Repair inconsistent placement groups, if safe to do so."
|
||||
|
1
actions/pg-repair
Symbolic link
1
actions/pg-repair
Symbolic link
@ -0,0 +1 @@
|
||||
pg_repair.py
|
202
actions/pg_repair.py
Executable file
202
actions/pg_repair.py
Executable file
@ -0,0 +1,202 @@
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright 2022 Canonical Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from subprocess import check_output, CalledProcessError
|
||||
|
||||
_path = os.path.dirname(os.path.realpath(__file__))
|
||||
_hooks = os.path.abspath(os.path.join(_path, "../hooks"))
|
||||
_lib = os.path.abspath(os.path.join(_path, "../lib"))
|
||||
|
||||
|
||||
def _add_path(path):
|
||||
if path not in sys.path:
|
||||
sys.path.insert(1, path)
|
||||
|
||||
|
||||
_add_path(_hooks)
|
||||
_add_path(_lib)
|
||||
|
||||
|
||||
from charmhelpers.core.hookenv import (
|
||||
log,
|
||||
function_fail,
|
||||
function_set,
|
||||
)
|
||||
from charms_ceph.utils import list_pools
|
||||
|
||||
|
||||
def get_rados_inconsistent_objs(pg):
|
||||
"""Get all inconsistent objects for a given placement group.
|
||||
|
||||
:param pg: Name of a placement group
|
||||
:type pg: str
|
||||
:return: list of inconsistent objects
|
||||
:rtype: list[str]
|
||||
"""
|
||||
return json.loads(
|
||||
check_output(
|
||||
["rados", "list-inconsistent-obj", pg, "--format=json-pretty"]
|
||||
).decode("UTF-8")
|
||||
)
|
||||
|
||||
|
||||
def get_rados_inconsistent_pgs(pool):
|
||||
"""Get all inconsistent placement groups for a given pool.
|
||||
|
||||
:param pool: Name of a Ceph pool
|
||||
:type pool: str
|
||||
:returns: list of inconsistent placement group IDs
|
||||
:rtype: list[str]
|
||||
"""
|
||||
return json.loads(
|
||||
check_output(["rados", "list-inconsistent-pg", pool]).decode("UTF-8")
|
||||
)
|
||||
|
||||
|
||||
def get_inconsistent_pgs(ceph_pools):
|
||||
"""Get all inconsistent placement groups for a list of pools.
|
||||
|
||||
:param ceph_pools: List of names of Ceph pools
|
||||
:type ceph_pools: list[str]
|
||||
:returns: list of inconsistent placement group IDs as a set
|
||||
:rtype: set[str]
|
||||
"""
|
||||
inconsistent_pgs = set()
|
||||
for pool in ceph_pools:
|
||||
inconsistent_pgs.update(get_rados_inconsistent_pgs(pool))
|
||||
return inconsistent_pgs
|
||||
|
||||
|
||||
def get_safe_pg_repairs(inconsistent_pgs):
|
||||
"""Filters inconsistent placement groups for ones that are safe to repair.
|
||||
|
||||
:param inconsistent_pgs: List of inconsistent placement groups
|
||||
:type inconsistent_pgs: list[str]
|
||||
:returns: list of safely repairable placement groups as a set
|
||||
:rtype: set[str]
|
||||
"""
|
||||
return {pg for pg in inconsistent_pgs if is_pg_safe_to_repair(pg)}
|
||||
|
||||
|
||||
def is_pg_safe_to_repair(pg):
|
||||
"""Determines if a placement group is safe to repair.
|
||||
|
||||
:param pg: Name of an inconsistent placement group
|
||||
:type pg: str
|
||||
:returns: placement group is safe to repair
|
||||
:rtype: bool
|
||||
"""
|
||||
# Additional tests for known safe cases can be added here.
|
||||
return has_read_error_only(pg)
|
||||
|
||||
|
||||
def has_read_error_only(pg):
|
||||
"""Determines if an inconsistent placement group is caused by a read error.
|
||||
Returns False if no read errors are found, or if any errors other than read
|
||||
errors are found.
|
||||
|
||||
:param pg: ID of an inconsistent placement group
|
||||
:type pg: str
|
||||
:returns: placement group is safe to repair
|
||||
:rtype: bool
|
||||
"""
|
||||
rados_inconsistent_objs = get_rados_inconsistent_objs(pg)
|
||||
read_error_found = False
|
||||
for inconsistent in rados_inconsistent_objs.get("inconsistents", []):
|
||||
for shard in inconsistent.get("shards", []):
|
||||
errors = shard.get("errors", [])
|
||||
if errors == ["read_error"]:
|
||||
if read_error_found:
|
||||
return False
|
||||
read_error_found = True
|
||||
continue
|
||||
elif errors:
|
||||
# Error other than "read_error" detected
|
||||
return False
|
||||
return read_error_found
|
||||
|
||||
|
||||
def perform_pg_repairs(pgs):
|
||||
"""Runs `ceph pg repair` on a group of placement groups.
|
||||
All placement groups provided should be confirmed as safe prior to using
|
||||
this method.
|
||||
|
||||
:param pgs: List of safe-to-repair placement groups
|
||||
:type pg: list[str]
|
||||
"""
|
||||
for pg in pgs:
|
||||
log("Repairing ceph placement group {}".format(pg))
|
||||
check_output(["ceph", "pg", "repair", pg])
|
||||
|
||||
|
||||
def pg_repair():
|
||||
"""Repair all inconsistent placement groups caused by read errors."""
|
||||
ceph_pools = list_pools()
|
||||
if not ceph_pools:
|
||||
msg = "No Ceph pools found."
|
||||
log(msg)
|
||||
function_set(msg)
|
||||
return
|
||||
|
||||
# Get inconsistent placement groups
|
||||
inconsistent_pgs = get_inconsistent_pgs(ceph_pools)
|
||||
if not inconsistent_pgs:
|
||||
msg = "No inconsistent placement groups found."
|
||||
log(msg)
|
||||
function_set(msg)
|
||||
return
|
||||
|
||||
# Filter for known safe cases
|
||||
safe_pg_repairs = get_safe_pg_repairs(inconsistent_pgs)
|
||||
unsafe_pg_repairs = inconsistent_pgs.difference(safe_pg_repairs)
|
||||
|
||||
# Perform safe placement group repairs
|
||||
if unsafe_pg_repairs:
|
||||
log(
|
||||
"Ignoring unsafe placement group repairs: {}".format(
|
||||
unsafe_pg_repairs
|
||||
)
|
||||
)
|
||||
if safe_pg_repairs:
|
||||
log("Safe placement group repairs found: {}".format(safe_pg_repairs))
|
||||
perform_pg_repairs(safe_pg_repairs)
|
||||
function_set(
|
||||
{
|
||||
"message": "placement groups repaired: {}".format(
|
||||
sorted(safe_pg_repairs)
|
||||
)
|
||||
}
|
||||
)
|
||||
else:
|
||||
msg = "No safe placement group repairs found."
|
||||
log(msg)
|
||||
function_set(msg)
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
pg_repair()
|
||||
except CalledProcessError as e:
|
||||
log(e)
|
||||
function_fail(
|
||||
"Safe placement group repair failed with error: {}".format(str(e))
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
280
unit_tests/test_action_pg_repair.py
Normal file
280
unit_tests/test_action_pg_repair.py
Normal file
@ -0,0 +1,280 @@
|
||||
# Copyright 2022 Canonical Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Tests for the pg_repair action."""
|
||||
|
||||
from actions import pg_repair as action
|
||||
import unittest.mock as mock
|
||||
from test_utils import CharmTestCase
|
||||
import json
|
||||
|
||||
|
||||
class PlacementGroupRepairTestCase(CharmTestCase):
|
||||
"""Run tests for the action."""
|
||||
|
||||
def setUp(self):
|
||||
"""Init mocks for test cases."""
|
||||
super(PlacementGroupRepairTestCase, self).setUp(
|
||||
action,
|
||||
[
|
||||
"function_fail",
|
||||
"function_set",
|
||||
"get_rados_inconsistent_objs",
|
||||
"get_rados_inconsistent_pgs",
|
||||
],
|
||||
)
|
||||
|
||||
@mock.patch("actions.pg_repair.get_rados_inconsistent_pgs")
|
||||
def test_get_inconsistent_pgs(self, _rados_inc_pgs):
|
||||
"""Test collection of all inconsistent placement groups."""
|
||||
_rados_inc_pgs.side_effect = (["1.a", "2.b"], ["2.b", "3.c"], [])
|
||||
ceph_pools = ["testPool0", "testPool1", "testPool2"]
|
||||
result = action.get_inconsistent_pgs(ceph_pools)
|
||||
self.assertEqual(result, {"1.a", "2.b", "3.c"})
|
||||
|
||||
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
|
||||
def test_safe_case_detection(self, _rados_inc_objs):
|
||||
"""Test that safe case is detected."""
|
||||
_rados_inc_objs.return_value = rados_inc_obj_output_safe()
|
||||
result = action.is_pg_safe_to_repair("")
|
||||
self.assertTrue(result)
|
||||
|
||||
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
|
||||
def test_unsafe_case_detection_extra_erros(self, _rados_inc_objs):
|
||||
"""Test that the unsafe case of extra errors is detected."""
|
||||
_rados_inc_objs.return_value = rados_inc_obj_output_extra_errors()
|
||||
result = action.is_pg_safe_to_repair("")
|
||||
self.assertFalse(result)
|
||||
|
||||
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
|
||||
def test_unsafe_case_detection_multiple_read_errors(self, _rados_inc_objs):
|
||||
"""Test that the unsafe case of multiple read errors is detected."""
|
||||
_rados_inc_objs.return_value = (
|
||||
rados_inc_obj_output_multiple_read_errors()
|
||||
)
|
||||
result = action.is_pg_safe_to_repair("")
|
||||
self.assertFalse(result)
|
||||
|
||||
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
|
||||
def test_get_safe_pg_repair(self, _rados_inc_objs):
|
||||
_rados_inc_objs.side_effect = (
|
||||
rados_inc_obj_output_safe(),
|
||||
rados_inc_obj_output_extra_errors(),
|
||||
rados_inc_obj_output_multiple_read_errors(),
|
||||
)
|
||||
inconsistent_pgs = ("3.1f2", "12.ab3", "16.222")
|
||||
result = action.get_safe_pg_repairs(inconsistent_pgs)
|
||||
self.assertEqual(result, {"3.1f2"})
|
||||
|
||||
@mock.patch("actions.pg_repair.list_pools")
|
||||
def test_pg_repair_no_ceph_pools(self, _list_pools):
|
||||
"""Test action fails when no Ceph pools found."""
|
||||
_list_pools.return_value = []
|
||||
action.pg_repair()
|
||||
msg = "No Ceph pools found."
|
||||
self.function_set.assert_called_once_with(msg)
|
||||
|
||||
@mock.patch("actions.pg_repair.get_inconsistent_pgs")
|
||||
@mock.patch("actions.pg_repair.list_pools")
|
||||
def test_pg_repair_no_inconsistent_pgs(self, _list_pools, _get_inc_pgs):
|
||||
_list_pools.return_value = ["testPool"]
|
||||
_get_inc_pgs.return_value = []
|
||||
action.pg_repair()
|
||||
msg = "No inconsistent placement groups found."
|
||||
self.function_set.assert_called_once_with(msg)
|
||||
|
||||
@mock.patch("actions.pg_repair.check_output")
|
||||
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
|
||||
@mock.patch("actions.pg_repair.get_rados_inconsistent_pgs")
|
||||
@mock.patch("actions.pg_repair.list_pools")
|
||||
def test_pg_repair_safe_case(
|
||||
self, _list_pools, _rados_inc_pgs, _rados_inc_objs, _check_output
|
||||
):
|
||||
"""Test action succeeds with one read error."""
|
||||
_list_pools.return_value = ["testPool"]
|
||||
_rados_inc_pgs.return_value = {"16.abf", "12.bd4"}
|
||||
_rados_inc_objs.return_value = rados_inc_obj_output_safe()
|
||||
_check_output.return_value = b""
|
||||
action.pg_repair()
|
||||
self.function_set.assert_called_once_with(
|
||||
{"message": "placement groups repaired: ['12.bd4', '16.abf']"}
|
||||
)
|
||||
|
||||
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
|
||||
@mock.patch("actions.pg_repair.get_rados_inconsistent_pgs")
|
||||
@mock.patch("actions.pg_repair.list_pools")
|
||||
def test_pg_repair_extra_errors(
|
||||
self, _list_pools, _rados_inc_pgs, _rados_inc_objs
|
||||
):
|
||||
"""Test action fails with errors other than read errors."""
|
||||
_list_pools.return_value = ["testPool"]
|
||||
_rados_inc_pgs.return_value = {"16.abf", "12.bd4"}
|
||||
_rados_inc_objs.return_value = rados_inc_obj_output_extra_errors()
|
||||
action.pg_repair()
|
||||
self.function_set.assert_called_once()
|
||||
|
||||
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
|
||||
@mock.patch("actions.pg_repair.get_rados_inconsistent_pgs")
|
||||
@mock.patch("actions.pg_repair.list_pools")
|
||||
def test_pg_repair_multiple_read_errors(
|
||||
self, _list_pools, _rados_inc_pgs, _rados_inc_objs
|
||||
):
|
||||
"""Test action fails with multiple read errors."""
|
||||
_list_pools.return_value = ["testPool"]
|
||||
_rados_inc_pgs.return_value = {"16.abf", "12.bd4"}
|
||||
_rados_inc_objs.return_value = (
|
||||
rados_inc_obj_output_multiple_read_errors()
|
||||
)
|
||||
action.pg_repair()
|
||||
self.function_set.assert_called_once()
|
||||
|
||||
|
||||
def rados_inc_obj_output_safe():
|
||||
return json.loads("""{
|
||||
"epoch": 873,
|
||||
"inconsistents": [
|
||||
{
|
||||
"object": {
|
||||
"data": "nothing to see here"
|
||||
},
|
||||
"errors": [],
|
||||
"union_shard_errors": [
|
||||
"read_error"
|
||||
],
|
||||
"selected_object_info": {
|
||||
"data": "nothing to see here"
|
||||
},
|
||||
"shards": [
|
||||
{
|
||||
"osd": 53,
|
||||
"primary": true,
|
||||
"errors": [
|
||||
"read_error"
|
||||
],
|
||||
"size": 4046848
|
||||
},
|
||||
{
|
||||
"osd": 56,
|
||||
"primary": false,
|
||||
"errors": [],
|
||||
"size": 4046848,
|
||||
"omap_digest": "0xffffffff",
|
||||
"data_digest": "0xb86056e7"
|
||||
},
|
||||
{
|
||||
"osd": 128,
|
||||
"primary": false,
|
||||
"errors": [],
|
||||
"size": 4046848,
|
||||
"omap_digest": "0xffffffff",
|
||||
"data_digest": "0xb86056e7"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}""")
|
||||
|
||||
|
||||
def rados_inc_obj_output_extra_errors():
|
||||
return json.loads("""{
|
||||
"epoch": 873,
|
||||
"inconsistents": [
|
||||
{
|
||||
"object": {
|
||||
"data": "nothing to see here"
|
||||
},
|
||||
"errors": [],
|
||||
"union_shard_errors": [
|
||||
"read_error"
|
||||
],
|
||||
"selected_object_info": {
|
||||
"data": "nothing to see here"
|
||||
},
|
||||
"shards": [
|
||||
{
|
||||
"osd": 53,
|
||||
"primary": true,
|
||||
"errors": [
|
||||
"read_error",
|
||||
"some_other_error"
|
||||
],
|
||||
"size": 4046848
|
||||
},
|
||||
{
|
||||
"osd": 56,
|
||||
"primary": false,
|
||||
"errors": [],
|
||||
"size": 4046848,
|
||||
"omap_digest": "0xffffffff",
|
||||
"data_digest": "0xb86056e7"
|
||||
},
|
||||
{
|
||||
"osd": 128,
|
||||
"primary": false,
|
||||
"errors": [],
|
||||
"size": 4046848,
|
||||
"omap_digest": "0xffffffff",
|
||||
"data_digest": "0xb86056e7"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}""")
|
||||
|
||||
|
||||
def rados_inc_obj_output_multiple_read_errors():
|
||||
return json.loads("""{
|
||||
"epoch": 873,
|
||||
"inconsistents": [
|
||||
{
|
||||
"object": {
|
||||
"data": "nothing to see here"
|
||||
},
|
||||
"errors": [],
|
||||
"union_shard_errors": [
|
||||
"read_error"
|
||||
],
|
||||
"selected_object_info": {
|
||||
"data": "nothing to see here"
|
||||
},
|
||||
"shards": [
|
||||
{
|
||||
"osd": 53,
|
||||
"primary": true,
|
||||
"errors": [
|
||||
"read_error"
|
||||
],
|
||||
"size": 4046848
|
||||
},
|
||||
{
|
||||
"osd": 56,
|
||||
"primary": false,
|
||||
"errors": [
|
||||
"read_error"
|
||||
],
|
||||
"size": 4046848,
|
||||
"omap_digest": "0xffffffff",
|
||||
"data_digest": "0xb86056e7"
|
||||
},
|
||||
{
|
||||
"osd": 128,
|
||||
"primary": false,
|
||||
"errors": [],
|
||||
"size": 4046848,
|
||||
"omap_digest": "0xffffffff",
|
||||
"data_digest": "0xb86056e7"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}""")
|
Loading…
Reference in New Issue
Block a user