Added safe-pg-repair action
This action automatically repairs inconsistent placement groups which are caused by read errors. PGs are repaired using `ceph pg repair <pgid>`. Action is only taken if on of a PG's shards has a "read_error", and no action will be taken if any additional errors are found. No action will be taken if multiple "read_errors" are found. This action is intended to be safe to run in all contexts. Closes-Bug: #1923218 Change-Id: I903dfe02aa3b7c67414e3d0d9b57f4042d301830
This commit is contained in:
parent
d3b2494ee8
commit
a1cffc6693
@ -439,3 +439,5 @@ delete-user:
|
|||||||
type: string
|
type: string
|
||||||
description: "User ID to delete."
|
description: "User ID to delete."
|
||||||
required: [username]
|
required: [username]
|
||||||
|
pg-repair:
|
||||||
|
description: "Repair inconsistent placement groups, if safe to do so."
|
||||||
|
1
actions/pg-repair
Symbolic link
1
actions/pg-repair
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
pg_repair.py
|
202
actions/pg_repair.py
Executable file
202
actions/pg_repair.py
Executable file
@ -0,0 +1,202 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# Copyright 2022 Canonical Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from subprocess import check_output, CalledProcessError
|
||||||
|
|
||||||
|
_path = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
_hooks = os.path.abspath(os.path.join(_path, "../hooks"))
|
||||||
|
_lib = os.path.abspath(os.path.join(_path, "../lib"))
|
||||||
|
|
||||||
|
|
||||||
|
def _add_path(path):
|
||||||
|
if path not in sys.path:
|
||||||
|
sys.path.insert(1, path)
|
||||||
|
|
||||||
|
|
||||||
|
_add_path(_hooks)
|
||||||
|
_add_path(_lib)
|
||||||
|
|
||||||
|
|
||||||
|
from charmhelpers.core.hookenv import (
|
||||||
|
log,
|
||||||
|
function_fail,
|
||||||
|
function_set,
|
||||||
|
)
|
||||||
|
from charms_ceph.utils import list_pools
|
||||||
|
|
||||||
|
|
||||||
|
def get_rados_inconsistent_objs(pg):
|
||||||
|
"""Get all inconsistent objects for a given placement group.
|
||||||
|
|
||||||
|
:param pg: Name of a placement group
|
||||||
|
:type pg: str
|
||||||
|
:return: list of inconsistent objects
|
||||||
|
:rtype: list[str]
|
||||||
|
"""
|
||||||
|
return json.loads(
|
||||||
|
check_output(
|
||||||
|
["rados", "list-inconsistent-obj", pg, "--format=json-pretty"]
|
||||||
|
).decode("UTF-8")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_rados_inconsistent_pgs(pool):
|
||||||
|
"""Get all inconsistent placement groups for a given pool.
|
||||||
|
|
||||||
|
:param pool: Name of a Ceph pool
|
||||||
|
:type pool: str
|
||||||
|
:returns: list of inconsistent placement group IDs
|
||||||
|
:rtype: list[str]
|
||||||
|
"""
|
||||||
|
return json.loads(
|
||||||
|
check_output(["rados", "list-inconsistent-pg", pool]).decode("UTF-8")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_inconsistent_pgs(ceph_pools):
|
||||||
|
"""Get all inconsistent placement groups for a list of pools.
|
||||||
|
|
||||||
|
:param ceph_pools: List of names of Ceph pools
|
||||||
|
:type ceph_pools: list[str]
|
||||||
|
:returns: list of inconsistent placement group IDs as a set
|
||||||
|
:rtype: set[str]
|
||||||
|
"""
|
||||||
|
inconsistent_pgs = set()
|
||||||
|
for pool in ceph_pools:
|
||||||
|
inconsistent_pgs.update(get_rados_inconsistent_pgs(pool))
|
||||||
|
return inconsistent_pgs
|
||||||
|
|
||||||
|
|
||||||
|
def get_safe_pg_repairs(inconsistent_pgs):
|
||||||
|
"""Filters inconsistent placement groups for ones that are safe to repair.
|
||||||
|
|
||||||
|
:param inconsistent_pgs: List of inconsistent placement groups
|
||||||
|
:type inconsistent_pgs: list[str]
|
||||||
|
:returns: list of safely repairable placement groups as a set
|
||||||
|
:rtype: set[str]
|
||||||
|
"""
|
||||||
|
return {pg for pg in inconsistent_pgs if is_pg_safe_to_repair(pg)}
|
||||||
|
|
||||||
|
|
||||||
|
def is_pg_safe_to_repair(pg):
|
||||||
|
"""Determines if a placement group is safe to repair.
|
||||||
|
|
||||||
|
:param pg: Name of an inconsistent placement group
|
||||||
|
:type pg: str
|
||||||
|
:returns: placement group is safe to repair
|
||||||
|
:rtype: bool
|
||||||
|
"""
|
||||||
|
# Additional tests for known safe cases can be added here.
|
||||||
|
return has_read_error_only(pg)
|
||||||
|
|
||||||
|
|
||||||
|
def has_read_error_only(pg):
|
||||||
|
"""Determines if an inconsistent placement group is caused by a read error.
|
||||||
|
Returns False if no read errors are found, or if any errors other than read
|
||||||
|
errors are found.
|
||||||
|
|
||||||
|
:param pg: ID of an inconsistent placement group
|
||||||
|
:type pg: str
|
||||||
|
:returns: placement group is safe to repair
|
||||||
|
:rtype: bool
|
||||||
|
"""
|
||||||
|
rados_inconsistent_objs = get_rados_inconsistent_objs(pg)
|
||||||
|
read_error_found = False
|
||||||
|
for inconsistent in rados_inconsistent_objs.get("inconsistents", []):
|
||||||
|
for shard in inconsistent.get("shards", []):
|
||||||
|
errors = shard.get("errors", [])
|
||||||
|
if errors == ["read_error"]:
|
||||||
|
if read_error_found:
|
||||||
|
return False
|
||||||
|
read_error_found = True
|
||||||
|
continue
|
||||||
|
elif errors:
|
||||||
|
# Error other than "read_error" detected
|
||||||
|
return False
|
||||||
|
return read_error_found
|
||||||
|
|
||||||
|
|
||||||
|
def perform_pg_repairs(pgs):
|
||||||
|
"""Runs `ceph pg repair` on a group of placement groups.
|
||||||
|
All placement groups provided should be confirmed as safe prior to using
|
||||||
|
this method.
|
||||||
|
|
||||||
|
:param pgs: List of safe-to-repair placement groups
|
||||||
|
:type pg: list[str]
|
||||||
|
"""
|
||||||
|
for pg in pgs:
|
||||||
|
log("Repairing ceph placement group {}".format(pg))
|
||||||
|
check_output(["ceph", "pg", "repair", pg])
|
||||||
|
|
||||||
|
|
||||||
|
def pg_repair():
|
||||||
|
"""Repair all inconsistent placement groups caused by read errors."""
|
||||||
|
ceph_pools = list_pools()
|
||||||
|
if not ceph_pools:
|
||||||
|
msg = "No Ceph pools found."
|
||||||
|
log(msg)
|
||||||
|
function_set(msg)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Get inconsistent placement groups
|
||||||
|
inconsistent_pgs = get_inconsistent_pgs(ceph_pools)
|
||||||
|
if not inconsistent_pgs:
|
||||||
|
msg = "No inconsistent placement groups found."
|
||||||
|
log(msg)
|
||||||
|
function_set(msg)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Filter for known safe cases
|
||||||
|
safe_pg_repairs = get_safe_pg_repairs(inconsistent_pgs)
|
||||||
|
unsafe_pg_repairs = inconsistent_pgs.difference(safe_pg_repairs)
|
||||||
|
|
||||||
|
# Perform safe placement group repairs
|
||||||
|
if unsafe_pg_repairs:
|
||||||
|
log(
|
||||||
|
"Ignoring unsafe placement group repairs: {}".format(
|
||||||
|
unsafe_pg_repairs
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if safe_pg_repairs:
|
||||||
|
log("Safe placement group repairs found: {}".format(safe_pg_repairs))
|
||||||
|
perform_pg_repairs(safe_pg_repairs)
|
||||||
|
function_set(
|
||||||
|
{
|
||||||
|
"message": "placement groups repaired: {}".format(
|
||||||
|
sorted(safe_pg_repairs)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
msg = "No safe placement group repairs found."
|
||||||
|
log(msg)
|
||||||
|
function_set(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
try:
|
||||||
|
pg_repair()
|
||||||
|
except CalledProcessError as e:
|
||||||
|
log(e)
|
||||||
|
function_fail(
|
||||||
|
"Safe placement group repair failed with error: {}".format(str(e))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
280
unit_tests/test_action_pg_repair.py
Normal file
280
unit_tests/test_action_pg_repair.py
Normal file
@ -0,0 +1,280 @@
|
|||||||
|
# Copyright 2022 Canonical Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""Tests for the pg_repair action."""
|
||||||
|
|
||||||
|
from actions import pg_repair as action
|
||||||
|
import unittest.mock as mock
|
||||||
|
from test_utils import CharmTestCase
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
class PlacementGroupRepairTestCase(CharmTestCase):
|
||||||
|
"""Run tests for the action."""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
"""Init mocks for test cases."""
|
||||||
|
super(PlacementGroupRepairTestCase, self).setUp(
|
||||||
|
action,
|
||||||
|
[
|
||||||
|
"function_fail",
|
||||||
|
"function_set",
|
||||||
|
"get_rados_inconsistent_objs",
|
||||||
|
"get_rados_inconsistent_pgs",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@mock.patch("actions.pg_repair.get_rados_inconsistent_pgs")
|
||||||
|
def test_get_inconsistent_pgs(self, _rados_inc_pgs):
|
||||||
|
"""Test collection of all inconsistent placement groups."""
|
||||||
|
_rados_inc_pgs.side_effect = (["1.a", "2.b"], ["2.b", "3.c"], [])
|
||||||
|
ceph_pools = ["testPool0", "testPool1", "testPool2"]
|
||||||
|
result = action.get_inconsistent_pgs(ceph_pools)
|
||||||
|
self.assertEqual(result, {"1.a", "2.b", "3.c"})
|
||||||
|
|
||||||
|
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
|
||||||
|
def test_safe_case_detection(self, _rados_inc_objs):
|
||||||
|
"""Test that safe case is detected."""
|
||||||
|
_rados_inc_objs.return_value = rados_inc_obj_output_safe()
|
||||||
|
result = action.is_pg_safe_to_repair("")
|
||||||
|
self.assertTrue(result)
|
||||||
|
|
||||||
|
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
|
||||||
|
def test_unsafe_case_detection_extra_erros(self, _rados_inc_objs):
|
||||||
|
"""Test that the unsafe case of extra errors is detected."""
|
||||||
|
_rados_inc_objs.return_value = rados_inc_obj_output_extra_errors()
|
||||||
|
result = action.is_pg_safe_to_repair("")
|
||||||
|
self.assertFalse(result)
|
||||||
|
|
||||||
|
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
|
||||||
|
def test_unsafe_case_detection_multiple_read_errors(self, _rados_inc_objs):
|
||||||
|
"""Test that the unsafe case of multiple read errors is detected."""
|
||||||
|
_rados_inc_objs.return_value = (
|
||||||
|
rados_inc_obj_output_multiple_read_errors()
|
||||||
|
)
|
||||||
|
result = action.is_pg_safe_to_repair("")
|
||||||
|
self.assertFalse(result)
|
||||||
|
|
||||||
|
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
|
||||||
|
def test_get_safe_pg_repair(self, _rados_inc_objs):
|
||||||
|
_rados_inc_objs.side_effect = (
|
||||||
|
rados_inc_obj_output_safe(),
|
||||||
|
rados_inc_obj_output_extra_errors(),
|
||||||
|
rados_inc_obj_output_multiple_read_errors(),
|
||||||
|
)
|
||||||
|
inconsistent_pgs = ("3.1f2", "12.ab3", "16.222")
|
||||||
|
result = action.get_safe_pg_repairs(inconsistent_pgs)
|
||||||
|
self.assertEqual(result, {"3.1f2"})
|
||||||
|
|
||||||
|
@mock.patch("actions.pg_repair.list_pools")
|
||||||
|
def test_pg_repair_no_ceph_pools(self, _list_pools):
|
||||||
|
"""Test action fails when no Ceph pools found."""
|
||||||
|
_list_pools.return_value = []
|
||||||
|
action.pg_repair()
|
||||||
|
msg = "No Ceph pools found."
|
||||||
|
self.function_set.assert_called_once_with(msg)
|
||||||
|
|
||||||
|
@mock.patch("actions.pg_repair.get_inconsistent_pgs")
|
||||||
|
@mock.patch("actions.pg_repair.list_pools")
|
||||||
|
def test_pg_repair_no_inconsistent_pgs(self, _list_pools, _get_inc_pgs):
|
||||||
|
_list_pools.return_value = ["testPool"]
|
||||||
|
_get_inc_pgs.return_value = []
|
||||||
|
action.pg_repair()
|
||||||
|
msg = "No inconsistent placement groups found."
|
||||||
|
self.function_set.assert_called_once_with(msg)
|
||||||
|
|
||||||
|
@mock.patch("actions.pg_repair.check_output")
|
||||||
|
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
|
||||||
|
@mock.patch("actions.pg_repair.get_rados_inconsistent_pgs")
|
||||||
|
@mock.patch("actions.pg_repair.list_pools")
|
||||||
|
def test_pg_repair_safe_case(
|
||||||
|
self, _list_pools, _rados_inc_pgs, _rados_inc_objs, _check_output
|
||||||
|
):
|
||||||
|
"""Test action succeeds with one read error."""
|
||||||
|
_list_pools.return_value = ["testPool"]
|
||||||
|
_rados_inc_pgs.return_value = {"16.abf", "12.bd4"}
|
||||||
|
_rados_inc_objs.return_value = rados_inc_obj_output_safe()
|
||||||
|
_check_output.return_value = b""
|
||||||
|
action.pg_repair()
|
||||||
|
self.function_set.assert_called_once_with(
|
||||||
|
{"message": "placement groups repaired: ['12.bd4', '16.abf']"}
|
||||||
|
)
|
||||||
|
|
||||||
|
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
|
||||||
|
@mock.patch("actions.pg_repair.get_rados_inconsistent_pgs")
|
||||||
|
@mock.patch("actions.pg_repair.list_pools")
|
||||||
|
def test_pg_repair_extra_errors(
|
||||||
|
self, _list_pools, _rados_inc_pgs, _rados_inc_objs
|
||||||
|
):
|
||||||
|
"""Test action fails with errors other than read errors."""
|
||||||
|
_list_pools.return_value = ["testPool"]
|
||||||
|
_rados_inc_pgs.return_value = {"16.abf", "12.bd4"}
|
||||||
|
_rados_inc_objs.return_value = rados_inc_obj_output_extra_errors()
|
||||||
|
action.pg_repair()
|
||||||
|
self.function_set.assert_called_once()
|
||||||
|
|
||||||
|
@mock.patch("actions.pg_repair.get_rados_inconsistent_objs")
|
||||||
|
@mock.patch("actions.pg_repair.get_rados_inconsistent_pgs")
|
||||||
|
@mock.patch("actions.pg_repair.list_pools")
|
||||||
|
def test_pg_repair_multiple_read_errors(
|
||||||
|
self, _list_pools, _rados_inc_pgs, _rados_inc_objs
|
||||||
|
):
|
||||||
|
"""Test action fails with multiple read errors."""
|
||||||
|
_list_pools.return_value = ["testPool"]
|
||||||
|
_rados_inc_pgs.return_value = {"16.abf", "12.bd4"}
|
||||||
|
_rados_inc_objs.return_value = (
|
||||||
|
rados_inc_obj_output_multiple_read_errors()
|
||||||
|
)
|
||||||
|
action.pg_repair()
|
||||||
|
self.function_set.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
def rados_inc_obj_output_safe():
|
||||||
|
return json.loads("""{
|
||||||
|
"epoch": 873,
|
||||||
|
"inconsistents": [
|
||||||
|
{
|
||||||
|
"object": {
|
||||||
|
"data": "nothing to see here"
|
||||||
|
},
|
||||||
|
"errors": [],
|
||||||
|
"union_shard_errors": [
|
||||||
|
"read_error"
|
||||||
|
],
|
||||||
|
"selected_object_info": {
|
||||||
|
"data": "nothing to see here"
|
||||||
|
},
|
||||||
|
"shards": [
|
||||||
|
{
|
||||||
|
"osd": 53,
|
||||||
|
"primary": true,
|
||||||
|
"errors": [
|
||||||
|
"read_error"
|
||||||
|
],
|
||||||
|
"size": 4046848
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"osd": 56,
|
||||||
|
"primary": false,
|
||||||
|
"errors": [],
|
||||||
|
"size": 4046848,
|
||||||
|
"omap_digest": "0xffffffff",
|
||||||
|
"data_digest": "0xb86056e7"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"osd": 128,
|
||||||
|
"primary": false,
|
||||||
|
"errors": [],
|
||||||
|
"size": 4046848,
|
||||||
|
"omap_digest": "0xffffffff",
|
||||||
|
"data_digest": "0xb86056e7"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}""")
|
||||||
|
|
||||||
|
|
||||||
|
def rados_inc_obj_output_extra_errors():
|
||||||
|
return json.loads("""{
|
||||||
|
"epoch": 873,
|
||||||
|
"inconsistents": [
|
||||||
|
{
|
||||||
|
"object": {
|
||||||
|
"data": "nothing to see here"
|
||||||
|
},
|
||||||
|
"errors": [],
|
||||||
|
"union_shard_errors": [
|
||||||
|
"read_error"
|
||||||
|
],
|
||||||
|
"selected_object_info": {
|
||||||
|
"data": "nothing to see here"
|
||||||
|
},
|
||||||
|
"shards": [
|
||||||
|
{
|
||||||
|
"osd": 53,
|
||||||
|
"primary": true,
|
||||||
|
"errors": [
|
||||||
|
"read_error",
|
||||||
|
"some_other_error"
|
||||||
|
],
|
||||||
|
"size": 4046848
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"osd": 56,
|
||||||
|
"primary": false,
|
||||||
|
"errors": [],
|
||||||
|
"size": 4046848,
|
||||||
|
"omap_digest": "0xffffffff",
|
||||||
|
"data_digest": "0xb86056e7"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"osd": 128,
|
||||||
|
"primary": false,
|
||||||
|
"errors": [],
|
||||||
|
"size": 4046848,
|
||||||
|
"omap_digest": "0xffffffff",
|
||||||
|
"data_digest": "0xb86056e7"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}""")
|
||||||
|
|
||||||
|
|
||||||
|
def rados_inc_obj_output_multiple_read_errors():
|
||||||
|
return json.loads("""{
|
||||||
|
"epoch": 873,
|
||||||
|
"inconsistents": [
|
||||||
|
{
|
||||||
|
"object": {
|
||||||
|
"data": "nothing to see here"
|
||||||
|
},
|
||||||
|
"errors": [],
|
||||||
|
"union_shard_errors": [
|
||||||
|
"read_error"
|
||||||
|
],
|
||||||
|
"selected_object_info": {
|
||||||
|
"data": "nothing to see here"
|
||||||
|
},
|
||||||
|
"shards": [
|
||||||
|
{
|
||||||
|
"osd": 53,
|
||||||
|
"primary": true,
|
||||||
|
"errors": [
|
||||||
|
"read_error"
|
||||||
|
],
|
||||||
|
"size": 4046848
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"osd": 56,
|
||||||
|
"primary": false,
|
||||||
|
"errors": [
|
||||||
|
"read_error"
|
||||||
|
],
|
||||||
|
"size": 4046848,
|
||||||
|
"omap_digest": "0xffffffff",
|
||||||
|
"data_digest": "0xb86056e7"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"osd": 128,
|
||||||
|
"primary": false,
|
||||||
|
"errors": [],
|
||||||
|
"size": 4046848,
|
||||||
|
"omap_digest": "0xffffffff",
|
||||||
|
"data_digest": "0xb86056e7"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}""")
|
Loading…
Reference in New Issue
Block a user