Fixed race condition in the Restore workflow

When an instance is restored from a backup, during the step when the root
password is being reset, we now check to ensure that the mysqld daemon is
up, before we kill the process. This ensures that the root password is
always reset successfully.

Fixes bug 1187958

Change-Id: If5a5c0293c0dc51c3f118abbcc52c3fb0d1d3cfa
This commit is contained in:
Nikhil Manchanda 2013-06-05 19:07:03 -07:00
parent c387692d50
commit 44c0dba4c6
3 changed files with 41 additions and 6 deletions

View File

@ -6,6 +6,7 @@ from reddwarf.guestagent import volume
from reddwarf.guestagent.manager.mysql_service import MySqlAppStatus from reddwarf.guestagent.manager.mysql_service import MySqlAppStatus
from reddwarf.guestagent.manager.mysql_service import MySqlAdmin from reddwarf.guestagent.manager.mysql_service import MySqlAdmin
from reddwarf.guestagent.manager.mysql_service import MySqlApp from reddwarf.guestagent.manager.mysql_service import MySqlApp
from reddwarf.instance import models as rd_models
from reddwarf.openstack.common import log as logging from reddwarf.openstack.common import log as logging
from reddwarf.openstack.common.gettextutils import _ from reddwarf.openstack.common.gettextutils import _
from reddwarf.openstack.common import periodic_task from reddwarf.openstack.common import periodic_task
@ -65,10 +66,16 @@ class Manager(periodic_task.PeriodicTasks):
def is_root_enabled(self, context): def is_root_enabled(self, context):
return MySqlAdmin().is_root_enabled() return MySqlAdmin().is_root_enabled()
def _perform_restore(self, backup_id, context, restore_location): def _perform_restore(self, backup_id, context, restore_location, app):
LOG.info(_("Restoring database from backup %s" % backup_id)) LOG.info(_("Restoring database from backup %s" % backup_id))
backup.restore(context, backup_id, restore_location) try:
LOG.info(_("Restored database")) backup.restore(context, backup_id, restore_location)
except Exception as e:
LOG.error(e)
LOG.error("Error performing restore from backup %s", backup_id)
app.status.set_status(rd_models.ServiceStatuses.FAILED)
raise
LOG.info(_("Restored database successfully"))
def prepare(self, context, databases, memory_mb, users, device_path=None, def prepare(self, context, databases, memory_mb, users, device_path=None,
mount_point=None, backup_id=None): mount_point=None, backup_id=None):
@ -96,7 +103,7 @@ class Manager(periodic_task.PeriodicTasks):
app.start_mysql() app.start_mysql()
app.install_if_needed() app.install_if_needed()
if backup_id: if backup_id:
self._perform_restore(backup_id, context, CONF.mount_point) self._perform_restore(backup_id, context, CONF.mount_point, app)
LOG.info(_("Securing mysql now.")) LOG.info(_("Securing mysql now."))
app.secure(memory_mb) app.secure(memory_mb)
if backup_id and MySqlAdmin().is_root_enabled(): if backup_id and MySqlAdmin().is_root_enabled():

View File

@ -14,7 +14,9 @@
# under the License. # under the License.
# #
from reddwarf.guestagent.strategy import Strategy from reddwarf.guestagent.strategy import Strategy
from reddwarf.common import cfg, utils from reddwarf.common import cfg
from reddwarf.common import exception
from reddwarf.common import utils
from reddwarf.openstack.common import log as logging from reddwarf.openstack.common import log as logging
from eventlet.green import subprocess from eventlet.green import subprocess
import tempfile import tempfile
@ -25,12 +27,26 @@ import glob
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
CONF = cfg.CONF CONF = cfg.CONF
CHUNK_SIZE = CONF.backup_chunk_size CHUNK_SIZE = CONF.backup_chunk_size
RESET_ROOT_RETRY_TIMEOUT = 100
RESET_ROOT_SLEEP_INTERVAL = 10
RESET_ROOT_MYSQL_COMMAND = """ RESET_ROOT_MYSQL_COMMAND = """
UPDATE mysql.user SET Password=PASSWORD('') WHERE User='root'; UPDATE mysql.user SET Password=PASSWORD('') WHERE User='root';
FLUSH PRIVILEGES; FLUSH PRIVILEGES;
""" """
def mysql_is_running():
try:
out, err = utils.execute_with_timeout(
"/usr/bin/mysqladmin",
"ping", run_as_root=True, root_helper="sudo")
LOG.info("The mysqld daemon is up and running.")
return True
except exception.ProcessExecutionError:
LOG.info("Waiting for mysqld daemon to start")
return False
class RestoreError(Exception): class RestoreError(Exception):
"""Error running the Backup Command.""" """Error running the Backup Command."""
@ -115,10 +131,21 @@ class RestoreRunner(Strategy):
try: try:
i = child.expect(['Starting mysqld daemon']) i = child.expect(['Starting mysqld daemon'])
if i == 0: if i == 0:
LOG.info("Root password reset successfully!") LOG.info("Starting mysqld daemon")
except pexpect.TIMEOUT as e: except pexpect.TIMEOUT as e:
LOG.error("wait_and_close_proc failed: %s" % e) LOG.error("wait_and_close_proc failed: %s" % e)
finally: finally:
try:
# There is a race condition here where we kill mysqld before
# the init file been executed. We need to ensure mysqld is up.
utils.poll_until(mysql_is_running,
sleep_time=RESET_ROOT_SLEEP_INTERVAL,
time_out=RESET_ROOT_RETRY_TIMEOUT)
except exception.PollTimeOut:
raise RestoreError("Reset root password failed: "
"mysqld did not start!")
LOG.info("Root password reset successfully!")
LOG.info("Cleaning up the temp mysqld process...") LOG.info("Cleaning up the temp mysqld process...")
child.delayafterclose = 1 child.delayafterclose = 1
child.delayafterterminate = 1 child.delayafterterminate = 1

View File

@ -43,6 +43,7 @@ class InnoBackupEx(base.RestoreRunner):
is_zipped = True is_zipped = True
restore_cmd = 'sudo xbstream -x -C %(restore_location)s' restore_cmd = 'sudo xbstream -x -C %(restore_location)s'
prepare_cmd = ('sudo innobackupex --apply-log %(restore_location)s ' prepare_cmd = ('sudo innobackupex --apply-log %(restore_location)s '
'--defaults-file=%(restore_location)s/backup-my.cnf '
'--ibbackup xtrabackup 2>/tmp/innoprepare.log') '--ibbackup xtrabackup 2>/tmp/innoprepare.log')
def _pre_restore(self): def _pre_restore(self):