#!/usr/bin/env python import BaseHTTPServer import functools import json import logging import os import os.path import socket import time import etcd import pymysql.cursors # Galera states JOINING_STATE = 1 DONOR_DESYNCED_STATE = 2 JOINED_STATE = 3 SYNCED_STATE = 4 WAS_JOINED = False OLD_STATE = 0 LOG_DATEFMT = "%Y-%m-%d %H:%M:%S" LOG_FORMAT = "%(asctime)s.%(msecs)03d - %(levelname)s - %(message)s" logging.basicConfig(format=LOG_FORMAT, datefmt=LOG_DATEFMT) LOG = logging.getLogger(__name__) LOG.setLevel(logging.DEBUG) GLOBALS_PATH = "/etc/ccp/globals/globals.json" DATADIR = "/var/lib/mysql" SST_FLAG = os.path.join(DATADIR, "sst_in_progress") PID_FILE = os.path.join(DATADIR, "mysqld.pid") HOSTNAME = socket.getfqdn() IPADDR = socket.gethostbyname(HOSTNAME) MONITOR_PASSWORD = None CLUSTER_NAME = None ETCD_PATH = None ETCD_HOST = None ETCD_PORT = None def retry(f): @functools.wraps(f) def wrap(*args, **kwargs): attempts = 3 delay = 1 while attempts > 1: try: return f(*args, **kwargs) except etcd.EtcdException as e: LOG.warning('Etcd is not ready: %s', str(e)) LOG.warning('Retrying in %d seconds...', delay) time.sleep(delay) attempts -= 1 except pymysql.OperationalError as e: LOG.warning('Mysql is not ready: %s', str(e)) LOG.warning('Retrying in %d seconds...', delay) time.sleep(delay) attempts -= 1 return f(*args, **kwargs) return wrap def get_etcd_client(): etcd_client = etcd.Client(host=ETCD_HOST, port=ETCD_PORT, allow_reconnect=True, read_timeout=2) return etcd_client @retry def get_mysql_client(): mysql_client = pymysql.connect(host='127.0.0.1', port=33306, user='monitor', password=MONITOR_PASSWORD, connect_timeout=1, read_timeout=1, cursorclass=pymysql.cursors.DictCursor) return mysql_client class GaleraChecker(object): def __init__(self): self.etcd_client = get_etcd_client() # Liveness check runs every 10 seconds with 5 seconds timeout (default) self.ttl = 20 @retry def fetch_wsrep_data(self): data = {} mysql_client = get_mysql_client() with mysql_client.cursor() as cursor: sql = "SHOW STATUS LIKE 'wsrep%'" cursor.execute(sql) for i in cursor.fetchall(): data[i['Variable_name']] = i['Value'] return data def check_if_sst_running(self): return os.path.isfile(SST_FLAG) def check_if_pidfile_created(self): return True if os.path.isfile(PID_FILE) else False def check_if_galera_ready(self): state = self.fetch_cluster_state() if state != 'STEADY': LOG.error("Cluster state is not STEADY") return False wsrep_data = self.fetch_wsrep_data() uuid = self.etcd_get_cluster_uuid() if wsrep_data["wsrep_local_state_comment"] != "Synced": LOG.error("wsrep_local_state_comment != 'Synced' - '%s'", wsrep_data["wsrep_local_state_comment"]) return False elif wsrep_data["wsrep_evs_state"] != "OPERATIONAL": LOG.error("wsrep_evs_state != 'OPERATIONAL' - '%s'", wsrep_data["wsrep_evs_state"]) return False elif wsrep_data["wsrep_connected"] != "ON": LOG.error("wsrep_connected != 'ON' - '%s'", wsrep_data["wsrep_connected"]) return False elif wsrep_data["wsrep_ready"] != "ON": LOG.error("wsrep_ready != 'ON' - '%s'", wsrep_data["wsrep_ready"]) return False elif wsrep_data["wsrep_cluster_state_uuid"] != uuid: LOG.error("wsrep_cluster_state_uuid != '%s' - '%s'", uuid, wsrep_data["wsrep_cluster_state_uuid"]) return False else: LOG.info("Galera node is ready") return True def check_if_galera_alive(self): # If cluster is not STEADY, nodes could be in strange positions, # like SST sync. We should postpone liveness checks 'till bootstrap is # done if not self.etcd_check_if_cluster_ready(): LOG.info("Galera cluster status is not 'STEADY', skiping check") return True # During SST sync mysql can't accept any requests if self.check_if_sst_running(): LOG.info("SST sync in progress, skiping check") return True if not self.check_if_pidfile_created(): LOG.info("Mysql pid file is not yet created, skiping check") return True global WAS_JOINED global OLD_STATE wsrep_data = self.fetch_wsrep_data() # If local uuid is different - we have a split brain. cluster_uuid = self.etcd_get_cluster_uuid() mysql_uuid = wsrep_data['wsrep_cluster_state_uuid'] if cluster_uuid != mysql_uuid: LOG.error("Cluster uuid is differs from local one.") LOG.debug("Cluster uuid: %s Local uuid: %s", cluster_uuid, mysql_uuid) return False # Node states check. state = int(wsrep_data['wsrep_local_state']) state_comment = wsrep_data['wsrep_local_state_comment'] if state == SYNCED_STATE or state == DONOR_DESYNCED_STATE: WAS_JOINED = True LOG.info("State OK: %s", state_comment) self.etcd_register_in_path('nodes') return True elif state == JOINED_STATE and WAS_JOINED: # Node was in the JOINED_STATE in prev check too. Seems to it can't # start syncing. if OLD_STATE == JOINED_STATE: LOG.error("State BAD: %s", state_comment) LOG.error("Joined, but not syncing") self._etcd_delete() return False else: LOG.info("State OK: %s", state_comment) LOG.info("Probably will sync soon") self.etcd_register_in_path('nodes') return False else: LOG.info("State OK: %s", state_comment) LOG.info("Just joined") WAS_JOINED = True self.etcd_register_in_path('nodes') return True OLD_STATE = state LOG.warning("Unknown state: %s", state_comment) return True @retry def _etcd_delete(self): key = os.path.join(ETCD_PATH, 'nodes', IPADDR) self.etcd_client.delete(key, recursive=True, dir=True) LOG.warning("Deleted node's key '%s'", key) @retry def _etcd_set(self, data): self.etcd_client.set(data[0], data[1], self.ttl) LOG.info("Set %s with value '%s'", data[0], data[1]) @retry def _etcd_read(self, path): key = os.path.join(ETCD_PATH, path) return self.etcd_client.read(key).value def etcd_register_in_path(self, path): key = os.path.join(ETCD_PATH, path, IPADDR) self._etcd_set((key, time.time())) def etcd_check_if_cluster_ready(self): try: state = self._etcd_read('state') return True if state == 'STEADY' else False except etcd.EtcdKeyNotFound: return False def etcd_get_cluster_uuid(self): return self._etcd_read('uuid') def fetch_cluster_state(self): return self._etcd_read('state') class GaleraHttpHandler(BaseHTTPServer.BaseHTTPRequestHandler): def do_GET(self): uri = self.path LOG.debug("Started processing GET '%s' request", uri) checker = GaleraChecker() try: if uri == "/liveness": success = checker.check_if_galera_alive() elif uri == "/readiness": success = checker.check_if_galera_ready() else: LOG.error("Only '/liveness' and '/readiness' uri are" " supported") success = False response = 200 if success else 503 self.send_response(response) self.end_headers() except Exception as err: LOG.exception(err) self.send_response(503) self.end_headers() finally: LOG.debug("Finished processing GET request") def run_server(port=8080): server_class = BaseHTTPServer.HTTPServer handler_class = GaleraHttpHandler server_address = ('', port) httpd = server_class(server_address, handler_class) LOG.info('Starting http server...') httpd.serve_forever() def get_config(): LOG.info("Getting global variables from %s", GLOBALS_PATH) variables = {} with open(GLOBALS_PATH) as f: global_conf = json.load(f) for key in ['percona', 'etcd', 'namespace', 'cluster_domain']: variables[key] = global_conf[key] LOG.debug(variables) return variables def set_globals(): config = get_config() global MONITOR_PASSWORD, CLUSTER_NAME global ETCD_PATH, ETCD_HOST, ETCD_PORT CLUSTER_NAME = config['percona']['cluster_name'] MONITOR_PASSWORD = config['percona']['monitor_password'] ETCD_PATH = "/galera/%s" % config['percona']['cluster_name'] ETCD_HOST = "etcd.%s.svc.%s" % (config['namespace'], config['cluster_domain']) ETCD_PORT = int(config['etcd']['client_port']['cont']) if __name__ == "__main__": get_config() set_globals() run_server()