Browse Source

HA for etcd based on Statefulsets

- no external monitor, thus after certain number of partial outages etcd
  might loose quorum;
- allows specifying any etcd command-line arguments in config;

Change-Id: Ib2b2b8bd9da2db4fb16914b6bb014fb38834c8e8
changes/90/436490/8
Aleksandr Mogylchenko 2 years ago
parent
commit
94b4940afa
4 changed files with 219 additions and 15 deletions
  1. 4
    3
      service/etcd.yaml
  2. 4
    0
      service/files/defaults.yaml
  3. 211
    0
      service/files/entrypoint.py
  4. 0
    12
      service/files/entrypoint.sh.j2

+ 4
- 3
service/etcd.yaml View File

@@ -1,6 +1,7 @@
1 1
 dsl_version: 0.4.0
2 2
 service:
3 3
   name: etcd
4
+  kind: StatefulSet
4 5
   ports:
5 6
     - {{ etcd.client_port }}
6 7
     - {{ etcd.server_port }}
@@ -12,7 +13,7 @@ service:
12 13
     - name: etcd
13 14
       image: etcd
14 15
       daemon:
15
-        command: /opt/ccp/bin/entrypoint.sh
16
+        command: /opt/ccp/bin/entrypoint.py
16 17
         files:
17 18
           - entrypoint
18 19
       # {% if etcd.tls.enabled %}
@@ -22,8 +23,8 @@ service:
22 23
 
23 24
 files:
24 25
   entrypoint:
25
-    path: /opt/ccp/bin/entrypoint.sh
26
-    content: entrypoint.sh.j2
26
+    path: /opt/ccp/bin/entrypoint.py
27
+    content: entrypoint.py
27 28
     perm: "0755"
28 29
 # {% if etcd.tls.enabled %}
29 30
   server_certificate:

+ 4
- 0
service/files/defaults.yaml View File

@@ -8,6 +8,10 @@ configs:
8 8
       cont: 2380
9 9
     tls:
10 10
       enabled: true
11
+    token: cluster
12
+    additional_arguments:
13
+      election-timeout: 5000
14
+      heartbeat-interval: 250
11 15
 
12 16
 versions:
13 17
   etcd_version: v3.0.12

+ 211
- 0
service/files/entrypoint.py View File

@@ -0,0 +1,211 @@
1
+#!/usr/bin/env python
2
+
3
+import functools
4
+import json
5
+import logging
6
+import requests
7
+import socket
8
+import subprocess
9
+import time
10
+import urlparse
11
+
12
+from requests.exceptions import RequestException, ConnectionError
13
+LOG_DATEFMT = "%Y-%m-%d %H:%M:%S"
14
+LOG_FORMAT = "%(asctime)s.%(msecs)03d - %(levelname)s - %(message)s"
15
+logging.basicConfig(format=LOG_FORMAT,
16
+                    datefmt=LOG_DATEFMT,
17
+                    level=logging.DEBUG)
18
+LOG = logging.getLogger(__name__)
19
+
20
+GLOBALS_PATH = '/etc/ccp/globals/globals.json'
21
+
22
+
23
+def retry(f):
24
+    @functools.wraps(f)
25
+    def wrap(*args, **kwargs):
26
+        attempts = config.connection_attempts
27
+        delay = config.connection_delay
28
+        while attempts > 1:
29
+            try:
30
+                return f(*args, **kwargs)
31
+            except (RequestException, ConnectionError) as err:
32
+                LOG.warning('Retrying in %d seconds because of %s', delay, err)
33
+                time.sleep(delay)
34
+                attempts -= 1
35
+        return f(*args, **kwargs)
36
+    return wrap
37
+
38
+
39
+class Configuration():
40
+    def __init__(self, config_file):
41
+        LOG.info("Getting global variables from %s", config_file)
42
+        values = {}
43
+        with open(config_file) as f:
44
+            global_conf = json.load(f)
45
+        for key in ['etcd', 'namespace',  'security', 'cluster_domain']:
46
+            values[key] = global_conf[key]
47
+        hostname = socket.gethostname()
48
+        ipaddr = socket.gethostbyname(hostname)
49
+        self.etcd_binary = '/usr/local/bin/etcd'
50
+        self.connection_delay = 2
51
+        self.connection_attempts = 5
52
+        self.client_port = int(values['etcd']['client_port']['cont'])
53
+        self.server_port = int(values['etcd']['server_port']['cont'])
54
+        self.tls = values['etcd']['tls']['enabled']
55
+        self.token = values['etcd']['token']
56
+        self.namespace = values['namespace']
57
+        self.cluster_domain = values['cluster_domain']
58
+        self.api_version = 'v2'
59
+        if self.tls:
60
+            self.host_template = 'https://%s:%d'
61
+            self.cert_file = '/opt/ccp/etc/tls/etcd_server_certificate.pem'
62
+            self.key_file = '/opt/ccp/etc/tls/etcd_server_key.pem'
63
+            self.ca_file = '/opt/ccp/etc/tls/ca.pem'
64
+            self.verify_connectivity = self.ca_file
65
+        else:
66
+            self.host_template = 'http://%s:%d'
67
+            self.verify_connectivity = False
68
+        fqdn_template = "%s.%s.svc.%s"
69
+        svc = fqdn_template % ('etcd', self.namespace, self.cluster_domain)
70
+        # Represents fqdn service endoint for etcd
71
+        self.service = self.host_template % (svc, self.client_port)
72
+        members_endpoint = '%s/members/' % self.api_version
73
+        # URL to query when accessing etcd members api
74
+        self.members_api = urlparse.urljoin(self.service, members_endpoint)
75
+        # When joining etcd cluster, members list is special:
76
+        # <name>=<peerURL>,<name2>=<peerURL2>,...
77
+        self.name = "%s.%s" % (hostname, svc)
78
+        self.peer_url = self.host_template % (ipaddr, self.server_port)
79
+        self.member_name = "%s=%s" % (self.name, self.peer_url)
80
+        self.arguments = values.get('etcd').get('additional_arguments', None)
81
+
82
+
83
+def start_etcd(config, bootstrap=False, initial_members=None):
84
+    name = config.name
85
+    client_port = config.client_port
86
+    server_port = config.server_port
87
+    client_host = config.host_template % (name, client_port)
88
+    server_host = config.host_template % (name, server_port)
89
+    if config.tls:
90
+        # We add insecure listener for checks
91
+        insecure_listener = ",http://%s:%s" % ('127.0.0.1', client_port)
92
+    else:
93
+        insecure_listener = ""
94
+    args = ['--name=%s' % name,
95
+            '--listen-peer-urls=%s' % server_host,
96
+            '--listen-client-urls=%s' % client_host + insecure_listener,
97
+            '--advertise-client-urls=%s' % client_host,
98
+            '--initial-advertise-peer-urls=%s' % server_host,
99
+            '--initial-cluster-token=%s' % config.token]
100
+    if config.tls:
101
+        args += ['--peer-auto-tls']
102
+        args += ['--cert-file=%s' % config.cert_file]
103
+        args += ['--key-file=%s' % config.key_file]
104
+    if bootstrap:
105
+        args += ["--initial-cluster=%s=%s" % (name, server_host)]
106
+    if initial_members:
107
+        args += ["--initial-cluster-state=existing",
108
+                 "--initial-cluster=%s" % initial_members]
109
+    if config.arguments:
110
+        LOG.debug("Additional arguments are %s" % config.arguments)
111
+        custom = ["--%s=%s" % (k,v) for k,v in config.arguments.iteritems()]
112
+        args += custom
113
+    cmd = [config.etcd_binary] + args
114
+    LOG.info("Launching etcd with %s" % cmd)
115
+    subprocess.check_call(cmd, shell=False)
116
+
117
+
118
+@retry
119
+def _add_etcd_member(members_api, peer_url):
120
+    headers = {'content-type': 'application/json'}
121
+    data = {'peerURLs': [peer_url]}
122
+    verify = config.verify_connectivity
123
+    r = requests.post(members_api, json=data, headers=headers, verify=verify)
124
+    # https://coreos.com/etcd/docs/latest/v2/members_api.html
125
+    if r.status_code == 201:
126
+        return peer_url
127
+    elif r.status_code == 500:
128
+        # Request failed, but might be processed later, not sure how to handle
129
+        LOG.debug('Etcd cluster returned 500, might be busy...')
130
+        r.raise_for_status()
131
+    else:
132
+        r.raise_for_status()
133
+
134
+
135
+@retry
136
+def _delete_etcd_member(members_api, name):
137
+    # HTTP API needs id of the member to delete it
138
+    # So first we get member id, then we delete it - 2 calls total.
139
+    peers = _get_etcd_members(members_api)
140
+    _id = _get_etcd_member_id(peers, name)
141
+    LOG.debug("Deleting %s with id %s from etcd cluster..." % (name, _id))
142
+    url = urlparse.urljoin(members_api, _id)
143
+    verify = config.verify_connectivity
144
+    r = requests.delete(url, verify=verify)
145
+    if r.status_code == 204:
146
+        return [p for p in peers if p['name'] != name]
147
+    else:
148
+        LOG.debug("Delete failed with error %i", r.status_code)
149
+        r.raise_for_status()
150
+
151
+
152
+@retry
153
+def _get_etcd_members(members_api):
154
+    verify = config.verify_connectivity
155
+    r = requests.get(members_api, verify=verify)
156
+    if r.status_code == 200:
157
+        peers = r.json()['members']
158
+        return peers
159
+    else:
160
+        r.raise_for_status()
161
+
162
+
163
+def _etcd_members_as_string(peers):
164
+    # <name>=<peerURL>,<name2>=<peerURL2>,...
165
+    l = []
166
+    for m in peers:
167
+        if m['name']:
168
+            l.append("%s=%s" % (m['name'], m['peerURLs'][0]))
169
+    return ",".join(l)
170
+
171
+
172
+def _get_etcd_member_id(peers, name):
173
+    # Get member id from peers list
174
+    members = [p['id'] for p in peers if p['name'] == name]
175
+    if members:
176
+        return members[0]
177
+    else:
178
+        return None
179
+
180
+
181
+if __name__ == "__main__":
182
+    config = Configuration(GLOBALS_PATH)
183
+    etcd_members_api = config.members_api
184
+    try:
185
+        # The only reliable way to determine if etcd cluster exists is to query
186
+        # service.
187
+        peers = _get_etcd_members(etcd_members_api)
188
+        members = _etcd_members_as_string(peers)
189
+    except ConnectionError:
190
+        LOG.debug("No one seems to be alive...")
191
+        members = ""
192
+    if not members:
193
+        # TODO(amnk): add recovery from complete disaster (e.g. restore data
194
+        # from data-dir if it is available
195
+        LOG.debug("I'm a leader, starting...")
196
+        start_etcd(config, bootstrap=True)
197
+    else:
198
+        if config.name in members:
199
+            # If we find our hostname in existing members, we are recovering
200
+            # from some failure. Since we cannot guarantee having all needed
201
+            # data on new node, we need to delete ourselve before joining.
202
+            LOG.debug("Found myself in members...")
203
+            new_peers = _delete_etcd_member(etcd_members_api, config.name)
204
+            new_members = _etcd_members_as_string(new_peers)
205
+        else:
206
+            new_members = members
207
+        LOG.debug("Adding myself to cluster %s..." % etcd_members_api)
208
+        _add_etcd_member(etcd_members_api, config.peer_url)
209
+        all_members = new_members + ',' + config.member_name
210
+        LOG.debug("Joining %s" % members)
211
+        start_etcd(config, initial_members=all_members)

+ 0
- 12
service/files/entrypoint.sh.j2 View File

@@ -1,12 +0,0 @@
1
-#!/usr/bin/env bash
2
-
3
-{% if etcd.tls.enabled %}
4
-etcd --listen-client-urls=https://{{ network_topology["private"]["address"] }}:{{ etcd.client_port.cont }},http://127.0.0.1:{{ etcd.client_port.cont }}\
5
-     --advertise-client-urls=https://{{ address("etcd", etcd.client_port, with_scheme=False) }}\
6
-     --peer-auto-tls\
7
-     --cert-file=/opt/ccp/etc/tls/etcd_server_certificate.pem\
8
-     --key-file=/opt/ccp/etc/tls/etcd_server_key.pem\
9
-{% else %}
10
-etcd --listen-client-urls http://0.0.0.0:{{ etcd.client_port.cont }}\
11
-     --advertise-client-urls {{ address("etcd", etcd.client_port, with_scheme=True) }}
12
-{% endif %}

Loading…
Cancel
Save