* Add a connection-string based workflow to MicroStack; * microstack add-compute command can be run at the Control node in order to generate a connection string (an ASCII blob for the user); * the connection string contains: * an address of the control node; * a sha256 fingerprint of the TLS certificate used by the clustering service at the control node (which is used during verification similar to the Certificate Pinning approach); * an application credential id; * an application credential secret (short expiration time, reader role on the service project, restricted to listing the service catalog); * a MicroStack admin is expected to have ssh access to all nodes that will participate in a cluster - prior trust establishment is on them to figure out which is normal since they provision the nodes; * a MicroStack admin is expected to securely copy a connection string to a compute node via ssh. Since it is short-lived and does not carry service secrets, there is no risk of a replay at a later time; * If the compute role is specified during microstack.init, a connection string is requested and used to perform a request to the clustering service and validate the certificate fingerprint. The credential ID and secret are POSTed for verification to the clustering service which responds with the necessary config data for the compute node upon successful authorization. * Set up TLS termination for the clustering service; * run the flask app as a UWSGI daemon behind nginx; * configure nginx to use a TLS certificate; * generate a self-signed TLS certificate. This setup does not require PKI to be present for its own purposes of joining compute nodes to the cluster. However, this does not mean that PKI will not be used for TLS termination of the OpenStack endpoints. Control node init workflow (non-interactive): sudo microstack init --auto --control microstack add-compute <the connection string to be used at the compute node> Compute node init workflow (non-interactive): sudo microstack init --auto --compute --join <connection-string> Change-Id: I9596fe1e6e5c1a325cc71fd3bf0c78b660b9a83echanges/58/757658/4
@ -0,0 +1,11 @@ | |||
[uwsgi] | |||
module = cluster.daemon:app | |||
uwsgi-socket = {{ snap_common }}/run/cluster-api.sock | |||
buffer-size = 65535 | |||
master = true | |||
enable-threads = true | |||
processes = 2 | |||
thunder-lock = true | |||
lazy-apps = true | |||
home = {{ snap }}/usr | |||
pyargv = {{ pyargv }} |
@ -0,0 +1,20 @@ | |||
server { | |||
listen 10002 ssl; | |||
error_log syslog:server=unix:/dev/log; | |||
access_log syslog:server=unix:/dev/log; | |||
{% if is_clustered %} | |||
ssl_session_timeout 1d; | |||
ssl_session_tickets off; | |||
ssl_protocols TLSv1.2 TLSv1.3; | |||
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; | |||
ssl_certificate {{ cluster_tls_cert_path }}; | |||
ssl_certificate_key {{ cluster_tls_key_path }}; | |||
{% endif %} | |||
location / { | |||
include {{ snap }}/usr/conf/uwsgi_params; | |||
uwsgi_param SCRIPT_NAME ''; | |||
uwsgi_pass unix://{{ snap_common }}/run/cluster-api.sock; | |||
} | |||
} |
@ -1,10 +0,0 @@ | |||
server { | |||
listen 8011; | |||
error_log syslog:server=unix:/dev/log; | |||
access_log syslog:server=unix:/dev/log; | |||
location / { | |||
include {{ snap }}/usr/conf/uwsgi_params; | |||
uwsgi_param SCRIPT_NAME ''; | |||
uwsgi_pass unix://{{ snap_common }}/run/keystone-api.sock; | |||
} | |||
} |
@ -0,0 +1,122 @@ | |||
#!/usr/bin/env python3 | |||
import uuid | |||
import secrets | |||
import argparse | |||
from datetime import datetime | |||
from dateutil.relativedelta import relativedelta | |||
from oslo_serialization import ( | |||
base64, | |||
msgpackutils | |||
) | |||
from cluster.shell import config_get | |||
from keystoneauth1.identity import v3 | |||
from keystoneauth1 import session | |||
from keystoneclient.v3 import client | |||
VALIDITY_PERIOD = relativedelta(minutes=20) | |||
def _create_credential(): | |||
project_name = 'service' | |||
domain_name = 'default' | |||
# TODO: add support for TLS-terminated Keystone once this is supported. | |||
auth = v3.password.Password( | |||
auth_url="http://localhost:5000/v3", | |||
username='nova', | |||
password=config_get('config.credentials.nova-password'), | |||
user_domain_name=domain_name, | |||
project_domain_name=domain_name, | |||
project_name=project_name | |||
) | |||
sess = session.Session(auth=auth) | |||
keystone_client = client.Client(session=sess) | |||
# Only allow this credential to list the Keystone catalog. After it | |||
# expires, Keystone will return Unauthorized for requests made with tokens | |||
# issued from that credential. | |||
access_rules = [{ | |||
'method': 'GET', | |||
'path': '/v3/auth/catalog', | |||
'service': 'identity' | |||
}] | |||
# TODO: make the expiration time customizable since this may be used by | |||
# automation or during live demonstrations where the lag between issuance | |||
# and usage may be more than the expiration time. | |||
expires_at = datetime.now() + VALIDITY_PERIOD | |||
# Role objects themselves are not tied to a specific domain by default | |||
# - this does not affect role assignments themselves which are scoped. | |||
reader_role = keystone_client.roles.find(name='reader', domain_id=None) | |||
return keystone_client.application_credentials.create( | |||
name=f'cluster-join-{uuid.uuid4().hex}', | |||
expires_at=expires_at, | |||
access_rules=access_rules, | |||
# Do not allow this app credential to create new app credentials. | |||
unrestricted=False, | |||
roles=[reader_role.id], | |||
# Make the secret shorter than the default but secure enough. | |||
secret=secrets.token_urlsafe(32)[:32] | |||
) | |||
def add_compute(): | |||
"""Generates connection string for adding a compute node to the cluster. | |||
Steps: | |||
* Make sure we are running in the clustered mode and this is a control | |||
node which is an initial node in the cluster; | |||
* Generate an application credential via Keystone scoped to the service | |||
project with restricted capabilities (reader role and only able to list | |||
the service catalog) and a short expiration time enough for a user to | |||
copy the connection string to the compute node; | |||
* Get an FQDN that will be used by the client to establish a connection to | |||
the clustering service; | |||
* Serialize the above data into a base64-encoded string. | |||
""" | |||
role = config_get('config.cluster.role') | |||
if role != 'control': | |||
raise Exception('Running add-compute is only supported on a' | |||
' control node.') | |||
app_cred = _create_credential() | |||
data = { | |||
# TODO: we do not use hostname verification, however, using | |||
# an FQDN might be useful here since the host may be behind NAT | |||
# with a split-horizon DNS implemented where a hostname would point | |||
# us to a different IP. | |||
'hostname': config_get('config.network.control-ip'), | |||
# Store bytes since the representation will be shorter than with hex. | |||
'fingerprint': bytes.fromhex(config_get('config.cluster.fingerprint')), | |||
'id': app_cred.id, | |||
'secret': app_cred.secret, | |||
} | |||
connection_string = base64.encode_as_text(msgpackutils.dumps(data)) | |||
# Print the connection string and an expiration notice to the user. | |||
print('Use the following connection string to add a new compute node' | |||
f' to the cluster (valid for {VALIDITY_PERIOD.minutes} minutes from' | |||
f' this moment):\n{connection_string}') | |||
def main(): | |||
parser = argparse.ArgumentParser( | |||
description='add-compute', | |||
usage='''add-compute | |||
This command does not have subcommands - just run it to get a connection string | |||
to be used when joining a node to the cluster. | |||
''') | |||
parser.parse_args() | |||
add_compute() | |||
if __name__ == '__main__': | |||
main() |
@ -1,37 +1,100 @@ | |||
#!/usr/bin/env python3 | |||
import urllib3 | |||
import json | |||
import requests | |||
from cluster import shell | |||
from cluster.shell import check_output | |||
CLUSTER_SERVICE_PORT = 10002 | |||
class UnauthorizedRequestError(Exception): | |||
pass | |||
def join(): | |||
"""Join an existing cluster as a compute node.""" | |||
config = json.loads(check_output('snapctl', 'get', 'config')) | |||
password = config['cluster']['password'] | |||
control_ip = config['network']['control-ip'] | |||
my_ip = config['network']['compute-ip'] | |||
cluster_config = shell.config_get('config.cluster') | |||
control_hostname = cluster_config['hostname'] | |||
fingerprint = cluster_config['fingerprint'] | |||
credential_id = cluster_config['credential-id'] | |||
credential_secret = cluster_config['credential-secret'] | |||
request_body = json.dumps({ | |||
'credential-id': credential_id, | |||
'credential-secret': credential_secret | |||
}) | |||
# Create a connection pool and override the TLS certificate | |||
# verification method to use the certificate fingerprint instead | |||
# of hostname validation + validation via CA cert and expiration time. | |||
# This avoids relying on any kind of PKI and DNS assumptions in the | |||
# installation environment. | |||
# If the fingerprint does not match, MaxRetryError will be raised | |||
# with SSLError as a cause even with the rest of the checks disabled. | |||
conn_pool = urllib3.HTTPSConnectionPool( | |||
control_hostname, CLUSTER_SERVICE_PORT, | |||
assert_fingerprint=fingerprint, assert_hostname=False, | |||
cert_reqs='CERT_NONE', | |||
) | |||
try: | |||
resp = conn_pool.urlopen( | |||
'POST', '/join', retries=0, preload_content=True, | |||
headers={ | |||
'API-VERSION': '1.0.0', | |||
'Content-Type': 'application/json', | |||
}, body=request_body) | |||
except urllib3.exceptions.MaxRetryError as e: | |||
if isinstance(e.reason, urllib3.exceptions.SSLError): | |||
raise Exception( | |||
'The actual clustering service certificate fingerprint' | |||
' did not match the expected one, please make sure that: ' | |||
'(1) that a correct token was specified during initialization;' | |||
' (2) a MITM attacks are not performed against HTTPS requests' | |||
' (including transparent proxies).' | |||
) from e.reason | |||
raise Exception('Could not retrieve a response from the clustering' | |||
' service.') from e | |||
if not password: | |||
raise Exception("No cluster password specified!") | |||
if resp.status == 401: | |||
response_data = resp.data.decode('utf-8') | |||
# TODO: this should be more bulletproof in case a proxy server | |||
# returns this response - it will not have the expected format. | |||
print('An authorization failure has occurred while joining the' | |||
' the cluster: please make sure the connection string' | |||
' was entered as returned by the "add-compute" command' | |||
' and that it was used before its expiration time.') | |||
if response_data: | |||
message = json.loads(response_data)['message'] | |||
raise UnauthorizedRequestError(message) | |||
raise UnauthorizedRequestError() | |||
if resp.status != 200: | |||
raise Exception('Unexpected response status received from the' | |||
f' clustering service: {resp.status}') | |||
resp = requests.post( | |||
'http://{}:10002/join'.format(control_ip), | |||
json={'password': password, 'ip_address': my_ip}) | |||
if resp.status_code != 200: | |||
# TODO better error and formatting. | |||
raise Exception('Failed to get info from control node: {}'.format( | |||
resp.json)) | |||
resp = resp.json() | |||
try: | |||
response_data = resp.data.decode('utf-8') | |||
except UnicodeDecodeError: | |||
raise Exception('The response from the clustering service contains' | |||
' bytes invalid for UTF-8') | |||
if not response_data: | |||
raise Exception('The response from the clustering service is empty' | |||
' which is unexpected: please check its status' | |||
' and file an issue if the problem persists') | |||
credentials = resp['config']['credentials'] | |||
# Load the response assuming it has the correct format. API versioning | |||
# should rule out inconsistencies, otherwise we will get an error here. | |||
response_dict = json.loads(response_data) | |||
credentials = response_dict['config']['credentials'] | |||
control_creds = {f'config.credentials.{k}': v | |||
for k, v in credentials.items()} | |||
shell.config_set(**control_creds) | |||
# TODO: use the hostname from the connection string instead to | |||
# resolve an IP address (requires a valid DNS setup). | |||
control_ip = response_dict['config']['network']['control-ip'] | |||
shell.config_set(**{'config.network.control-ip': control_ip}) | |||
if __name__ == '__main__': | |||
@ -1,51 +1,287 @@ | |||
import logging | |||
import json | |||
from flask import Flask, request | |||
import semantic_version | |||
import keystoneclient.exceptions as kc_exceptions | |||
from flask import Flask, request, jsonify | |||
from werkzeug.exceptions import BadRequest | |||
from cluster.shell import check_output | |||
from keystoneauth1.identity import v3 | |||
from keystoneauth1 import session | |||
from keystoneclient.v3 import client as v3client | |||
logger = logging.getLogger(__name__) | |||
app = Flask(__name__) | |||
API_VERSION = semantic_version.Version('1.0.0') | |||
class Unauthorized(Exception): | |||
pass | |||
def join_info(password, ip_address): | |||
our_password = check_output('snapctl', 'get', 'config.cluster.password') | |||
class APIException(Exception): | |||
status_code = None | |||
message = '' | |||
def to_dict(self): | |||
return {'message': self.message} | |||
class APIVersionMissing(APIException): | |||
status_code = 400 | |||
message = 'An API version was not specified in the request.' | |||
class APIVersionInvalid(APIException): | |||
status_code = 400 | |||
message = 'Invalid API version was specified in the request.' | |||
class APIVersionDropped(APIException): | |||
status_code = 410 | |||
message = 'The requested join API version is no longer supported.' | |||
class APIVersionNotImplemented(APIException): | |||
status_code = 501 | |||
message = 'The requested join API version is not yet implemented.' | |||
class InvalidJSONInRequest(APIException): | |||
status_code = 400 | |||
message = 'The request includes invalid JSON.' | |||
class IncorrectContentType(APIException): | |||
status_code = 400 | |||
message = ('The request does not have a Content-Type header set to ' | |||
'application/json.') | |||
class MissingAuthDataInRequest(APIException): | |||
status_code = 400 | |||
message = 'The request does not have the required authentication data.' | |||
class InvalidAuthDataFormatInRequest(APIException): | |||
status_code = 400 | |||
message = 'The authentication data in the request has invalid format.' | |||
class InvalidAuthDataInRequest(APIException): | |||
status_code = 400 | |||
message = 'The authentication data in the request is invalid.' | |||
class AuthorizationFailed(APIException): | |||
status_code = 401 | |||
message = ('Failed to pass authorization using the data provided in the' | |||
' request') | |||
class UnexpectedError(APIException): | |||
status_code = 500 | |||
message = ('The clustering server has encountered an unexpected' | |||
' error while handling the request.') | |||
def _handle_api_version_exception(error): | |||
response = jsonify(error.to_dict()) | |||
response.status_code = error.status_code | |||
return response | |||
@app.errorhandler(APIVersionMissing) | |||
def handle_api_version_missing(error): | |||
return _handle_api_version_exception(error) | |||
@app.errorhandler(APIVersionInvalid) | |||
def handle_api_version_invalid(error): | |||
return _handle_api_version_exception(error) | |||
if password.strip() != our_password.strip(): | |||
raise Unauthorized() | |||
# Load config | |||
@app.errorhandler(APIVersionDropped) | |||
def handle_api_version_dropped(error): | |||
return _handle_api_version_exception(error) | |||
@app.errorhandler(APIVersionNotImplemented) | |||
def handle_api_version_not_implemented(error): | |||
return _handle_api_version_exception(error) | |||
@app.errorhandler(IncorrectContentType) | |||
def handle_incorrect_content_type(error): | |||
return _handle_api_version_exception(error) | |||
@app.errorhandler(InvalidJSONInRequest) | |||
def handle_invalid_json_in_request(error): | |||
return _handle_api_version_exception(error) | |||
@app.errorhandler(InvalidAuthDataInRequest) | |||
def handle_invalid_auth_data_format_in_request(error): | |||
return _handle_api_version_exception(error) | |||
@app.errorhandler(InvalidAuthDataFormatInRequest) | |||
def handle_invalid_auth_data_in_request(error): | |||
return _handle_api_version_exception(error) | |||
@app.errorhandler(AuthorizationFailed) | |||
def handle_authorization_failed(error): | |||
return _handle_api_version_exception(error) | |||
@app.errorhandler(UnexpectedError) | |||
def handle_unexpected_error(error): | |||
return _handle_api_version_exception(error) | |||
def join_info(): | |||
"""Generate the configuration information to return to a client.""" | |||
# TODO: be selective about what we return. For now, we just get everything. | |||
config = json.loads(check_output('snapctl', 'get', 'config')) | |||
info = {'config': config} | |||
return info | |||
@app.route('/join', methods=['POST']) | |||
def join(): | |||
"""Authorize a client node and return relevant config.""" | |||
# Retrieve an API version from the request - it is a mandatory | |||
# header for this API. | |||
request_version = request.headers.get('API-Version') | |||
if request_version is None: | |||
logger.debug('The client has not specified the API-version header.') | |||
raise APIVersionMissing() | |||
else: | |||
try: | |||
api_version = semantic_version.Version(request_version) | |||
except ValueError: | |||
logger.debug('The client has specified an invalid API version.' | |||
f': {request_version}') | |||
raise APIVersionInvalid() | |||
# Compare the API version used by the clustering service with the | |||
# one specified in the request and return an appropriate response. | |||
if api_version.major > API_VERSION.major: | |||
logger.debug('The client requested a version that is not' | |||
f' supported yet: {api_version}.') | |||
raise APIVersionNotImplemented() | |||
elif api_version.major < API_VERSION.major: | |||
logger.debug('The client request version is no longer supported' | |||
f': {api_version}.') | |||
raise APIVersionDropped() | |||
else: | |||
# Flask raises a BadRequest if the JSON content is invalid and | |||
# returns None if the Content-Type header is missing or not set | |||
# to application/json. | |||
try: | |||
req_json = request.json | |||
except BadRequest: | |||
logger.debug('The client has POSTed an invalid JSON' | |||
' in the request.') | |||
raise InvalidJSONInRequest() | |||
if req_json is None: | |||
logger.debug('The client has not specified the application/json' | |||
' content type in the request.') | |||
raise IncorrectContentType() | |||
# So far we don't have any minor versions with backwards-compatible | |||
# changes so just assume that all data will be present or error out. | |||
credential_id = req_json.get('credential-id') | |||
credential_secret = req_json.get('credential-secret') | |||
if not credential_id or not credential_secret: | |||
logger.debug('The client has not specified the required' | |||
' authentication data in the request.') | |||
return MissingAuthDataInRequest() | |||
# TODO: handle https here when TLS termination support is added. | |||
keystone_base_url = 'http://localhost:5000/v3' | |||
# In an unlikely event of failing to construct an auth object | |||
# treat it as if invalid data got passed in terms of responding | |||
# to the client. | |||
try: | |||
auth = v3.ApplicationCredential( | |||
auth_url=keystone_base_url, | |||
application_credential_id=credential_id, | |||
application_credential_secret=credential_secret | |||
) | |||
except Exception: | |||
logger.exception('An exception has occurred while trying to build' | |||
' an auth object for an application credential' | |||
' passed from the clustering client.') | |||
raise InvalidAuthDataInRequest() | |||
try: | |||
# Use the auth object with the app credential to create a session | |||
# which the Keystone client will use. | |||
sess = session.Session(auth=auth) | |||
except Exception: | |||
logger.exception('An exception has occurred while trying to build' | |||
' a Session object with auth data' | |||
' passed from the clustering client.') | |||
raise UnexpectedError() | |||
try: | |||
keystone_client = v3client.Client(session=sess) | |||
except Exception: | |||
logger.exception('An exception has occurred while trying to build' | |||
' a Keystone Client object with auth data' | |||
' passed from the clustering client.') | |||
raise UnexpectedError() | |||
try: | |||
# The add-compute command creates application credentials that | |||
# allow access to /v3/auth/catalog with an expiration time. | |||
# Authorization failures occur after an app credential expires | |||
# in which case an error is returned to the client. | |||
keystone_client.get(f'{keystone_base_url}/auth/catalog') | |||
except (kc_exceptions.AuthorizationFailure, | |||
kc_exceptions.Unauthorized): | |||
logger.exception('Failed to get a Keystone token' | |||
' with the application credentials' | |||
' passed from the clustering client.') | |||
raise AuthorizationFailed() | |||
except ValueError: | |||
logger.exception('Insufficient amount of parameters were' | |||
' used in the request to Keystone.') | |||
raise UnexpectedError() | |||
except kc_exceptions.ConnectionError: | |||
logger.exception('Failed to connect to Keystone') | |||
raise UnexpectedError() | |||
except kc_exceptions.SSLError: | |||
logger.exception('A TLS-related error has occurred while' | |||
' connecting to Keystone') | |||
raise UnexpectedError() | |||
# We were able to authenticate against Keystone using the | |||
# application credential and verify that it has not expired | |||
# so the information for a compute node to join the cluster can | |||
# now be returned. | |||
return json.dumps(join_info()) | |||
@app.route('/') | |||
def home(): | |||
status = { | |||
'status': 'running', | |||
'info': 'Microstack clustering daemon.' | |||
'info': 'MicroStack clustering daemon.' | |||
} | |||
return json.dumps(status) | |||
@app.route('/join', methods=['POST']) | |||
def join(): | |||
req = request.json # TODO: better error messages on failed parse. | |||
password = req.get('password') | |||
ip_address = req.get('ip_address') | |||
if not password: | |||
return 'No password specified', 500 | |||
try: | |||
return json.dumps(join_info(password, ip_address)) | |||
except Unauthorized: | |||
return (json.dumps({'error': 'Incorrect password.'}), 500) |
@ -0,0 +1,75 @@ | |||
#!/usr/bin/env python3 | |||
from pathlib import Path | |||
from datetime import datetime | |||
from dateutil.relativedelta import relativedelta | |||
from cryptography.hazmat.primitives import hashes | |||
from cryptography.hazmat.backends import default_backend | |||
from cryptography.hazmat.primitives import serialization | |||
from cryptography.hazmat.primitives.asymmetric import rsa | |||
from cryptography import x509 | |||
from cryptography.x509.oid import NameOID | |||
from init import shell | |||
def generate_selfsigned(): | |||
"""Generate a self-signed certificate with associated keys. | |||
The certificate will have a fake CNAME and subjAltName since | |||
the expectation is that this certificate will only be used by | |||
clients that know its fingerprint and will not use a validation | |||
via a CA certificate and hostname. This approach is similar to | |||
Certificate Pinning, however, here a certificate is not embedded | |||
into the application but is generated on initialization at one | |||
node and its fingerprint is copied in a token to another node | |||
via a secure channel. | |||
https://owasp.org/www-community/controls/Certificate_and_Public_Key_Pinning | |||
""" | |||
cert_path, key_path = ( | |||
Path(shell.config_get('config.cluster.tls-cert-path')), | |||
Path(shell.config_get('config.cluster.tls-key-path')), | |||
) | |||
# Do not generate a new certificate and key if there is already an existing | |||
# pair. TODO: improve this check and allow renewal. | |||
if cert_path.exists() and key_path.exists(): | |||
return | |||
dummy_cn = 'microstack.run' | |||
key = rsa.generate_private_key( | |||
public_exponent=65537, | |||
key_size=2048, | |||
backend=default_backend(), | |||
) | |||
common_name = x509.Name([ | |||
x509.NameAttribute(NameOID.COMMON_NAME, dummy_cn) | |||
]) | |||
san = x509.SubjectAlternativeName([x509.DNSName(dummy_cn)]) | |||
basic_contraints = x509.BasicConstraints(ca=True, path_length=0) | |||
now = datetime.utcnow() | |||
cert = ( | |||
x509.CertificateBuilder() | |||
.subject_name(common_name) | |||
.issuer_name(common_name) | |||
.public_key(key.public_key()) | |||
.serial_number(x509.random_serial_number()) | |||
.not_valid_before(now) | |||
.not_valid_after(now + relativedelta(years=10)) | |||
.add_extension(basic_contraints, False) | |||
.add_extension(san, False) | |||
.sign(key, hashes.SHA256(), default_backend()) | |||
) | |||
cert_fprint = cert.fingerprint(hashes.SHA256()).hex() | |||
shell.config_set(**{'config.cluster.fingerprint': cert_fprint}) | |||
serialized_cert = cert.public_bytes(encoding=serialization.Encoding.PEM) | |||
serialized_key = key.private_bytes( | |||
encoding=serialization.Encoding.PEM, | |||
format=serialization.PrivateFormat.PKCS8, | |||
encryption_algorithm=serialization.NoEncryption(), | |||
) | |||
cert_path.write_bytes(serialized_cert) | |||
key_path.write_bytes(serialized_key) |