system-config/launch/src/opendev_launch/launch_node.py

478 lines
18 KiB
Python
Executable File

#!/usr/bin/env python3
# Launch a new OpenStack project infrastructure node.
# Copyright (C) 2011-2012 OpenStack LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
#
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import shutil
import socket
import subprocess
import sys
import threading
import tempfile
import time
import traceback
from . import dns
from . import rax_rdns
from . import utils
from .ssh_knownhosts import generate_known_hosts
import openstack
import paramiko
from .sshclient import SSHException
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
try:
# This unactionable warning does not need to be printed over and over.
import requests.packages.urllib3
requests.packages.urllib3.disable_warnings()
except:
pass
class JobDir(object):
def __init__(self, keep=False):
self.keep = keep
self.root = tempfile.mkdtemp()
self.inventory_root = os.path.join(self.root, 'inventory')
os.makedirs(self.inventory_root)
self.hosts = os.path.join(self.inventory_root, 'hosts')
self.groups = os.path.join(self.inventory_root, 'groups')
self.key = os.path.join(self.root, 'id_rsa')
self.ansible_log = os.path.join(self.root, 'ansible_log.txt')
# XXX if we need more, we might like to setup an ansible.cfg
# file and use that rather than env vars. See
# zuul/launcher/ansiblelaunchserver.py as an example
self.env = os.environ.copy()
self.env['ANSIBLE_LOG_PATH'] = self.ansible_log
def __enter__(self):
return self
def __exit__(self, etype, value, tb):
if not self.keep:
shutil.rmtree(self.root)
def run(cmd, **args):
args['stdout'] = subprocess.PIPE
args['stderr'] = subprocess.STDOUT
print("Running: %s" % (cmd,))
proc = subprocess.Popen(cmd, **args)
out = ''
for line in iter(proc.stdout.readline, b''):
line = line.decode('utf-8')
sys.stdout.write(line)
sys.stdout.flush()
out += line
ret = proc.wait()
print("Return code: %s" % (ret,))
if ret != 0:
raise subprocess.CalledProcessError(ret, cmd, out)
return ret
def stream_syslog(ssh_client):
try:
ssh_client.ssh('tail -f /var/log/syslog')
except Exception:
print("Syslog stream terminated")
def bootstrap_server(server, key, name, volume_device, keep,
mount_path, fs_label, environment, timeout, ignore_ipv6,
playbooks):
ip = server.public_v4
ssh_kwargs = dict(pkey=key)
print("--- Running initial configuration on host %s ---" % ip)
ssh_client = None
for username in ['root', 'ubuntu', 'centos', 'admin']:
try:
ssh_client = utils.ssh_connect(ip, username, ssh_kwargs,
timeout=timeout)
except SSHException:
print("Username: " + username + " failed to ssh. "
"Trying next option.")
if ssh_client:
break
if not ssh_client:
raise Exception("Unable to log in via SSH")
# cloud-init puts the "please log in as user foo" message and
# subsequent exit() in root's authorized_keys -- overwrite it with
# a normal version to get root login working again.
if username != 'root':
ssh_client.ssh("sudo cp ~/.ssh/authorized_keys"
" ~root/.ssh/authorized_keys")
ssh_client.ssh("sudo chmod 644 ~root/.ssh/authorized_keys")
ssh_client.ssh("sudo chown root.root ~root/.ssh/authorized_keys")
ssh_client = utils.ssh_connect(ip, 'root', ssh_kwargs, timeout=timeout)
if "sse4_2" not in ssh_client.ssh('cat /proc/cpuinfo', quiet=True)[1]:
raise Exception("CPU does not support x86-64-v2 (sse4_2)")
if not ignore_ipv6:
# Something up with RAX images that they have the ipv6 interface in
# /etc/network/interfaces but eth0 hasn't noticed yet; reload it
ssh_client.ssh('(ifdown eth0 && ifup eth0) || true')
if server.public_v6:
# The server may be waiting on Router Advertisements to configure
# this address. Wait for that to complete before pinging.
ssh_client.ssh("bash -c 'count=0 ; "
"while ! ip addr | grep -q %s && [\"$count\" -le 60 ] ; do "
" count=$((count + 1)) ; "
" echo \"Waiting for IPv6 address to configure\" ; "
" sleep 1 ; "
"done'" % server.public_v6)
ssh_client.ssh('ping6 -c5 -Q 0x10 review.openstack.org '
'|| ping6 -c5 -Q 0x10 wiki.openstack.org')
ssh_client.scp(os.path.join(SCRIPT_DIR, 'make_swap.sh'),
'make_swap.sh')
ssh_client.ssh('bash -x make_swap.sh')
if volume_device:
ssh_client.scp(os.path.join(SCRIPT_DIR, 'mount_volume.sh'),
'mount_volume.sh')
ssh_client.ssh('bash -x mount_volume.sh %s %s %s' %
(volume_device, mount_path, fs_label))
# Zero the ansible inventory cache so that next run finds the new server
inventory_cache_dir = '/var/cache/ansible/inventory'
try:
for inventory_cache in os.listdir(inventory_cache_dir):
os.unlink(os.path.join(inventory_cache_dir, inventory_cache))
except FileNotFoundError:
pass
with JobDir(keep) as jobdir:
# Update the generated-groups file globally and incorporate it
# into our inventory
# Remove cloud and region from the environment to work
# around a bug in occ
expand_env = os.environ.copy()
for env_key in list(expand_env.keys()):
if env_key.startswith('OS_'):
expand_env.pop(env_key, None)
expand_env['ANSIBLE_LOG_PATH'] = jobdir.ansible_log
# Write out the private SSH key we generated
with open(jobdir.key, 'w') as key_file:
key.write_private_key(key_file)
os.chmod(jobdir.key, 0o600)
if ignore_ipv6:
host_ip = server.public_v4
else:
host_ip = server.interface_ip
# Write out inventory
with open(jobdir.hosts, 'w') as inventory_file:
inventory_file.write(
"{host} ansible_host={ip} ansible_user=root {python}".format(
host=name, ip=host_ip,
python='ansible_python_interpreter=/usr/bin/python3'))
t = threading.Thread(target=stream_syslog, args=(ssh_client,))
t.daemon = True
t.start()
inventory_list = (
'/home/zuul/src/opendev.org/opendev/system-config/inventory/base/hosts.yaml',
'/home/zuul/src/opendev.org/opendev/system-config/inventory/service/groups.yaml',
'/etc/ansible/hosts/emergency.yaml',
jobdir.inventory_root,
)
inventory_cmds = [v for e in inventory_list for v in ('-i', e)]
ansible_cmd = ['ansible-playbook', '--flush-cache' ] + \
inventory_cmds + \
['-l', name,
'--private-key={key}'.format(key=jobdir.key),
"--ssh-common-args='-o StrictHostKeyChecking=no'",
'-e', 'target={name}'.format(name=name)]
# Run the base playbook limited to just this server we just created
for playbook in [
'set-hostnames.yaml',
'base.yaml',
'apply-package-updates.yaml',
]:
run(ansible_cmd + [os.path.join(playbooks, playbook)],
env=jobdir.env)
try:
ssh_client.ssh("reboot")
ssh_client.close()
except Exception as e:
# Some init system kill the connection too fast after reboot.
# Deal with it by ignoring ssh errors when rebooting.
if e.rc == -1:
pass
else:
raise
# Wait a bit and make sure we can ssh back in
print("Waiting 30 seconds for reboot")
time.sleep(30)
ssh_client = utils.ssh_connect(ip, 'root', ssh_kwargs, timeout=90)
if not ssh_client:
raise Exception("Failed to log into host")
ssh_client.close()
print("Host alive")
def build_server(cloud, name, image, flavor,
volume, keep, network, boot_from_volume, config_drive,
mount_path, fs_label, availability_zone, environment,
volume_size, timeout, ignore_ipv6, playbooks):
key = None
server = None
create_kwargs = dict(image=image, flavor=flavor, name=name,
reuse_ips=False, wait=True,
boot_from_volume=boot_from_volume,
volume_size=volume_size,
network=network,
config_drive=config_drive,
timeout=timeout)
if availability_zone:
create_kwargs['availability_zone'] = availability_zone
if volume:
create_kwargs['volumes'] = [volume]
key_name = 'launch-%i' % (time.time())
key = paramiko.RSAKey.generate(2048)
public_key = key.get_name() + ' ' + key.get_base64()
cloud.create_keypair(key_name, public_key)
create_kwargs['key_name'] = key_name
try:
server = cloud.create_server(**create_kwargs)
except Exception:
try:
cloud.delete_keypair(key_name)
except Exception:
print("Exception encountered deleting keypair:")
traceback.print_exc()
raise
try:
cloud.delete_keypair(key_name)
server = cloud.get_openstack_vars(server)
if volume:
volume = cloud.get_volume(volume)
volume_device = cloud.get_volume_attach_device(volume,
server['id'])
else:
volume_device = None
bootstrap_server(server, key, name, volume_device, keep,
mount_path, fs_label, environment, timeout,
ignore_ipv6, playbooks)
print('UUID=%s\nIPV4=%s\nIPV6=%s\n' % (
server.id, server.public_v4, server.public_v6))
except Exception:
print("****")
print("Server %s failed to build!" % (server.id))
try:
volumes = []
if boot_from_volume:
volumes = cloud.get_volumes(server)
if keep:
print("Keeping as requested")
# Write out the private SSH key we generated, as we
# may not have got far enough for ansible to run
with open('/tmp/%s.id_rsa' % server.id, 'w') as key_file:
key.write_private_key(key_file)
os.chmod(key_file.name, 0o600)
print("Private key saved in %s" % key_file.name)
print("Run to delete:")
print(" openstack server delete %s" % (server.id))
for attachment in volumes:
print(" openstack volume delete %s" % (attachment.id))
else:
cloud.delete_server(server.id, delete_ips=True)
for attachment in volumes:
cloud.delete_volume(attachment.id)
except Exception:
print("Exception encountered deleting server:")
traceback.print_exc()
print("The original exception follows:")
print("****")
# Raise the important exception that started this
raise
return server
def print_inventory_yaml(server):
ip4 = server.public_v4
ip6 = server.public_v6
cloud = server.location['cloud']
region = server.location['region_name']
known_hosts = generate_known_hosts(ip4)
print(f"Put the following into system-config:inventory/base/hosts.yaml")
print()
print(f" {server.name}:")
print(f" ansible_host: {ip4}")
print(f" location:")
print(f" cloud: {cloud}")
print(f" region_name: {region}")
print(f" public_v4: {ip4}")
if ip6:
print(f" public_v6: {ip6}")
print(f" host_keys:")
for (key, fingerprint) in known_hosts:
print(f" - {key} {fingerprint}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("name", help="server name")
parser.add_argument("--cloud", dest="cloud", required=True,
help="cloud name")
parser.add_argument("--region", dest="region",
help="cloud region")
parser.add_argument("--flavor", dest="flavor", default='1GB',
help="name (or substring) of flavor")
parser.add_argument("--image", dest="image",
default="Ubuntu 22.04 LTS (Jammy Jellyfish) (Cloud)",
help="image name")
parser.add_argument("--environment", dest="environment",
help="Puppet environment to use",
default=None)
parser.add_argument("--volume", dest="volume",
help="UUID of volume to attach to the new server.",
default=None)
parser.add_argument("--mount-path", dest="mount_path",
help="Path to mount cinder volume at.",
default=None)
parser.add_argument("--fs-label", dest="fs_label",
help="FS label to use when mounting cinder volume.",
default=None)
parser.add_argument("--boot-from-volume", dest="boot_from_volume",
help="Create a boot volume for the server and use it.",
action='store_true',
default=False)
parser.add_argument("--volume-size", dest="volume_size",
help="Size of volume (GB) for --boot-from-volume",
default="50")
parser.add_argument("--keep", dest="keep",
help="Don't clean up or delete the server on error.",
action='store_true',
default=False)
parser.add_argument("--verbose", dest="verbose", default=False,
action='store_true',
help="Be verbose about logging cloud actions")
parser.add_argument("--network", dest="network", default=None,
help="network label to attach instance to")
parser.add_argument("--config-drive", dest="config_drive",
help="Boot with config_drive attached.",
action='store_true',
default=False)
parser.add_argument("--timeout", dest="timeout",
help="Increase timeouts (default 600s)",
type=int, default=600)
parser.add_argument("--ignore_ipv6", dest="ignore_ipv6",
help="Ignore IPv6 addresses from API",
action='store_true', default=False)
parser.add_argument("--az", dest="availability_zone", default=None,
help="AZ to boot in.")
parser.add_argument("--playbooks", dest="playbooks",
default="/home/zuul/src/opendev.org/opendev/"
"system-config/playbooks",
help="alternative playbook directory")
options = parser.parse_args()
openstack.enable_logging(debug=options.verbose)
cloud_kwargs = {}
if options.region:
cloud_kwargs['region_name'] = options.region
cloud = openstack.connect(cloud=options.cloud, **cloud_kwargs)
flavor = cloud.get_flavor(options.flavor)
if flavor:
print("Found flavor", flavor.name)
else:
print("Unable to find matching flavor; flavor list:")
for i in cloud.list_flavors():
print(i.name)
sys.exit(1)
image = cloud.get_image_exclude(options.image, 'deprecated')
if image:
print("Found image", image.name)
else:
print("Unable to find matching image; image list:")
for i in cloud.list_images():
print(i.name)
sys.exit(1)
# NOTE(ianw): 2019-06-26 for whatever reason OVH assigns an IPv6
# address that is seen in API queries, but it is not in the host
# metadata so the interface isn't autoconfigured. Auto skip it to
# avoid this bombing out.
if "-ovh" in options.cloud:
print("Ignoring IPv6 for OVH cloud instances")
options.ignore_ipv6 = True
server = build_server(cloud, options.name, image, flavor,
options.volume, options.keep,
options.network, options.boot_from_volume,
options.config_drive,
options.mount_path, options.fs_label,
options.availability_zone,
options.environment, options.volume_size,
options.timeout, options.ignore_ipv6,
options.playbooks)
if 'rax' in cloud.config.name:
print("Setting reverse DNS for RAX")
rax_rdns.set_rax_reverse_dns(cloud, server,
server.public_v4, server.public_v6)
print()
print("-------- CONFIGURATION --------\n")
dns.print_dns(server)
print()
print_inventory_yaml(server)
print()
print("-------------------------------")
print()
print("If this is a server that is expected to send email (ask, review,")
print("lists, etc) double check that the server's IPs are not listed on")
print("the spamhaus pbl.\n")
print("URLs to check:")
print("https://www.spamhaus.org/query/ip/%s" % server.public_v4)
print("https://www.spamhaus.org/query/ip/%s" % server.public_v6)
print()
print("When requesting an exception you can use the")
print("infra-root@openstack.org email address to verify the responsible")
print("party.")