Launcher: improve nodescan testing

This improves the nodescan testing by:

* Removing the fake handling from the production code
* Adds explicit host-key-checking: false settings to most nodepool
  test fixtures.
* Adding explicit success/failure tests for nodescan with the full
  node request lifecycle.
* Moves the test-only fakes into their own file so they can be shared
  with the new launcher nodescan tests as well as the dedicated
  nodescan unit tests.
* Allows for customizable timeouts in the requestNodes helper method
  since the failing test takes a while.

It also allows the boot-timeout to be correctly set in the section
or provider object in addition to the label, for convenience.

Change-Id: If6361cb437ad28472f154f3af9dde190d43c0d1e
This commit is contained in:
James E. Blair 2025-02-06 10:45:07 -08:00
parent 9114224348
commit cb68047920
18 changed files with 367 additions and 106 deletions

View File

@ -3877,7 +3877,8 @@ class ZuulTestCase(BaseTestCase):
path = os.path.join(self.test_root, "changes.data")
self.test_config.changes.load(path)
def requestNodes(self, labels, tenant="tenant-one", pipeline="check"):
def requestNodes(self, labels, tenant="tenant-one", pipeline="check",
timeout=10):
result_queue = PipelineResultEventQueue(
self.zk_client, tenant, pipeline)
@ -3899,7 +3900,7 @@ class ZuulTestCase(BaseTestCase):
span_info=None,
)
for _ in iterate_timeout(
10, "nodeset request to be fulfilled"):
timeout, "nodeset request to be fulfilled"):
result_events = list(result_queue)
if result_events:
for event in result_events:

87
tests/fake_nodescan.py Normal file
View File

@ -0,0 +1,87 @@
# Copyright (C) 2023 Acme Gating, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class FakeSocket:
def __init__(self):
self.blocking = True
self.fd = 1
def setblocking(self, b):
self.blocking = b
def getsockopt(self, level, optname):
return None
def connect(self, addr):
if not self.blocking:
raise BlockingIOError()
raise Exception("blocking connect attempted")
def fileno(self):
return self.fd
class FakePoll:
def __init__(self, _fail=False):
self.fds = []
self._fail = _fail
def register(self, fd, bitmap):
self.fds.append(fd)
def unregister(self, fd):
if fd in self.fds:
self.fds.remove(fd)
def poll(self, timeout=None):
if self._fail:
return []
fds = self.fds[:]
self.fds = [f for f in fds if not isinstance(f, FakeSocket)]
fds = [f.fileno() if hasattr(f, 'fileno') else f for f in fds]
return [(f, 0) for f in fds]
class Dummy:
pass
class FakeKey:
def get_name(self):
return 'fake key'
def get_base64(self):
return 'fake base64'
class FakeTransport:
def __init__(self, _fail=False, active=True):
self.active = active
self._fail = _fail
def start_client(self, event=None, timeout=None):
if not self._fail:
event.set()
def get_security_options(self):
ret = Dummy()
ret.key_types = ['rsa']
return ret
def get_remote_server_key(self):
return FakeKey()
def get_exception(self):
return Exception("Fake ssh error")

View File

@ -176,16 +176,16 @@ class FakeOpenstackConnection:
if self.cloud._fake_needs_floating_ip:
addresses = dict(
public=[],
private=[dict(version=4, addr='fake')]
private=[dict(version=4, addr='198.51.100.1')]
)
interface_ip = 'fake'
interface_ip = '198.51.100.1'
else:
addresses = dict(
public=[dict(version=4, addr='fake'),
dict(version=6, addr='fake_v6')],
private=[dict(version=4, addr='fake')]
public=[dict(version=4, addr='198.51.100.1'),
dict(version=6, addr='2001:db8::1')],
private=[dict(version=4, addr='198.51.100.1')]
)
interface_ip = 'fake'
interface_ip = '198.51.100.1'
args = dict(
id=uuid.uuid4().hex,

View File

@ -101,6 +101,7 @@
name: aws-base
abstract: true
connection: aws
host-key-checking: false
boot-timeout: 120
launch-timeout: 600
launch-attempts: 2

View File

@ -66,6 +66,7 @@
name: aws-base
abstract: true
connection: aws
host-key-checking: false
boot-timeout: 120
launch-timeout: 600
launch-attempts: 2

View File

@ -47,6 +47,7 @@
name: aws-base
abstract: true
connection: aws
host-key-checking: false
boot-timeout: 120
launch-timeout: 600
object-storage:

View File

@ -64,6 +64,7 @@
name: aws-base
abstract: true
connection: aws
host-key-checking: false
boot-timeout: 120
launch-timeout: 600
object-storage:

View File

@ -151,6 +151,7 @@
name: aws-base
abstract: true
connection: aws
host-key-checking: false
boot-timeout: 120
launch-timeout: 600
object-storage:

View File

@ -124,6 +124,7 @@
name: aws-base
abstract: true
connection: aws-missing
host-key-checking: false
boot-timeout: 120
launch-timeout: 600
launch-attempts: 2

View File

@ -0,0 +1,174 @@
- pipeline:
name: check
manager: independent
trigger:
gerrit:
- event: patchset-created
success:
gerrit:
Verified: 1
failure:
gerrit:
Verified: -1
- pipeline:
name: gate
manager: dependent
success-message: Build succeeded (gate).
trigger:
gerrit:
- event: comment-added
approval:
- Approved: 1
success:
gerrit:
Verified: 2
submit: true
failure:
gerrit:
Verified: -2
start:
gerrit:
Verified: 0
precedence: high
- pipeline:
name: post
manager: independent
trigger:
gerrit:
- event: ref-updated
ref: ^(?!refs/).*$
- pipeline:
name: tag
manager: independent
trigger:
gerrit:
- event: ref-updated
ref: ^refs/tags/.*$
- job:
name: base
parent: null
run: playbooks/base.yaml
nodeset:
nodes:
- label: debian-normal
name: controller
- job:
name: check-job
run: playbooks/check.yaml
- job:
name: post-job
run: playbooks/post.yaml
- job:
name: tag-job
run: playbooks/tag.yaml
- project:
name: org/project
check:
jobs:
- check-job
gate:
jobs:
- check-job
post:
jobs:
- post-job
tag:
jobs:
- tag-job
- image:
name: debian
type: cloud
- flavor:
name: normal
- flavor:
name: large
- flavor:
name: dedicated
- flavor:
name: invalid
- label:
name: debian-normal
image: debian
flavor: normal
- label:
name: debian-large
image: debian
flavor: large
- label:
name: debian-dedicated
image: debian
flavor: dedicated
- label:
name: debian-invalid
image: debian
flavor: invalid
- section:
name: aws-base
abstract: true
connection: aws
host-key-checking: true
boot-timeout: 1
launch-timeout: 600
launch-attempts: 2
object-storage:
bucket-name: zuul
# TODO
# key-name: zuul
flavors:
- name: normal
instance-type: t3.medium
volume-type: gp3
volume-size: 40
iops: 500
throughput: 200
- name: large
instance-type: t3.large
- name: dedicated
instance-type: t3.large
dedicated-host: True
- name: invalid
instance-type: invalid
images:
- name: debian
image-id: ami-1e749f67
- section:
name: aws-us-east-1
parent: aws-base
region: us-east-1
- provider:
name: aws-us-east-1-main
section: aws-us-east-1
labels:
- name: debian-normal
key-name: zuul
volume-type: gp3
volume-size: 40
# Change iops and omit throughput to test label overriding the
# flavor
iops: 1000
- name: debian-large
key-name: zuul
- name: debian-dedicated
key-name: zuul
- name: debian-invalid
key-name: zuul

View File

@ -124,7 +124,8 @@
name: aws-base
abstract: true
connection: aws
boot-timeout: 120
host-key-checking: false
boot-timeout: 300
launch-timeout: 600
launch-attempts: 2
object-storage:

View File

@ -57,6 +57,7 @@
name: openstack-base
abstract: true
connection: openstack
host-key-checking: false
boot-timeout: 120
launch-timeout: 600
launch-attempts: 2

View File

@ -100,6 +100,7 @@
name: openstack-base
abstract: true
connection: openstack
host-key-checking: false
boot-timeout: 120
launch-timeout: 600
launch-attempts: 2

View File

@ -21,7 +21,6 @@ from unittest import mock
from zuul import model
from zuul.launcher.client import LauncherClient
import fixtures
import responses
import testtools
from kazoo.exceptions import NoNodeError
@ -36,6 +35,11 @@ from tests.base import (
return_data,
ResponsesFixture,
)
from tests.fake_nodescan import (
FakeSocket,
FakePoll,
FakeTransport,
)
class ImageMocksFixture(ResponsesFixture):
@ -137,8 +141,6 @@ class LauncherBaseTestCase(ZuulTestCase):
self.mock_aws.start()
# Must start responses after mock_aws
self.useFixture(ImageMocksFixture())
self.useFixture(fixtures.MonkeyPatch(
'zuul.launcher.server.NodescanRequest.FAKE', True))
self.s3 = boto3.resource('s3', region_name='us-west-2')
self.s3.create_bucket(
Bucket='zuul',
@ -444,7 +446,7 @@ class TestLauncher(LauncherBaseTestCase):
self.waitUntilSettled()
nodes = self.launcher.api.nodes_cache.getItems()
self.assertNotEqual(nodes[0].host_keys, [])
self.assertEqual(nodes[0].host_keys, [])
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
@ -763,6 +765,64 @@ class TestLauncher(LauncherBaseTestCase):
except NoNodeError:
break
@simple_layout('layouts/nodepool-nodescan.yaml', enable_nodepool=True)
@okay_tracebacks('_checkNodescanRequest')
@mock.patch('paramiko.transport.Transport')
@mock.patch('socket.socket')
@mock.patch('select.epoll')
def test_nodescan_failure(self, mock_epoll, mock_socket, mock_transport):
# Test a nodescan failure
fake_socket = FakeSocket()
mock_socket.return_value = fake_socket
mock_epoll.return_value = FakePoll()
mock_transport.return_value = FakeTransport(_fail=True)
ctx = self.createZKContext(None)
request = self.requestNodes(["debian-normal"], timeout=30)
self.assertEqual(request.state, model.NodesetRequest.State.FAILED)
provider_nodes = request.provider_nodes[0]
self.assertEqual(len(provider_nodes), 2)
self.assertEqual(len(request.nodes), 1)
self.assertEqual(provider_nodes[-1], request.nodes[-1])
provider_nodes = []
for node_id in request.nodes:
provider_nodes.append(model.ProviderNode.fromZK(
ctx, path=model.ProviderNode._getPath(node_id)))
request.delete(ctx)
self.waitUntilSettled()
for pnode in provider_nodes:
for _ in iterate_timeout(60, "node to be deleted"):
try:
pnode.refresh(ctx)
except NoNodeError:
break
@simple_layout('layouts/nodepool-nodescan.yaml', enable_nodepool=True)
@okay_tracebacks('_checkNodescanRequest')
@mock.patch('paramiko.transport.Transport')
@mock.patch('socket.socket')
@mock.patch('select.epoll')
def test_nodescan_success(self, mock_epoll, mock_socket, mock_transport):
# Test a normal launch with a nodescan
fake_socket = FakeSocket()
mock_socket.return_value = fake_socket
mock_epoll.return_value = FakePoll()
mock_transport.return_value = FakeTransport()
ctx = self.createZKContext(None)
request = self.requestNodes(["debian-normal"])
self.assertEqual(request.state, model.NodesetRequest.State.FULFILLED)
provider_nodes = request.provider_nodes[0]
self.assertEqual(len(provider_nodes), 1)
self.assertEqual(len(request.nodes), 1)
node = model.ProviderNode.fromZK(
ctx, path=model.ProviderNode._getPath(request.nodes[0]))
self.assertEqual(['fake key fake base64'], node.host_keys)
@simple_layout('layouts/nodepool-image.yaml', enable_nodepool=True)
@return_data(
'build-debian-local-image',
@ -876,7 +936,7 @@ class TestMinReadyLauncher(LauncherBaseTestCase):
if len(in_use_nodes) == 2:
break
self.assertNotEqual(nodes[0].host_keys, [])
self.assertEqual(nodes[0].host_keys, [])
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()

View File

@ -27,80 +27,11 @@ from tests.base import (
iterate_timeout,
okay_tracebacks,
)
class FakeSocket:
def __init__(self):
self.blocking = True
self.fd = 1
def setblocking(self, b):
self.blocking = b
def getsockopt(self, level, optname):
return None
def connect(self, addr):
if not self.blocking:
raise BlockingIOError()
raise Exception("blocking connect attempted")
def fileno(self):
return self.fd
class FakePoll:
def __init__(self, _fail=False):
self.fds = []
self._fail = _fail
def register(self, fd, bitmap):
self.fds.append(fd)
def unregister(self, fd):
if fd in self.fds:
self.fds.remove(fd)
def poll(self, timeout=None):
if self._fail:
return []
fds = self.fds[:]
self.fds = [f for f in fds if not isinstance(f, FakeSocket)]
fds = [f.fileno() if hasattr(f, 'fileno') else f for f in fds]
return [(f, 0) for f in fds]
class Dummy:
pass
class FakeKey:
def get_name(self):
return 'fake key'
def get_base64(self):
return 'fake base64'
class FakeTransport:
def __init__(self, _fail=False, active=True):
self.active = active
self._fail = _fail
def start_client(self, event=None, timeout=None):
if not self._fail:
event.set()
def get_security_options(self):
ret = Dummy()
ret.key_types = ['rsa']
return ret
def get_remote_server_key(self):
return FakeKey()
def get_exception(self):
return Exception("Fake ssh error")
from tests.fake_nodescan import (
FakeSocket,
FakePoll,
FakeTransport,
)
class DummyProviderNode(ProviderNode, subclass_id="dummy-nodescan"):

View File

@ -264,8 +264,6 @@ class NodescanRequest:
CONNECTING_KEY = 'connecting key'
NEGOTIATING_KEY = 'negotiating key'
COMPLETE = 'complete'
# For unit testing
FAKE = False
def __init__(self, node, log):
self.state = self.START
@ -282,10 +280,9 @@ class NodescanRequest:
self.gather_hostkeys = False
self.ip = node.interface_ip
self.port = node.connection_port
if 'fake' not in self.ip and not self.FAKE:
addrinfo = socket.getaddrinfo(self.ip, self.port)[0]
self.family = addrinfo[0]
self.sockaddr = addrinfo[4]
addrinfo = socket.getaddrinfo(self.ip, self.port)[0]
self.family = addrinfo[0]
self.sockaddr = addrinfo[4]
self.sock = None
self.transport = None
self.event = None
@ -412,14 +409,9 @@ class NodescanRequest:
if not self.host_key_checking:
self.state = self.COMPLETE
else:
if 'fake' in self.ip or self.FAKE:
if self.gather_hostkeys:
self.keys = ['ssh-rsa FAKEKEY']
self.state = self.COMPLETE
else:
self.init_connection_attempts += 1
self._connect()
self.state = self.CONNECTING_INIT
self.init_connection_attempts += 1
self._connect()
self.state = self.CONNECTING_INIT
if self.state == self.CONNECTING_INIT:
if not socket_ready:
@ -1024,7 +1016,6 @@ class Launcher:
with node.activeContext(ctx):
node.setState(state)
self.wake_event.set()
# Deallocate ready node w/o a request for re-use
if (node.request_id and not request
and node.state == node.State.READY):

View File

@ -74,8 +74,11 @@ class BaseProviderFlavor(CNameMixin, metaclass=abc.ABCMeta):
class BaseProviderLabel(CNameMixin, metaclass=abc.ABCMeta):
inheritable_schema = assemble()
inheritable_schema = assemble(
provider_schema.common_label,
)
schema = assemble(
provider_schema.common_label,
provider_schema.base_label,
)
image_flavor_inheritable_schema = assemble()
@ -185,7 +188,6 @@ class BaseProviderSchema(metaclass=abc.ABCMeta):
Optional('abstract', default=False): Nullable(bool),
Optional('parent'): Nullable(str),
Required('connection'): str,
Optional('boot-timeout'): Nullable(int),
Optional('launch-timeout'): Nullable(int),
Optional('launch-attempts', default=3): int,
})

View File

@ -22,6 +22,13 @@ from zuul.lib.voluputil import Required, Optional, Nullable
# Labels
# The label attributes which can appear either in the main body of the
# section stanza, or in a section/provider label, or in a standalone
# label.
common_label = vs.Schema({
Optional('boot-timeout', default=300): int,
})
# The label attributes that can appear in a section/provider label or
# a standalone label (but not in the section body).
base_label = vs.Schema({
@ -34,7 +41,6 @@ base_label = vs.Schema({
Optional('tags', default=dict): {str: str},
Optional('min-ready', default=0): int,
Optional('max-ready-age', default=0): int,
Optional('boot-timeout', default=300): int,
})
# Label attributes that are common to any kind of ssh-based driver.