from __future__ import annotations

import asyncio
import copy
import functools
import gc
import inspect
import io
import logging
import logging.config
import multiprocessing
import os
import re
import shutil
import signal
import socket
import subprocess
import sys
import tempfile
import threading
import uuid
import weakref
from collections import defaultdict
from collections.abc import Callable
from contextlib import contextmanager, nullcontext, suppress
from glob import glob
from itertools import count
from time import sleep
from typing import Any, Literal

from distributed.compatibility import MACOS
from distributed.scheduler import Scheduler

try:
    import ssl
except ImportError:
    ssl = None  # type: ignore

import pytest
import yaml
from tlz import assoc, memoize, merge
from tornado import gen
from tornado.ioloop import IOLoop

import dask

from distributed.comm.tcp import TCP

from . import system
from . import versions as version_module
from .client import Client, _global_clients, default_client
from .comm import Comm
from .comm.tcp import BaseTCPConnector
from .compatibility import WINDOWS
from .config import initialize_logging
from .core import CommClosedError, ConnectionPool, Status, connect, rpc
from .deploy import SpecCluster
from .diagnostics.plugin import WorkerPlugin
from .metrics import time
from .nanny import Nanny
from .node import ServerNode
from .proctitle import enable_proctitle_on_children
from .security import Security
from .utils import (
    DequeHandler,
    TimeoutError,
    _offload_executor,
    get_ip,
    get_ipv6,
    iscoroutinefunction,
    log_errors,
    mp_context,
    reset_logger_locks,
    sync,
)
from .worker import Worker

try:
    import dask.array  # register config
except ImportError:
    pass


logger = logging.getLogger(__name__)


logging_levels = {
    name: logger.level
    for name, logger in logging.root.manager.loggerDict.items()
    if isinstance(logger, logging.Logger)
}

_TEST_TIMEOUT = 30
_offload_executor.submit(lambda: None).result()  # create thread during import


@pytest.fixture(scope="session")
def valid_python_script(tmpdir_factory):
    local_file = tmpdir_factory.mktemp("data").join("file.py")
    local_file.write("print('hello world!')")
    return local_file


@pytest.fixture(scope="session")
def client_contract_script(tmpdir_factory):
    local_file = tmpdir_factory.mktemp("data").join("distributed_script.py")
    lines = (
        "from distributed import Client",
        "e = Client('127.0.0.1:8989')",
        "print(e)",
    )
    local_file.write("\n".join(lines))
    return local_file


@pytest.fixture(scope="session")
def invalid_python_script(tmpdir_factory):
    local_file = tmpdir_factory.mktemp("data").join("file.py")
    local_file.write("a+1")
    return local_file


async def cleanup_global_workers():
    for worker in Worker._instances:
        await worker.close(report=False, executor_wait=False)


@pytest.fixture
def loop():
    with check_instances():
        with pristine_loop() as loop:
            # Monkey-patch IOLoop.start to wait for loop stop
            orig_start = loop.start
            is_stopped = threading.Event()
            is_stopped.set()

            def start():
                is_stopped.clear()
                try:
                    orig_start()
                finally:
                    is_stopped.set()

            loop.start = start

            yield loop

            # Stop the loop in case it's still running
            try:
                sync(loop, cleanup_global_workers, callback_timeout=0.500)
                loop.add_callback(loop.stop)
            except RuntimeError as e:
                if not re.match("IOLoop is clos(ed|ing)", str(e)):
                    raise
            except TimeoutError:
                pass
            else:
                is_stopped.wait()


@pytest.fixture
def loop_in_thread():
    with pristine_loop() as loop:
        thread = threading.Thread(target=loop.start, name="test IOLoop")
        thread.daemon = True
        thread.start()
        loop_started = threading.Event()
        loop.add_callback(loop_started.set)
        loop_started.wait()
        yield loop
        loop.add_callback(loop.stop)
        thread.join(timeout=5)


@pytest.fixture
def zmq_ctx():
    import zmq

    ctx = zmq.Context.instance()
    yield ctx
    ctx.destroy(linger=0)


@contextmanager
def pristine_loop():
    IOLoop.clear_instance()
    IOLoop.clear_current()
    loop = IOLoop()
    loop.make_current()
    assert IOLoop.current() is loop
    try:
        yield loop
    finally:
        try:
            loop.close(all_fds=True)
        except (KeyError, ValueError):
            pass
        IOLoop.clear_instance()
        IOLoop.clear_current()


@contextmanager
def mock_ipython():
    from unittest import mock

    from distributed._ipython_utils import remote_magic

    ip = mock.Mock()
    ip.user_ns = {}
    ip.kernel = None

    def get_ip():
        return ip

    with mock.patch("IPython.get_ipython", get_ip), mock.patch(
        "distributed._ipython_utils.get_ipython", get_ip
    ):
        yield ip
    # cleanup remote_magic client cache
    for kc in remote_magic._clients.values():
        kc.stop_channels()
    remote_magic._clients.clear()


original_config = copy.deepcopy(dask.config.config)


def reset_config():
    dask.config.config.clear()
    dask.config.config.update(copy.deepcopy(original_config))


def nodebug(func):
    """
    A decorator to disable debug facilities during timing-sensitive tests.
    Warning: this doesn't affect already created IOLoops.
    """

    @functools.wraps(func)
    def wrapped(*args, **kwargs):
        old_asyncio_debug = os.environ.get("PYTHONASYNCIODEBUG")
        if old_asyncio_debug is not None:
            del os.environ["PYTHONASYNCIODEBUG"]
        try:
            return func(*args, **kwargs)
        finally:
            if old_asyncio_debug is not None:
                os.environ["PYTHONASYNCIODEBUG"] = old_asyncio_debug

    return wrapped


def nodebug_setup_module(module):
    """
    A setup_module() that you can install in a test module to disable
    debug facilities.
    """
    module._old_asyncio_debug = os.environ.get("PYTHONASYNCIODEBUG")
    if module._old_asyncio_debug is not None:
        del os.environ["PYTHONASYNCIODEBUG"]


def nodebug_teardown_module(module):
    """
    A teardown_module() that you can install in a test module to reenable
    debug facilities.
    """
    if module._old_asyncio_debug is not None:
        os.environ["PYTHONASYNCIODEBUG"] = module._old_asyncio_debug


def inc(x):
    return x + 1


def dec(x):
    return x - 1


def mul(x, y):
    return x * y


def div(x, y):
    return x / y


def deep(n):
    if n > 0:
        return deep(n - 1)
    else:
        return True


def throws(x):
    raise RuntimeError("hello!")


def double(x):
    return x * 2


def slowinc(x, delay=0.02):
    sleep(delay)
    return x + 1


def slowdec(x, delay=0.02):
    sleep(delay)
    return x - 1


def slowdouble(x, delay=0.02):
    sleep(delay)
    return 2 * x


def randominc(x, scale=1):
    from random import random

    sleep(random() * scale)
    return x + 1


def slowadd(x, y, delay=0.02):
    sleep(delay)
    return x + y


def slowsum(seq, delay=0.02):
    sleep(delay)
    return sum(seq)


def slowidentity(*args, **kwargs):
    delay = kwargs.get("delay", 0.02)
    sleep(delay)
    if len(args) == 1:
        return args[0]
    else:
        return args


class _UnhashableCallable:
    # FIXME https://github.com/python/mypy/issues/4266
    __hash__ = None  # type: ignore

    def __call__(self, x):
        return x + 1


def run_for(duration, timer=time):
    """
    Burn CPU for *duration* seconds.
    """
    deadline = timer() + duration
    while timer() <= deadline:
        pass


# This dict grows at every varying() invocation
_varying_dict: defaultdict[str, int] = defaultdict(int)
_varying_key_gen = count()


class _ModuleSlot:
    def __init__(self, modname, slotname):
        self.modname = modname
        self.slotname = slotname

    def get(self):
        return getattr(sys.modules[self.modname], self.slotname)


def varying(items):
    """
    Return a function that returns a result (or raises an exception)
    from *items* at each call.
    """
    # cloudpickle would serialize the *values* of all globals
    # used by *func* below, so we can't use `global <something>`.
    # Instead look up the module by name to get the original namespace
    # and not a copy.
    slot = _ModuleSlot(__name__, "_varying_dict")
    key = next(_varying_key_gen)

    def func():
        dct = slot.get()
        i = dct[key]
        if i == len(items):
            raise IndexError
        else:
            x = items[i]
            dct[key] = i + 1
            if isinstance(x, Exception):
                raise x
            else:
                return x

    return func


def map_varying(itemslists):
    """
    Like *varying*, but return the full specification for a map() call
    on multiple items lists.
    """

    def apply(func, *args, **kwargs):
        return func(*args, **kwargs)

    return apply, list(map(varying, itemslists))


async def geninc(x, delay=0.02):
    await asyncio.sleep(delay)
    return x + 1


async def asyncinc(x, delay=0.02):
    await asyncio.sleep(delay)
    return x + 1


_readone_queues: dict[Any, asyncio.Queue] = {}


async def readone(comm):
    """
    Read one message at a time from a comm that reads lists of
    messages.
    """
    try:
        q = _readone_queues[comm]
    except KeyError:
        q = _readone_queues[comm] = asyncio.Queue()

        async def background_read():
            while True:
                try:
                    messages = await comm.read()
                except CommClosedError:
                    break
                for msg in messages:
                    q.put_nowait(msg)
            q.put_nowait(None)
            del _readone_queues[comm]

        background_read()

    msg = await q.get()
    if msg is None:
        raise CommClosedError
    else:
        return msg


def run_scheduler(q, nputs, config, port=0, **kwargs):
    with dask.config.set(config):
        from distributed import Scheduler

        # On Python 2.7 and Unix, fork() is used to spawn child processes,
        # so avoid inheriting the parent's IO loop.
        with pristine_loop() as loop:

            async def _():
                try:
                    scheduler = await Scheduler(
                        validate=True, host="127.0.0.1", port=port, **kwargs
                    )
                except Exception as exc:
                    for i in range(nputs):
                        q.put(exc)
                else:
                    for i in range(nputs):
                        q.put(scheduler.address)
                    await scheduler.finished()

            try:
                loop.run_sync(_)
            finally:
                loop.close(all_fds=True)


def run_worker(q, scheduler_q, config, **kwargs):
    with dask.config.set(config):
        from distributed import Worker

        reset_logger_locks()
        with log_errors():
            with pristine_loop() as loop:
                scheduler_addr = scheduler_q.get()

                async def _():
                    try:
                        worker = await Worker(scheduler_addr, validate=True, **kwargs)
                    except Exception as exc:
                        q.put(exc)
                    else:
                        q.put(worker.address)
                        await worker.finished()

                # Scheduler might've failed
                if isinstance(scheduler_addr, str):
                    try:
                        loop.run_sync(_)
                    finally:
                        loop.close(all_fds=True)


def run_nanny(q, scheduler_q, config, **kwargs):
    with dask.config.set(config):
        with log_errors():
            with pristine_loop() as loop:
                scheduler_addr = scheduler_q.get()

                async def _():
                    try:
                        worker = await Nanny(scheduler_addr, validate=True, **kwargs)
                    except Exception as exc:
                        q.put(exc)
                    else:
                        q.put(worker.address)
                        await worker.finished()

                # Scheduler might've failed
                if isinstance(scheduler_addr, str):
                    try:
                        loop.run_sync(_)
                    finally:
                        loop.close(all_fds=True)


@contextmanager
def check_active_rpc(loop, active_rpc_timeout=1):
    active_before = set(rpc.active)
    yield
    # Some streams can take a bit of time to notice their peer
    # has closed, and keep a coroutine (*) waiting for a CommClosedError
    # before calling close_rpc() after a CommClosedError.
    # This would happen especially if a non-localhost address is used,
    # as Nanny does.
    # (*) (example: gather_from_workers())

    def fail():
        pytest.fail(
            "some RPCs left active by test: %s" % (set(rpc.active) - active_before)
        )

    async def wait():
        await async_wait_for(
            lambda: len(set(rpc.active) - active_before) == 0,
            timeout=active_rpc_timeout,
            fail_func=fail,
        )

    loop.run_sync(wait)


@pytest.fixture
def cluster_fixture(loop):
    with cluster() as (scheduler, workers):
        yield (scheduler, workers)


@pytest.fixture
def s(cluster_fixture):
    scheduler, workers = cluster_fixture
    return scheduler


@pytest.fixture
def a(cluster_fixture):
    scheduler, workers = cluster_fixture
    return workers[0]


@pytest.fixture
def b(cluster_fixture):
    scheduler, workers = cluster_fixture
    return workers[1]


@pytest.fixture
def client(loop, cluster_fixture):
    scheduler, workers = cluster_fixture
    with Client(scheduler["address"], loop=loop) as client:
        yield client


# Compatibility. A lot of tests simply use `c` as fixture name
c = client


@pytest.fixture
def client_secondary(loop, cluster_fixture):
    scheduler, workers = cluster_fixture
    with Client(scheduler["address"], loop=loop) as client:
        yield client


@contextmanager
def tls_cluster_context(
    worker_kwargs=None, scheduler_kwargs=None, security=None, **kwargs
):
    security = security or tls_only_security()
    worker_kwargs = assoc(worker_kwargs or {}, "security", security)
    scheduler_kwargs = assoc(scheduler_kwargs or {}, "security", security)

    with cluster(
        worker_kwargs=worker_kwargs, scheduler_kwargs=scheduler_kwargs, **kwargs
    ) as (s, workers):
        yield s, workers


@pytest.fixture
def tls_cluster(loop, security):
    with tls_cluster_context(security=security) as (scheduler, workers):
        yield (scheduler, workers)


@pytest.fixture
def tls_client(tls_cluster, loop, security):
    s, workers = tls_cluster
    with Client(s["address"], security=security, loop=loop) as client:
        yield client


@pytest.fixture
def security():
    return tls_only_security()


@contextmanager
def cluster(
    nworkers=2,
    nanny=False,
    worker_kwargs={},
    active_rpc_timeout=10,
    disconnect_timeout=20,
    scheduler_kwargs={},
    config={},
):
    ws = weakref.WeakSet()
    enable_proctitle_on_children()

    with clean(timeout=active_rpc_timeout, threads=False) as loop:
        if nanny:
            _run_worker = run_nanny
        else:
            _run_worker = run_worker

        # The scheduler queue will receive the scheduler's address
        scheduler_q = mp_context.Queue()

        # Launch scheduler
        scheduler = mp_context.Process(
            name="Dask cluster test: Scheduler",
            target=run_scheduler,
            args=(scheduler_q, nworkers + 1, config),
            kwargs=scheduler_kwargs,
        )
        ws.add(scheduler)
        scheduler.daemon = True
        scheduler.start()

        # Launch workers
        workers = []
        for i in range(nworkers):
            q = mp_context.Queue()
            fn = "_test_worker-%s" % uuid.uuid4()
            kwargs = merge(
                {
                    "nthreads": 1,
                    "local_directory": fn,
                    "memory_limit": system.MEMORY_LIMIT,
                },
                worker_kwargs,
            )
            proc = mp_context.Process(
                name="Dask cluster test: Worker",
                target=_run_worker,
                args=(q, scheduler_q, config),
                kwargs=kwargs,
            )
            ws.add(proc)
            workers.append({"proc": proc, "queue": q, "dir": fn})

        for worker in workers:
            worker["proc"].start()
        saddr_or_exception = scheduler_q.get()
        if isinstance(saddr_or_exception, Exception):
            raise saddr_or_exception
        saddr = saddr_or_exception

        for worker in workers:
            addr_or_exception = worker["queue"].get()
            if isinstance(addr_or_exception, Exception):
                raise addr_or_exception
            worker["address"] = addr_or_exception

        start = time()
        try:
            try:
                security = scheduler_kwargs["security"]
                rpc_kwargs = {"connection_args": security.get_connection_args("client")}
            except KeyError:
                rpc_kwargs = {}

            with rpc(saddr, **rpc_kwargs) as s:
                while True:
                    nthreads = loop.run_sync(s.ncores)
                    if len(nthreads) == nworkers:
                        break
                    if time() - start > 5:
                        raise Exception("Timeout on cluster creation")

            # avoid sending processes down to function
            yield {"address": saddr}, [
                {"address": w["address"], "proc": weakref.ref(w["proc"])}
                for w in workers
            ]
        finally:
            logger.debug("Closing out test cluster")

            loop.run_sync(
                lambda: disconnect_all(
                    [w["address"] for w in workers],
                    timeout=disconnect_timeout,
                    rpc_kwargs=rpc_kwargs,
                )
            )
            loop.run_sync(
                lambda: disconnect(
                    saddr, timeout=disconnect_timeout, rpc_kwargs=rpc_kwargs
                )
            )

            scheduler.terminate()
            scheduler_q.close()
            scheduler_q._reader.close()
            scheduler_q._writer.close()

            for w in workers:
                w["proc"].terminate()
                w["queue"].close()
                w["queue"]._reader.close()
                w["queue"]._writer.close()

            scheduler.join(2)
            del scheduler
            for proc in [w["proc"] for w in workers]:
                proc.join(timeout=30)

            with suppress(UnboundLocalError):
                del worker, w, proc
            del workers[:]

            for fn in glob("_test_worker-*"):
                with suppress(OSError):
                    shutil.rmtree(fn)

        try:
            client = default_client()
        except ValueError:
            pass
        else:
            client.close()

    start = time()
    while any(proc.is_alive() for proc in ws):
        text = str(list(ws))
        sleep(0.2)
        assert time() < start + 5, ("Workers still around after five seconds", text)


async def disconnect(addr, timeout=3, rpc_kwargs=None):
    rpc_kwargs = rpc_kwargs or {}

    async def do_disconnect():
        with rpc(addr, **rpc_kwargs) as w:
            # If the worker was killed hard (e.g. sigterm) during test runtime,
            # we do not know at this point and may not be able to connect
            with suppress(EnvironmentError, CommClosedError):
                # Do not request a reply since comms will be closed by the
                # worker before a reply can be made and we will always trigger
                # the timeout
                await w.terminate(reply=False)

    await asyncio.wait_for(do_disconnect(), timeout=timeout)


async def disconnect_all(addresses, timeout=3, rpc_kwargs=None):
    await asyncio.gather(*(disconnect(addr, timeout, rpc_kwargs) for addr in addresses))


def gen_test(timeout: float = _TEST_TIMEOUT) -> Callable[[Callable], Callable]:
    """Coroutine test

    @pytest.mark.parametrize("param", [1, 2, 3])
    @gen_test(timeout=5)
    async def test_foo(param)
        await ... # use tornado coroutines


    @gen_test(timeout=5)
    async def test_foo():
        await ...  # use tornado coroutines
    """
    assert timeout, (
        "timeout should always be set and it should be smaller than the global one from"
        "pytest-timeout"
    )

    def _(func):
        def test_func(*args, **kwargs):
            with clean() as loop:
                injected_func = functools.partial(func, *args, **kwargs)
                if iscoroutinefunction(func):
                    cor = injected_func
                else:
                    cor = gen.coroutine(injected_func)

                loop.run_sync(cor, timeout=timeout)

        # Patch the signature so pytest can inject fixtures
        test_func.__signature__ = inspect.signature(func)
        return test_func

    return _


async def start_cluster(
    nthreads: list[tuple[str, int] | tuple[str, int, dict]],
    scheduler_addr: str,
    loop: IOLoop,
    security: Security | dict[str, Any] | None = None,
    Worker: type[ServerNode] = Worker,
    scheduler_kwargs: dict[str, Any] = {},
    worker_kwargs: dict[str, Any] = {},
) -> tuple[Scheduler, list[ServerNode]]:
    s = await Scheduler(
        loop=loop,
        validate=True,
        security=security,
        port=0,
        host=scheduler_addr,
        **scheduler_kwargs,
    )

    workers = [
        Worker(
            s.address,
            nthreads=ncore[1],
            name=i,
            security=security,
            loop=loop,
            validate=True,
            host=ncore[0],
            **(
                merge(worker_kwargs, ncore[2])  # type: ignore
                if len(ncore) > 2
                else worker_kwargs
            ),
        )
        for i, ncore in enumerate(nthreads)
    ]

    await asyncio.gather(*workers)

    start = time()
    while (
        len(s.workers) < len(nthreads)
        or any(ws.status != Status.running for ws in s.workers.values())
        or any(comm.comm is None for comm in s.stream_comms.values())
    ):
        await asyncio.sleep(0.01)
        if time() > start + 30:
            await asyncio.gather(*(w.close(timeout=1) for w in workers))
            await s.close(fast=True)
            raise TimeoutError("Cluster creation timeout")
    return s, workers


async def end_cluster(s, workers):
    logger.debug("Closing out test cluster")

    async def end_worker(w):
        with suppress(TimeoutError, CommClosedError, EnvironmentError):
            await w.close(report=False)

    await asyncio.gather(*(end_worker(w) for w in workers))
    await s.close()  # wait until scheduler stops completely
    s.stop()


def gen_cluster(
    nthreads: list[tuple[str, int] | tuple[str, int, dict]] = [
        ("127.0.0.1", 1),
        ("127.0.0.1", 2),
    ],
    scheduler="127.0.0.1",
    timeout: float = _TEST_TIMEOUT,
    security: Security | dict[str, Any] | None = None,
    Worker: type[ServerNode] = Worker,
    client: bool = False,
    scheduler_kwargs: dict[str, Any] = {},
    worker_kwargs: dict[str, Any] = {},
    client_kwargs: dict[str, Any] = {},
    active_rpc_timeout: float = 1,
    config: dict[str, Any] = {},
    clean_kwargs: dict[str, Any] = {},
    allow_unclosed: bool = False,
    cluster_dump_directory: str | Literal[False] = "test_cluster_dump",
) -> Callable[[Callable], Callable]:
    from distributed import Client

    """ Coroutine test with small cluster

    @gen_cluster()
    async def test_foo(scheduler, worker1, worker2):
        await ...  # use tornado coroutines

    @pytest.mark.parametrize("param", [1, 2, 3])
    @gen_cluster()
    async def test_foo(scheduler, worker1, worker2, param):
        await ...  # use tornado coroutines

    @gen_cluster()
    async def test_foo(scheduler, worker1, worker2, pytest_fixture_a, pytest_fixture_b):
        await ...  # use tornado coroutines

    See also:
        start
        end
    """
    assert timeout, (
        "timeout should always be set and it should be smaller than the global one from"
        "pytest-timeout"
    )

    scheduler_kwargs = merge(
        {"dashboard": False, "dashboard_address": ":0"}, scheduler_kwargs
    )
    worker_kwargs = merge(
        {"memory_limit": system.MEMORY_LIMIT, "death_timeout": 15}, worker_kwargs
    )

    def _(func):
        if not iscoroutinefunction(func):
            raise RuntimeError("gen_cluster only works for coroutine functions.")

        @functools.wraps(func)
        def test_func(*outer_args, **kwargs):
            result = None
            workers = []
            with clean(timeout=active_rpc_timeout, **clean_kwargs) as loop:

                async def coro():
                    with dask.config.set(config):
                        s = False
                        for _ in range(60):
                            try:
                                s, ws = await start_cluster(
                                    nthreads,
                                    scheduler,
                                    loop,
                                    security=security,
                                    Worker=Worker,
                                    scheduler_kwargs=scheduler_kwargs,
                                    worker_kwargs=worker_kwargs,
                                )
                            except Exception as e:
                                logger.error(
                                    "Failed to start gen_cluster: "
                                    f"{e.__class__.__name__}: {e}; retrying",
                                    exc_info=True,
                                )
                                await asyncio.sleep(1)
                            else:
                                workers[:] = ws
                                args = [s] + workers
                                break
                        if s is False:
                            raise Exception("Could not start cluster")
                        if client:
                            c = await Client(
                                s.address,
                                loop=loop,
                                security=security,
                                asynchronous=True,
                                **client_kwargs,
                            )
                            args = [c] + args

                        try:
                            coro = func(*args, *outer_args, **kwargs)
                            task = asyncio.create_task(coro)
                            coro2 = asyncio.wait_for(asyncio.shield(task), timeout)
                            result = await coro2
                            if s.validate:
                                s.validate_state()

                        except asyncio.TimeoutError:
                            assert task
                            buffer = io.StringIO()
                            # This stack indicates where the coro/test is suspended
                            task.print_stack(file=buffer)

                            if cluster_dump_directory:
                                await dump_cluster_state(
                                    s,
                                    ws,
                                    output_dir=cluster_dump_directory,
                                    func_name=func.__name__,
                                )

                            task.cancel()
                            while not task.cancelled():
                                await asyncio.sleep(0.01)

                            # Remove as much of the traceback as possible; it's
                            # uninteresting boilerplate from utils_test and asyncio and
                            # not from the code being tested.
                            raise TimeoutError(
                                f"Test timeout after {timeout}s.\n"
                                "========== Test stack trace starts here ==========\n"
                                f"{buffer.getvalue()}"
                            ) from None

                        except pytest.xfail.Exception:
                            raise

                        except Exception:
                            if cluster_dump_directory and not has_pytestmark(
                                test_func, "xfail"
                            ):
                                await dump_cluster_state(
                                    s,
                                    ws,
                                    output_dir=cluster_dump_directory,
                                    func_name=func.__name__,
                                )
                            raise

                        finally:
                            if client and c.status not in ("closing", "closed"):
                                await c._close(fast=s.status == Status.closed)
                            await end_cluster(s, workers)
                            await asyncio.wait_for(cleanup_global_workers(), 1)

                        try:
                            c = await default_client()
                        except ValueError:
                            pass
                        else:
                            await c._close(fast=True)

                        def get_unclosed():
                            return [c for c in Comm._instances if not c.closed()] + [
                                c
                                for c in _global_clients.values()
                                if c.status != "closed"
                            ]

                        try:
                            start = time()
                            while time() < start + 60:
                                gc.collect()
                                if not get_unclosed():
                                    break
                                await asyncio.sleep(0.05)
                            else:
                                if allow_unclosed:
                                    print(f"Unclosed Comms: {get_unclosed()}")
                                else:
                                    raise RuntimeError("Unclosed Comms", get_unclosed())
                        finally:
                            Comm._instances.clear()
                            _global_clients.clear()

                        return result

                result = loop.run_sync(
                    coro, timeout=timeout * 2 if timeout else timeout
                )

            for w in workers:
                if getattr(w, "data", None):
                    try:
                        w.data.clear()
                    except OSError:
                        # zict backends can fail if their storage directory
                        # was already removed
                        pass
                    del w.data

            return result

        # Patch the signature so pytest can inject fixtures
        orig_sig = inspect.signature(func)
        args = [None] * (1 + len(nthreads))  # scheduler, *workers
        if client:
            args.insert(0, None)

        bound = orig_sig.bind_partial(*args)
        test_func.__signature__ = orig_sig.replace(
            parameters=[
                p
                for name, p in orig_sig.parameters.items()
                if name not in bound.arguments
            ]
        )

        return test_func

    return _


async def dump_cluster_state(
    s: Scheduler, ws: list[ServerNode], output_dir: str, func_name: str
) -> None:
    """A variant of Client.dump_cluster_state, which does not rely on any of the below
    to work:

    - Having a client at all
    - Client->Scheduler comms
    - Scheduler->Worker comms (unless using Nannies)
    """
    scheduler_info = s._to_dict()
    workers_info: dict[str, Any]
    versions_info = version_module.get_versions()

    if not ws or isinstance(ws[0], Worker):
        workers_info = {w.address: w._to_dict() for w in ws}
    else:
        workers_info = await s.broadcast(msg={"op": "dump_state"}, on_error="return")
        workers_info = {
            k: repr(v) if isinstance(v, Exception) else v
            for k, v in workers_info.items()
        }

    state = {
        "scheduler": scheduler_info,
        "workers": workers_info,
        "versions": versions_info,
    }
    os.makedirs(output_dir, exist_ok=True)
    fname = os.path.join(output_dir, func_name) + ".yaml"
    with open(fname, "w") as fh:
        yaml.safe_dump(state, fh)  # Automatically convert tuples to lists
    print(f"Dumped cluster state to {fname}")


def raises(func, exc=Exception):
    try:
        func()
        return False
    except exc:
        return True


def _terminate_process(proc):
    if proc.poll() is None:
        if sys.platform.startswith("win"):
            proc.send_signal(signal.CTRL_BREAK_EVENT)
        else:
            proc.send_signal(signal.SIGINT)
        try:
            proc.wait(30)
        finally:
            # Make sure we don't leave the process lingering around
            with suppress(OSError):
                proc.kill()


@contextmanager
def popen(args, **kwargs):
    kwargs["stdout"] = subprocess.PIPE
    kwargs["stderr"] = subprocess.PIPE
    if sys.platform.startswith("win"):
        # Allow using CTRL_C_EVENT / CTRL_BREAK_EVENT
        kwargs["creationflags"] = subprocess.CREATE_NEW_PROCESS_GROUP
    dump_stdout = False

    args = list(args)
    if sys.platform.startswith("win"):
        args[0] = os.path.join(sys.prefix, "Scripts", args[0])
    else:
        args[0] = os.path.join(
            os.environ.get("DESTDIR", "") + sys.prefix, "bin", args[0]
        )
    proc = subprocess.Popen(args, **kwargs)
    try:
        yield proc
    except Exception:
        dump_stdout = True
        raise

    finally:
        try:
            _terminate_process(proc)
        finally:
            # XXX Also dump stdout if return code != 0 ?
            out, err = proc.communicate()
            if dump_stdout:
                print("\n\nPrint from stderr\n  %s\n=================\n" % args[0][0])
                print(err.decode())

                print("\n\nPrint from stdout\n=================\n")
                print(out.decode())


def wait_for(predicate, timeout, fail_func=None, period=0.001):
    deadline = time() + timeout
    while not predicate():
        sleep(period)
        if time() > deadline:
            if fail_func is not None:
                fail_func()
            pytest.fail(f"condition not reached until {timeout} seconds")


async def async_wait_for(predicate, timeout, fail_func=None, period=0.001):
    deadline = time() + timeout
    while not predicate():
        await asyncio.sleep(period)
        if time() > deadline:
            if fail_func is not None:
                fail_func()
            pytest.fail(f"condition not reached until {timeout} seconds")


@memoize
def has_ipv6():
    """
    Return whether IPv6 is locally functional.  This doesn't guarantee IPv6
    is properly configured outside of localhost.
    """
    if os.getenv("DISABLE_IPV6") == "1":
        return False

    serv = cli = None
    try:
        serv = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
        serv.bind(("::", 0))
        serv.listen(5)
        cli = socket.create_connection(serv.getsockname()[:2])
        return True
    except OSError:
        return False
    finally:
        if cli is not None:
            cli.close()
        if serv is not None:
            serv.close()


if has_ipv6():

    def requires_ipv6(test_func):
        return test_func

else:
    requires_ipv6 = pytest.mark.skip("ipv6 required")


async def assert_can_connect(addr, timeout=0.5, **kwargs):
    """
    Check that it is possible to connect to the distributed *addr*
    within the given *timeout*.
    """
    comm = await connect(addr, timeout=timeout, **kwargs)
    comm.abort()


async def assert_cannot_connect(
    addr, timeout=0.5, exception_class=EnvironmentError, **kwargs
):
    """
    Check that it is impossible to connect to the distributed *addr*
    within the given *timeout*.
    """
    with pytest.raises(exception_class):
        comm = await connect(addr, timeout=timeout, **kwargs)
        comm.abort()


async def assert_can_connect_from_everywhere_4_6(port, protocol="tcp", **kwargs):
    """
    Check that the local *port* is reachable from all IPv4 and IPv6 addresses.
    """
    futures = [
        assert_can_connect("%s://127.0.0.1:%d" % (protocol, port), **kwargs),
        assert_can_connect("%s://%s:%d" % (protocol, get_ip(), port), **kwargs),
    ]
    if has_ipv6():
        futures += [
            assert_can_connect("%s://[::1]:%d" % (protocol, port), **kwargs),
            assert_can_connect("%s://[%s]:%d" % (protocol, get_ipv6(), port), **kwargs),
        ]
    await asyncio.gather(*futures)


async def assert_can_connect_from_everywhere_4(port, protocol="tcp", **kwargs):
    """
    Check that the local *port* is reachable from all IPv4 addresses.
    """
    futures = [
        assert_can_connect("%s://127.0.0.1:%d" % (protocol, port), **kwargs),
        assert_can_connect("%s://%s:%d" % (protocol, get_ip(), port), **kwargs),
    ]
    if has_ipv6():
        futures += [
            assert_cannot_connect("%s://[::1]:%d" % (protocol, port), **kwargs),
            assert_cannot_connect(
                "%s://[%s]:%d" % (protocol, get_ipv6(), port), **kwargs
            ),
        ]
    await asyncio.gather(*futures)


async def assert_can_connect_locally_4(port, **kwargs):
    """
    Check that the local *port* is only reachable from local IPv4 addresses.
    """
    futures = [assert_can_connect("tcp://127.0.0.1:%d" % port, **kwargs)]
    if get_ip() != "127.0.0.1":  # No outside IPv4 connectivity?
        futures += [assert_cannot_connect("tcp://%s:%d" % (get_ip(), port), **kwargs)]
    if has_ipv6():
        futures += [
            assert_cannot_connect("tcp://[::1]:%d" % port, **kwargs),
            assert_cannot_connect("tcp://[%s]:%d" % (get_ipv6(), port), **kwargs),
        ]
    await asyncio.gather(*futures)


async def assert_can_connect_from_everywhere_6(port, **kwargs):
    """
    Check that the local *port* is reachable from all IPv6 addresses.
    """
    assert has_ipv6()
    futures = [
        assert_cannot_connect("tcp://127.0.0.1:%d" % port, **kwargs),
        assert_cannot_connect("tcp://%s:%d" % (get_ip(), port), **kwargs),
        assert_can_connect("tcp://[::1]:%d" % port, **kwargs),
        assert_can_connect("tcp://[%s]:%d" % (get_ipv6(), port), **kwargs),
    ]
    await asyncio.gather(*futures)


async def assert_can_connect_locally_6(port, **kwargs):
    """
    Check that the local *port* is only reachable from local IPv6 addresses.
    """
    assert has_ipv6()
    futures = [
        assert_cannot_connect("tcp://127.0.0.1:%d" % port, **kwargs),
        assert_cannot_connect("tcp://%s:%d" % (get_ip(), port), **kwargs),
        assert_can_connect("tcp://[::1]:%d" % port, **kwargs),
    ]
    if get_ipv6() != "::1":  # No outside IPv6 connectivity?
        futures += [
            assert_cannot_connect("tcp://[%s]:%d" % (get_ipv6(), port), **kwargs)
        ]
    await asyncio.gather(*futures)


@contextmanager
def captured_logger(logger, level=logging.INFO, propagate=None):
    """Capture output from the given Logger."""
    if isinstance(logger, str):
        logger = logging.getLogger(logger)
    orig_level = logger.level
    orig_handlers = logger.handlers[:]
    if propagate is not None:
        orig_propagate = logger.propagate
        logger.propagate = propagate
    sio = io.StringIO()
    logger.handlers[:] = [logging.StreamHandler(sio)]
    logger.setLevel(level)
    try:
        yield sio
    finally:
        logger.handlers[:] = orig_handlers
        logger.setLevel(orig_level)
        if propagate is not None:
            logger.propagate = orig_propagate


@contextmanager
def captured_handler(handler):
    """Capture output from the given logging.StreamHandler."""
    assert isinstance(handler, logging.StreamHandler)
    orig_stream = handler.stream
    handler.stream = io.StringIO()
    try:
        yield handler.stream
    finally:
        handler.stream = orig_stream


@contextmanager
def new_config(new_config):
    """
    Temporarily change configuration dictionary.
    """
    from .config import defaults

    config = dask.config.config
    orig_config = copy.deepcopy(config)
    try:
        config.clear()
        config.update(copy.deepcopy(defaults))
        dask.config.update(config, new_config)
        initialize_logging(config)
        yield
    finally:
        config.clear()
        config.update(orig_config)
        initialize_logging(config)


@contextmanager
def new_environment(changes):
    saved_environ = os.environ.copy()
    os.environ.update(changes)
    try:
        yield
    finally:
        os.environ.clear()
        os.environ.update(saved_environ)


@contextmanager
def new_config_file(c):
    """
    Temporarily change configuration file to match dictionary *c*.
    """
    import yaml

    old_file = os.environ.get("DASK_CONFIG")
    fd, path = tempfile.mkstemp(prefix="dask-config")
    try:
        with os.fdopen(fd, "w") as f:
            f.write(yaml.dump(c))
        os.environ["DASK_CONFIG"] = path
        try:
            yield
        finally:
            if old_file:
                os.environ["DASK_CONFIG"] = old_file
            else:
                del os.environ["DASK_CONFIG"]
    finally:
        os.remove(path)


certs_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "tests"))


def get_cert(filename):
    """
    Get the path to one of the test TLS certificates.
    """
    path = os.path.join(certs_dir, filename)
    assert os.path.exists(path), path
    return path


def tls_config():
    """
    A functional TLS configuration with our test certs.
    """
    ca_file = get_cert("tls-ca-cert.pem")
    keycert = get_cert("tls-key-cert.pem")

    return {
        "distributed": {
            "comm": {
                "tls": {
                    "ca-file": ca_file,
                    "client": {"cert": keycert},
                    "scheduler": {"cert": keycert},
                    "worker": {"cert": keycert},
                }
            }
        }
    }


def tls_only_config():
    """
    A functional TLS configuration with our test certs, disallowing
    plain TCP communications.
    """
    c = tls_config()
    c["distributed"]["comm"]["require-encryption"] = True
    return c


def tls_security():
    """
    A Security object with proper TLS configuration.
    """
    with new_config(tls_config()):
        sec = Security()
    return sec


def tls_only_security():
    """
    A Security object with proper TLS configuration and disallowing plain
    TCP communications.
    """
    with new_config(tls_only_config()):
        sec = Security()
    assert sec.require_encryption
    return sec


def get_server_ssl_context(
    certfile="tls-cert.pem", keyfile="tls-key.pem", ca_file="tls-ca-cert.pem"
):
    ctx = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH, cafile=get_cert(ca_file))
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_REQUIRED
    ctx.load_cert_chain(get_cert(certfile), get_cert(keyfile))
    return ctx


def get_client_ssl_context(
    certfile="tls-cert.pem", keyfile="tls-key.pem", ca_file="tls-ca-cert.pem"
):
    ctx = ssl.create_default_context(ssl.Purpose.SERVER_AUTH, cafile=get_cert(ca_file))
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_REQUIRED
    ctx.load_cert_chain(get_cert(certfile), get_cert(keyfile))
    return ctx


def bump_rlimit(limit, desired):
    resource = pytest.importorskip("resource")
    try:
        soft, hard = resource.getrlimit(limit)
        if soft < desired:
            resource.setrlimit(limit, (desired, max(hard, desired)))
    except Exception as e:
        pytest.skip(f"rlimit too low ({soft}) and can't be increased: {e}")


def gen_tls_cluster(**kwargs):
    kwargs.setdefault("nthreads", [("tls://127.0.0.1", 1), ("tls://127.0.0.1", 2)])
    return gen_cluster(
        scheduler="tls://127.0.0.1", security=tls_only_security(), **kwargs
    )


@contextmanager
def save_sys_modules():
    old_modules = sys.modules
    old_path = sys.path
    try:
        yield
    finally:
        for i, elem in enumerate(sys.path):
            if elem not in old_path:
                del sys.path[i]
        for elem in sys.modules.keys():
            if elem not in old_modules:
                del sys.modules[elem]


@contextmanager
def check_thread_leak():
    """Context manager to ensure we haven't leaked any threads"""
    # "TCP-Executor" threads are never stopped once they are started
    BaseTCPConnector.warmup()

    active_threads_start = threading.enumerate()

    yield

    start = time()
    while True:
        bad_threads = [
            thread
            for thread in threading.enumerate()
            if thread not in active_threads_start
            # FIXME this looks like a genuine leak that needs fixing
            and "watch message queue" not in thread.name
        ]
        if not bad_threads:
            break
        else:
            sleep(0.01)
        if time() > start + 5:
            # Raise an error with information about leaked threads
            from distributed import profile

            bad_thread = bad_threads[0]
            call_stacks = profile.call_stack(sys._current_frames()[bad_thread.ident])
            assert False, (bad_thread, call_stacks)


def wait_active_children(timeout: float) -> list[multiprocessing.Process]:
    """Wait until timeout for mp_context.active_children() to terminate.
    Return list of active subprocesses after the timeout expired.
    """
    t0 = time()
    while True:
        # Do not sample the subprocesses once at the beginning with
        # `for proc in mp_context.active_children: ...`, assume instead that new
        # children processes may be spawned before the timeout expires.
        children = mp_context.active_children()
        if not children:
            return []
        join_timeout = timeout - time() + t0
        if join_timeout <= 0:
            return children
        children[0].join(timeout=join_timeout)


def term_or_kill_active_children(timeout: float) -> None:
    """Send SIGTERM to mp_context.active_children(), wait up to 3 seconds for processes
    to die, then send SIGKILL to the survivors
    """
    children = mp_context.active_children()
    for proc in children:
        proc.terminate()

    children = wait_active_children(timeout=timeout)
    for proc in children:
        proc.kill()

    children = wait_active_children(timeout=30)
    if children:  # pragma: nocover
        logger.warning("Leaked unkillable children processes: %s", children)
        # It should be impossible to ignore SIGKILL on Linux/MacOSX
        assert WINDOWS


@contextmanager
def check_process_leak(
    check: bool = True, check_timeout: float = 40, term_timeout: float = 3
):
    """Terminate any currently-running subprocesses at both the beginning and end of this context

    Parameters
    ----------
    check : bool, optional
        If True, raise AssertionError if any processes survive at the exit
    check_timeout: float, optional
        Wait up to these many seconds for subprocesses to terminate before failing
    term_timeout: float, optional
        After sending SIGTERM to a subprocess, wait up to these many seconds before
        sending SIGKILL
    """
    term_or_kill_active_children(timeout=term_timeout)
    try:
        yield
        if check:
            children = wait_active_children(timeout=check_timeout)
            assert not children, f"Test leaked subprocesses: {children}"
    finally:
        term_or_kill_active_children(timeout=term_timeout)


@contextmanager
def check_instances():
    Client._instances.clear()
    Worker._instances.clear()
    Scheduler._instances.clear()
    SpecCluster._instances.clear()
    Worker._initialized_clients.clear()
    # assert all(n.status == "closed" for n in Nanny._instances), {
    #     n: n.status for n in Nanny._instances
    # }
    Nanny._instances.clear()
    _global_clients.clear()
    Comm._instances.clear()

    yield

    start = time()
    while set(_global_clients):
        sleep(0.1)
        assert time() < start + 10

    _global_clients.clear()

    for w in Worker._instances:
        with suppress(RuntimeError):  # closed IOLoop
            w.loop.add_callback(w.close, report=False, executor_wait=False)
            if w.status in Status.ANY_RUNNING:
                w.loop.add_callback(w.close)
    Worker._instances.clear()

    start = time()
    while any(c.status != "closed" for c in Worker._initialized_clients):
        sleep(0.1)
        assert time() < start + 10
    Worker._initialized_clients.clear()

    for i in range(5):
        if all(c.closed() for c in Comm._instances):
            break
        else:
            sleep(0.1)
    else:
        L = [c for c in Comm._instances if not c.closed()]
        Comm._instances.clear()
        raise ValueError("Unclosed Comms", L)

    assert all(
        n.status == Status.closed or n.status == Status.init for n in Nanny._instances
    ), {n: n.status for n in Nanny._instances}

    # assert not list(SpecCluster._instances)  # TODO
    assert all(c.status == Status.closed for c in SpecCluster._instances), list(
        SpecCluster._instances
    )
    SpecCluster._instances.clear()

    Nanny._instances.clear()
    DequeHandler.clear_all_instances()


@contextmanager
def clean(threads=not WINDOWS, instances=True, timeout=1, processes=True):
    with check_thread_leak() if threads else nullcontext():
        with pristine_loop() as loop:
            with check_process_leak(check=processes):
                with check_instances() if instances else nullcontext():
                    with check_active_rpc(loop, timeout):
                        reset_config()

                        dask.config.set({"distributed.comm.timeouts.connect": "5s"})
                        # Restore default logging levels
                        # XXX use pytest hooks/fixtures instead?
                        for name, level in logging_levels.items():
                            logging.getLogger(name).setLevel(level)

                        yield loop


@pytest.fixture
def cleanup():
    with clean():
        yield


class TaskStateMetadataPlugin(WorkerPlugin):
    """WorkPlugin to populate TaskState.metadata"""

    def setup(self, worker):
        self.worker = worker

    def transition(self, key, start, finish, **kwargs):
        ts = self.worker.tasks[key]

        if start == "ready" and finish == "executing":
            ts.metadata["start_time"] = time()
        elif start == "executing" and finish == "memory":
            ts.metadata["stop_time"] = time()


class LockedComm(TCP):
    def __init__(self, comm, read_event, read_queue, write_event, write_queue):
        self.write_event = write_event
        self.write_queue = write_queue
        self.read_event = read_event
        self.read_queue = read_queue
        self.comm = comm
        assert isinstance(comm, TCP)

    def __getattr__(self, name):
        return getattr(self.comm, name)

    async def write(self, msg, serializers=None, on_error="message"):
        if self.write_queue:
            await self.write_queue.put((self.comm.peer_address, msg))
        if self.write_event:
            await self.write_event.wait()
        return await self.comm.write(msg, serializers=serializers, on_error=on_error)

    async def read(self, deserializers=None):
        msg = await self.comm.read(deserializers=deserializers)
        if self.read_queue:
            await self.read_queue.put((self.comm.peer_address, msg))
        if self.read_event:
            await self.read_event.wait()
        return msg

    async def close(self):
        await self.comm.close()


class _LockedCommPool(ConnectionPool):
    """A ConnectionPool wrapper to intercept network traffic between servers

    This wrapper can be attached to a running server to intercept outgoing read or write requests in test environments.

    Examples
    --------
    >>> w = await Worker(...)
    >>> read_event = asyncio.Event()
    >>> read_queue = asyncio.Queue()
    >>> w.rpc = _LockedCommPool(
            w.rpc,
            read_event=read_event,
            read_queue=read_queue,
        )
    # It might be necessary to remove all existing comms
    # if the wrapped pool has been used before
    >>> w.remove(remote_address)

    >>> async def ping_pong():
            return await w.rpc(remote_address).ping()
    >>> with pytest.raises(asyncio.TimeoutError):
    >>>     await asyncio.wait_for(ping_pong(), 0.01)
    >>> read_event.set()
    >>> await ping_pong()
    """

    def __init__(
        self, pool, read_event=None, read_queue=None, write_event=None, write_queue=None
    ):
        self.write_event = write_event
        self.write_queue = write_queue
        self.read_event = read_event
        self.read_queue = read_queue
        self.pool = pool

    def __getattr__(self, name):
        return getattr(self.pool, name)

    async def connect(self, *args, **kwargs):
        comm = await self.pool.connect(*args, **kwargs)
        return LockedComm(
            comm, self.read_event, self.read_queue, self.write_event, self.write_queue
        )

    async def close(self):
        await self.pool.close()


def xfail_ssl_issue5601():
    """Work around https://github.com/dask/distributed/issues/5601 where any test that
    inits Security.temporary() crashes on MacOS GitHub Actions CI
    """
    pytest.importorskip("cryptography")
    try:
        Security.temporary()
    except ImportError:
        if MACOS:
            pytest.xfail(reason="distributed#5601")
        raise


def assert_worker_story(
    story: list[tuple], expect: list[tuple], *, strict: bool = False
) -> None:
    """Test the output of ``Worker.story``

    Parameters
    ==========
    story: list[tuple]
        Output of Worker.story
    expect: list[tuple]
        Expected events. Each expected event must contain exactly 2 less fields than the
        story (the last two fields are always the stimulus_id and the timestamp).

        Elements of the expect tuples can be

        - callables, which accept a single element of the event tuple as argument and
          return True for match and False for no match;
        - arbitrary objects, which are compared with a == b

        e.g.
        .. code-block:: python

            expect=[
                ("x", "missing", "fetch", "fetch", {}),
                ("gather-dependencies", worker_addr, lambda set_: "x" in set_),
            ]

    strict: bool, optional
        If True, the story must contain exactly as many events as expect.
        If False (the default), the story may contain more events than expect; extra
        events are ignored.
    """
    now = time()
    prev_ts = 0.0
    for ev in story:
        try:
            assert len(ev) > 2
            assert isinstance(ev, tuple)
            assert isinstance(ev[-2], str) and ev[-2]  # stimulus_id
            assert isinstance(ev[-1], float)  # timestamp
            assert prev_ts <= ev[-1]  # Timestamps are monotonic ascending
            # Timestamps are within the last hour. It's been observed that a timestamp
            # generated in a Nanny process can be a few milliseconds in the future.
            assert now - 3600 < ev[-1] <= now + 1
            prev_ts = ev[-1]
        except AssertionError:
            raise AssertionError(
                f"Malformed story event: {ev}\nin story:\n{_format_story(story)}"
            )

    try:
        if strict and len(story) != len(expect):
            raise StopIteration()
        story_it = iter(story)
        for ev_expect in expect:
            while True:
                event = next(story_it)
                # Ignore (stimulus_id, timestamp)
                event = event[:-2]
                if len(event) == len(ev_expect) and all(
                    ex(ev) if callable(ex) else ev == ex
                    for ev, ex in zip(event, ev_expect)
                ):
                    break
    except StopIteration:
        raise AssertionError(
            f"assert_worker_story({strict=}) failed\n"
            f"story:\n{_format_story(story)}\n"
            f"expect:\n{_format_story(expect)}"
        ) from None


def _format_story(story: list[tuple]) -> str:
    if not story:
        return "(empty story)"
    return "- " + "\n- ".join(str(ev) for ev in story)


class BrokenComm(Comm):
    peer_address = ""
    local_address = ""

    def close(self):
        pass

    def closed(self):
        return True

    def abort(self):
        pass

    def read(self, deserializers=None):
        raise OSError()

    def write(self, msg, serializers=None, on_error=None):
        raise OSError()


def has_pytestmark(test_func: Callable, name: str) -> bool:
    """Return True if the test function is marked by the given @pytest.mark.<name>;
    False otherwise.

    FIXME doesn't work with individually marked parameters inside
          @pytest.mark.parametrize
    """
    marks = getattr(test_func, "pytestmark", [])
    return any(mark.name == name for mark in marks)