"""This module provides services to automatically restart threads and processes
that die.
"""
import enum
import logging
import multiprocessing
import pickle
import random
import threading
import time
from typing import Any, Callable, Dict, Generator, List, NamedTuple, Optional, Tuple
import pandas
from setproctitle import setproctitle
from factorytx import const
from factorytx.exceptions import Failure
from factorytx import logs
from factorytx import markers
log = logging.getLogger(__name__)
class DummyProcess(threading.Thread):
pass
@enum.unique
class RunMethod(enum.Enum):
PROCESSES: Any = multiprocessing.Process
DUMMY_PROCESSES: Any = DummyProcess
THREADS: Any = threading.Thread
class Target(NamedTuple):
context: logs.LogContext
callable: Callable
args: Tuple[Any, ...]
kwargs: Dict[str, Any]
CHECK_INTERVAL_SECONDS = 1.0 # How long to wait between process checks.
MIN_BACKOFF_SECONDS = 1.0 # Min duration to back off before restarting a process.
MAX_BACKOFF_SECONDS = 60.0 # Max duration to back off before restarting a process.
HEALTHY_UPTIME_SECONDS = 600.0 # Duration a process must be up before it is considered healthy.
def _run_process(run_method: RunMethod, target: Target) -> None:
"""Wraps the execution of `target`. This is used to set up the thread /
process context and catch any unexpected errors.
"""
if run_method == RunMethod.PROCESSES:
# TODO: Factor this out?
logs.initialize()
markers.setup_db(const.MARKER_PATH)
markers.initialize(const.MARKER_PATH)
# Don't just warn, or else bugs go unnoticed in multi-stage transform pipelines.
pandas.set_option('mode.chained_assignment', 'raise')
setproctitle(f'factorytx [{logs.format_context(target.context)}]')
elif run_method == RunMethod.DUMMY_PROCESSES:
# Check that targets can be pickled and unpickled successfully to
# simulate spawning them under multiprocessing. We MUST do this in
# the target thread since some targets count on deserialization to
# reinitialize data structures that are tied to the current thread, eg.
# sqlite connections.
target = pickle.loads(pickle.dumps(target))
logs.set_context(context=target.context)
try:
target.callable(*target.args, **target.kwargs)
except Failure:
pass
except Exception as e:
# Pass the context to markers.error so that it matches the context that
# the supervisor will pass to markers.clear(). Otherwise the supervisor
# might not clean up the marker when the process stabilizes.
markers.error('ftx.supervisor.error', f'Unexpected error: {e!s}',
context=target.context, log_exception=True)
# The Supervisor is implemented as a class so that in the future we can add
# methods to request that supervised process be stopped or restarted.
[docs]class Supervisor:
"""A Supervisor supervises a collection of callables, running each callable
in its own thread or process. Each callable will be restarted if it terminates;
callables which terminate too frequently will wait longer to restart to
avoid overloading the system.
"""
[docs] def __init__(self, run_method: RunMethod, targets: List[Target]) -> None:
"""Instantiates a supervisor object.
:param run_method: how to run targets. Please refer to options below.
:param targets: list of callables, each of which will be run in its own
thread / process. Each callable has an associated log context which
is used to log messages if the callable terminates unexpectedly.
Options for running a method:
- `RunMethod.DUMMY_PROCESSES`: run threads in the current process
but simulate running in another process;
- `RunMethod.THREADS`: run threads in the current process; or
- `RunMethod.PROCESSES`: run each target in a separate process.
"""
self.run_method = run_method
self.targets = targets
if self.run_method in (RunMethod.DUMMY_PROCESSES, RunMethod.THREADS):
self.running_type = 'thread'
elif self.run_method == RunMethod.PROCESSES:
self.running_type = 'process'
else:
assert False
def _supervise(self, target: Target) -> Generator[None, None, None]:
"""Coroutine which supervises a specific callable."""
last_start_time = 0.0
backoff_seconds = 1.0
process: Optional[Any] = None
while True:
try:
now = time.time()
if process is None:
last_start_time = now
process = self.run_method.value(
target=_run_process,
args=(self.run_method, target),
daemon=True
)
process.start()
if process.is_alive():
if now > last_start_time + HEALTHY_UPTIME_SECONDS:
backoff_seconds = MIN_BACKOFF_SECONDS
markers.clear('ftx.supervisor.', context=target.context)
yield # Wait for the next polling period.
else:
process.join()
# Only mark a warning if we're running a process and it
# exited with a non-zero exitcode. Otherwise, we've already
# logged the error once so there's no reason to log it again.
if getattr(process, 'exitcode', 0) != 0:
msg = f'{self.running_type.title()} stopped unexpectedly.'
markers.error('ftx.supervisor.restart', msg, context=target.context)
next_start_time = now + backoff_seconds
backoff_seconds = min(2 * backoff_seconds, MAX_BACKOFF_SECONDS)
process = None
log.info('Backing off %s seconds before restarting ...', backoff_seconds)
while time.time() < next_start_time:
yield # Wait for the next polling period.
except Exception as e:
markers.error('ftx.supervisor.unknown', f'An unknown error occurred: {e!s}',
context=target.context, log_exception=True)
yield # Wait for the next polling period.
[docs] def run(self) -> None:
"""Runs forever, restarting any targets that terminate."""
coroutines = [self._supervise(target) for target in self.targets]
while True:
time.sleep(CHECK_INTERVAL_SECONDS)
for coroutine in coroutines:
coroutine.send(None)
# `python -m factorytx.supervisor` will start a manual test of the supervisor.
# Task A should die and back off repeatedly until eventually its backoff resets;
# Task B should die less frequently, and should always be restarted with the
# minimum backoff.
def _test_task_a() -> None:
print("A")
time.sleep(random.randint(0, 15))
def _test_task_b() -> None:
print("B")
time.sleep(20)
if __name__ == '__main__':
from factorytx import logs
HEALTHY_UPTIME_SECONDS = 10
MAX_BACKOFF_SECONDS = 14
logs.initialize()
context_a = logs.LogContext(component='A', component_type='Receiver', asset='', stream_type='')
context_b = logs.LogContext(component='B', component_type='Receiver', asset='foo', stream_type='bar')
targets = [
Target(context=context_a, callable=_test_task_a, args=(), kwargs={}),
Target(context=context_b, callable=_test_task_b, args=(), kwargs={}),
]
supervisor = Supervisor(RunMethod.THREADS, targets)
supervisor.run()