hyperactor_mesh/
mesh_controller.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9use std::collections::HashMap;
10use std::collections::HashSet;
11use std::fmt::Debug;
12use std::time::SystemTime;
13
14use async_trait::async_trait;
15use hyperactor::Actor;
16use hyperactor::Context;
17use hyperactor::Endpoint as _;
18use hyperactor::Handler;
19use hyperactor::Instance;
20use hyperactor::RemoteEndpoint as _;
21use hyperactor::actor::ActorError;
22use hyperactor::actor::ActorErrorKind;
23use hyperactor::actor::ActorStatus;
24use hyperactor::actor::Referable;
25use hyperactor::actor::handle_undeliverable_message;
26use hyperactor::context;
27use hyperactor::kv_pairs;
28use hyperactor::mailbox::MessageEnvelope;
29use hyperactor::mailbox::RemoteMessage;
30use hyperactor::mailbox::Undeliverable;
31use hyperactor::mailbox::UndeliverableReason;
32use hyperactor::supervision::ActorSupervisionEvent;
33use hyperactor_config::CONFIG;
34use hyperactor_config::ConfigAttr;
35use hyperactor_config::Flattrs;
36use hyperactor_config::attrs::declare_attrs;
37use hyperactor_telemetry::declare_static_counter;
38use ndslice::ViewExt;
39use ndslice::view::CollectMeshExt;
40use ndslice::view::Point;
41use ndslice::view::Ranked;
42use opentelemetry::metrics::Counter;
43use serde::Deserialize;
44use serde::Serialize;
45use tokio::time::Duration;
46use typeuri::Named;
47
48use crate::ValueMesh;
49use crate::actor_mesh::ActorMeshRef;
50use crate::bootstrap::ProcStatus;
51use crate::casting::CAST_ACTOR_MESH_ID;
52use crate::casting::update_undeliverable_envelope_for_casting;
53use crate::mesh_id::ResourceId;
54use crate::proc_agent::ActorState;
55use crate::proc_agent::MESH_ORPHAN_TIMEOUT;
56use crate::proc_mesh::ProcMeshRef;
57use crate::resource;
58use crate::supervision::MeshFailure;
59use crate::supervision::Unhealthy;
60
61/// Actor name for `ActorMeshController` when spawned as a named child.
62pub const ACTOR_MESH_CONTROLLER_NAME: &str = "actor_mesh_controller";
63
64declare_attrs! {
65    /// Time between checks of actor states to create supervision events for
66    /// owners. The longer this is, the longer it will take to detect a failure
67    /// and report it to all subscribers; however, shorter intervals will send
68    /// more frequent messages and heartbeats just to see everything is still running.
69    /// The default is chosen to balance these two objectives.
70    /// This also controls how frequently the healthy heartbeat is sent out to
71    /// subscribers if there are no failures encountered.
72    @meta(CONFIG = ConfigAttr::new(
73        Some("HYPERACTOR_MESH_SUPERVISION_POLL_FREQUENCY".to_string()),
74        None,
75    ))
76    pub attr SUPERVISION_POLL_FREQUENCY: Duration = Duration::from_secs(10);
77}
78
79declare_static_counter!(
80    ACTOR_MESH_CONTROLLER_SUPERVISION_STALLS,
81    "actor.actor_mesh_controller.num_stalls"
82);
83
84declare_static_counter!(
85    PROC_MESH_CONTROLLER_SUPERVISION_STALLS,
86    "actor.proc_mesh_controller.num_stalls"
87);
88
89/// Aggregated health and subscriber bookkeeping for a single
90/// `ResourceController`. Tracks the most recently observed status of every
91/// rank in the controlled mesh, the latched unhealthy event (if any), the
92/// owner port (notified on failures), and the set of streaming subscribers
93/// (notified on both stop and failure events). The generation counter on
94/// each status entry provides last-writer-wins ordering between streamed
95/// and polled updates.
96#[derive(Debug)]
97pub struct HealthState {
98    /// The status of each rank in the controlled mesh, paired with the
99    /// generation counter from the most recent update. The generation is
100    /// used for last-writer-wins ordering between streamed and polled updates.
101    statuses: HashMap<Point, (resource::Status, u64)>,
102    /// The latched unhealthy event for the mesh, if any. Once set, this is
103    /// surfaced to new subscribers on subscribe and to `GetState` callers.
104    unhealthy_event: Option<Unhealthy>,
105    /// Per-rank supervision events for ranks that have crashed. Used to build
106    /// region-scoped failure reports.
107    crashed_ranks: HashMap<usize, ActorSupervisionEvent>,
108    /// The single owner of the controlled mesh, notified on failure events
109    /// (but not on clean stops).
110    owner: Option<hyperactor::PortRef<MeshFailure>>,
111    /// Streaming subscribers, notified on both stop and failure events as
112    /// well as periodic heartbeats.
113    subscribers: HashSet<hyperactor::PortRef<Option<MeshFailure>>>,
114}
115
116impl HealthState {
117    fn new(
118        statuses: HashMap<Point, resource::Status>,
119        owner: Option<hyperactor::PortRef<MeshFailure>>,
120    ) -> Self {
121        Self {
122            statuses: statuses
123                .into_iter()
124                .map(|(point, status)| (point, (status, 0)))
125                .collect(),
126            unhealthy_event: None,
127            crashed_ranks: HashMap::new(),
128            owner,
129            subscribers: HashSet::new(),
130        }
131    }
132
133    /// Try to update the status at `point`. Returns `true` if the status
134    /// was newly inserted or changed; `false` if dominated by a higher
135    /// generation or unchanged.
136    fn maybe_update(&mut self, point: Point, status: resource::Status, generation: u64) -> bool {
137        use std::collections::hash_map::Entry;
138        match self.statuses.entry(point) {
139            Entry::Occupied(mut entry) => {
140                let (old_status, old_gen) = entry.get();
141                // Once a resource enters a terminating state (including Stopping),
142                // its status is frozen — later updates are ignored.
143                if old_status.is_terminating() || *old_gen > generation {
144                    return false;
145                }
146                let changed = *old_status != status;
147                *entry.get_mut() = (status, generation);
148                changed
149            }
150            Entry::Vacant(entry) => {
151                entry.insert((status, generation));
152                true
153            }
154        }
155    }
156
157    /// True when every tracked rank has reached a terminating status.
158    fn all_terminating(&self) -> bool {
159        self.statuses.values().all(|(s, _)| s.is_terminating())
160    }
161
162    /// True when at least one tracked rank has reached a terminating status.
163    fn any_terminating(&self) -> bool {
164        self.statuses.values().any(|(s, _)| s.is_terminating())
165    }
166
167    /// The lowest rank that has not yet consumed one terminal state slot.
168    fn first_non_terminating_rank(&self) -> Option<usize> {
169        self.statuses
170            .iter()
171            .filter(|(_, (status, _))| !status.is_terminating())
172            .map(|(point, _)| point.rank())
173            .min()
174    }
175
176    /// Mark `rank` with a terminal status. Returns `true` only if this rank
177    /// was not already terminal.
178    fn mark_rank_terminating(&mut self, rank: usize, status: resource::Status) -> bool {
179        assert!(status.is_terminating(), "rank status must be terminating");
180        let point = self
181            .statuses
182            .keys()
183            .find(|point| point.rank() == rank)
184            .cloned()
185            .unwrap_or_else(|| panic!("rank {rank} is not tracked by health state"));
186        self.maybe_update(point, status, u64::MAX)
187    }
188
189    /// Apply status updates from polled resource states and invoke `on_change`
190    /// for each rank whose status actually changed. The point passed to
191    /// `on_change` is the created rank, *not* the rank of the possibly sliced
192    /// input mesh. Returns `true` if `on_change` reported at least one
193    /// notification (used to decide whether a heartbeat is needed).
194    pub(crate) fn apply_updates_and_notify<S: Clone + 'static>(
195        &mut self,
196        states: &ValueMesh<resource::State<S>>,
197        mut on_change: impl FnMut(resource::State<S>, &mut HealthState) -> bool,
198    ) -> bool {
199        let mut did_notify = false;
200        for (point, state) in states.iter() {
201            let status = state.status.clone();
202            let generation = state.generation;
203            if self.maybe_update(point, status, generation) && on_change(state, self) {
204                did_notify = true;
205            }
206        }
207        did_notify
208    }
209}
210
211/// Outcome of the mesh-specific polling phase inside `CheckState`.
212pub enum PollResult {
213    /// An error or early condition was handled internally; just reschedule.
214    Reschedule,
215    /// A terminal polling failure was reported; stop periodic polling.
216    StopMonitoring,
217    /// States were polled and processed. `did_notify` is true if at least
218    /// one subscriber/owner notification was sent.
219    Processed { did_notify: bool },
220}
221
222/// Compute the keepalive expiry from `MESH_ORPHAN_TIMEOUT`, or `None` if
223/// the timeout is disabled.
224fn compute_keepalive() -> Option<SystemTime> {
225    hyperactor_config::global::get(MESH_ORPHAN_TIMEOUT).map(|d| SystemTime::now() + d)
226}
227
228/// Subscribe me to updates about a mesh. If a duplicate is subscribed, only a single
229/// message is sent.
230/// Will send None if there are no failures on the mesh periodically. This guarantees
231/// the listener that the controller is still alive. Make sure to filter such events
232/// out as not useful.
233#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Named)]
234pub struct Subscribe(pub hyperactor::PortRef<Option<MeshFailure>>);
235wirevalue::register_type!(Subscribe);
236
237/// Unsubscribe me to future updates about a mesh. Should be the same port used in
238/// the Subscribe message.
239#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Named)]
240pub struct Unsubscribe(pub hyperactor::PortRef<Option<MeshFailure>>);
241wirevalue::register_type!(Unsubscribe);
242
243/// Query the number of active supervision subscribers on this controller.
244#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Named)]
245pub struct GetSubscriberCount(pub hyperactor::PortRef<usize>);
246wirevalue::register_type!(GetSubscriberCount);
247
248/// Check state of the actors in the mesh. This is used as a self message to
249/// periodically check.
250/// Stores the next time we expect to start running a check state message.
251/// Used to check for stalls in message handling.
252#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Named)]
253pub struct CheckState(pub SystemTime);
254wirevalue::register_type!(CheckState);
255
256declare_attrs! {
257    /// If present in a message header, the message is from an ActorMeshController
258    /// to a subscriber and can be safely dropped if it is returned as undeliverable.
259    pub attr ACTOR_MESH_SUBSCRIBER_MESSAGE: bool;
260}
261
262fn send_subscriber_message(
263    cx: &impl context::Actor,
264    subscriber: &hyperactor::PortRef<Option<MeshFailure>>,
265    message: MeshFailure,
266) {
267    let mut headers = Flattrs::new();
268    headers.set(ACTOR_MESH_SUBSCRIBER_MESSAGE, true);
269    subscriber.post_with_headers(cx, headers, Some(message.clone()));
270    tracing::info!(event = %message, "sent supervision failure message to subscriber {}", subscriber.port_addr());
271}
272
273/// Like send_state_change, but when there was no state change that occurred.
274/// Will send a None message to subscribers, and there is no state to change.
275/// Is not sent to the owner, because the owner is only watching for failures.
276/// Should be called once every so often so subscribers can discern the difference
277/// between "no messages because no errors" and "no messages because controller died".
278/// Without sending these hearbeats, subscribers will assume the mesh is dead.
279fn send_heartbeat(cx: &impl context::Actor, health_state: &HealthState) {
280    tracing::debug!(
281        num_subscribers = health_state.subscribers.len(),
282        "sending heartbeat to subscribers",
283    );
284
285    for subscriber in health_state.subscribers.iter() {
286        let mut headers = Flattrs::new();
287        headers.set(ACTOR_MESH_SUBSCRIBER_MESSAGE, true);
288        subscriber.post_with_headers(cx, headers, None);
289    }
290}
291
292/// Sends a MeshFailure to the owner and subscribers of this mesh,
293/// and changes the health state stored unhealthy_event.
294/// Owners are sent a message only for Failure events, not for Stopped events.
295/// Subscribers are sent both Stopped and Failure events.
296fn send_state_change(
297    cx: &impl context::Actor,
298    rank: usize,
299    event: ActorSupervisionEvent,
300    mesh_name: &ResourceId,
301    is_proc_stopped: bool,
302    health_state: &mut HealthState,
303) {
304    // This does not include the Stopped status, which is a state that occurs when the
305    // user calls stop() on a proc or actor mesh.
306    let is_failed = event.is_error();
307    if is_failed {
308        tracing::warn!(
309            name = "SupervisionEvent",
310            actor_mesh = %mesh_name,
311            %event,
312            "detected supervision error on monitored mesh: name={mesh_name}",
313        );
314    } else {
315        tracing::debug!(
316            name = "SupervisionEvent",
317            actor_mesh = %mesh_name,
318            %event,
319            "detected non-error supervision event on monitored mesh: name={mesh_name}",
320        );
321    }
322
323    let failure_message = MeshFailure {
324        actor_mesh_name: Some(mesh_name.to_string()),
325        event: event.clone(),
326        crashed_ranks: vec![rank],
327    };
328    health_state.crashed_ranks.insert(rank, event.clone());
329    health_state.unhealthy_event = Some(if is_proc_stopped {
330        Unhealthy::StreamClosed(failure_message.clone())
331    } else {
332        Unhealthy::Crashed(failure_message.clone())
333    });
334    // Send a notification to the owning actor of this mesh, if there is one.
335    // Don't send a message to the owner for non-failure events such as "stopped".
336    // Those events are always initiated by the owner, who don't need to be
337    // told that they were stopped.
338    if is_failed && let Some(owner) = &health_state.owner {
339        owner.post(cx, failure_message.clone());
340        tracing::info!(actor_mesh = %mesh_name, %event, "sent supervision failure message to owner {}", owner.port_addr());
341    }
342    // Subscribers get all messages, even for non-failures like Stopped, because
343    // they need to know if the owner stopped the mesh.
344    for subscriber in health_state.subscribers.iter() {
345        send_subscriber_message(cx, subscriber, failure_message.clone());
346    }
347}
348
349fn send_poll_failure(
350    cx: &impl context::Actor,
351    event: ActorSupervisionEvent,
352    mesh_name: &ResourceId,
353    health_state: &mut HealthState,
354) -> PollResult {
355    let Some(rank) = health_state.first_non_terminating_rank() else {
356        return PollResult::StopMonitoring;
357    };
358    health_state.mark_rank_terminating(rank, resource::Status::Failed(event.to_string()));
359    send_state_change(cx, rank, event, mesh_name, false, health_state);
360    PollResult::StopMonitoring
361}
362
363fn actor_state_to_supervision_events(
364    state: resource::State<ActorState>,
365) -> (usize, Vec<ActorSupervisionEvent>) {
366    let (rank, actor_id, events) = match state.state {
367        Some(inner) => (
368            inner.create_rank,
369            Some(inner.actor_id),
370            inner.supervision_events.clone(),
371        ),
372        None => (0, None, vec![]),
373    };
374    let events = match state.status {
375        // If the actor was killed, it might not have a Failed status
376        // or supervision events, and it can't tell us which rank
377        resource::Status::NotExist | resource::Status::Stopped | resource::Status::Timeout(_) => {
378            // it was.
379            if !events.is_empty() {
380                events
381            } else {
382                vec![ActorSupervisionEvent::new(
383                    actor_id.expect("actor_id is None"),
384                    None,
385                    ActorStatus::Stopped(
386                        format!(
387                            "actor status is {}; actor may have been killed",
388                            state.status
389                        )
390                        .to_string(),
391                    ),
392                    None,
393                )]
394            }
395        }
396        resource::Status::Failed(_) => events,
397        // All other states are successful.
398        _ => vec![],
399    };
400    (rank, events)
401}
402
403/// Map a process-level [`ProcStatus`] to an actor-level [`ActorStatus`].
404///
405/// When the supervision poll discovers that a process is terminating, this
406/// function decides whether to treat it as a clean stop or a failure.
407/// Notably, [`ProcStatus::Stopping`] (SIGTERM sent, process not yet exited)
408/// is mapped to [`ActorStatus::Stopped`] rather than [`ActorStatus::Failed`]
409/// so that a graceful shutdown in progress does not trigger unhandled
410/// supervision errors.
411fn proc_status_to_actor_status(proc_status: Option<ProcStatus>) -> ActorStatus {
412    match proc_status {
413        Some(ProcStatus::Stopped { exit_code: 0, .. }) => {
414            ActorStatus::Stopped("process exited cleanly".to_string())
415        }
416        Some(ProcStatus::Stopped { exit_code, .. }) => {
417            ActorStatus::Failed(ActorErrorKind::Generic(format!(
418                "the process this actor was running on exited with non-zero code {}",
419                exit_code
420            )))
421        }
422        // Stopping is a transient state during graceful shutdown. Treat it the
423        // same as a clean stop rather than a failure.
424        Some(ProcStatus::Stopping { .. }) => {
425            ActorStatus::Stopped("process is stopping".to_string())
426        }
427        // Conservatively treat lack of status as stopped
428        None => ActorStatus::Stopped("no status received from process".to_string()),
429        Some(status) => ActorStatus::Failed(ActorErrorKind::Generic(format!(
430            "the process this actor was running on failed: {}",
431            status
432        ))),
433    }
434}
435
436/// Log a warning and bump `counter` if the supervision loop is running late.
437///
438/// "Late" means the current wall-clock time exceeds `expected_time` by more
439/// than one full poll interval, i.e. 2x the expected period.
440fn check_stall(expected_time: SystemTime, actor_id: &hyperactor::ActorId, counter: &Counter<u64>) {
441    let now = SystemTime::now();
442    let poll_frequency = hyperactor_config::global::get(SUPERVISION_POLL_FREQUENCY);
443    let Ok(mut stalled_by) = now.duration_since(expected_time + poll_frequency) else {
444        return;
445    };
446    // Add back before using, as the addition is only used to avoid logging unless it's
447    // stalled by at least one cycle.
448    stalled_by += poll_frequency;
449    counter.add(
450        1,
451        // hyperactor_telemetry::Value only supports i64, not u64.
452        kv_pairs!("actor_id" => actor_id.to_string(), "stalled_by_seconds" => stalled_by.as_secs() as i64),
453    );
454    tracing::warn!(
455        %actor_id,
456        "Handler<CheckState> is stalled by {}",
457        humantime::format_duration(stalled_by),
458    );
459}
460
461/// Mesh-specific behavior required by the generic `ResourceController`.
462///
463/// Each variant of resource mesh (actor, proc) implements this trait to
464/// provide the details that cannot be shared by the generic controller:
465/// the state type carried in `resource::State<_>`, how to query or stream
466/// that state from the underlying agents, how to stop the resources, and
467/// how to notify observers when the state changes.
468#[async_trait]
469pub trait Controlled: Clone + Debug + Send + Sync + 'static {
470    /// Inner payload carried in `resource::State<Self::StateInner>`.
471    type StateInner: RemoteMessage + Clone + Debug + 'static;
472
473    /// Counter bumped when the supervision loop detects a stall.
474    fn stall_counter() -> &'static Counter<u64>;
475
476    /// The mesh's resource identifier.
477    fn id(&self) -> &ResourceId;
478
479    /// The region of ranks in this mesh.
480    fn region(&self) -> &ndslice::Region;
481
482    /// Subscribe the given port to `StreamState<StateInner>` updates from
483    /// the underlying agents.
484    fn subscribe_to_stream(
485        &self,
486        cx: &impl context::Actor,
487        subscriber: hyperactor::PortRef<resource::State<Self::StateInner>>,
488    ) -> anyhow::Result<()>;
489
490    /// Forward a `WaitRankStatus` message to the underlying agents.
491    fn forward_wait_rank_status(
492        &self,
493        cx: &impl context::Actor,
494        msg: resource::WaitRankStatus,
495    ) -> anyhow::Result<()>;
496
497    /// Mesh-specific polling step for the supervision loop. Implementations
498    /// may do pre-checks (such as the actor mesh's proc-aliveness check)
499    /// before querying rank states; updates to `health_state` happen
500    /// in-place. `supervision_display_name` is used for synthesised
501    /// supervision events (e.g., when a proc dies).
502    async fn poll_states(
503        &self,
504        cx: &impl context::Actor,
505        supervision_display_name: &str,
506        health_state: &mut HealthState,
507    ) -> PollResult;
508
509    /// Process a single streamed or polled state. Updates the health state
510    /// and notifies owner/subscribers as appropriate. Returns `true` if a
511    /// notification was emitted (used to suppress heartbeats).
512    fn process_state(
513        &self,
514        cx: &impl context::Actor,
515        state: resource::State<Self::StateInner>,
516        health_state: &mut HealthState,
517    ) -> bool;
518
519    /// Perform the mesh-specific stop: issue stop messages to the underlying
520    /// agents and, where appropriate, update `health_state` and notify
521    /// subscribers. The caller has already taken the monitor and logged.
522    async fn handle_stop_request(
523        &self,
524        cx: &impl context::Actor,
525        supervision_display_name: &str,
526        reason: String,
527        health_state: &mut HealthState,
528    ) -> anyhow::Result<()>;
529
530    /// Stop this mesh on controller cleanup (when `Stop` was not received
531    /// but the actor is shutting down).
532    async fn cleanup_stop(&self, cx: &impl context::Actor, reason: String) -> anyhow::Result<()>;
533}
534
535/// Generic controller for a mesh of resources. Actor meshes instantiate this
536/// as `ActorMeshController<A> = ResourceController<ActorMeshControlPlane<A>>`.
537/// All shared behavior lives here; mesh-specific behavior is delegated through
538/// the `Controlled` trait.
539///
540/// `resource::mesh::Spec<()>` and `resource::mesh::State<()>` (instead of
541/// `Spec<T::Spec>` / `State<T::StateInner>`) are used because the
542/// controller participates in the mesh `resource` protocol only at the
543/// outer layer: callers of `GetState` on the controller want the
544/// per-rank statuses and the mesh-wide status that `resource::mesh::State`
545/// already carries, not the inner `T::StateInner` payload (which is
546/// available rank-by-rank via the `resource::State<T::StateInner>` stream).
547/// The unit type is the explicit "no extra payload" choice.
548#[hyperactor::export(
549    handlers=[
550        Subscribe,
551        Unsubscribe,
552        GetSubscriberCount,
553        CheckState,
554        resource::WaitRankStatus,
555        resource::CreateOrUpdate<resource::mesh::Spec<()>>,
556        resource::GetState<resource::mesh::State<()>>,
557        resource::Stop,
558        resource::State<T::StateInner>,
559    ]
560)]
561pub struct ResourceController<T: Controlled> {
562    mesh: T,
563    /// Supervision display name used in telemetry and fake supervision
564    /// events. If `None`, falls back to `mesh.id()`.
565    supervision_display_name: Option<String>,
566    /// Shared health state for the monitor and responding to queries.
567    health_state: HealthState,
568    /// The monitor which continuously runs in the background to refresh
569    /// state. If None, the controller has stopped monitoring.
570    monitor: Option<()>,
571}
572
573/// Controller-side state for an actor mesh.
574///
575/// `ActorMeshRef` is moving toward being a self-contained, serializable mesh
576/// descriptor. The controller is the one place that needs both the actor mesh
577/// and its backing proc mesh: it streams actor state through proc agents and
578/// stops actors through the proc mesh control plane.
579pub struct ActorMeshControlPlane<A: Referable> {
580    actor_mesh: ActorMeshRef<A>,
581    proc_mesh: ProcMeshRef,
582}
583
584impl<A: Referable> ActorMeshControlPlane<A> {
585    pub(crate) fn new(actor_mesh: ActorMeshRef<A>, proc_mesh: ProcMeshRef) -> Self {
586        Self {
587            actor_mesh,
588            proc_mesh,
589        }
590    }
591}
592
593impl<A: Referable> Clone for ActorMeshControlPlane<A> {
594    fn clone(&self) -> Self {
595        Self {
596            actor_mesh: self.actor_mesh.clone(),
597            proc_mesh: self.proc_mesh.clone(),
598        }
599    }
600}
601
602impl<A: Referable> Debug for ActorMeshControlPlane<A> {
603    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
604        f.debug_struct("ActorMeshControlPlane")
605            .field("actor_mesh", &self.actor_mesh)
606            .field("proc_mesh", &self.proc_mesh)
607            .finish()
608    }
609}
610
611impl<A: Referable> Named for ActorMeshControlPlane<A> {
612    fn typename() -> &'static str {
613        wirevalue::intern_typename!(Self, "hyperactor_mesh::ActorMeshControlPlane<{}>", A)
614    }
615}
616
617/// Controller for an actor mesh.
618pub type ActorMeshController<A> = ResourceController<ActorMeshControlPlane<A>>;
619
620impl<T: Controlled> ResourceController<T> {
621    /// Create a new controller over the given mesh.
622    pub(crate) fn new(
623        mesh: T,
624        supervision_display_name: Option<String>,
625        owner: Option<hyperactor::PortRef<MeshFailure>>,
626        initial_statuses: ValueMesh<resource::Status>,
627    ) -> Self {
628        Self {
629            mesh,
630            supervision_display_name,
631            health_state: HealthState::new(initial_statuses.iter().collect(), owner),
632            monitor: None,
633        }
634    }
635
636    /// The display name to use for supervision events and telemetry.
637    pub(crate) fn supervision_display_name(&self) -> String {
638        self.supervision_display_name
639            .clone()
640            .unwrap_or_else(|| self.mesh.id().to_string())
641    }
642
643    /// Schedule the next `CheckState` self-message if the monitor is active.
644    ///
645    /// `send_fn` bridges the type gap: the caller passes a closure that
646    /// captures the typed `Instance`/`Context` and calls
647    /// `post_after`.
648    fn schedule_next_check(&self, send_fn: impl FnOnce(CheckState, Duration)) {
649        if self.monitor.is_some() {
650            let delay = hyperactor_config::global::get(SUPERVISION_POLL_FREQUENCY);
651            send_fn(CheckState(SystemTime::now() + delay), delay);
652        }
653    }
654
655    /// Derive the mesh-level status from health state and monitor presence.
656    fn mesh_status(&self) -> resource::Status {
657        if let Some(Unhealthy::Crashed(e)) = &self.health_state.unhealthy_event {
658            resource::Status::Failed(e.to_string())
659        } else if let Some(Unhealthy::StreamClosed(_)) = &self.health_state.unhealthy_event {
660            resource::Status::Stopped
661        } else if self.monitor.is_none() {
662            resource::Status::Stopped
663        } else {
664            resource::Status::Running
665        }
666    }
667
668    /// Build and send the `GetState<resource::mesh::State<()>>` response.
669    fn handle_get_state_msg(
670        &self,
671        cx: &impl context::Actor,
672        message: resource::GetState<resource::mesh::State<()>>,
673    ) -> anyhow::Result<()> {
674        let status = self.mesh_status();
675        let mut statuses = self
676            .health_state
677            .statuses
678            .iter()
679            .map(|(p, (s, _))| (p.clone(), s.clone()))
680            .collect::<Vec<_>>();
681        statuses.sort_by_key(|(p, _)| p.rank());
682        let statuses: ValueMesh<resource::Status> =
683            statuses
684                .into_iter()
685                .map(|(_, s)| s)
686                .collect_mesh::<ValueMesh<_>>(self.mesh.region().clone())?;
687        let state = resource::mesh::State {
688            statuses,
689            state: (),
690        };
691        message.reply.post(
692            cx,
693            resource::State {
694                id: message.id,
695                status,
696                state: Some(state),
697                generation: 0,
698                timestamp: SystemTime::now(),
699            },
700        );
701        Ok(())
702    }
703
704    /// Drop the monitor if every tracked rank has reached a terminal status.
705    fn stop_if_all_terminating(&mut self) {
706        if self.health_state.all_terminating() {
707            self.monitor.take();
708        }
709    }
710
711    async fn handle_check_state(
712        &mut self,
713        cx: &Context<'_, Self>,
714        expected_time: SystemTime,
715    ) -> anyhow::Result<()>
716    where
717        resource::State<T::StateInner>: RemoteMessage,
718    {
719        if self.monitor.is_none() {
720            return Ok(());
721        }
722        check_stall(expected_time, cx.self_addr().id(), T::stall_counter());
723
724        let display = self.supervision_display_name();
725        let result = self
726            .mesh
727            .poll_states(cx, &display, &mut self.health_state)
728            .await;
729
730        match result {
731            PollResult::Reschedule => {
732                self.schedule_next_check(|msg, delay| cx.post_after(cx, msg, delay));
733            }
734            PollResult::StopMonitoring => {
735                self.monitor.take();
736            }
737            PollResult::Processed { did_notify } => {
738                // Suppress heartbeats once any rank is terminating: the mesh is on
739                // its way down and subscribers will get a real state-change message
740                // for the terminal transition.
741                if !did_notify && !self.health_state.any_terminating() {
742                    send_heartbeat(cx, &self.health_state);
743                }
744                if !self.health_state.all_terminating() {
745                    self.schedule_next_check(|msg, delay| cx.post_after(cx, msg, delay));
746                } else {
747                    self.monitor.take();
748                }
749            }
750        }
751        Ok(())
752    }
753}
754
755impl<T: Controlled> Debug for ResourceController<T> {
756    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
757        f.debug_struct("ResourceController")
758            .field("mesh", &self.mesh)
759            .field("health_state", &self.health_state)
760            .field("monitor", &self.monitor)
761            .finish()
762    }
763}
764
765impl<T: Controlled> resource::mesh::Mesh for ResourceController<T> {
766    type Spec = ();
767    type State = ();
768}
769
770#[async_trait]
771impl<T: Controlled> Actor for ResourceController<T>
772where
773    resource::State<T::StateInner>: RemoteMessage,
774{
775    async fn init(&mut self, this: &Instance<Self>) -> Result<(), anyhow::Error> {
776        this.set_system();
777
778        // Subscribe to streaming state updates from the underlying agents so
779        // the controller receives state changes in real time, complementing
780        // the existing polling loop. Must happen before starting the monitor
781        // so that the first CheckState does not race the initial StreamState
782        // cast.
783        //
784        // Bind the handler port before handing the `PortRef` to the agent.
785        // The later external bind races against `init`, but both calls bind
786        // the same well-known handler port and are idempotent. No `.unsplit()`:
787        // the subscriber port is split by the cast tree so streamed states
788        // reduce up it (see the `StreamState` cast).
789        self.mesh.subscribe_to_stream(this, this.port().bind())?;
790
791        // Start the monitor task.
792        self.monitor = Some(());
793        self.schedule_next_check(|msg, delay| this.post_after(this, msg, delay));
794
795        let owner = if let Some(owner) = &self.health_state.owner {
796            owner.to_string()
797        } else {
798            String::from("None")
799        };
800        tracing::info!(
801            actor_id = %this.self_addr(),
802            %owner,
803            "started resource controller for {}",
804            self.mesh.id()
805        );
806        Ok(())
807    }
808
809    async fn cleanup(
810        &mut self,
811        this: &Instance<Self>,
812        _err: Option<&ActorError>,
813    ) -> Result<(), anyhow::Error> {
814        if self.monitor.take().is_some() {
815            tracing::info!(
816                actor_id = %this.self_addr(),
817                mesh = %self.mesh.id(),
818                "starting cleanup for ResourceController, stopping mesh",
819            );
820            self.mesh
821                .cleanup_stop(this, "resource controller cleanup".to_string())
822                .await?;
823        }
824        Ok(())
825    }
826
827    async fn handle_undeliverable_message(
828        &mut self,
829        cx: &Instance<Self>,
830        reason: UndeliverableReason,
831        mut envelope: Undeliverable<MessageEnvelope>,
832    ) -> Result<(), anyhow::Error> {
833        envelope = update_undeliverable_envelope_for_casting(envelope);
834        let Some(returned) = envelope.as_message() else {
835            return handle_undeliverable_message(cx, reason, envelope);
836        };
837        if let Some(true) = returned.headers().get(ACTOR_MESH_SUBSCRIBER_MESSAGE) {
838            // Remove from the subscriber list (if it existed) so we don't
839            // send to this subscriber again.
840            // NOTE: The only part of the port that is used for equality checks is
841            // the port id, so create a new one just for the comparison.
842            let dest_port_id = returned.dest().clone();
843            let port = hyperactor::PortRef::<Option<MeshFailure>>::attest(dest_port_id);
844            let did_exist = self.health_state.subscribers.remove(&port);
845            if did_exist {
846                tracing::debug!(
847                    actor_id = %cx.self_addr(),
848                    num_subscribers = self.health_state.subscribers.len(),
849                    "ResourceController: removed subscriber {} from mesh controller",
850                    port.port_addr()
851                );
852            }
853            Ok(())
854        } else if returned.headers().get(CAST_ACTOR_MESH_ID).is_some() {
855            // A cast message we sent (e.g. StreamState or KeepaliveGetState)
856            // was returned by the CommActor because it could not be forwarded.
857            // This is expected when the network session is broken. Log and
858            // continue — the supervision polling loop will detect the failure.
859            tracing::warn!(
860                actor_id = %cx.self_addr(),
861                dest = %returned.dest(),
862                "ResourceController: ignoring undeliverable cast message",
863            );
864            Ok(())
865        } else {
866            handle_undeliverable_message(cx, reason, envelope)
867        }
868    }
869
870    async fn handle_invalid_reference(
871        &mut self,
872        cx: &Instance<Self>,
873        invalid: hyperactor::mailbox::InvalidReference,
874        envelope: Undeliverable<MessageEnvelope>,
875    ) -> Result<(), anyhow::Error> {
876        let envelope = update_undeliverable_envelope_for_casting(envelope);
877        let Some(returned) = envelope.as_message() else {
878            return hyperactor::actor::handle_invalid_reference(cx, invalid, envelope);
879        };
880        if let Some(true) = returned.headers().get(ACTOR_MESH_SUBSCRIBER_MESSAGE) {
881            let dest_port_id = returned.dest().clone();
882            let port = hyperactor::PortRef::<Option<MeshFailure>>::attest(dest_port_id);
883            let did_exist = self.health_state.subscribers.remove(&port);
884            if did_exist {
885                tracing::debug!(
886                    actor_id = %cx.self_addr(),
887                    num_subscribers = self.health_state.subscribers.len(),
888                    "ResourceController: removed subscriber {} from mesh controller",
889                    port.port_addr()
890                );
891            }
892            Ok(())
893        } else if returned.headers().get(CAST_ACTOR_MESH_ID).is_some() {
894            tracing::warn!(
895                actor_id = %cx.self_addr(),
896                dest = %returned.dest(),
897                "ResourceController: ignoring undeliverable cast message",
898            );
899            Ok(())
900        } else {
901            hyperactor::actor::handle_invalid_reference(cx, invalid, envelope)
902        }
903    }
904}
905
906#[async_trait]
907impl<T: Controlled> Handler<Subscribe> for ResourceController<T>
908where
909    resource::State<T::StateInner>: RemoteMessage,
910{
911    async fn handle(&mut self, cx: &Context<Self>, message: Subscribe) -> anyhow::Result<()> {
912        // If there are any crashed ranks, replay a failure event so the new
913        // subscriber learns about the current health state. We send a single
914        // message with all crashed ranks so the subscriber's filter can check
915        // overlap with its slice region. This avoids the watch-channel
916        // coalescing problem (sending per-rank messages would lose all but
917        // the last one).
918        if let Some(unhealthy) = &self.health_state.unhealthy_event {
919            let msg = match unhealthy {
920                Unhealthy::StreamClosed(msg) | Unhealthy::Crashed(msg) => msg,
921            };
922            let mut replay_msg = msg.clone();
923            replay_msg.crashed_ranks = self.health_state.crashed_ranks.keys().copied().collect();
924            send_subscriber_message(cx, &message.0, replay_msg);
925        }
926        let port_id = message.0.port_addr().clone();
927        if self.health_state.subscribers.insert(message.0) {
928            tracing::debug!(
929                actor_id = %cx.self_addr(),
930                num_subscribers = self.health_state.subscribers.len(),
931                "added subscriber {} to mesh controller",
932                port_id
933            );
934        }
935        Ok(())
936    }
937}
938
939#[async_trait]
940impl<T: Controlled> Handler<Unsubscribe> for ResourceController<T>
941where
942    resource::State<T::StateInner>: RemoteMessage,
943{
944    async fn handle(&mut self, cx: &Context<Self>, message: Unsubscribe) -> anyhow::Result<()> {
945        if self.health_state.subscribers.remove(&message.0) {
946            tracing::debug!(
947                actor_id = %cx.self_addr(),
948                num_subscribers = self.health_state.subscribers.len(),
949                "removed subscriber {} from mesh controller",
950                message.0.port_addr()
951            );
952        }
953        Ok(())
954    }
955}
956
957#[async_trait]
958impl<T: Controlled> Handler<GetSubscriberCount> for ResourceController<T>
959where
960    resource::State<T::StateInner>: RemoteMessage,
961{
962    async fn handle(
963        &mut self,
964        cx: &Context<Self>,
965        message: GetSubscriberCount,
966    ) -> anyhow::Result<()> {
967        message.0.post(cx, self.health_state.subscribers.len());
968        Ok(())
969    }
970}
971
972#[async_trait]
973impl<T: Controlled> Handler<resource::CreateOrUpdate<resource::mesh::Spec<()>>>
974    for ResourceController<T>
975where
976    resource::State<T::StateInner>: RemoteMessage,
977{
978    /// Currently a no-op as there's nothing to create or update, but allows
979    /// `ResourceController` to implement the resource mesh behavior.
980    async fn handle(
981        &mut self,
982        _cx: &Context<Self>,
983        _message: resource::CreateOrUpdate<resource::mesh::Spec<()>>,
984    ) -> anyhow::Result<()> {
985        Ok(())
986    }
987}
988
989#[async_trait]
990impl<T: Controlled> Handler<resource::GetState<resource::mesh::State<()>>> for ResourceController<T>
991where
992    resource::State<T::StateInner>: RemoteMessage,
993{
994    async fn handle(
995        &mut self,
996        cx: &Context<Self>,
997        message: resource::GetState<resource::mesh::State<()>>,
998    ) -> anyhow::Result<()> {
999        self.handle_get_state_msg(cx, message)
1000    }
1001}
1002
1003#[async_trait]
1004impl<T: Controlled> Handler<resource::Stop> for ResourceController<T>
1005where
1006    resource::State<T::StateInner>: RemoteMessage,
1007{
1008    async fn handle(&mut self, cx: &Context<Self>, message: resource::Stop) -> anyhow::Result<()> {
1009        let mesh_name = self.mesh.id().clone();
1010        tracing::info!(
1011            name = "ResourceControllerStatus",
1012            %mesh_name,
1013            reason = %message.reason,
1014            "stopping mesh"
1015        );
1016        if self.monitor.take().is_none() {
1017            tracing::debug!(
1018                actor_id = %cx.self_addr(),
1019                %mesh_name,
1020                "duplicate stop request, mesh is already stopped",
1021            );
1022            return Ok(());
1023        }
1024        let display = self.supervision_display_name();
1025        self.mesh
1026            .handle_stop_request(cx, &display, message.reason, &mut self.health_state)
1027            .await
1028    }
1029}
1030
1031#[async_trait]
1032impl<T: Controlled> Handler<resource::WaitRankStatus> for ResourceController<T>
1033where
1034    resource::State<T::StateInner>: RemoteMessage,
1035{
1036    /// Forward WaitRankStatus to the underlying agents. Each agent replies
1037    /// directly to the caller's accumulator port when its resource reaches
1038    /// the requested status.
1039    async fn handle(
1040        &mut self,
1041        cx: &Context<Self>,
1042        msg: resource::WaitRankStatus,
1043    ) -> anyhow::Result<()> {
1044        self.mesh.forward_wait_rank_status(cx, msg)
1045    }
1046}
1047
1048#[async_trait]
1049impl<T: Controlled> Handler<CheckState> for ResourceController<T>
1050where
1051    resource::State<T::StateInner>: RemoteMessage,
1052{
1053    async fn handle(
1054        &mut self,
1055        cx: &Context<Self>,
1056        CheckState(expected_time): CheckState,
1057    ) -> Result<(), anyhow::Error> {
1058        self.handle_check_state(cx, expected_time).await
1059    }
1060}
1061
1062#[async_trait]
1063impl<T: Controlled> Handler<resource::State<T::StateInner>> for ResourceController<T>
1064where
1065    resource::State<T::StateInner>: RemoteMessage,
1066{
1067    async fn handle(
1068        &mut self,
1069        cx: &Context<Self>,
1070        state: resource::State<T::StateInner>,
1071    ) -> anyhow::Result<()> {
1072        self.mesh.process_state(cx, state, &mut self.health_state);
1073        self.stop_if_all_terminating();
1074        Ok(())
1075    }
1076}
1077
1078/// `Controlled` implementation for an actor mesh.
1079#[async_trait]
1080impl<A: Referable> Controlled for ActorMeshControlPlane<A> {
1081    type StateInner = ActorState;
1082
1083    fn stall_counter() -> &'static Counter<u64> {
1084        &ACTOR_MESH_CONTROLLER_SUPERVISION_STALLS
1085    }
1086
1087    fn id(&self) -> &ResourceId {
1088        self.actor_mesh.id().resource_id()
1089    }
1090
1091    fn region(&self) -> &ndslice::Region {
1092        ndslice::view::Ranked::region(&self.actor_mesh)
1093    }
1094
1095    fn subscribe_to_stream(
1096        &self,
1097        cx: &impl context::Actor,
1098        subscriber: hyperactor::PortRef<resource::State<ActorState>>,
1099    ) -> anyhow::Result<()> {
1100        self.proc_mesh.agent_mesh().cast(
1101            cx,
1102            resource::StreamState::<ActorState> {
1103                id: self.actor_mesh.id().resource_id().clone(),
1104                subscriber,
1105            },
1106        )?;
1107        Ok(())
1108    }
1109
1110    fn forward_wait_rank_status(
1111        &self,
1112        cx: &impl context::Actor,
1113        msg: resource::WaitRankStatus,
1114    ) -> anyhow::Result<()> {
1115        self.proc_mesh.agent_mesh().cast(cx, msg)?;
1116        Ok(())
1117    }
1118
1119    async fn poll_states(
1120        &self,
1121        cx: &impl context::Actor,
1122        supervision_display_name: &str,
1123        health_state: &mut HealthState,
1124    ) -> PollResult {
1125        let mesh_name = Controlled::id(self);
1126
1127        // Actor-specific: first check if the proc mesh is dead before
1128        // trying to query their agents.
1129        let proc_states = self.proc_mesh.states(cx, None).await;
1130        if let Err(e) = proc_states {
1131            return send_poll_failure(
1132                cx,
1133                ActorSupervisionEvent::new(
1134                    cx.instance().self_addr().clone(),
1135                    None,
1136                    ActorStatus::generic_failure(format!(
1137                        "unable to query for proc states: {:?}",
1138                        e
1139                    )),
1140                    None,
1141                ),
1142                mesh_name,
1143                health_state,
1144            );
1145        }
1146        if let Some(proc_states) = proc_states.unwrap() {
1147            // Check if the proc mesh is still alive.
1148            if let Some((point, state)) = proc_states
1149                .iter()
1150                .find(|(_rank, state)| state.status.is_terminating())
1151            {
1152                // TODO: allow "actor supervision event" to be general, and
1153                // make the proc failure the cause. It is a hack to try to determine
1154                // the correct status based on process exit status.
1155                let actor_status =
1156                    proc_status_to_actor_status(state.state.and_then(|s| s.proc_status));
1157                let stop_monitoring = actor_status.is_failed();
1158                let display = crate::actor_display_name(supervision_display_name, &point);
1159                let event = ActorSupervisionEvent::new(
1160                    // Attribute this to the monitored actor, even if the underlying
1161                    // cause is a proc_failure. We propagate the cause explicitly.
1162                    self.actor_mesh
1163                        .get(point.rank())
1164                        .unwrap()
1165                        .actor_addr()
1166                        .clone(),
1167                    Some(display),
1168                    actor_status,
1169                    None,
1170                );
1171                if stop_monitoring {
1172                    if health_state.mark_rank_terminating(
1173                        point.rank(),
1174                        resource::Status::Failed(event.to_string()),
1175                    ) {
1176                        send_state_change(cx, point.rank(), event, mesh_name, true, health_state);
1177                    }
1178                    return PollResult::StopMonitoring;
1179                } else {
1180                    send_state_change(cx, point.rank(), event, mesh_name, true, health_state);
1181                    return PollResult::Reschedule;
1182                }
1183            }
1184        }
1185
1186        // Query resource states with keepalive.
1187        let actor_states = self
1188            .proc_mesh
1189            .actor_states_with_keepalive(cx, self.actor_mesh.id().clone(), compute_keepalive())
1190            .await;
1191        match actor_states {
1192            Err(e) => send_poll_failure(
1193                cx,
1194                ActorSupervisionEvent::new(
1195                    cx.instance().self_addr().clone(),
1196                    Some(supervision_display_name.to_string()),
1197                    ActorStatus::generic_failure(format!(
1198                        "unable to query for actor states: {:?}",
1199                        e
1200                    )),
1201                    None,
1202                ),
1203                mesh_name,
1204                health_state,
1205            ),
1206            Ok(states) => {
1207                let did_notify =
1208                    health_state.apply_updates_and_notify(&states, |state, health_state| {
1209                        let (rank, events) = actor_state_to_supervision_events(state);
1210                        if events.is_empty() {
1211                            return false;
1212                        }
1213                        send_state_change(
1214                            cx,
1215                            rank,
1216                            events[0].clone(),
1217                            mesh_name,
1218                            false,
1219                            health_state,
1220                        );
1221                        true
1222                    });
1223                PollResult::Processed { did_notify }
1224            }
1225        }
1226    }
1227
1228    fn process_state(
1229        &self,
1230        cx: &impl context::Actor,
1231        state: resource::State<ActorState>,
1232        health_state: &mut HealthState,
1233    ) -> bool {
1234        let (rank, events) = actor_state_to_supervision_events(state.clone());
1235        let Ok(point) = Controlled::region(self).extent().point_of_rank(rank) else {
1236            return false;
1237        };
1238
1239        let changed = health_state.maybe_update(point, state.status, state.generation);
1240
1241        if changed && !events.is_empty() {
1242            send_state_change(
1243                cx,
1244                rank,
1245                events[0].clone(),
1246                Controlled::id(self),
1247                false,
1248                health_state,
1249            );
1250            true
1251        } else {
1252            false
1253        }
1254    }
1255
1256    async fn handle_stop_request(
1257        &self,
1258        cx: &impl context::Actor,
1259        _supervision_display_name: &str,
1260        reason: String,
1261        health_state: &mut HealthState,
1262    ) -> anyhow::Result<()> {
1263        let mesh_name = Controlled::id(self);
1264        tracing::info!(
1265            actor_id = %cx.instance().self_addr(),
1266            actor_mesh = %mesh_name,
1267            "forwarding stop request from ActorMeshController to proc mesh"
1268        );
1269
1270        // Let the client know that the controller has stopped. Since the monitor
1271        // is cancelled, it will not alert the owner or the subscribers.
1272        // We use a placeholder rank to get an actor id, but really there should
1273        // be a stop event for every rank in the mesh. Since every rank has the
1274        // same owner, we assume the rank doesn't matter, and the owner can just
1275        // assume the stop happened on all actors.
1276        let rank = 0usize;
1277        let event = ActorSupervisionEvent::new(
1278            self.actor_mesh
1279                .get(rank)
1280                .expect("mesh must have at least one rank")
1281                .actor_addr()
1282                .clone(),
1283            None,
1284            ActorStatus::Stopped("ActorMeshController received explicit stop request".to_string()),
1285            None,
1286        );
1287        let failure_message = MeshFailure {
1288            actor_mesh_name: Some(mesh_name.to_string()),
1289            event,
1290            crashed_ranks: vec![],
1291        };
1292        health_state.unhealthy_event = Some(Unhealthy::StreamClosed(failure_message.clone()));
1293        // We don't send a message to the owner on stops, because only the owner
1294        // can request a stop. We just send to subscribers instead, as they did
1295        // not request the stop themselves.
1296        for subscriber in health_state.subscribers.iter() {
1297            send_subscriber_message(cx, subscriber, failure_message.clone());
1298        }
1299
1300        // max_rank and extent are only needed for the deprecated RankedValues.
1301        // TODO: add cmp::Ord to Point for a max() impl.
1302        let max_rank = health_state.statuses.keys().map(|p| p.rank()).max();
1303        let extent = health_state
1304            .statuses
1305            .keys()
1306            .next()
1307            .map(|p| p.extent().clone());
1308
1309        // Cannot use "ActorMesh::stop" as it tries to message the controller.
1310        let result = self
1311            .proc_mesh
1312            .stop_actor_by_id(cx, self.actor_mesh.id().clone(), reason)
1313            .await;
1314
1315        match result {
1316            Ok(statuses) => {
1317                // All stops successful, set actor status on health state.
1318                for (rank, status) in statuses.iter() {
1319                    health_state
1320                        .statuses
1321                        .entry(rank)
1322                        .and_modify(move |s| *s = (status, u64::MAX));
1323                }
1324            }
1325            Err(crate::Error::ActorStopError { statuses }) => {
1326                if let Some(max_rank) = max_rank {
1327                    let extent = extent.expect("no actors in mesh");
1328                    for (rank, status) in statuses.materialized_iter(max_rank).enumerate() {
1329                        *health_state
1330                            .statuses
1331                            .get_mut(&extent.point_of_rank(rank).expect("illegal rank"))
1332                            .unwrap() = (status.clone(), u64::MAX);
1333                    }
1334                }
1335            }
1336            Err(e) => {
1337                return Err(e.into());
1338            }
1339        }
1340
1341        tracing::info!(
1342            actor_id = %cx.instance().self_addr(),
1343            actor_mesh = %mesh_name,
1344            "stopped mesh"
1345        );
1346        Ok(())
1347    }
1348
1349    async fn cleanup_stop(&self, cx: &impl context::Actor, reason: String) -> anyhow::Result<()> {
1350        self.proc_mesh
1351            .stop_actor_by_id(cx, self.actor_mesh.id().clone(), reason)
1352            .await?;
1353        Ok(())
1354    }
1355}
1356
1357/// Controller for a proc mesh.
1358pub(crate) type ProcMeshController = ResourceController<ProcMeshRef>;
1359
1360/// `Controlled` implementation for a proc mesh.
1361#[async_trait]
1362impl Controlled for ProcMeshRef {
1363    type StateInner = crate::host_mesh::host_agent::ProcState;
1364
1365    fn stall_counter() -> &'static Counter<u64> {
1366        &PROC_MESH_CONTROLLER_SUPERVISION_STALLS
1367    }
1368
1369    fn id(&self) -> &ResourceId {
1370        ProcMeshRef::id(self).resource_id()
1371    }
1372
1373    fn region(&self) -> &ndslice::Region {
1374        ndslice::view::Ranked::region(self)
1375    }
1376
1377    fn subscribe_to_stream(
1378        &self,
1379        cx: &impl context::Actor,
1380        subscriber: hyperactor::PortRef<resource::State<Self::StateInner>>,
1381    ) -> anyhow::Result<()> {
1382        // A `ProcMeshController` is only ever created for host-backed proc
1383        // meshes (host_mesh.rs spawn path), so a host mesh should always be
1384        // present. Surface a violation as an error rather than panicking — this
1385        // runs from `init`, so a panic would abort the actor. Cast a single
1386        // StreamState to the host-agent mesh so each host streams its procs'
1387        // state back through the cast tree (fanning in at cast actor 0) instead
1388        // of every host dialing the subscriber directly.
1389        let host_mesh = self.hosts().ok_or_else(|| {
1390            anyhow::anyhow!(
1391                "ProcMeshController has no host mesh; it must run on a host-backed proc mesh"
1392            )
1393        })?;
1394
1395        host_mesh.cast_stream_state(cx, ProcMeshRef::id(self).resource_id().clone(), subscriber)?;
1396        Ok(())
1397    }
1398
1399    fn forward_wait_rank_status(
1400        &self,
1401        cx: &impl context::Actor,
1402        msg: resource::WaitRankStatus,
1403    ) -> anyhow::Result<()> {
1404        for proc_id in self.proc_ids() {
1405            crate::host_mesh::host_agent_ref(proc_id.addr().clone()).post(cx, msg.clone());
1406        }
1407        Ok(())
1408    }
1409
1410    async fn poll_states(
1411        &self,
1412        cx: &impl context::Actor,
1413        supervision_display_name: &str,
1414        health_state: &mut HealthState,
1415    ) -> PollResult {
1416        let mesh_name = Controlled::id(self);
1417
1418        match self.states(cx, compute_keepalive()).await {
1419            Err(e) => send_poll_failure(
1420                cx,
1421                ActorSupervisionEvent::new(
1422                    cx.instance().self_addr().clone(),
1423                    Some(supervision_display_name.to_string()),
1424                    ActorStatus::generic_failure(format!(
1425                        "unable to query for proc states: {:?}",
1426                        e
1427                    )),
1428                    None,
1429                ),
1430                mesh_name,
1431                health_state,
1432            ),
1433            Ok(None) => PollResult::Processed { did_notify: false },
1434            Ok(Some(states)) => {
1435                let did_notify =
1436                    health_state.apply_updates_and_notify(&states, |state, health_state| {
1437                        self.notify_proc_state_change(
1438                            cx,
1439                            supervision_display_name,
1440                            state,
1441                            health_state,
1442                        )
1443                    });
1444                PollResult::Processed { did_notify }
1445            }
1446        }
1447    }
1448
1449    fn process_state(
1450        &self,
1451        cx: &impl context::Actor,
1452        state: resource::State<Self::StateInner>,
1453        health_state: &mut HealthState,
1454    ) -> bool {
1455        let Ok(point) = Controlled::region(self).extent().point_of_rank(
1456            state
1457                .state
1458                .as_ref()
1459                .map(|s| s.create_rank)
1460                .unwrap_or(usize::MAX),
1461        ) else {
1462            return false;
1463        };
1464        let changed = health_state.maybe_update(point, state.status.clone(), state.generation);
1465        if !changed {
1466            return false;
1467        }
1468        let display = Controlled::id(self).to_string();
1469        self.notify_proc_state_change(cx, &display, state, health_state)
1470    }
1471
1472    async fn handle_stop_request(
1473        &self,
1474        cx: &impl context::Actor,
1475        _supervision_display_name: &str,
1476        reason: String,
1477        health_state: &mut HealthState,
1478    ) -> anyhow::Result<()> {
1479        let mesh_name = Controlled::id(self);
1480        tracing::info!(
1481            actor_id = %cx.instance().self_addr(),
1482            proc_mesh = %mesh_name,
1483            "ProcMeshController stopping proc mesh"
1484        );
1485        // Marker so subscribers know the mesh is being torn down on request.
1486        let event = ActorSupervisionEvent::new(
1487            cx.instance().self_addr().clone(),
1488            None,
1489            ActorStatus::Stopped("ProcMeshController received explicit stop request".to_string()),
1490            None,
1491        );
1492        let failure_message = MeshFailure {
1493            actor_mesh_name: Some(mesh_name.to_string()),
1494            event,
1495            crashed_ranks: vec![],
1496        };
1497        health_state.unhealthy_event = Some(Unhealthy::StreamClosed(failure_message.clone()));
1498        for subscriber in health_state.subscribers.iter() {
1499            send_subscriber_message(cx, subscriber, failure_message.clone());
1500        }
1501
1502        let names = self.proc_ids().collect::<Vec<hyperactor::ProcAddr>>();
1503        let region = Ranked::region(self).clone();
1504        let Some(hosts) = self.hosts() else {
1505            return Ok(());
1506        };
1507        // stop_proc_mesh waits for every rank to reach a terminating state
1508        // before returning Ok, so we can apply its returned StatusMesh
1509        // verbatim. On error we still got per-rank statuses for whatever
1510        // ranks the host agents reported on; apply those too so health
1511        // state stays as accurate as we can make it.
1512        let max_rank = health_state.statuses.keys().map(|p| p.rank()).max();
1513        let extent = health_state
1514            .statuses
1515            .keys()
1516            .next()
1517            .map(|p| p.extent().clone());
1518        match hosts
1519            .stop_proc_mesh(cx, self.id(), names, region, reason)
1520            .await
1521        {
1522            Ok(statuses) => {
1523                for (rank, status) in statuses.iter() {
1524                    health_state
1525                        .statuses
1526                        .entry(rank)
1527                        .and_modify(move |s| *s = (status, u64::MAX));
1528                }
1529                Ok(())
1530            }
1531            Err(crate::Error::ProcMeshStopError { statuses }) => {
1532                if let (Some(max_rank), Some(extent)) = (max_rank, extent) {
1533                    for (rank, status) in statuses.materialized_iter(max_rank).enumerate() {
1534                        if let Ok(point) = extent.point_of_rank(rank) {
1535                            health_state
1536                                .statuses
1537                                .entry(point)
1538                                .and_modify(|s| *s = (status.clone(), u64::MAX));
1539                        }
1540                    }
1541                }
1542                Err(crate::Error::ProcMeshStopError { statuses }.into())
1543            }
1544            Err(e) => Err(e.into()),
1545        }
1546    }
1547
1548    async fn cleanup_stop(&self, cx: &impl context::Actor, reason: String) -> anyhow::Result<()> {
1549        let names = self.proc_ids().collect::<Vec<hyperactor::ProcAddr>>();
1550        let region = Ranked::region(self).clone();
1551        if let Some(hosts) = self.hosts() {
1552            hosts
1553                .stop_proc_mesh(cx, self.id(), names, region, reason)
1554                .await?;
1555        }
1556        Ok(())
1557    }
1558}
1559
1560impl ProcMeshRef {
1561    /// Translate a polled or streamed `State<ProcState>` into a supervision
1562    /// event on this proc-mesh controller. Returns `true` if a notification
1563    /// was sent (which suppresses the heartbeat path).
1564    fn notify_proc_state_change(
1565        &self,
1566        cx: &impl context::Actor,
1567        supervision_display_name: &str,
1568        state: resource::State<crate::host_mesh::host_agent::ProcState>,
1569        health_state: &mut HealthState,
1570    ) -> bool {
1571        let create_rank = state.state.as_ref().map(|s| s.create_rank);
1572        let actor_status = proc_status_to_actor_status(state.state.and_then(|s| s.proc_status));
1573        let event = ActorSupervisionEvent::new(
1574            cx.instance().self_addr().clone(),
1575            Some(supervision_display_name.to_string()),
1576            actor_status,
1577            None,
1578        );
1579        let rank = create_rank
1580            .and_then(|r| {
1581                ndslice::view::Ranked::region(self)
1582                    .extent()
1583                    .point_of_rank(r)
1584                    .ok()
1585            })
1586            .map(|p| p.rank())
1587            .unwrap_or(0);
1588        send_state_change(cx, rank, event, Controlled::id(self), true, health_state);
1589        true
1590    }
1591}
1592
1593#[cfg(test)]
1594mod tests {
1595    use std::ops::Deref;
1596    use std::time::Duration;
1597
1598    use hyperactor::actor::ActorErrorKind;
1599    use hyperactor::actor::ActorStatus;
1600    use hyperactor::channel::ChannelAddr;
1601    use hyperactor::id::Label;
1602    use hyperactor::supervision::ActorSupervisionEvent;
1603    use ndslice::Extent;
1604    use ndslice::ViewExt;
1605
1606    use super::HealthState;
1607    use super::PollResult;
1608    #[cfg(fbcode_build)]
1609    use super::SUPERVISION_POLL_FREQUENCY;
1610    use super::proc_status_to_actor_status;
1611    use super::send_poll_failure;
1612    use super::send_state_change;
1613    use crate::ActorMesh;
1614    use crate::bootstrap::ProcStatus;
1615    #[cfg(fbcode_build)]
1616    use crate::host_mesh::PROC_SPAWN_MAX_IDLE;
1617    use crate::mesh_id::ActorMeshId;
1618    #[cfg(fbcode_build)]
1619    use crate::mesh_id::HostMeshId;
1620    use crate::mesh_id::ResourceId;
1621    use crate::proc_agent::MESH_ORPHAN_TIMEOUT;
1622    use crate::resource;
1623    use crate::supervision::MeshFailure;
1624    use crate::test_utils::local_host_mesh;
1625    use crate::testactor;
1626    use crate::testing;
1627
1628    #[tokio::test]
1629    async fn poll_failure_consumes_one_terminal_rank_for_owner_notification_bound() {
1630        let instance = testing::instance();
1631        let (owner_port, mut owner_rx) = instance.open_port::<MeshFailure>();
1632        let mesh_name = ResourceId::instance(Label::new("workers").unwrap());
1633        let region: ndslice::Region = ndslice::extent!(gpus = 3).into();
1634        let statuses = (0..3)
1635            .map(|rank| {
1636                (
1637                    region.extent().point_of_rank(rank).unwrap(),
1638                    resource::Status::Running,
1639                )
1640            })
1641            .collect();
1642        let mut health_state = HealthState::new(statuses, Some(owner_port.bind()));
1643
1644        let rank0_event = failed_event(0, "rank 0 failed");
1645        assert!(
1646            health_state
1647                .mark_rank_terminating(0, resource::Status::Failed("rank 0 failed".to_string()))
1648        );
1649        send_state_change(
1650            &instance,
1651            0,
1652            rank0_event.clone(),
1653            &mesh_name,
1654            false,
1655            &mut health_state,
1656        );
1657        let rank0_failure = owner_rx.recv().await.unwrap();
1658        assert_eq!(rank0_failure.crashed_ranks, vec![0]);
1659        assert_eq!(rank0_failure.event, rank0_event);
1660
1661        let poll_event = failed_event(99, "unable to query for actor states");
1662        assert!(matches!(
1663            send_poll_failure(&instance, poll_event.clone(), &mesh_name, &mut health_state),
1664            PollResult::StopMonitoring
1665        ));
1666        let poll_failure = owner_rx.recv().await.unwrap();
1667        assert_eq!(poll_failure.crashed_ranks, vec![1]);
1668        assert_eq!(poll_failure.event, poll_event);
1669
1670        let late_rank1_event = failed_event(1, "late rank 1 failure");
1671        if health_state.mark_rank_terminating(
1672            1,
1673            resource::Status::Failed("late rank 1 failure".to_string()),
1674        ) {
1675            send_state_change(
1676                &instance,
1677                1,
1678                late_rank1_event,
1679                &mesh_name,
1680                false,
1681                &mut health_state,
1682            );
1683        }
1684        assert_eq!(owner_rx.try_recv().unwrap(), None);
1685
1686        let rank2_event = failed_event(2, "rank 2 failed");
1687        assert!(
1688            health_state
1689                .mark_rank_terminating(2, resource::Status::Failed("rank 2 failed".to_string()))
1690        );
1691        send_state_change(
1692            &instance,
1693            2,
1694            rank2_event.clone(),
1695            &mesh_name,
1696            false,
1697            &mut health_state,
1698        );
1699        let rank2_failure = owner_rx.recv().await.unwrap();
1700        assert_eq!(rank2_failure.crashed_ranks, vec![2]);
1701        assert_eq!(rank2_failure.event, rank2_event);
1702
1703        assert_eq!(health_state.first_non_terminating_rank(), None);
1704        assert_eq!(owner_rx.try_recv().unwrap(), None);
1705    }
1706
1707    fn failed_event(rank: usize, message: &str) -> ActorSupervisionEvent {
1708        ActorSupervisionEvent::new(
1709            ResourceId::proc_addr_from_name(ChannelAddr::Local(0), "test_proc")
1710                .actor_addr(format!("worker_{rank}")),
1711            None,
1712            ActorStatus::Failed(ActorErrorKind::Generic(message.to_string())),
1713            None,
1714        )
1715    }
1716
1717    /// Wraps a host mesh's shutdown guard and the spawned host child
1718    /// processes so tests can simulate an unclean host crash by killing
1719    /// the children directly rather than asking an in-mesh actor to
1720    /// `process::exit`, which can also tear down the test binary.
1721    #[cfg(fbcode_build)]
1722    struct TestHostMesh {
1723        guard: crate::host_mesh::HostMeshShutdownGuard,
1724        children: Vec<tokio::process::Child>,
1725    }
1726
1727    #[cfg(fbcode_build)]
1728    impl TestHostMesh {
1729        async fn kill_hosts(&mut self) {
1730            for child in &mut self.children {
1731                let _ = child.start_kill();
1732                let _ = child.wait().await;
1733            }
1734            self.children.clear();
1735        }
1736    }
1737
1738    #[cfg(fbcode_build)]
1739    impl std::ops::Deref for TestHostMesh {
1740        type Target = crate::host_mesh::HostMeshShutdownGuard;
1741
1742        fn deref(&self) -> &Self::Target {
1743            &self.guard
1744        }
1745    }
1746
1747    #[cfg(fbcode_build)]
1748    impl std::ops::DerefMut for TestHostMesh {
1749        fn deref_mut(&mut self) -> &mut Self::Target {
1750            &mut self.guard
1751        }
1752    }
1753
1754    /// Verify that actors spawned without a controller are cleaned up
1755    /// when their keepalive expiry lapses. We:
1756    ///   1. Enable the orphan timeout on the `ProcMeshAgent`.
1757    ///   2. Spawn actors as *system actors* (no `ActorMeshController`).
1758    ///   3. Send a single keepalive with a short expiry time.
1759    ///   4. Wait for the expiry to pass and `SelfCheck` to fire.
1760    ///   5. Assert that the actors are now stopped.
1761    #[tokio::test]
1762    async fn test_orphaned_actors_are_cleaned_up() {
1763        let config = hyperactor_config::global::lock();
1764        // Short orphan timeout so SelfCheck fires frequently.
1765        let _orphan = config.override_key(MESH_ORPHAN_TIMEOUT, Some(Duration::from_secs(1)));
1766
1767        let instance = testing::instance();
1768        let host_mesh = local_host_mesh(2).await;
1769        let proc_mesh = host_mesh
1770            .spawn(instance, "test", Extent::unity(), None, None)
1771            .await
1772            .unwrap();
1773
1774        let actor_name = ActorMeshId::instance(Label::new("orphan_test").unwrap());
1775        // Spawn as a system actor so no controller is created. This lets us
1776        // control keepalive messages directly without the controller
1777        // interfering.
1778        let actor_mesh: ActorMesh<testactor::TestActor> = proc_mesh
1779            .spawn_with_name(instance, actor_name.clone(), &(), None, true)
1780            .await
1781            .unwrap();
1782        assert!(
1783            actor_mesh.deref().extent().num_ranks() > 0,
1784            "should have spawned at least one actor"
1785        );
1786
1787        // Send a keepalive with a short expiry. This is what the
1788        // ActorMeshController would normally do on each supervision poll.
1789        let states = proc_mesh
1790            .actor_states_with_keepalive(
1791                instance,
1792                actor_name.clone(),
1793                Some(std::time::SystemTime::now() + Duration::from_secs(2)),
1794            )
1795            .await
1796            .unwrap();
1797        // All actors should be running right now.
1798        for state in states.values() {
1799            assert_eq!(
1800                state.status,
1801                resource::Status::Running,
1802                "actor should be running before expiry"
1803            );
1804        }
1805
1806        // Poll until all actors are stopped, rather than sleeping a
1807        // fixed duration. The expiry is 2s and SelfCheck fires every 1s,
1808        // so this should converge quickly, but we allow a generous timeout
1809        // for slow CI environments.
1810        let deadline = tokio::time::Instant::now() + Duration::from_secs(30);
1811        loop {
1812            let states = proc_mesh
1813                .actor_states(instance, actor_name.clone())
1814                .await
1815                .unwrap();
1816            if states
1817                .values()
1818                .all(|s| s.status == resource::Status::Stopped)
1819            {
1820                break;
1821            }
1822            assert!(
1823                tokio::time::Instant::now() < deadline,
1824                "timed out waiting for actors to be stopped after keepalive expiry"
1825            );
1826            tokio::time::sleep(Duration::from_millis(200)).await;
1827        }
1828    }
1829
1830    /// Create a multi-process host mesh that propagates the current
1831    /// process's config overrides to child processes via Bootstrap.
1832    #[cfg(fbcode_build)]
1833    async fn host_mesh_with_config(n: usize) -> TestHostMesh {
1834        use hyperactor::channel::ChannelTransport;
1835        use tokio::process::Command;
1836
1837        let program = crate::testresource::get("monarch/hyperactor_mesh/bootstrap");
1838        let mut host_addrs = vec![];
1839        let mut children = Vec::new();
1840        for _ in 0..n {
1841            host_addrs.push(ChannelTransport::Unix.any());
1842        }
1843
1844        for host in host_addrs.iter() {
1845            let mut cmd = Command::new(program.clone());
1846            let boot = crate::Bootstrap::Host {
1847                addr: host.clone(),
1848                command: None,
1849                config: Some(hyperactor_config::global::attrs()),
1850                exit_on_shutdown: false,
1851            };
1852            boot.to_env(&mut cmd);
1853            cmd.kill_on_drop(false);
1854            // SAFETY: pre_exec sets PR_SET_PDEATHSIG so the child is
1855            // cleaned up if the parent (test) process dies.
1856            unsafe {
1857                cmd.pre_exec(crate::bootstrap::install_pdeathsig_kill);
1858            }
1859            children.push(cmd.spawn().unwrap());
1860        }
1861
1862        let host_mesh = crate::HostMeshRef::from_hosts(
1863            HostMeshId::instance(Label::new("test").unwrap()),
1864            host_addrs,
1865        );
1866        TestHostMesh {
1867            guard: crate::host_mesh::HostMesh::take(host_mesh).shutdown_guard(),
1868            children,
1869        }
1870    }
1871
1872    /// Verify that actors are cleaned up via the orphan timeout when the
1873    /// `ActorMeshController`'s host process crashes. Unlike the system-actor
1874    /// test above, this spawns actors through a real controller (via
1875    /// `WrapperActor`) and then kills the controller's host process
1876    /// uncleanly. The agents on the surviving proc mesh detect the expired
1877    /// keepalive and stop the actors.
1878    #[tokio::test]
1879    #[cfg(fbcode_build)]
1880    async fn test_orphaned_actors_cleaned_up_on_controller_crash() {
1881        let config = hyperactor_config::global::lock();
1882        let _orphan = config.override_key(MESH_ORPHAN_TIMEOUT, Some(Duration::from_secs(2)));
1883        let _poll = config.override_key(SUPERVISION_POLL_FREQUENCY, Duration::from_secs(1));
1884        let _proc_spawn = config.override_key(PROC_SPAWN_MAX_IDLE, Duration::from_secs(60));
1885        let _host_spawn = config.override_key(
1886            hyperactor::config::HOST_SPAWN_READY_TIMEOUT,
1887            Duration::from_secs(60),
1888        );
1889
1890        let instance = testing::instance();
1891        let num_replicas = 1;
1892
1893        // Host mesh for the test actors (these survive the crash).
1894        // host_mesh_with_config propagates config overrides to child
1895        // processes via Bootstrap, so agents boot with
1896        // MESH_ORPHAN_TIMEOUT=2s and start the SelfCheck loop.
1897        let mut actor_hm = host_mesh_with_config(num_replicas).await;
1898        let actor_proc_mesh = actor_hm
1899            .spawn(instance, "actors", Extent::unity(), None, None)
1900            .await
1901            .unwrap();
1902
1903        // Host mesh for the wrapper + controller (will be killed).
1904        let mut controller_hm = host_mesh_with_config(1).await;
1905        let controller_proc_mesh = controller_hm
1906            .spawn(instance, "controller", Extent::unity(), None, None)
1907            .await
1908            .unwrap();
1909
1910        let child_name = ActorMeshId::instance(Label::new("orphan_child").unwrap());
1911
1912        // Supervision port required by WrapperActor params.
1913        let (supervision_port, _supervision_receiver) = instance.open_port::<MeshFailure>();
1914        let supervisor = supervision_port.bind();
1915
1916        // Spawn WrapperActor on controller_proc_mesh. Its init() spawns
1917        // ActorMesh<TestActor> on actor_proc_mesh with a real
1918        // ActorMeshController co-located on the controller's process.
1919        let _wrapper_mesh: ActorMesh<testactor::WrapperActor> = controller_proc_mesh
1920            .spawn(
1921                instance,
1922                "wrapper",
1923                &(
1924                    actor_proc_mesh.deref().clone(),
1925                    supervisor,
1926                    child_name.clone(),
1927                ),
1928            )
1929            .await
1930            .unwrap();
1931
1932        // Give the controller time to run at least one CheckState cycle
1933        // (polling every 1s). This is what sends `KeepaliveGetState` to
1934        // each agent, and is what arms the agent's `expiry_time` so the
1935        // agent's `SelfCheck` reaper can cull the actors after the
1936        // controller dies. Polling for `Running` is not enough: actors
1937        // reach `Running` at spawn time before the controller's first
1938        // poll, and if we kill the controller in that window the agents
1939        // never received a keepalive and the orphan timeout never trips.
1940        tokio::time::sleep(Duration::from_secs(3)).await;
1941        let states = actor_proc_mesh
1942            .actor_states(instance, child_name.clone())
1943            .await
1944            .unwrap();
1945        for state in states.values() {
1946            assert_eq!(
1947                state.status,
1948                resource::Status::Running,
1949                "actor should be running before controller crash"
1950            );
1951        }
1952
1953        // Kill the controller's host process uncleanly. The TestActors on
1954        // actor_proc_mesh survive. Killing the host (rather than asking the
1955        // wrapper actor to `process::exit`) is critical: the wrapper runs
1956        // in this same test binary's address space when the host mesh is
1957        // co-located, so an in-process exit would also tear down the test
1958        // runner.
1959        controller_hm.kill_hosts().await;
1960
1961        // Poll until all actors are stopped via the orphan timeout. The
1962        // configured timeout is 2s and `SelfCheck` fires every 2s, so this
1963        // converges quickly; allow generous slack for slow CI environments.
1964        let deadline = tokio::time::Instant::now() + Duration::from_secs(30);
1965        loop {
1966            let states = actor_proc_mesh
1967                .actor_states(instance, child_name.clone())
1968                .await
1969                .unwrap();
1970            if states
1971                .values()
1972                .all(|s| s.status == resource::Status::Stopped)
1973            {
1974                break;
1975            }
1976            assert!(
1977                tokio::time::Instant::now() < deadline,
1978                "timed out waiting for actors to be stopped after controller crash and orphan timeout"
1979            );
1980            tokio::time::sleep(Duration::from_millis(200)).await;
1981        }
1982
1983        let _ = actor_hm.shutdown(instance).await;
1984    }
1985
1986    #[test]
1987    fn test_proc_status_to_actor_status_stopped_cleanly() {
1988        let status = proc_status_to_actor_status(Some(ProcStatus::Stopped {
1989            exit_code: 0,
1990            stderr_tail: vec![],
1991        }));
1992        assert!(
1993            matches!(status, ActorStatus::Stopped(ref msg) if msg.contains("cleanly")),
1994            "expected Stopped, got {:?}",
1995            status
1996        );
1997    }
1998
1999    #[test]
2000    fn test_proc_status_to_actor_status_nonzero_exit() {
2001        let status = proc_status_to_actor_status(Some(ProcStatus::Stopped {
2002            exit_code: 1,
2003            stderr_tail: vec![],
2004        }));
2005        assert!(
2006            matches!(status, ActorStatus::Failed(_)),
2007            "expected Failed, got {:?}",
2008            status
2009        );
2010    }
2011
2012    #[test]
2013    fn test_proc_status_to_actor_status_stopping_is_not_a_failure() {
2014        let status = proc_status_to_actor_status(Some(ProcStatus::Stopping {
2015            started_at: std::time::SystemTime::now(),
2016        }));
2017        assert!(
2018            matches!(status, ActorStatus::Stopped(ref msg) if msg.contains("stopping")),
2019            "expected Stopped, got {:?}",
2020            status
2021        );
2022    }
2023
2024    #[test]
2025    fn test_proc_status_to_actor_status_none() {
2026        let status = proc_status_to_actor_status(None);
2027        assert!(
2028            matches!(status, ActorStatus::Stopped(_)),
2029            "expected Stopped, got {:?}",
2030            status
2031        );
2032    }
2033
2034    #[test]
2035    fn test_proc_status_to_actor_status_killed() {
2036        let status = proc_status_to_actor_status(Some(ProcStatus::Killed {
2037            signal: 9,
2038            core_dumped: false,
2039        }));
2040        assert!(
2041            matches!(status, ActorStatus::Failed(_)),
2042            "expected Failed, got {:?}",
2043            status
2044        );
2045    }
2046
2047    #[test]
2048    fn test_proc_status_to_actor_status_failed() {
2049        let status = proc_status_to_actor_status(Some(ProcStatus::Failed {
2050            reason: "oom".to_string(),
2051        }));
2052        assert!(
2053            matches!(status, ActorStatus::Failed(_)),
2054            "expected Failed, got {:?}",
2055            status
2056        );
2057    }
2058
2059    /// Force the supervision loop to look stalled by handing `check_stall` an
2060    /// `expected_time` several poll intervals in the past, then assert it both
2061    /// logs the warning and records the stall duration. This is the cheapest
2062    /// way to eyeball the warning the controller emits in production without
2063    /// standing up a real mesh.
2064    #[tracing_test::traced_test]
2065    #[test]
2066    fn test_check_stall_logs_when_late() {
2067        use std::time::SystemTime;
2068
2069        use hyperactor::id::ActorId;
2070        use hyperactor::id::ProcId;
2071
2072        let poll = hyperactor_config::global::get(super::SUPERVISION_POLL_FREQUENCY);
2073        // Pretend the CheckState handler was due five poll intervals ago.
2074        let expected_time = SystemTime::now() - poll * 5;
2075
2076        let proc = ProcId::instance(Label::new("stall_demo").unwrap());
2077        let actor_id = ActorId::singleton(Label::new("controller").unwrap(), proc);
2078
2079        super::check_stall(
2080            expected_time,
2081            &actor_id,
2082            &super::ACTOR_MESH_CONTROLLER_SUPERVISION_STALLS,
2083        );
2084
2085        // The reported lateness is `now - expected_time` (~5 poll intervals),
2086        // which humantime renders at full precision (e.g. "50s 343us 134ns").
2087        // Assert only on the stable prefix so the test stays deterministic.
2088        assert!(
2089            logs_contain("Handler<CheckState> is stalled by"),
2090            "expected a stall warning to be logged"
2091        );
2092    }
2093}
hyperactor_mesh/mesh_controller.rs

hyperactor_mesh/
mesh_controller.rs