Skip to main content

hyperactor_mesh/
supervision.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9//! Messages used in supervision of actor meshes.
10//!
11//! ## Mesh-name propagation
12//!
13//! When a `MeshFailure` is constructed for a supervision event
14//! whose constructing site has the mesh name locally in scope, the
15//! mesh name is carried on `MeshFailure.actor_mesh_name`. The
16//! constructing site does not perform a lookup to obtain the mesh
17//! name; if the mesh name is not locally available at the site,
18//! `None` is correct. `MeshFailure::Display` surfaces the mesh
19//! name as an `on mesh "{name}"` segment when `actor_mesh_name`
20//! is populated; stable identifiers continue to appear in detail
21//! segments where the renderer already includes them.
22//! Python-binding-specific plumbing for this carrier — how a
23//! Python-spawned actor ends up with a mesh base-name string to
24//! supply — lives in `monarch_hyperactor/src/actor.rs`
25//! (`PythonActorParams.mesh_base_name`).
26
27use hyperactor::Bind;
28use hyperactor::Unbind;
29use hyperactor::actor::ActorErrorKind;
30use hyperactor::actor::ActorStatus;
31use hyperactor::context;
32use hyperactor::supervision::ActorSupervisionEvent;
33use serde::Deserialize;
34use serde::Serialize;
35use typeuri::Named;
36
37/// Message about a supervision failure on a mesh of actors instead of a single
38/// actor.
39#[derive(Clone, Debug, Serialize, Deserialize, Named, PartialEq, Bind, Unbind)]
40pub struct MeshFailure {
41    /// Mesh name carried by the `MeshFailure` construction site,
42    /// when locally available. On the direct actor-handled path
43    /// this is the observing PythonActor's mesh base name. On
44    /// controller-owned paths this is the monitored mesh name
45    /// supplied by the controller path.
46    pub actor_mesh_name: Option<String>,
47    /// The supervision event on an actor located at mesh + rank.
48    pub event: ActorSupervisionEvent,
49    /// The set of crashed ranks in the mesh. Empty means the event
50    /// applies to the whole mesh (e.g. mesh stop, controller timeout).
51    pub crashed_ranks: Vec<usize>,
52}
53wirevalue::register_type!(MeshFailure);
54
55impl MeshFailure {
56    /// Returns true if the given rank is part of this failure.
57    /// A whole-mesh event (empty crashed_ranks) contains every rank.
58    pub fn contains_rank(&self, rank: usize) -> bool {
59        self.crashed_ranks.is_empty() || self.crashed_ranks.contains(&rank)
60    }
61
62    /// Helper function to handle a message to an actor that just wants to forward
63    /// it to the next owner.
64    pub fn default_handler(&self, cx: &impl context::Actor) -> Result<(), anyhow::Error> {
65        // If an actor spawned by this one fails, we can't handle it. We fail
66        // ourselves with a chained error and bubble up to the next owner.
67        let err = ActorErrorKind::UnhandledSupervisionEvent(Box::new(ActorSupervisionEvent::new(
68            cx.instance().self_addr().clone(),
69            None,
70            ActorStatus::Failed(ActorErrorKind::UnhandledSupervisionEvent(Box::new(
71                self.event.clone(),
72            ))),
73            None,
74        )));
75        Err(anyhow::Error::new(err))
76    }
77}
78
79impl std::fmt::Display for MeshFailure {
80    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
81        let actor_mesh_name = self
82            .actor_mesh_name
83            .as_ref()
84            .map(|m| format!(" on mesh \"{}\"", m))
85            .unwrap_or("".to_string());
86        let ranks = if self.crashed_ranks.is_empty() {
87            String::new()
88        } else {
89            format!(" at ranks {:?}", self.crashed_ranks)
90        };
91        write!(
92            f,
93            "failure{}{} with event: {}",
94            actor_mesh_name, ranks, self.event
95        )
96    }
97}
98
99// Shared between mesh types.
100#[derive(Debug, Clone)]
101pub(crate) enum Unhealthy {
102    StreamClosed(MeshFailure), // Event stream closed
103    Crashed(MeshFailure),      // Bad health event received
104}
105
106#[cfg(test)]
107mod tests {
108    //! Tests that pin `MeshFailure::Display` rendering. The
109    //! `proof_*` tests capture exact rendered strings for three
110    //! supervision-path shapes, paired for each path as
111    //! `MeshFailure { actor_mesh_name: None, ... }` vs.
112    //! `MeshFailure { actor_mesh_name: Some(...), ... }`, so the
113    //! "with mesh name" and "without mesh name" rendered output is
114    //! locked down and any regression surfaces here.
115    //!
116    //! The `assert_eq!` literals capture the `ActorAddr::Display`
117    //! output of the checkout the tests were generated against. If
118    //! the identifier encoding changes (e.g. a reference-stack
119    //! refactor lands in the same tree), the literals need to be
120    //! regenerated on the new baseline — the mesh-name-rendering
121    //! behavior this module tests is independent of the id format.
122
123    use hyperactor::actor::ActorErrorKind;
124    use hyperactor::actor::ActorStatus;
125    use hyperactor::channel::ChannelAddr;
126
127    use super::*;
128    use crate::mesh_id::ResourceId;
129
130    fn test_event(name: &str, display_name: Option<String>) -> ActorSupervisionEvent {
131        let proc_id = ResourceId::proc_addr_from_name(ChannelAddr::Local(0), "test_proc");
132        ActorSupervisionEvent::new(
133            proc_id.actor_addr(name),
134            display_name,
135            ActorStatus::Failed(ActorErrorKind::Generic("boom".to_string())),
136            None,
137        )
138    }
139
140    // `MeshFailure::Display` renders the mesh name in its prose
141    // when `actor_mesh_name` is Some, producing the "on mesh \"{name}\""
142    // segment alongside the stable id-bearing event.
143    #[test]
144    fn mesh_failure_display_renders_mesh_name_when_populated() {
145        let failure = MeshFailure {
146            actor_mesh_name: Some("training".to_string()),
147            event: test_event("actor_a", None),
148            crashed_ranks: vec![],
149        };
150        let rendered = format!("{}", failure);
151        assert!(
152            rendered.contains("on mesh \"training\""),
153            "expected rendered output to contain `on mesh \"training\"`; got: {rendered}"
154        );
155    }
156
157    // When `actor_mesh_name` is None, the formatter omits the mesh
158    // segment entirely — the absence degrades gracefully without
159    // changing surrounding prose.
160    #[test]
161    fn mesh_failure_display_omits_mesh_segment_when_none() {
162        let failure = MeshFailure {
163            actor_mesh_name: None,
164            event: test_event("actor_a", None),
165            crashed_ranks: vec![],
166        };
167        let rendered = format!("{}", failure);
168        assert!(
169            !rendered.contains("on mesh"),
170            "expected no `on mesh` segment when actor_mesh_name is None; got: {rendered}"
171        );
172    }
173
174    // When both mesh name and the event's Python-class display_name
175    // are populated, the rendered prose includes both, producing a
176    // user-readable description alongside the stable identifier
177    // carried in the event.
178    #[test]
179    fn mesh_failure_display_renders_mesh_and_python_class() {
180        let failure = MeshFailure {
181            actor_mesh_name: Some("training".to_string()),
182            event: test_event(
183                "actor_a",
184                Some("instance0.<my_module.Philosopher training>".to_string()),
185            ),
186            crashed_ranks: vec![],
187        };
188        let rendered = format!("{}", failure);
189        assert!(
190            rendered.contains("on mesh \"training\""),
191            "expected mesh name segment; got: {rendered}"
192        );
193        assert!(
194            rendered.contains("my_module.Philosopher"),
195            "expected Python-class segment from display_name; got: {rendered}"
196        );
197    }
198
199    // Shared fixture for the proofs: the exact synthesized event shape
200    // that `GlobalClientActor::handle_undeliverable_message` produces
201    // (`hyperactor_mesh/src/global_context.rs:278`): display_name =
202    // None, actor_status = generic_failure("message not delivered: ...").
203    fn undeliverable_synthesized_event() -> ActorSupervisionEvent {
204        let proc_id = ResourceId::proc_addr_from_name(ChannelAddr::Local(0), "worker_proc");
205        ActorSupervisionEvent::new(
206            proc_id.actor_addr("dead_actor"),
207            None, // synthesized site has no PythonActor context; display_name stays None
208            ActorStatus::generic_failure(
209                "message not delivered: undeliverable message error: ... \
210                 error: broken link: message returned to global root client"
211                    .to_string(),
212            ),
213            None,
214        )
215    }
216
217    // Root-client undeliverable path.
218    //
219    // Transport bounces an undeliverable back to the root client;
220    // `GlobalClientActor::handle_undeliverable_message` synthesizes
221    // an `ActorSupervisionEvent` with `display_name = None` and
222    // `"message not delivered: ..."` status. That event propagates
223    // to a `PythonActor::handle_supervision_event`, which wraps it
224    // in a `MeshFailure`. At the wrap site the observing
225    // `PythonActor`'s `mesh_base_name` is the mesh name locally
226    // available; this test pins what `MeshFailure::Display` renders
227    // when `actor_mesh_name` is `None` vs. `Some("training")` for
228    // that exact synthesized inner event shape.
229    #[test]
230    fn proof_motivating_incident_root_client_undeliverable() {
231        let without_mesh_name = MeshFailure {
232            actor_mesh_name: None,
233            event: undeliverable_synthesized_event(),
234            crashed_ranks: vec![],
235        };
236        let with_mesh_name = MeshFailure {
237            actor_mesh_name: Some("training".to_string()),
238            event: undeliverable_synthesized_event(),
239            crashed_ranks: vec![],
240        };
241        let expected_without = "failure with event: Supervision event: \
242                                actor worker_proc@inproc://0,dead_actor failed:\n  \
243                                message not delivered: undeliverable message error: \
244                                ... error: broken link: message returned to global \
245                                root client";
246        let expected_with = "failure on mesh \"training\" with event: \
247                             Supervision event: actor \
248                             worker_proc@inproc://0,dead_actor failed:\n  \
249                             message not delivered: undeliverable message error: \
250                             ... error: broken link: message returned to global \
251                             root client";
252        assert_eq!(format!("{}", without_mesh_name), expected_without);
253        assert_eq!(format!("{}", with_mesh_name), expected_with);
254
255        // Note: the inner event here has `display_name = None` (the
256        // synthesis site at `global_context.rs` has no PythonActor
257        // context to populate it), so the inner actor mention
258        // renders via raw `ActorAddr` text. That is a separate concern
259        // from mesh-name plumbing.
260    }
261
262    // Direct actor-handled panic path.
263    //
264    // A `PythonActor` panics in a handler. `Proc::stop_actor`
265    // constructs the `ActorSupervisionEvent` using
266    // `actor.display_name()`, which on a `PythonActor` is the
267    // Python-class-bearing `str(PyInstance)`. The event reaches a
268    // supervising `PythonActor` through the propagation chain,
269    // which wraps it in a `MeshFailure` at
270    // `monarch_hyperactor/src/actor.rs:1072`. At that wrap site the
271    // observing `PythonActor`'s `mesh_base_name` is the mesh name
272    // locally available; this test pins what
273    // `MeshFailure::Display` renders when `actor_mesh_name` is
274    // `None` vs. `Some("training")` for a panicked-event inner
275    // shape that already carries a Python-class `display_name`.
276    #[test]
277    fn proof_direct_actor_handled_panic() {
278        let panicked_event = {
279            let proc_id = ResourceId::proc_addr_from_name(ChannelAddr::Local(0), "worker_proc");
280            ActorSupervisionEvent::new(
281                proc_id.actor_addr("philosopher_1"),
282                // `Proc::stop_actor` populates this via
283                // `actor.display_name()` on a PythonActor — which
284                // returns the Python-class-bearing `str(PyInstance)`.
285                Some("instance0.<monarch_examples.dining.Philosopher training>".to_string()),
286                ActorStatus::Failed(ActorErrorKind::Generic(
287                    "IndexError: list index out of range".to_string(),
288                )),
289                None,
290            )
291        };
292        let without_mesh_name = MeshFailure {
293            actor_mesh_name: None,
294            event: panicked_event.clone(),
295            crashed_ranks: vec![],
296        };
297        let with_mesh_name = MeshFailure {
298            actor_mesh_name: Some("training".to_string()),
299            event: panicked_event,
300            crashed_ranks: vec![],
301        };
302        let expected_without = "failure with event: Supervision event: actor \
303                                instance0.<monarch_examples.dining.Philosopher \
304                                training> failed:\n  \
305                                IndexError: list index out of range";
306        let expected_with = "failure on mesh \"training\" with event: \
307                             Supervision event: actor \
308                             instance0.<monarch_examples.dining.Philosopher \
309                             training> failed:\n  \
310                             IndexError: list index out of range";
311        assert_eq!(format!("{}", without_mesh_name), expected_without);
312        assert_eq!(format!("{}", with_mesh_name), expected_with);
313    }
314
315    // Controller-unreachable path.
316    //
317    // When the controller for a mesh becomes unreachable, code in
318    // `actor_mesh.rs` synthesizes a `MeshFailure` with
319    // `actor_mesh_name: Some(self.id().to_string())` — the slot is
320    // already populated on this path, and the inner event's
321    // `display_name` is `None` because the construction site has
322    // no `PythonActor` context. This test pins the rendered string
323    // for that exact shape.
324    #[test]
325    fn proof_controller_unreachable() {
326        let controller_timeout_event = {
327            let proc_id = ResourceId::proc_addr_from_name(ChannelAddr::Local(0), "controller_proc");
328            ActorSupervisionEvent::new(
329                proc_id.actor_addr("training_controller"),
330                None,
331                ActorStatus::generic_failure(
332                    "timed out reaching controller ... Assuming controller's proc is dead"
333                        .to_string(),
334                ),
335                None,
336            )
337        };
338        let failure = MeshFailure {
339            actor_mesh_name: Some("training".to_string()),
340            event: controller_timeout_event,
341            crashed_ranks: vec![],
342        };
343        let expected = "failure on mesh \"training\" with event: \
344                        Supervision event: actor \
345                        controller_proc@inproc://0,training_controller \
346                        failed:\n  \
347                        timed out reaching controller ... Assuming \
348                        controller's proc is dead";
349        assert_eq!(format!("{}", failure), expected);
350    }
351}