hyperactor_mesh/supervision.rs
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9//! Messages used in supervision of actor meshes.
10//!
11//! ## Mesh-name propagation
12//!
13//! When a `MeshFailure` is constructed for a supervision event
14//! whose constructing site has the mesh name locally in scope, the
15//! mesh name is carried on `MeshFailure.actor_mesh_name`. The
16//! constructing site does not perform a lookup to obtain the mesh
17//! name; if the mesh name is not locally available at the site,
18//! `None` is correct. `MeshFailure::Display` surfaces the mesh
19//! name as an `on mesh "{name}"` segment when `actor_mesh_name`
20//! is populated; stable identifiers continue to appear in detail
21//! segments where the renderer already includes them.
22//! Python-binding-specific plumbing for this carrier — how a
23//! Python-spawned actor ends up with a mesh base-name string to
24//! supply — lives in `monarch_hyperactor/src/actor.rs`
25//! (`PythonActorParams.mesh_base_name`).
26
27use hyperactor::Bind;
28use hyperactor::Unbind;
29use hyperactor::actor::ActorErrorKind;
30use hyperactor::actor::ActorStatus;
31use hyperactor::context;
32use hyperactor::supervision::ActorSupervisionEvent;
33use serde::Deserialize;
34use serde::Serialize;
35use typeuri::Named;
36
37/// Message about a supervision failure on a mesh of actors instead of a single
38/// actor.
39#[derive(Clone, Debug, Serialize, Deserialize, Named, PartialEq, Bind, Unbind)]
40pub struct MeshFailure {
41 /// Mesh name carried by the `MeshFailure` construction site,
42 /// when locally available. On the direct actor-handled path
43 /// this is the observing PythonActor's mesh base name. On
44 /// controller-owned paths this is the monitored mesh name
45 /// supplied by the controller path.
46 pub actor_mesh_name: Option<String>,
47 /// The supervision event on an actor located at mesh + rank.
48 pub event: ActorSupervisionEvent,
49 /// The set of crashed ranks in the mesh. Empty means the event
50 /// applies to the whole mesh (e.g. mesh stop, controller timeout).
51 pub crashed_ranks: Vec<usize>,
52}
53wirevalue::register_type!(MeshFailure);
54
55impl MeshFailure {
56 /// Returns true if the given rank is part of this failure.
57 /// A whole-mesh event (empty crashed_ranks) contains every rank.
58 pub fn contains_rank(&self, rank: usize) -> bool {
59 self.crashed_ranks.is_empty() || self.crashed_ranks.contains(&rank)
60 }
61
62 /// Helper function to handle a message to an actor that just wants to forward
63 /// it to the next owner.
64 pub fn default_handler(&self, cx: &impl context::Actor) -> Result<(), anyhow::Error> {
65 // If an actor spawned by this one fails, we can't handle it. We fail
66 // ourselves with a chained error and bubble up to the next owner.
67 let err = ActorErrorKind::UnhandledSupervisionEvent(Box::new(ActorSupervisionEvent::new(
68 cx.instance().self_addr().clone(),
69 None,
70 ActorStatus::Failed(ActorErrorKind::UnhandledSupervisionEvent(Box::new(
71 self.event.clone(),
72 ))),
73 None,
74 )));
75 Err(anyhow::Error::new(err))
76 }
77}
78
79impl std::fmt::Display for MeshFailure {
80 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
81 let actor_mesh_name = self
82 .actor_mesh_name
83 .as_ref()
84 .map(|m| format!(" on mesh \"{}\"", m))
85 .unwrap_or("".to_string());
86 let ranks = if self.crashed_ranks.is_empty() {
87 String::new()
88 } else {
89 format!(" at ranks {:?}", self.crashed_ranks)
90 };
91 write!(
92 f,
93 "failure{}{} with event: {}",
94 actor_mesh_name, ranks, self.event
95 )
96 }
97}
98
99// Shared between mesh types.
100#[derive(Debug, Clone)]
101pub(crate) enum Unhealthy {
102 StreamClosed(MeshFailure), // Event stream closed
103 Crashed(MeshFailure), // Bad health event received
104}
105
106#[cfg(test)]
107mod tests {
108 //! Tests that pin `MeshFailure::Display` rendering. The
109 //! `proof_*` tests capture exact rendered strings for three
110 //! supervision-path shapes, paired for each path as
111 //! `MeshFailure { actor_mesh_name: None, ... }` vs.
112 //! `MeshFailure { actor_mesh_name: Some(...), ... }`, so the
113 //! "with mesh name" and "without mesh name" rendered output is
114 //! locked down and any regression surfaces here.
115 //!
116 //! The `assert_eq!` literals capture the `ActorAddr::Display`
117 //! output of the checkout the tests were generated against. If
118 //! the identifier encoding changes (e.g. a reference-stack
119 //! refactor lands in the same tree), the literals need to be
120 //! regenerated on the new baseline — the mesh-name-rendering
121 //! behavior this module tests is independent of the id format.
122
123 use hyperactor::actor::ActorErrorKind;
124 use hyperactor::actor::ActorStatus;
125 use hyperactor::channel::ChannelAddr;
126
127 use super::*;
128 use crate::mesh_id::ResourceId;
129
130 fn test_event(name: &str, display_name: Option<String>) -> ActorSupervisionEvent {
131 let proc_id = ResourceId::proc_addr_from_name(ChannelAddr::Local(0), "test_proc");
132 ActorSupervisionEvent::new(
133 proc_id.actor_addr(name),
134 display_name,
135 ActorStatus::Failed(ActorErrorKind::Generic("boom".to_string())),
136 None,
137 )
138 }
139
140 // `MeshFailure::Display` renders the mesh name in its prose
141 // when `actor_mesh_name` is Some, producing the "on mesh \"{name}\""
142 // segment alongside the stable id-bearing event.
143 #[test]
144 fn mesh_failure_display_renders_mesh_name_when_populated() {
145 let failure = MeshFailure {
146 actor_mesh_name: Some("training".to_string()),
147 event: test_event("actor_a", None),
148 crashed_ranks: vec![],
149 };
150 let rendered = format!("{}", failure);
151 assert!(
152 rendered.contains("on mesh \"training\""),
153 "expected rendered output to contain `on mesh \"training\"`; got: {rendered}"
154 );
155 }
156
157 // When `actor_mesh_name` is None, the formatter omits the mesh
158 // segment entirely — the absence degrades gracefully without
159 // changing surrounding prose.
160 #[test]
161 fn mesh_failure_display_omits_mesh_segment_when_none() {
162 let failure = MeshFailure {
163 actor_mesh_name: None,
164 event: test_event("actor_a", None),
165 crashed_ranks: vec![],
166 };
167 let rendered = format!("{}", failure);
168 assert!(
169 !rendered.contains("on mesh"),
170 "expected no `on mesh` segment when actor_mesh_name is None; got: {rendered}"
171 );
172 }
173
174 // When both mesh name and the event's Python-class display_name
175 // are populated, the rendered prose includes both, producing a
176 // user-readable description alongside the stable identifier
177 // carried in the event.
178 #[test]
179 fn mesh_failure_display_renders_mesh_and_python_class() {
180 let failure = MeshFailure {
181 actor_mesh_name: Some("training".to_string()),
182 event: test_event(
183 "actor_a",
184 Some("instance0.<my_module.Philosopher training>".to_string()),
185 ),
186 crashed_ranks: vec![],
187 };
188 let rendered = format!("{}", failure);
189 assert!(
190 rendered.contains("on mesh \"training\""),
191 "expected mesh name segment; got: {rendered}"
192 );
193 assert!(
194 rendered.contains("my_module.Philosopher"),
195 "expected Python-class segment from display_name; got: {rendered}"
196 );
197 }
198
199 // Shared fixture for the proofs: the exact synthesized event shape
200 // that `GlobalClientActor::handle_undeliverable_message` produces
201 // (`hyperactor_mesh/src/global_context.rs:278`): display_name =
202 // None, actor_status = generic_failure("message not delivered: ...").
203 fn undeliverable_synthesized_event() -> ActorSupervisionEvent {
204 let proc_id = ResourceId::proc_addr_from_name(ChannelAddr::Local(0), "worker_proc");
205 ActorSupervisionEvent::new(
206 proc_id.actor_addr("dead_actor"),
207 None, // synthesized site has no PythonActor context; display_name stays None
208 ActorStatus::generic_failure(
209 "message not delivered: undeliverable message error: ... \
210 error: broken link: message returned to global root client"
211 .to_string(),
212 ),
213 None,
214 )
215 }
216
217 // Root-client undeliverable path.
218 //
219 // Transport bounces an undeliverable back to the root client;
220 // `GlobalClientActor::handle_undeliverable_message` synthesizes
221 // an `ActorSupervisionEvent` with `display_name = None` and
222 // `"message not delivered: ..."` status. That event propagates
223 // to a `PythonActor::handle_supervision_event`, which wraps it
224 // in a `MeshFailure`. At the wrap site the observing
225 // `PythonActor`'s `mesh_base_name` is the mesh name locally
226 // available; this test pins what `MeshFailure::Display` renders
227 // when `actor_mesh_name` is `None` vs. `Some("training")` for
228 // that exact synthesized inner event shape.
229 #[test]
230 fn proof_motivating_incident_root_client_undeliverable() {
231 let without_mesh_name = MeshFailure {
232 actor_mesh_name: None,
233 event: undeliverable_synthesized_event(),
234 crashed_ranks: vec![],
235 };
236 let with_mesh_name = MeshFailure {
237 actor_mesh_name: Some("training".to_string()),
238 event: undeliverable_synthesized_event(),
239 crashed_ranks: vec![],
240 };
241 let expected_without = "failure with event: Supervision event: \
242 actor worker_proc@inproc://0,dead_actor failed:\n \
243 message not delivered: undeliverable message error: \
244 ... error: broken link: message returned to global \
245 root client";
246 let expected_with = "failure on mesh \"training\" with event: \
247 Supervision event: actor \
248 worker_proc@inproc://0,dead_actor failed:\n \
249 message not delivered: undeliverable message error: \
250 ... error: broken link: message returned to global \
251 root client";
252 assert_eq!(format!("{}", without_mesh_name), expected_without);
253 assert_eq!(format!("{}", with_mesh_name), expected_with);
254
255 // Note: the inner event here has `display_name = None` (the
256 // synthesis site at `global_context.rs` has no PythonActor
257 // context to populate it), so the inner actor mention
258 // renders via raw `ActorAddr` text. That is a separate concern
259 // from mesh-name plumbing.
260 }
261
262 // Direct actor-handled panic path.
263 //
264 // A `PythonActor` panics in a handler. `Proc::stop_actor`
265 // constructs the `ActorSupervisionEvent` using
266 // `actor.display_name()`, which on a `PythonActor` is the
267 // Python-class-bearing `str(PyInstance)`. The event reaches a
268 // supervising `PythonActor` through the propagation chain,
269 // which wraps it in a `MeshFailure` at
270 // `monarch_hyperactor/src/actor.rs:1072`. At that wrap site the
271 // observing `PythonActor`'s `mesh_base_name` is the mesh name
272 // locally available; this test pins what
273 // `MeshFailure::Display` renders when `actor_mesh_name` is
274 // `None` vs. `Some("training")` for a panicked-event inner
275 // shape that already carries a Python-class `display_name`.
276 #[test]
277 fn proof_direct_actor_handled_panic() {
278 let panicked_event = {
279 let proc_id = ResourceId::proc_addr_from_name(ChannelAddr::Local(0), "worker_proc");
280 ActorSupervisionEvent::new(
281 proc_id.actor_addr("philosopher_1"),
282 // `Proc::stop_actor` populates this via
283 // `actor.display_name()` on a PythonActor — which
284 // returns the Python-class-bearing `str(PyInstance)`.
285 Some("instance0.<monarch_examples.dining.Philosopher training>".to_string()),
286 ActorStatus::Failed(ActorErrorKind::Generic(
287 "IndexError: list index out of range".to_string(),
288 )),
289 None,
290 )
291 };
292 let without_mesh_name = MeshFailure {
293 actor_mesh_name: None,
294 event: panicked_event.clone(),
295 crashed_ranks: vec![],
296 };
297 let with_mesh_name = MeshFailure {
298 actor_mesh_name: Some("training".to_string()),
299 event: panicked_event,
300 crashed_ranks: vec![],
301 };
302 let expected_without = "failure with event: Supervision event: actor \
303 instance0.<monarch_examples.dining.Philosopher \
304 training> failed:\n \
305 IndexError: list index out of range";
306 let expected_with = "failure on mesh \"training\" with event: \
307 Supervision event: actor \
308 instance0.<monarch_examples.dining.Philosopher \
309 training> failed:\n \
310 IndexError: list index out of range";
311 assert_eq!(format!("{}", without_mesh_name), expected_without);
312 assert_eq!(format!("{}", with_mesh_name), expected_with);
313 }
314
315 // Controller-unreachable path.
316 //
317 // When the controller for a mesh becomes unreachable, code in
318 // `actor_mesh.rs` synthesizes a `MeshFailure` with
319 // `actor_mesh_name: Some(self.id().to_string())` — the slot is
320 // already populated on this path, and the inner event's
321 // `display_name` is `None` because the construction site has
322 // no `PythonActor` context. This test pins the rendered string
323 // for that exact shape.
324 #[test]
325 fn proof_controller_unreachable() {
326 let controller_timeout_event = {
327 let proc_id = ResourceId::proc_addr_from_name(ChannelAddr::Local(0), "controller_proc");
328 ActorSupervisionEvent::new(
329 proc_id.actor_addr("training_controller"),
330 None,
331 ActorStatus::generic_failure(
332 "timed out reaching controller ... Assuming controller's proc is dead"
333 .to_string(),
334 ),
335 None,
336 )
337 };
338 let failure = MeshFailure {
339 actor_mesh_name: Some("training".to_string()),
340 event: controller_timeout_event,
341 crashed_ranks: vec![],
342 };
343 let expected = "failure on mesh \"training\" with event: \
344 Supervision event: actor \
345 controller_proc@inproc://0,training_controller \
346 failed:\n \
347 timed out reaching controller ... Assuming \
348 controller's proc is dead";
349 assert_eq!(format!("{}", failure), expected);
350 }
351}