hyperactor/
introspect.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9//! Introspection protocol for hyperactor actors.
10//!
11//! Every actor has a dedicated introspect task that handles
12//! [`IntrospectMessage`] by reading [`InstanceCell`] state directly,
13//! without going through the actor's message loop. This means:
14//!
15//! - Stuck actors can be introspected (the task runs independently).
16//! - Introspection does not perturb observed state (no Heisenberg).
17//! - Live status is reported accurately.
18//!
19//! Infrastructure actors publish domain-specific metadata via
20//! `publish_attrs()`, which the introspect task reads for Entity-view
21//! queries. Non-addressable children (e.g., system procs) are
22//! resolved via a callback registered on [`InstanceCell`].
23//!
24//! Callers navigate topology by fetching an [`IntrospectResult`] and
25//! following its `children` references.
26//!
27//! # Design Invariants
28//!
29//! The introspection subsystem maintains twelve invariants (S1--S12).
30//! Each is documented at the code site that enforces it.
31//!
32//! - **S1.** Introspection must not depend on actor responsiveness --
33//!   a wedged actor can still be introspected (runtime task, not
34//!   actor loop).
35//! - **S2.** Introspection must not perturb observed state -- reading
36//!   `InstanceCell` never sets `last_message_handler` to
37//!   `IntrospectMessage`.
38//! - **S3.** Sender routing is unchanged -- senders target the same
39//!   `PortId` (`IntrospectMessage::port()`) across processes.
40//! - **S4.** `IntrospectMessage` never produces a `WorkCell` --
41//!   pre-registration via `open_message_port` gives the introspect
42//!   port its own channel, independent of the actor's work queue.
43//! - **S5.** Replies never use `PanickingMailboxSender` -- the
44//!   introspect task replies via `Mailbox::serialize_and_send_once`.
45//! - **S6.** View semantics are stable -- Actor view uses live
46//!   structural state + supervision children; Entity view uses
47//!   published properties + domain children.
48//! - **S7.** `QueryChild` must work without actor handlers -- system
49//!   procs are resolved via a per-actor callback on `InstanceCell`.
50//! - **S8.** Published properties are constrained -- actors cannot
51//!   publish `Root` or `Error` payloads (only `Host` and `Proc`
52//!   variants).
53//! - **S9.** Port binding is single source of truth -- the introspect
54//!   port is bound exactly once via `bind_actor_port()` in
55//!   `Instance::new()`.
56//! - **S10.** Introspect receiver lifecycle -- created in
57//!   `Instance::new()`, spawned in `start()`, dropped in
58//!   `child_instance()`.
59//! - **S11.** Terminated snapshots do not keep actors resolvable --
60//!   `store_terminated_snapshot` writes to the proc's snapshot map,
61//!   not the instances map. `resolve_actor_ref` checks terminal
62//!   status independently and is unaffected by snapshot storage.
63//! - **S12.** Introspection must not impair actor liveness --
64//!   introspection queries (including DashMap reads for actor
65//!   enumeration) must not cause convoy starvation or scheduling
66//!   delays that stall concurrent actor spawn/stop operations.
67//!
68//! ## Introspection key invariants (IK-*)
69//!
70//! - **IK-1 (metadata completeness):** Every actor-runtime
71//!   introspection key must carry `@meta(INTROSPECT = ...)` with
72//!   non-empty `name` and `desc`.
73//! - **IK-2 (short-name uniqueness):** No two introspection keys may
74//!   share the same `IntrospectAttr.name`. Duplicates would break the
75//!   FQ-to-short HTTP remap and schema output.
76//!
77//! ## Failure introspection invariants (FI-*)
78//!
79//! The FailureInfo presentation type lives in
80//! `hyperactor_mesh::introspect`; these invariants are documented
81//! here because the enforcement sites are in hyperactor (`proc.rs`
82//! `serve()`, `live_actor_payload`).
83//!
84//! - **FI-1 (event-before-status):** All `InstanceCell` state that
85//!   `live_actor_payload` reads must be written BEFORE
86//!   `change_status()` transitions to terminal.
87//! - **FI-2 (write-once):** `InstanceCellState::supervision_event` is
88//!   written at most once per actor lifetime.
89//! - **FI-3 (failure attrs <-> status):** Failure attrs are present
90//!   iff status is `"failed"`.
91//! - **FI-4 (is_propagated <-> root_cause_actor):**
92//!   `failure_is_propagated == true` iff `failure_root_cause_actor !=
93//!   this_actor_id`.
94//! - **FI-5 (is_poisoned <-> failed_actor_count):** `is_poisoned ==
95//!   true` iff `failed_actor_count > 0`.
96//! - **FI-6 (clean stop = no artifacts):** When an actor stops
97//!   cleanly, `supervision_event` is `None`, failure attrs are
98//!   absent, and the actor does not contribute to
99//!   `failed_actor_count`.
100//! - **FI-7 (propagated-stopped-root-cause):** When a failed actor's
101//!   supervision chain bottoms out in a `Stopped` child event,
102//!   structured failure metadata must still name the stopped child as
103//!   `failure_root_cause_actor`.
104//! - **FI-8 (propagation-classification):** `failure_is_propagated`
105//!   is derived from root-cause actor identity; a parent that failed
106//!   due to a child's event must report `failure_is_propagated ==
107//!   true`.
108//!
109//! ## Attrs view invariants (AV-*)
110//!
111//! These govern the typed view layer (`ActorAttrsView`). The full
112//! AV-* / DP-* family is documented in `hyperactor_mesh::introspect`;
113//! the subset relevant to this crate:
114//!
115//! - **AV-1 (view-roundtrip):** For each view V,
116//!   `V::from_attrs(&v.to_attrs()) == Ok(v)`.
117//! - **AV-2 (required-key-strictness):** `from_attrs` fails iff
118//!   required keys for that view are missing.
119//! - **AV-3 (unknown-key-tolerance):** Unknown attrs keys must not
120//!   affect successful decode outcome.
121
122use std::fmt;
123use std::str::FromStr;
124use std::time::SystemTime;
125
126use hyperactor_config::Attrs;
127use hyperactor_config::INTROSPECT;
128use hyperactor_config::IntrospectAttr;
129use hyperactor_config::declare_attrs;
130use serde::Deserialize;
131use serde::Serialize;
132use typeuri::Named;
133
134use crate::InstanceCell;
135use crate::reference;
136
137/// Typed reference to an introspectable entity.
138///
139/// This is the generic hyperactor layer — it knows about procs and
140/// actors, not mesh-specific concepts like root or host.
141///
142/// Port references are intentionally excluded — introspection
143/// does not address individual ports.
144#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Named)]
145pub enum IntrospectRef {
146    /// A proc reference.
147    Proc(reference::ProcId),
148    /// An actor reference.
149    Actor(reference::ActorId),
150}
151hyperactor_config::impl_attrvalue!(IntrospectRef);
152
153impl fmt::Display for IntrospectRef {
154    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
155        match self {
156            Self::Proc(id) => fmt::Display::fmt(id, f),
157            Self::Actor(id) => fmt::Display::fmt(id, f),
158        }
159    }
160}
161
162impl FromStr for IntrospectRef {
163    type Err = reference::ReferenceParsingError;
164
165    fn from_str(s: &str) -> Result<Self, Self::Err> {
166        let r: reference::Reference = s.parse()?;
167        match r {
168            reference::Reference::Proc(id) => Ok(Self::Proc(id)),
169            reference::Reference::Actor(id) => Ok(Self::Actor(id)),
170            reference::Reference::Port(_) => Err(reference::ReferenceParsingError::WrongType(
171                "port references are not valid introspection references".to_string(),
172            )),
173        }
174    }
175}
176
177impl From<reference::ProcId> for IntrospectRef {
178    fn from(id: reference::ProcId) -> Self {
179        Self::Proc(id)
180    }
181}
182
183impl From<reference::ActorId> for IntrospectRef {
184    fn from(id: reference::ActorId) -> Self {
185        Self::Actor(id)
186    }
187}
188
189// Introspection attr keys — actor-runtime concepts.
190//
191// These keys are populated by the introspect handler from
192// InstanceCell data. Mesh-topology keys (node_type, addr, num_procs,
193// etc.) are declared in hyperactor_mesh::introspect.
194//
195// Naming convention:
196//
197// - Attr names are node-type-agnostic. The `node_type` attr (from the
198//   mesh layer) identifies what kind of node it is; individual attr
199//   names don't repeat that. So `status`, not `actor_status`.
200// - Related attrs share a prefix to form a group. The `failure_*`
201//   keys decompose failure info into flat attrs — the `failure_`
202//   prefix groups them semantically.
203// - `actor_type` is an exception: the `actor_` prefix disambiguates
204//   it from `node_type` (mesh-layer concept). `actor_type` is the
205//   Rust actor type name; `node_type` is the topology role.
206// - Use real types where possible (e.g. SystemTime for timestamps),
207//   not String. Serialization format is a presentation concern.
208// - Internal key names are fully-qualified by `declare_attrs!`
209//   (module_path + attr constant), e.g.
210//   `hyperactor::introspect::status`.
211// - HTTP/schema public key names come from `@meta(INTROSPECT =
212//   IntrospectAttr { name, desc })`. Keep `name` explicit so API
213//   stability is decoupled from internal refactors.
214//
215// See IK-1 (metadata completeness) and IK-2 (short-name uniqueness)
216// in module doc.
217declare_attrs! {
218    /// Actor lifecycle status: "running", "stopped", "failed".
219    ///
220    /// Together with `STATUS_REASON`, these two attrs replace the
221    /// former `actor_status` prefix protocol (`"stopped:reason"`,
222    /// `"failed:reason"`) with structured fields, eliminating string
223    /// prefix parsing in consumers.
224    @meta(INTROSPECT = IntrospectAttr {
225        name: "status".into(),
226        desc: "Actor lifecycle status: running, stopped, failed".into(),
227    })
228    pub attr STATUS: String;
229
230    /// Reason for stop/failure (absent when running).
231    @meta(INTROSPECT = IntrospectAttr {
232        name: "status_reason".into(),
233        desc: "Reason for stop/failure (absent when running)".into(),
234    })
235    pub attr STATUS_REASON: String;
236
237    /// Fully-qualified actor type name.
238    @meta(INTROSPECT = IntrospectAttr {
239        name: "actor_type".into(),
240        desc: "Fully-qualified actor type name".into(),
241    })
242    pub attr ACTOR_TYPE: String;
243
244    /// Number of messages processed by this actor.
245    @meta(INTROSPECT = IntrospectAttr {
246        name: "messages_processed".into(),
247        desc: "Number of messages processed by this actor".into(),
248    })
249    pub attr MESSAGES_PROCESSED: u64 = 0;
250
251    /// Timestamp when this actor was created.
252    @meta(INTROSPECT = IntrospectAttr {
253        name: "created_at".into(),
254        desc: "Timestamp when this actor was created".into(),
255    })
256    pub attr CREATED_AT: SystemTime;
257
258    /// Name of the last message handler invoked.
259    @meta(INTROSPECT = IntrospectAttr {
260        name: "last_handler".into(),
261        desc: "Name of the last message handler invoked".into(),
262    })
263    pub attr LAST_HANDLER: String;
264
265    /// Total CPU time in message handlers (microseconds).
266    @meta(INTROSPECT = IntrospectAttr {
267        name: "total_processing_time_us".into(),
268        desc: "Total CPU time in message handlers (microseconds)".into(),
269    })
270    pub attr TOTAL_PROCESSING_TIME_US: u64 = 0;
271
272    /// Flight recorder JSON (recent trace events).
273    @meta(INTROSPECT = IntrospectAttr {
274        name: "flight_recorder".into(),
275        desc: "Flight recorder JSON (recent trace events)".into(),
276    })
277    pub attr FLIGHT_RECORDER: String;
278
279    /// Whether this actor is infrastructure/system.
280    @meta(INTROSPECT = IntrospectAttr {
281        name: "is_system".into(),
282        desc: "Whether this actor is infrastructure/system".into(),
283    })
284    pub attr IS_SYSTEM: bool = false;
285
286    /// Child references for tree navigation. Published by
287    /// infrastructure actors (HostMeshAgent, ProcAgent) so the
288    /// Entity view can return children without parsing mesh-layer keys.
289    @meta(INTROSPECT = IntrospectAttr {
290        name: "children".into(),
291        desc: "Child references for tree navigation".into(),
292    })
293    pub attr CHILDREN: Vec<IntrospectRef>;
294
295    /// Machine-readable error code for error nodes.
296    @meta(INTROSPECT = IntrospectAttr {
297        name: "error_code".into(),
298        desc: "Machine-readable error code (e.g. not_found)".into(),
299    })
300    pub attr ERROR_CODE: String;
301
302    /// Human-readable error message for error nodes.
303    @meta(INTROSPECT = IntrospectAttr {
304        name: "error_message".into(),
305        desc: "Human-readable error message".into(),
306    })
307    pub attr ERROR_MESSAGE: String;
308
309    // Failure attrs — decomposition of FailureInfo into flat attrs.
310    //
311    // - **FI-A1 (presence):** failure_* attrs are present iff
312    //   status == "failed"; absent otherwise. (Attr-level restatement
313    //   of FI-3.)
314    // - **FI-A2 (propagation):** failure_is_propagated == true iff
315    //   failure_root_cause_actor != this actor's id. (Attr-level
316    //   restatement of FI-4.)
317    // FI-1, FI-2 (write ordering) are enforced in proc.rs serve()
318    // and are unaffected by the representation change.
319    // FI-5, FI-6 are proc/mesh-level and unaffected.
320
321    /// Failure error message.
322    @meta(INTROSPECT = IntrospectAttr {
323        name: "failure_error_message".into(),
324        desc: "Failure error message".into(),
325    })
326    pub attr FAILURE_ERROR_MESSAGE: String;
327
328    /// Actor that caused the failure (root cause).
329    @meta(INTROSPECT = IntrospectAttr {
330        name: "failure_root_cause_actor".into(),
331        desc: "Actor that caused the failure (root cause)".into(),
332    })
333    pub attr FAILURE_ROOT_CAUSE_ACTOR: reference::ActorId;
334
335    /// Name of root cause actor.
336    @meta(INTROSPECT = IntrospectAttr {
337        name: "failure_root_cause_name".into(),
338        desc: "Name of root cause actor".into(),
339    })
340    pub attr FAILURE_ROOT_CAUSE_NAME: String;
341
342    /// Timestamp when failure occurred.
343    @meta(INTROSPECT = IntrospectAttr {
344        name: "failure_occurred_at".into(),
345        desc: "Timestamp when failure occurred".into(),
346    })
347    pub attr FAILURE_OCCURRED_AT: SystemTime;
348
349    /// Whether the failure was propagated from a child.
350    @meta(INTROSPECT = IntrospectAttr {
351        name: "failure_is_propagated".into(),
352        desc: "Whether the failure was propagated from a child".into(),
353    })
354    pub attr FAILURE_IS_PROPAGATED: bool = false;
355}
356
357// See FI-1 through FI-8 in module doc.
358
359/// Error from decoding an `Attrs` bag into a typed view.
360#[derive(Debug, Clone, PartialEq)]
361pub enum AttrsViewError {
362    /// A required key was absent (and has no default).
363    MissingKey {
364        /// The attr key that was absent.
365        key: &'static str,
366    },
367    /// A cross-field coherence check failed.
368    InvariantViolation {
369        /// Invariant label (e.g. "IA-4").
370        label: &'static str,
371        /// Human-readable description of the violation.
372        detail: String,
373    },
374}
375
376impl fmt::Display for AttrsViewError {
377    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
378        match self {
379            Self::MissingKey { key } => write!(f, "missing required key: {key}"),
380            Self::InvariantViolation { label, detail } => {
381                write!(f, "invariant {label} violated: {detail}")
382            }
383        }
384    }
385}
386
387impl std::error::Error for AttrsViewError {}
388
389impl AttrsViewError {
390    /// Convenience constructor for a missing required key.
391    pub fn missing(key: &'static str) -> Self {
392        Self::MissingKey { key }
393    }
394
395    /// Convenience constructor for an invariant violation.
396    pub fn invariant(label: &'static str, detail: String) -> Self {
397        Self::InvariantViolation { label, detail }
398    }
399}
400
401/// Structured failure fields decoded from `FAILURE_*` attrs.
402#[derive(Debug, Clone, PartialEq)]
403pub struct FailureAttrs {
404    /// Error message describing the failure.
405    pub error_message: String,
406    /// Actor that caused the failure (root cause).
407    pub root_cause_actor: reference::ActorId,
408    /// Display name of the root-cause actor, if available.
409    pub root_cause_name: Option<String>,
410    /// When the failure occurred.
411    pub occurred_at: SystemTime,
412    /// Whether this failure was propagated from a child.
413    pub is_propagated: bool,
414}
415
416/// Typed view over attrs for an actor node.
417#[derive(Debug, Clone, PartialEq)]
418pub struct ActorAttrsView {
419    /// Lifecycle status: "running", "stopped", "failed".
420    pub status: String,
421    /// Reason for stop/failure, if any.
422    pub status_reason: Option<String>,
423    /// Fully-qualified actor type name.
424    pub actor_type: String,
425    /// Number of messages processed.
426    pub messages_processed: u64,
427    /// When this actor was created.
428    pub created_at: Option<SystemTime>,
429    /// Name of the last message handler invoked.
430    pub last_handler: Option<String>,
431    /// Total CPU time in message handlers (microseconds).
432    pub total_processing_time_us: u64,
433    /// Flight recorder JSON, if available.
434    pub flight_recorder: Option<String>,
435    /// Whether this is a system/infrastructure actor.
436    pub is_system: bool,
437    /// Failure details, present iff status == "failed".
438    pub failure: Option<FailureAttrs>,
439}
440
441impl ActorAttrsView {
442    /// Decode from an `Attrs` bag (AV-2, AV-3). Requires `STATUS`
443    /// and `ACTOR_TYPE`. Enforces IA-3 (status_reason must not be
444    /// present for non-terminal status), IA-4 (failure attrs iff
445    /// failed), and failure completeness (if any required failure
446    /// key is present, all three required keys must be).
447    pub fn from_attrs(attrs: &Attrs) -> Result<Self, AttrsViewError> {
448        let status = attrs
449            .get(STATUS)
450            .ok_or_else(|| AttrsViewError::missing("status"))?
451            .clone();
452        let status_reason = attrs.get(STATUS_REASON).cloned();
453        let actor_type = attrs
454            .get(ACTOR_TYPE)
455            .ok_or_else(|| AttrsViewError::missing("actor_type"))?
456            .clone();
457        let messages_processed = *attrs.get(MESSAGES_PROCESSED).unwrap_or(&0);
458        let created_at = attrs.get(CREATED_AT).copied();
459        let last_handler = attrs.get(LAST_HANDLER).cloned();
460        let total_processing_time_us = *attrs.get(TOTAL_PROCESSING_TIME_US).unwrap_or(&0);
461        let flight_recorder = attrs.get(FLIGHT_RECORDER).cloned();
462        let is_system = *attrs.get(IS_SYSTEM).unwrap_or(&false);
463
464        // IA-3 (one-sided): status_reason must not be present for
465        // non-terminal status. The converse is not enforced —
466        // terminal status without a reason is valid (clean shutdown).
467        let is_terminal = status == "stopped" || status == "failed";
468        if status_reason.is_some() && !is_terminal {
469            return Err(AttrsViewError::invariant(
470                "IA-3",
471                format!(
472                    "status_reason present but status is '{status}' (expected stopped or failed)"
473                ),
474            ));
475        }
476
477        // Decode failure attrs. If any of the three required
478        // failure keys is present, require all three.
479        // FAILURE_IS_PROPAGATED has a declare_attrs! default of
480        // false, so it always resolves via attrs.get() and needs
481        // no explicit presence check. FAILURE_ROOT_CAUSE_NAME is
482        // genuinely optional.
483        let has_any_failure = attrs.get(FAILURE_ERROR_MESSAGE).is_some()
484            || attrs.get(FAILURE_ROOT_CAUSE_ACTOR).is_some()
485            || attrs.get(FAILURE_OCCURRED_AT).is_some();
486
487        let failure = if has_any_failure {
488            let error_message = attrs
489                .get(FAILURE_ERROR_MESSAGE)
490                .ok_or_else(|| AttrsViewError::missing("failure_error_message"))?
491                .clone();
492            let root_cause_actor = attrs
493                .get(FAILURE_ROOT_CAUSE_ACTOR)
494                .ok_or_else(|| AttrsViewError::missing("failure_root_cause_actor"))?
495                .clone();
496            let root_cause_name = attrs.get(FAILURE_ROOT_CAUSE_NAME).cloned();
497            let occurred_at = *attrs
498                .get(FAILURE_OCCURRED_AT)
499                .ok_or_else(|| AttrsViewError::missing("failure_occurred_at"))?;
500            // Default false: failure originated at this actor.
501            let is_propagated = *attrs.get(FAILURE_IS_PROPAGATED).unwrap_or(&false);
502            Some(FailureAttrs {
503                error_message,
504                root_cause_actor,
505                root_cause_name,
506                occurred_at,
507                is_propagated,
508            })
509        } else {
510            None
511        };
512
513        // IA-4: failure attrs present iff status == "failed".
514        if status == "failed" && failure.is_none() {
515            return Err(AttrsViewError::invariant(
516                "IA-4",
517                "status is 'failed' but no failure_* attrs present".to_string(),
518            ));
519        }
520        if status != "failed" && failure.is_some() {
521            return Err(AttrsViewError::invariant(
522                "IA-4",
523                format!("status is '{status}' but failure_* attrs are present"),
524            ));
525        }
526
527        Ok(Self {
528            status,
529            status_reason,
530            actor_type,
531            messages_processed,
532            created_at,
533            last_handler,
534            total_processing_time_us,
535            flight_recorder,
536            is_system,
537            failure,
538        })
539    }
540
541    /// Encode into an `Attrs` bag (AV-1 round-trip producer).
542    pub fn to_attrs(&self) -> Attrs {
543        let mut attrs = Attrs::new();
544        attrs.set(STATUS, self.status.clone());
545        if let Some(reason) = &self.status_reason {
546            attrs.set(STATUS_REASON, reason.clone());
547        }
548        attrs.set(ACTOR_TYPE, self.actor_type.clone());
549        attrs.set(MESSAGES_PROCESSED, self.messages_processed);
550        if let Some(t) = self.created_at {
551            attrs.set(CREATED_AT, t);
552        }
553        if let Some(handler) = &self.last_handler {
554            attrs.set(LAST_HANDLER, handler.clone());
555        }
556        attrs.set(TOTAL_PROCESSING_TIME_US, self.total_processing_time_us);
557        if let Some(fr) = &self.flight_recorder {
558            attrs.set(FLIGHT_RECORDER, fr.clone());
559        }
560        attrs.set(IS_SYSTEM, self.is_system);
561        if let Some(fi) = &self.failure {
562            attrs.set(FAILURE_ERROR_MESSAGE, fi.error_message.clone());
563            attrs.set(FAILURE_ROOT_CAUSE_ACTOR, fi.root_cause_actor.clone());
564            if let Some(name) = &fi.root_cause_name {
565                attrs.set(FAILURE_ROOT_CAUSE_NAME, name.clone());
566            }
567            attrs.set(FAILURE_OCCURRED_AT, fi.occurred_at);
568            attrs.set(FAILURE_IS_PROPAGATED, fi.is_propagated);
569        }
570        attrs
571    }
572}
573
574/// Internal introspection result. Carries attrs as a JSON string.
575/// The mesh layer constructs the API-facing `NodePayload` (with
576/// `properties`) from this via `derive_properties`.
577///
578/// This is the internal wire type — it travels over actor ports
579/// via `IntrospectMessage`. The presentation-layer `NodePayload`
580/// (with `NodeProperties`) lives in `hyperactor_mesh::introspect`.
581#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Named)]
582pub struct IntrospectResult {
583    /// Reference identifying this node.
584    pub identity: IntrospectRef,
585    /// JSON-serialized `Attrs` bag containing introspection attributes.
586    pub attrs: String,
587    /// Child references the client can follow to descend the tree.
588    pub children: Vec<IntrospectRef>,
589    /// Parent reference for upward navigation.
590    pub parent: Option<IntrospectRef>,
591    /// When this data was captured.
592    pub as_of: SystemTime,
593}
594wirevalue::register_type!(IntrospectResult);
595
596/// Context for introspection query - what aspect of the actor to
597/// describe.
598///
599/// Infrastructure actors (e.g., ProcAgent, HostAgent)
600/// have dual nature: they manage entities (Proc, Host) while also
601/// being actors themselves. IntrospectView allows callers to
602/// specify which aspect to query.
603// TODO(monarch-introspection): IntrospectView currently uses
604// Entity/Actor naming. Consider renaming to runtime-neutral query
605// modes (e.g. Published/Runtime) to avoid mesh-domain wording in
606// hyperactor while preserving behavior and wire compatibility.
607#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Named)]
608pub enum IntrospectView {
609    /// Return managed-entity properties (Proc, Host, etc.) for
610    /// infrastructure actors.
611    Entity,
612    /// Return standard actor properties (status, messages_processed,
613    /// flight_recorder).
614    Actor,
615}
616wirevalue::register_type!(IntrospectView);
617
618/// Introspection query sent to any actor.
619///
620/// `Query` asks the actor to describe itself. `QueryChild` asks the
621/// actor to describe one of its non-addressable children — an entity
622/// that appears in the navigation tree but has no mailbox of its own
623/// (e.g. a system proc owned by a host). The parent actor answers on
624/// the child's behalf.
625#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Named)]
626pub enum IntrospectMessage {
627    /// "Describe yourself."
628    Query {
629        /// View context - Entity or Actor.
630        view: IntrospectView,
631        /// Reply port receiving the actor's self-description.
632        reply: reference::OncePortRef<IntrospectResult>,
633    },
634    /// "Describe one of your children."
635    QueryChild {
636        /// Reference identifying the child to describe.
637        child_ref: reference::Reference,
638        /// Reply port receiving the child's description.
639        reply: reference::OncePortRef<IntrospectResult>,
640    },
641}
642wirevalue::register_type!(IntrospectMessage);
643
644/// Structured tracing event from the actor-local flight recorder.
645///
646/// Deserialization target for the `FLIGHT_RECORDER` attrs JSON string.
647#[derive(Debug, Clone, Serialize, Deserialize)]
648pub struct RecordedEvent {
649    /// ISO 8601 timestamp of the event.
650    pub timestamp: String,
651    /// Monotonic sequence number for ordering.
652    #[serde(default)]
653    pub seq: usize,
654    /// Event level (INFO, DEBUG, etc.).
655    pub level: String,
656    /// Event target (module path).
657    #[serde(default)]
658    pub target: String,
659    /// Event name.
660    pub name: String,
661    /// Event fields as JSON.
662    pub fields: serde_json::Value,
663}
664
665/// Format a [`SystemTime`] as an ISO 8601 timestamp with millisecond
666/// precision.
667pub fn format_timestamp(time: SystemTime) -> String {
668    humantime::format_rfc3339_millis(time).to_string()
669}
670
671/// Build a JSON-serialized `Attrs` string from values already
672/// computed by `live_actor_payload`. Reuses the same data — no
673/// redundant reads from `InstanceCell`.
674///
675/// Populates actor-runtime keys (STATUS, ACTOR_TYPE, etc.),
676/// decomposes the status prefix protocol into STATUS + STATUS_REASON,
677/// and decomposes failure fields into individual FAILURE_* attrs.
678///
679/// Starts from a fresh `Attrs` bag — published attrs (node_type,
680/// addr, etc.) are NOT included. This ensures the Actor view
681/// produces actor-only data; the Entity view handles published
682/// attrs separately.
683/// Failure fields extracted from a supervision event.
684struct FailureSnapshot {
685    error_message: String,
686    root_cause_actor: reference::ActorId,
687    root_cause_name: Option<String>,
688    occurred_at: SystemTime,
689    is_propagated: bool,
690}
691
692/// Pre-computed actor state for building the attrs JSON string.
693/// Avoids redundant InstanceCell reads — `live_actor_payload`
694/// computes these once and passes them in.
695struct ActorSnapshot {
696    status_str: String,
697    is_system: bool,
698    last_handler: Option<String>,
699    flight_recorder: Option<String>,
700    failure: Option<FailureSnapshot>,
701}
702
703fn build_actor_attrs(cell: &crate::InstanceCell, snap: &ActorSnapshot) -> String {
704    // Actor view builds a clean attrs bag with only actor-runtime
705    // keys. Published attrs (node_type, addr, etc.) belong to the
706    // Entity view — they are NOT merged here. This ensures that
707    // e.g. a HostMeshAgent resolved via Actor view produces Actor
708    // properties, not Host properties.
709    let mut attrs = hyperactor_config::Attrs::new();
710
711    // IA-3: status_reason present iff status carries a reason.
712    if let Some(reason) = snap.status_str.strip_prefix("stopped:") {
713        attrs.set(STATUS, "stopped".to_string());
714        attrs.set(STATUS_REASON, reason.trim().to_string());
715    } else if let Some(reason) = snap.status_str.strip_prefix("failed:") {
716        attrs.set(STATUS, "failed".to_string());
717        attrs.set(STATUS_REASON, reason.trim().to_string());
718    } else {
719        attrs.set(STATUS, snap.status_str.clone());
720        // IA-3: no status_reason for non-terminal states —
721        // guaranteed by fresh Attrs bag.
722    }
723
724    attrs.set(ACTOR_TYPE, cell.actor_type_name().to_string());
725    attrs.set(MESSAGES_PROCESSED, cell.num_processed_messages());
726    attrs.set(CREATED_AT, cell.created_at());
727    attrs.set(TOTAL_PROCESSING_TIME_US, cell.total_processing_time_us());
728    attrs.set(IS_SYSTEM, snap.is_system);
729
730    if let Some(handler) = &snap.last_handler {
731        attrs.set(LAST_HANDLER, handler.clone());
732    }
733    if let Some(fr) = &snap.flight_recorder {
734        attrs.set(FLIGHT_RECORDER, fr.clone());
735    }
736
737    // IA-4 / FI-A1: failure attrs present iff status == "failed".
738    if let Some(fi) = &snap.failure {
739        attrs.set(FAILURE_ERROR_MESSAGE, fi.error_message.clone());
740        attrs.set(FAILURE_ROOT_CAUSE_ACTOR, fi.root_cause_actor.clone());
741        if let Some(name) = &fi.root_cause_name {
742            attrs.set(FAILURE_ROOT_CAUSE_NAME, name.clone());
743        }
744        attrs.set(FAILURE_OCCURRED_AT, fi.occurred_at);
745        attrs.set(FAILURE_IS_PROPAGATED, fi.is_propagated);
746    }
747    // IA-4: failure attrs absent when not failed — guaranteed by
748    // starting from a fresh Attrs bag (no stale keys possible).
749
750    serde_json::to_string(&attrs).unwrap_or_else(|_| "{}".to_string())
751}
752
753/// Build an [`IntrospectResult`] from live [`InstanceCell`] state.
754///
755/// Reads the current live status and last handler directly from
756/// the cell. Used by the introspect task (which runs outside
757/// the actor's message loop) and by `Instance::introspect_payload`.
758pub fn live_actor_payload(cell: &InstanceCell) -> IntrospectResult {
759    let actor_id = cell.actor_id();
760    let status = cell.status().borrow().clone();
761    let last_handler = cell.last_message_handler();
762
763    let children: Vec<IntrospectRef> = cell
764        .child_actor_ids()
765        .into_iter()
766        .map(IntrospectRef::Actor)
767        .collect();
768
769    let events = cell.recording().tail();
770    let flight_recorder_events: Vec<RecordedEvent> = events
771        .into_iter()
772        .map(|event| RecordedEvent {
773            timestamp: format_timestamp(event.time),
774            seq: event.seq,
775            level: event.metadata.level().to_string(),
776            target: event.metadata.target().to_string(),
777            name: event.metadata.name().to_string(),
778            fields: event.json_value(),
779        })
780        .collect();
781
782    let flight_recorder = if flight_recorder_events.is_empty() {
783        None
784    } else {
785        serde_json::to_string(&flight_recorder_events).ok()
786    };
787
788    let supervisor = cell
789        .parent()
790        .map(|p| IntrospectRef::Actor(p.actor_id().clone()));
791
792    // FI-3: failure_info is computed from the same status value as
793    // actor_status, ensuring they agree on whether the actor failed.
794    let failure = if status.is_failed() {
795        cell.supervision_event().and_then(|event| {
796            let root = event.actually_failing_actor()?;
797            Some(FailureSnapshot {
798                error_message: event.actor_status.to_string(),
799                root_cause_actor: root.actor_id.clone(),
800                root_cause_name: root.display_name.clone(),
801                occurred_at: event.occurred_at,
802                is_propagated: root.actor_id != *actor_id,
803            })
804        })
805    } else {
806        None
807    };
808
809    let snap = ActorSnapshot {
810        status_str: status.to_string(),
811        is_system: cell.is_system(),
812        last_handler: last_handler.map(|info| info.to_string()),
813        flight_recorder,
814        failure,
815    };
816
817    let attrs = build_actor_attrs(cell, &snap);
818
819    IntrospectResult {
820        identity: IntrospectRef::Actor(actor_id.clone()),
821        attrs,
822        children,
823        parent: supervisor,
824        as_of: SystemTime::now(),
825    }
826}
827
828/// Introspect task: runs on a dedicated tokio task per actor,
829/// handling [`IntrospectMessage`] by reading [`InstanceCell`]
830/// directly and replying via the actor's [`Mailbox`].
831///
832/// The actor's message loop never sees these messages.
833///
834/// # Invariants exercised
835///
836/// Exercises S1, S2, S4, S5, S6, S11 (see module doc).
837pub(crate) async fn serve_introspect(
838    cell: InstanceCell,
839    mailbox: crate::mailbox::Mailbox,
840    mut receiver: crate::mailbox::PortReceiver<IntrospectMessage>,
841) {
842    use crate::actor::ActorStatus;
843    use crate::mailbox::PortSender as _;
844
845    // Watch for terminal status so we can break the reference cycle:
846    // InstanceCellState → Ports → introspect sender → keeps receiver
847    // open → this task holds InstanceCell → InstanceCellState.
848    // Without this, a stopped actor's InstanceCellState is never
849    // dropped and the actor lingers in the proc's instances map.
850    let mut status = cell.status().clone();
851
852    loop {
853        let msg = tokio::select! {
854            msg = receiver.recv() => {
855                match msg {
856                    Ok(msg) => msg,
857                    Err(_) => {
858                        // Channel closed. If the actor reached a
859                        // terminal state, snapshot it before exiting
860                        // so it remains queryable post-mortem.
861                        if cell.status().borrow().is_terminal() {
862                            let snapshot = live_actor_payload(&cell);
863                            cell.store_terminated_snapshot(snapshot);
864                        }
865                        break;
866                    }
867                }
868            }
869            _ = status.wait_for(ActorStatus::is_terminal) => {
870                // Snapshot for post-mortem introspection before
871                // dropping our InstanceCell reference.
872                let snapshot = live_actor_payload(&cell);
873                cell.store_terminated_snapshot(snapshot);
874                break;
875            }
876        };
877
878        let result = match msg {
879            IntrospectMessage::Query { view, reply } => {
880                let payload = match view {
881                    IntrospectView::Entity => match cell.published_attrs() {
882                        Some(published) => {
883                            let attrs_json =
884                                serde_json::to_string(&published).unwrap_or_else(|_| "{}".into());
885                            let children: Vec<IntrospectRef> =
886                                published.get(CHILDREN).cloned().unwrap_or_default();
887                            IntrospectResult {
888                                identity: IntrospectRef::Actor(cell.actor_id().clone()),
889                                attrs: attrs_json,
890                                children,
891                                parent: cell
892                                    .parent()
893                                    .map(|p| IntrospectRef::Actor(p.actor_id().clone())),
894                                as_of: SystemTime::now(),
895                            }
896                        }
897                        None => live_actor_payload(&cell),
898                    },
899                    IntrospectView::Actor => live_actor_payload(&cell),
900                };
901                mailbox.serialize_and_send_once(
902                    reply,
903                    payload,
904                    crate::mailbox::monitored_return_handle(),
905                )
906            }
907            IntrospectMessage::QueryChild { child_ref, reply } => {
908                let payload = cell.query_child(&child_ref).unwrap_or_else(|| {
909                    let mut error_attrs = hyperactor_config::Attrs::new();
910                    error_attrs.set(ERROR_CODE, "not_found".to_string());
911                    error_attrs.set(
912                        ERROR_MESSAGE,
913                        format!("child {} not found (no callback registered)", child_ref),
914                    );
915                    // Use the queried child_ref as identity for the error node.
916                    let identity = match &child_ref {
917                        reference::Reference::Proc(id) => IntrospectRef::Proc(id.clone()),
918                        reference::Reference::Actor(id) => IntrospectRef::Actor(id.clone()),
919                        reference::Reference::Port(id) => {
920                            IntrospectRef::Actor(id.actor_id().clone())
921                        }
922                    };
923                    IntrospectResult {
924                        identity,
925                        attrs: serde_json::to_string(&error_attrs)
926                            .unwrap_or_else(|_| "{}".to_string()),
927                        children: Vec::new(),
928                        parent: None,
929                        as_of: SystemTime::now(),
930                    }
931                });
932                mailbox.serialize_and_send_once(
933                    reply,
934                    payload,
935                    crate::mailbox::monitored_return_handle(),
936                )
937            }
938        };
939        if let Err(e) = result {
940            tracing::debug!("introspect reply failed: {e}");
941        }
942    }
943    tracing::debug!(
944        actor_id = %cell.actor_id(),
945        "introspect task exiting"
946    );
947}
948
949#[cfg(test)]
950mod tests {
951    use super::*;
952    use crate::actor::ActorErrorKind;
953    use crate::actor::ActorStatus;
954    use crate::channel::ChannelAddr;
955    use crate::reference::ProcId;
956    use crate::supervision::ActorSupervisionEvent;
957
958    /// Exercises IK-1 (see module doc).
959    #[test]
960    fn test_introspect_keys_are_tagged() {
961        let cases = vec![
962            ("status", STATUS.attrs()),
963            ("status_reason", STATUS_REASON.attrs()),
964            ("actor_type", ACTOR_TYPE.attrs()),
965            ("messages_processed", MESSAGES_PROCESSED.attrs()),
966            ("created_at", CREATED_AT.attrs()),
967            ("last_handler", LAST_HANDLER.attrs()),
968            ("total_processing_time_us", TOTAL_PROCESSING_TIME_US.attrs()),
969            ("flight_recorder", FLIGHT_RECORDER.attrs()),
970            ("is_system", IS_SYSTEM.attrs()),
971            ("children", CHILDREN.attrs()),
972            ("error_code", ERROR_CODE.attrs()),
973            ("error_message", ERROR_MESSAGE.attrs()),
974            ("failure_error_message", FAILURE_ERROR_MESSAGE.attrs()),
975            ("failure_root_cause_actor", FAILURE_ROOT_CAUSE_ACTOR.attrs()),
976            ("failure_root_cause_name", FAILURE_ROOT_CAUSE_NAME.attrs()),
977            ("failure_occurred_at", FAILURE_OCCURRED_AT.attrs()),
978            ("failure_is_propagated", FAILURE_IS_PROPAGATED.attrs()),
979        ];
980
981        for (expected_name, meta) in &cases {
982            // IK-1: see module doc.
983            let introspect = meta
984                .get(INTROSPECT)
985                .unwrap_or_else(|| panic!("{expected_name}: missing INTROSPECT meta-attr"));
986            assert_eq!(
987                introspect.name, *expected_name,
988                "short name mismatch for {expected_name}"
989            );
990            assert!(
991                !introspect.desc.is_empty(),
992                "{expected_name}: desc should not be empty"
993            );
994        }
995
996        // Exhaustiveness: verify cases covers all INTROSPECT-tagged
997        // keys declared in this module.
998        use hyperactor_config::attrs::AttrKeyInfo;
999        let registry_count = inventory::iter::<AttrKeyInfo>()
1000            .filter(|info| {
1001                info.name.starts_with("hyperactor::introspect::")
1002                    && info.meta.get(INTROSPECT).is_some()
1003            })
1004            .count();
1005        assert_eq!(
1006            cases.len(),
1007            registry_count,
1008            "test must cover all INTROSPECT-tagged keys in this module"
1009        );
1010    }
1011
1012    /// Exercises IK-2 (see module doc).
1013    #[test]
1014    fn test_introspect_short_names_are_globally_unique() {
1015        use hyperactor_config::attrs::AttrKeyInfo;
1016
1017        let mut seen = std::collections::HashMap::new();
1018        for info in inventory::iter::<AttrKeyInfo>() {
1019            let Some(introspect) = info.meta.get(INTROSPECT) else {
1020                continue;
1021            };
1022            // Metadata quality: every tagged key must have
1023            // non-empty name and desc.
1024            assert!(
1025                !introspect.name.is_empty(),
1026                "INTROSPECT key {:?} has empty name",
1027                info.name
1028            );
1029            assert!(
1030                !introspect.desc.is_empty(),
1031                "INTROSPECT key {:?} has empty desc",
1032                info.name
1033            );
1034            if let Some(prev_fq) = seen.insert(introspect.name.clone(), info.name) {
1035                panic!(
1036                    "IK-2 violation: duplicate short name {:?} declared by both {:?} and {:?}",
1037                    introspect.name, prev_fq, info.name
1038                );
1039            }
1040        }
1041    }
1042
1043    // IA-1 tests require spawning actors and live in actor.rs
1044    // where #[hyperactor::export] and test infrastructure are
1045    // available. IA-3 and IA-4 are tested below at the view level.
1046
1047    fn running_actor_attrs() -> Attrs {
1048        let mut attrs = Attrs::new();
1049        attrs.set(STATUS, "running".to_string());
1050        attrs.set(ACTOR_TYPE, "MyActor".to_string());
1051        attrs.set(MESSAGES_PROCESSED, 42u64);
1052        attrs.set(CREATED_AT, SystemTime::UNIX_EPOCH);
1053        attrs.set(IS_SYSTEM, false);
1054        attrs
1055    }
1056
1057    fn test_actor_id(proc_name: &str, actor_name: &str, pid: usize) -> crate::reference::ActorId {
1058        ProcId::with_name(ChannelAddr::Local(0), proc_name).actor_id(actor_name, pid)
1059    }
1060
1061    fn failed_actor_attrs() -> Attrs {
1062        let mut attrs = running_actor_attrs();
1063        attrs.set(STATUS, "failed".to_string());
1064        attrs.set(STATUS_REASON, "something broke".to_string());
1065        attrs.set(FAILURE_ERROR_MESSAGE, "boom".to_string());
1066        attrs.set(FAILURE_ROOT_CAUSE_ACTOR, test_actor_id("proc", "other", 0));
1067        attrs.set(FAILURE_ROOT_CAUSE_NAME, "OtherActor".to_string());
1068        attrs.set(FAILURE_OCCURRED_AT, SystemTime::UNIX_EPOCH);
1069        attrs.set(FAILURE_IS_PROPAGATED, true);
1070        attrs
1071    }
1072
1073    /// AV-1: from_attrs(to_attrs(v)) == v.
1074    #[test]
1075    fn test_actor_view_round_trip_running() {
1076        let view = ActorAttrsView::from_attrs(&running_actor_attrs()).unwrap();
1077        assert_eq!(view.status, "running");
1078        assert_eq!(view.actor_type, "MyActor");
1079        assert_eq!(view.messages_processed, 42);
1080        assert!(view.failure.is_none());
1081
1082        let round_tripped = ActorAttrsView::from_attrs(&view.to_attrs()).unwrap();
1083        assert_eq!(round_tripped, view);
1084    }
1085
1086    /// AV-1.
1087    #[test]
1088    fn test_actor_view_round_trip_failed() {
1089        let view = ActorAttrsView::from_attrs(&failed_actor_attrs()).unwrap();
1090        assert_eq!(view.status, "failed");
1091        let fi = view.failure.as_ref().unwrap();
1092        assert_eq!(fi.error_message, "boom");
1093        assert!(fi.is_propagated);
1094
1095        let round_tripped = ActorAttrsView::from_attrs(&view.to_attrs()).unwrap();
1096        assert_eq!(round_tripped, view);
1097    }
1098
1099    /// AV-2: missing required key rejected.
1100    #[test]
1101    fn test_actor_view_missing_status() {
1102        let mut attrs = Attrs::new();
1103        attrs.set(ACTOR_TYPE, "X".to_string());
1104        let err = ActorAttrsView::from_attrs(&attrs).unwrap_err();
1105        assert_eq!(err, AttrsViewError::MissingKey { key: "status" });
1106    }
1107
1108    /// AV-2.
1109    #[test]
1110    fn test_actor_view_missing_actor_type() {
1111        let mut attrs = Attrs::new();
1112        attrs.set(STATUS, "running".to_string());
1113        let err = ActorAttrsView::from_attrs(&attrs).unwrap_err();
1114        assert_eq!(err, AttrsViewError::MissingKey { key: "actor_type" });
1115    }
1116
1117    #[test]
1118    fn test_actor_view_ia3_rejects_reason_on_running() {
1119        let mut attrs = running_actor_attrs();
1120        attrs.set(STATUS_REASON, "should not be here".to_string());
1121        let err = ActorAttrsView::from_attrs(&attrs).unwrap_err();
1122        assert!(matches!(
1123            err,
1124            AttrsViewError::InvariantViolation { label: "IA-3", .. }
1125        ));
1126    }
1127
1128    #[test]
1129    fn test_actor_view_ia3_allows_terminal_without_reason() {
1130        let mut attrs = running_actor_attrs();
1131        attrs.set(STATUS, "stopped".to_string());
1132        // No status_reason — should be fine.
1133        let view = ActorAttrsView::from_attrs(&attrs).unwrap();
1134        assert_eq!(view.status, "stopped");
1135        assert!(view.status_reason.is_none());
1136    }
1137
1138    #[test]
1139    fn test_actor_view_ia4_rejects_failed_without_failure_attrs() {
1140        let mut attrs = running_actor_attrs();
1141        attrs.set(STATUS, "failed".to_string());
1142        // No failure_* keys.
1143        let err = ActorAttrsView::from_attrs(&attrs).unwrap_err();
1144        assert!(matches!(
1145            err,
1146            AttrsViewError::InvariantViolation { label: "IA-4", .. }
1147        ));
1148    }
1149
1150    #[test]
1151    fn test_actor_view_ia4_rejects_failure_attrs_on_running() {
1152        let mut attrs = running_actor_attrs();
1153        attrs.set(FAILURE_ERROR_MESSAGE, "boom".to_string());
1154        attrs.set(FAILURE_ROOT_CAUSE_ACTOR, test_actor_id("proc", "x", 0));
1155        attrs.set(FAILURE_OCCURRED_AT, SystemTime::UNIX_EPOCH);
1156        let err = ActorAttrsView::from_attrs(&attrs).unwrap_err();
1157        assert!(matches!(
1158            err,
1159            AttrsViewError::InvariantViolation { label: "IA-4", .. }
1160        ));
1161    }
1162
1163    /// AV-2: partial failure set → missing key.
1164    #[test]
1165    fn test_actor_view_partial_failure_attrs_rejected() {
1166        let mut attrs = running_actor_attrs();
1167        attrs.set(STATUS, "failed".to_string());
1168        // Only one of the three required failure keys.
1169        attrs.set(FAILURE_ERROR_MESSAGE, "boom".to_string());
1170        let err = ActorAttrsView::from_attrs(&attrs).unwrap_err();
1171        assert_eq!(
1172            err,
1173            AttrsViewError::MissingKey {
1174                key: "failure_root_cause_actor"
1175            }
1176        );
1177    }
1178
1179    /// Exercises FI-7 and FI-8 (see module doc): when a parent fails
1180    /// due to an unhandled Stopped child event, structured failure
1181    /// attrs must name the stopped child as
1182    /// `failure_root_cause_actor` (FI-7) and report
1183    /// `failure_is_propagated == true` (FI-8).
1184    ///
1185    /// Partially white-box: re-creates `FailureSnapshot` construction
1186    /// from `live_actor_payload` because that function requires an
1187    /// `InstanceCell`. This test will fail if
1188    /// `actually_failing_actor()` regresses, because that helper is
1189    /// the shared decision point for root-cause attribution. See
1190    /// `test_propagated_failure_info` in `proc.rs` for end-to-end
1191    /// integration coverage.
1192    #[test]
1193    fn test_fi7_fi8_propagated_stopped_child() {
1194        let proc_id = ProcId::with_name(ChannelAddr::Local(0), "test_proc");
1195        let child_id = proc_id.actor_id("proc_agent", 0);
1196        let parent_id = proc_id.actor_id("mesh_actor", 0);
1197
1198        let child_event = ActorSupervisionEvent::new(
1199            child_id.clone(),
1200            Some("proc_agent".into()),
1201            ActorStatus::Stopped("host died".into()),
1202            None,
1203        );
1204        let parent_event = ActorSupervisionEvent::new(
1205            parent_id.clone(),
1206            Some("mesh_actor".into()),
1207            ActorStatus::Failed(ActorErrorKind::UnhandledSupervisionEvent(Box::new(
1208                child_event,
1209            ))),
1210            None,
1211        );
1212
1213        // -- reproduce FailureSnapshot construction (same logic as
1214        // live_actor_payload) --
1215        let root = parent_event
1216            .actually_failing_actor()
1217            .expect("parent_event is a failure");
1218        let snap = FailureSnapshot {
1219            error_message: parent_event.actor_status.to_string(),
1220            root_cause_actor: root.actor_id.clone(),
1221            root_cause_name: root.display_name.clone(),
1222            occurred_at: parent_event.occurred_at,
1223            is_propagated: root.actor_id != parent_id,
1224        };
1225
1226        // FI-7: failure_root_cause_actor is the stopped child.
1227        assert_eq!(snap.root_cause_actor, child_id);
1228        // FI-8: failure_is_propagated is true.
1229        assert!(snap.is_propagated);
1230        // root_cause_name pinned before round-trip.
1231        assert_eq!(snap.root_cause_name.as_deref(), Some("proc_agent"));
1232
1233        // -- attrs round-trip through ActorAttrsView --
1234        let mut attrs = failed_actor_attrs();
1235        attrs.set(FAILURE_ERROR_MESSAGE, snap.error_message);
1236        attrs.set(FAILURE_ROOT_CAUSE_ACTOR, snap.root_cause_actor.clone());
1237        if let Some(name) = &snap.root_cause_name {
1238            attrs.set(FAILURE_ROOT_CAUSE_NAME, name.clone());
1239        }
1240        attrs.set(FAILURE_OCCURRED_AT, snap.occurred_at);
1241        attrs.set(FAILURE_IS_PROPAGATED, snap.is_propagated);
1242
1243        let view = ActorAttrsView::from_attrs(&attrs).unwrap();
1244        assert_eq!(view.status, "failed");
1245        let fi = view.failure.as_ref().expect("failure_info must be present");
1246        // FI-7: failure_root_cause_actor survives attrs round-trip.
1247        assert_eq!(fi.root_cause_actor, child_id);
1248        // FI-8: failure_is_propagated survives attrs round-trip.
1249        assert!(fi.is_propagated);
1250        // root_cause_name also survives.
1251        assert_eq!(fi.root_cause_name.as_deref(), Some("proc_agent"));
1252    }
1253}