hyperactor_mesh/
actor_mesh.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9//! ## Actor mesh invariants (AM-*)
10//!
11//! - **AM-1 (rank-space):** `ActorMeshRef` uses its `CastDomainRef` as
12//!   the source of truth for actor addresses. The cast domain stores
13//!   members in the same dense rank order as the mesh `Region`, so a
14//!   rank can be materialized by indexing the cast-domain member map
15//!   directly; the reference does not need to retain the `ProcMeshRef`
16//!   that created it.
17//! - **AM-2 (slice materialization):** `RankedSliceable::sliced` has no
18//!   caller context, so it carries only a raw cast-domain descriptor. The first
19//!   cast through that ref materializes the descriptor with the caller context
20//!   before sending the cast message, sequencing setup and delivery on the same
21//!   sender stream.
22
23use std::collections::HashMap;
24use std::fmt;
25use std::hash::Hash;
26use std::hash::Hasher;
27use std::ops::Deref;
28use std::sync::Arc;
29use std::sync::OnceLock as OnceCell;
30use std::time::Duration;
31
32use hyperactor::ActorAddr;
33use hyperactor::ActorLocal;
34use hyperactor::ActorRef;
35use hyperactor::Endpoint as _;
36use hyperactor::OncePortRefRepr;
37use hyperactor::PortRef;
38use hyperactor::PortRefRepr;
39use hyperactor::RemoteEndpoint as _;
40use hyperactor::RemoteHandles;
41use hyperactor::RemoteMessage;
42use hyperactor::accum::ReducerMode;
43use hyperactor::actor::ActorStatus;
44use hyperactor::actor::Referable;
45use hyperactor::context;
46use hyperactor::mailbox::PortReceiver;
47use hyperactor::port::Port;
48use hyperactor::supervision::ActorSupervisionEvent;
49use hyperactor_cast::TilingPolicy;
50use hyperactor_cast::cast_actor::CastDomainId;
51use hyperactor_cast::cast_actor::CastDomainRef;
52use hyperactor_config::CONFIG;
53use hyperactor_config::ConfigAttr;
54use hyperactor_config::Flattrs;
55use hyperactor_config::attrs::declare_attrs;
56use ndslice::ViewExt as _;
57use ndslice::view;
58use ndslice::view::Region;
59use ndslice::view::View;
60use serde::Deserialize;
61use serde::Deserializer;
62use serde::Serialize;
63use serde::Serializer;
64use tokio::sync::watch;
65
66use crate::Error;
67use crate::ProcMeshRef;
68use crate::ValueMesh;
69use crate::comm::multicast;
70use crate::config::MAX_CAST_FANOUT;
71use crate::host_mesh::GET_PROC_STATE_MAX_IDLE;
72use crate::host_mesh::mesh_to_rankedvalues_with_default;
73use crate::mesh_controller::ActorMeshController;
74use crate::mesh_controller::SUPERVISION_POLL_FREQUENCY;
75use crate::mesh_controller::Subscribe;
76use crate::mesh_controller::Unsubscribe;
77use crate::mesh_id::ActorMeshId;
78use crate::mesh_id::ProcMeshId;
79use crate::proc_mesh::GET_ACTOR_STATE_MAX_IDLE;
80use crate::proc_mesh::telemetry_actor_mesh_id;
81use crate::resource;
82use crate::supervision::MeshFailure;
83use crate::supervision::Unhealthy;
84
85declare_attrs! {
86    /// Liveness watchdog for the supervision stream. If no
87    /// supervision message (healthy or unhealthy) is observed within
88    /// this duration, the controller is assumed to be unreachable and
89    /// the mesh is treated as unhealthy. This timeout is about
90    /// detecting silence, not slow messages.
91    /// This value must be > poll frequency + get actor state timeout + get proc state timeout
92    /// or else it is possible to declare the controller dead before it could
93    /// feasibly have received a healthy reply.
94    @meta(CONFIG = ConfigAttr::new(
95        Some("HYPERACTOR_MESH_SUPERVISION_WATCHDOG_TIMEOUT".to_string()),
96        Some("supervision_watchdog_timeout".to_string()),
97    ))
98    pub attr SUPERVISION_WATCHDOG_TIMEOUT: Duration = Duration::from_mins(2);
99}
100
101/// An ActorMesh is a collection of ranked A-typed actors.
102///
103/// Bound note: `A: Referable` because the mesh stores/returns
104/// `ActorRef<A>`, which is only defined for `A: Referable`.
105#[derive(Debug)]
106pub struct ActorMesh<A: Referable> {
107    proc_mesh: ProcMeshRef,
108    id: ActorMeshId,
109    current_ref: ActorMeshRef<A>,
110    /// If present, this is the controller for the mesh. The controller ensures
111    /// the mesh is stopped when the actor owning it is stopped, and can provide
112    /// supervision events via subscribing.
113    /// It may not be present for some types of actors, typically system actors
114    /// such as ProcAgent or CommActor.
115    controller: Option<ActorRef<ActorMeshController<A>>>,
116}
117
118// `A: Referable` for the same reason as the struct: the mesh holds `ActorRef<A>`.
119impl<A: Referable> ActorMesh<A> {
120    pub(crate) fn new(
121        proc_mesh: ProcMeshRef,
122        id: ActorMeshId,
123        controller: Option<ActorRef<ActorMeshController<A>>>,
124        members: Arc<ValueMesh<ActorAddr>>,
125    ) -> Self {
126        let current_ref = ActorMeshRef::new(
127            id.clone(),
128            Some(proc_mesh.id().clone()),
129            proc_mesh.region().clone(),
130            controller.clone(),
131            members,
132        );
133
134        Self {
135            proc_mesh,
136            id,
137            current_ref,
138            controller,
139        }
140    }
141
142    pub fn id(&self) -> &ActorMeshId {
143        &self.id
144    }
145
146    pub(crate) fn set_controller(&mut self, controller: Option<ActorRef<ActorMeshController<A>>>) {
147        self.controller = controller.clone();
148        self.current_ref.set_controller(controller);
149    }
150
151    /// Stop actors on this mesh across all procs.
152    pub async fn stop(&mut self, cx: &impl context::Actor, reason: String) -> crate::Result<()> {
153        // Remove the controller as an optimization so all future meshes
154        // created from this one (such as slices) know they are already stopped.
155        // Refs and slices on other machines will still be able to query the
156        // controller and will be sent a notification about this stop by the controller
157        // itself.
158        if let Some(controller) = self.controller.take() {
159            // Run the Stop/GetState exchange. We wrap it so that, no matter
160            // how it ends, we can record a single unhealthy event
161            // afterwards. Taking the controller is one-way: once it is gone,
162            // no future call through this handle can retry the stop, so a
163            // silently-still-healthy mesh with a vanished controller would
164            // hide the fact that the stop never reached (or never confirmed)
165            // the actors.
166            let id = self.id.resource_id().clone();
167            let num_ranks = self.current_ref.region().num_ranks();
168            let result: crate::Result<()> = async {
169                controller.post(
170                    cx,
171                    resource::Stop {
172                        id: id.clone(),
173                        reason,
174                    },
175                );
176                // The controller processes messages serially, and its `Stop`
177                // handler already awaits the underlying ProcAgent wait, which
178                // sends its own `WaitRankStatus` to the ProcAgents and
179                // blocks up to `ACTOR_SPAWN_MAX_IDLE` for the actors to
180                // reach `Stopped`. By the time the controller gets to this
181                // `GetState`, its `health_state.statuses` already reflects
182                // the outcome (Stopping, Stopped, Failed, or Timeout on
183                // abort-budget exhaustion). We just need to serialize
184                // behind the Stop handler and read the result.
185                let (port, mut rx) = cx.mailbox().open_port();
186                controller.post(
187                    cx,
188                    resource::GetState::<resource::mesh::State<()>> {
189                        id: id.clone(),
190                        reply: port.bind(),
191                    },
192                );
193                let statuses = rx.recv().await?;
194                let Some(state) = &statuses.state else {
195                    return Err(Error::Other(anyhow::anyhow!(
196                        "non-existent state in GetState reply from controller: {}",
197                        controller.actor_addr()
198                    )));
199                };
200                // `is_terminating` accepts Stopping, Stopped, Failed, and
201                // Timeout. The controller's Stop handler has already
202                // awaited (or timed out) the underlying ProcAgent wait, so
203                // any rank still in Running here means the controller
204                // never processed the stop for that rank — a genuine
205                // error.
206                let all_terminating = state.statuses.values().all(|s| s.is_terminating());
207                if !all_terminating {
208                    let legacy = mesh_to_rankedvalues_with_default(
209                        &state.statuses,
210                        resource::Status::NotExist,
211                        resource::Status::is_not_exist,
212                        num_ranks,
213                    );
214                    return Err(Error::ActorStopError { statuses: legacy });
215                }
216                Ok(())
217            }
218            .await;
219
220            // Record the unhealthy event regardless of outcome. On success
221            // the mesh is stopped; on failure the controller is gone and
222            // the actors may still be running, but callers need to see the
223            // mesh as unhealthy either way so they stop treating it as
224            // live.
225            let status = match &result {
226                Ok(()) => ActorStatus::Stopped("mesh stopped".to_string()),
227                Err(e) => ActorStatus::Stopped(format!("mesh stop failed: {e}")),
228            };
229            let mut entry = self.health_state.entry(cx).or_default();
230            let health_state = entry.get_mut();
231            health_state.unhealthy_event = Some(Unhealthy::StreamClosed(MeshFailure {
232                actor_mesh_name: Some(self.id().to_string()),
233                event: ActorSupervisionEvent::new(
234                    // Use an actor id from the mesh.
235                    ndslice::view::Ranked::get(&self.current_ref, 0)
236                        .unwrap()
237                        .actor_addr()
238                        .clone(),
239                    None,
240                    status,
241                    None,
242                ),
243                crashed_ranks: vec![],
244            }));
245
246            result?;
247        }
248        // Also take the controller from the ref, since that is used for
249        // some operations.
250        self.current_ref.controller.take();
251        Ok(())
252    }
253}
254
255impl<A: Referable> fmt::Display for ActorMesh<A> {
256    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
257        write!(f, "{}", self.current_ref)
258    }
259}
260
261impl<A: Referable> Deref for ActorMesh<A> {
262    type Target = ActorMeshRef<A>;
263
264    fn deref(&self) -> &Self::Target {
265        &self.current_ref
266    }
267}
268
269/// Manual implementation of Clone because `A` doesn't need to implement Clone
270/// but we still want to be able to clone the ActorMesh.
271impl<A: Referable> Clone for ActorMesh<A> {
272    fn clone(&self) -> Self {
273        Self {
274            proc_mesh: self.proc_mesh.clone(),
275            id: self.id.clone(),
276            current_ref: self.current_ref.clone(),
277            controller: self.controller.clone(),
278        }
279    }
280}
281
282impl<A: Referable> Drop for ActorMesh<A> {
283    fn drop(&mut self) {
284        tracing::info!(
285            name = "ActorMeshStatus",
286            actor_name = %self.id,
287            status = "Dropped",
288        );
289    }
290}
291
292/// Influences paging behavior for the lazy cache. Smaller pages
293/// reduce over-allocation for sparse access; larger pages reduce the
294/// number of heap allocations for contiguous scans.
295const DEFAULT_PAGE: usize = 1024;
296
297/// A lazily materialized page of ActorRefs.
298struct Page<A: Referable> {
299    slots: Box<[OnceCell<ActorRef<A>>]>,
300}
301
302impl<A: Referable> Page<A> {
303    fn new(len: usize) -> Self {
304        let mut v = Vec::with_capacity(len);
305        for _ in 0..len {
306            v.push(OnceCell::new());
307        }
308        Self {
309            slots: v.into_boxed_slice(),
310        }
311    }
312}
313
314#[derive(Default)]
315struct HealthState {
316    unhealthy_event: Option<Unhealthy>,
317    crashed_ranks: HashMap<usize, ActorSupervisionEvent>,
318}
319
320impl HealthState {
321    fn failure_for_region(&self, region: &Region) -> Option<MeshFailure> {
322        let unhealthy = self.unhealthy_event.as_ref()?;
323        let mut failure = match unhealthy {
324            Unhealthy::StreamClosed(failure) | Unhealthy::Crashed(failure) => failure.clone(),
325        };
326        if failure.crashed_ranks.is_empty() {
327            return Some(failure);
328        }
329        let mut crashed_ranks = self
330            .crashed_ranks
331            .keys()
332            .copied()
333            .filter(|rank| region.slice().contains(*rank))
334            .collect::<Vec<_>>();
335        crashed_ranks.sort_unstable();
336        if crashed_ranks.is_empty() {
337            return None;
338        }
339        failure.crashed_ranks = crashed_ranks;
340        Some(failure)
341    }
342}
343
344impl std::fmt::Debug for HealthState {
345    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
346        f.debug_struct("HealthState")
347            .field("unhealthy_event", &self.unhealthy_event)
348            .field("crashed_ranks", &self.crashed_ranks)
349            .finish()
350    }
351}
352
353#[derive(Clone)]
354enum MessageOrFailure<M: Send + Sync + Clone + Default + 'static> {
355    Message(M),
356    // anyhow::Error and MailboxError are not clone-able, which we need to move
357    // out of a tokio watch Ref.
358    Failure(String),
359    Timeout,
360}
361
362impl<M: Send + Sync + Clone + Default + 'static> Default for MessageOrFailure<M> {
363    fn default() -> Self {
364        Self::Message(M::default())
365    }
366}
367
368fn default_cast_tiling_policy() -> TilingPolicy {
369    TilingPolicy::BoundedFanout {
370        fanout: hyperactor_config::global::get(MAX_CAST_FANOUT).into(),
371    }
372}
373
374#[derive(Clone)]
375struct ActorMeshCastDomain {
376    id: CastDomainId,
377    members: Arc<ValueMesh<ActorAddr>>,
378    region: Region,
379    tiling_policy: TilingPolicy,
380    cast_domain: ActorLocal<CastDomainRef>,
381}
382
383impl std::fmt::Debug for ActorMeshCastDomain {
384    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
385        f.debug_struct("ActorMeshCastDomain")
386            .field("id", &self.id)
387            .field("members", &self.members)
388            .field("region", &self.region)
389            .field("tiling_policy", &self.tiling_policy)
390            .finish_non_exhaustive()
391    }
392}
393
394impl ActorMeshCastDomain {
395    fn new(members: Arc<ValueMesh<ActorAddr>>, region: Region) -> Self {
396        Self {
397            id: CastDomainId::new(),
398            members,
399            region,
400            tiling_policy: default_cast_tiling_policy(),
401            cast_domain: ActorLocal::new(),
402        }
403    }
404
405    fn ensure_materialized(
406        &self,
407        cx: &impl context::Actor,
408        headers: &Flattrs,
409    ) -> anyhow::Result<CastDomainRef> {
410        if let hyperactor::actor_local::Entry::Occupied(cast_domain) = self.cast_domain.entry(cx) {
411            return Ok(cast_domain.get().clone());
412        }
413
414        let members =
415            self.region
416                .slice()
417                .iter()
418                .map(|rank| {
419                    let member = self.members.get_by_base_rank(rank).ok_or_else(|| {
420                        anyhow::anyhow!("missing cast-domain member for rank {rank}")
421                    })?;
422                    Ok((rank, member.clone()))
423                })
424                .collect::<anyhow::Result<HashMap<_, _>>>()?;
425
426        let cast_domain = self.id.clone().materialize(
427            cx,
428            members,
429            self.region.clone(),
430            self.tiling_policy,
431            headers.clone(),
432        )?;
433
434        self.cast_domain.entry(cx).or_insert(cast_domain.clone());
435
436        Ok(cast_domain)
437    }
438
439    fn members(&self) -> &ValueMesh<ActorAddr> {
440        &self.members
441    }
442
443    fn region(&self) -> &Region {
444        &self.region
445    }
446}
447
448impl Serialize for ActorMeshCastDomain {
449    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
450    where
451        S: Serializer,
452    {
453        (&self.id, &self.members, &self.region, self.tiling_policy).serialize(serializer)
454    }
455}
456
457impl<'de> Deserialize<'de> for ActorMeshCastDomain {
458    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
459    where
460        D: Deserializer<'de>,
461    {
462        let (id, members, region, tiling_policy) = <(
463            CastDomainId,
464            Arc<ValueMesh<ActorAddr>>,
465            Region,
466            TilingPolicy,
467        )>::deserialize(deserializer)?;
468        Ok(Self {
469            id,
470            members,
471            region,
472            tiling_policy,
473            cast_domain: ActorLocal::new(),
474        })
475    }
476}
477
478/// Turn the single-owner PortReceiver into a watch receiver, which can be
479/// cloned and subscribed to. Requires a default message to pre-populate with.
480/// Option can be used as M to provide a default of None.
481fn into_watch<M: Send + Sync + Clone + Default + 'static>(
482    mut rx: PortReceiver<M>,
483) -> watch::Receiver<MessageOrFailure<M>> {
484    let (sender, receiver) = watch::channel(MessageOrFailure::<M>::default());
485    // Apply a watchdog timeout to the supervision stream. If no
486    // supervision message (healthy or unhealthy) is observed within
487    // this window, we assume the controller is unreachable and
488    // surface a terminal failure on the watch channel. This is a
489    // watchdog against indefinite silence, not a message-delivery
490    // guarantee, and may conservatively treat a quiet but healthy
491    // controller as failed.
492    let timeout = hyperactor_config::global::get(SUPERVISION_WATCHDOG_TIMEOUT);
493    let poll_frequency = hyperactor_config::global::get(SUPERVISION_POLL_FREQUENCY);
494    let get_actor_state_max_idle = hyperactor_config::global::get(GET_ACTOR_STATE_MAX_IDLE);
495    let get_proc_state_max_idle = hyperactor_config::global::get(GET_PROC_STATE_MAX_IDLE);
496    let total_time = poll_frequency + get_actor_state_max_idle + get_proc_state_max_idle;
497    if timeout < total_time {
498        tracing::warn!(
499            "HYPERACTOR_MESH_SUPERVISION_WATCHDOG_TIMEOUT={} is too short. It should be >= {} (SUPERVISION_POLL_FREQUENCY={} + GET_ACTOR_STATE_MAX_IDLE={} + GET_PROC_STATE_MAX_IDLE={})",
500            humantime::format_duration(timeout),
501            humantime::format_duration(total_time),
502            humantime::format_duration(poll_frequency),
503            humantime::format_duration(get_actor_state_max_idle),
504            humantime::format_duration(get_proc_state_max_idle),
505        );
506    }
507    tokio::spawn(async move {
508        loop {
509            let message = match tokio::time::timeout(timeout, rx.recv()).await {
510                Ok(Ok(msg)) => MessageOrFailure::Message(msg),
511                Ok(Err(e)) => MessageOrFailure::Failure(e.to_string()),
512                Err(_) => MessageOrFailure::Timeout,
513            };
514            let is_failure = matches!(
515                message,
516                MessageOrFailure::Failure(_) | MessageOrFailure::Timeout
517            );
518            if sender.send(message).is_err() {
519                // After a sending error, exit the task.
520                break;
521            }
522            if is_failure {
523                // No need to keep polling if we've received an error or timeout.
524                break;
525            }
526        }
527    });
528    receiver
529}
530
531/// A reference to a stable snapshot of an [`ActorMesh`].
532#[derive(typeuri::Named)]
533pub struct ActorMeshRef<A: Referable> {
534    id: ActorMeshId,
535    /// Id of the proc mesh backing this actor mesh, if any. Retained as a
536    /// lightweight id (not the `ProcMeshRef`) so telemetry can derive the same
537    /// mesh id as creation time without holding proc-mesh state. `None` for
538    /// meshes not backed by a user proc mesh (e.g. the host-agent mesh).
539    proc_mesh_id: Option<ProcMeshId>,
540    /// Reference to a remote controller actor living on the proc that spawned
541    /// the actors in this ref. If None, the actor mesh was already stopped, or
542    /// this is a mesh ref to a "system actor" which has no controller and should
543    /// not be stopped. If Some, the actor mesh may still be stopped, and the
544    /// next_supervision_event function can be used to alert that the mesh has
545    /// stopped.
546    controller: Option<ActorRef<ActorMeshController<A>>>,
547
548    /// Cast-domain handle for this mesh view.
549    ///
550    /// A cast domain is the `hyperactor_cast` routing state used to deliver a
551    /// mesh-wide cast without iterating over every destination actor. The
552    /// descriptor can be carried by pure slices; routing setup is fenced by the
553    /// first cast from the caller that uses this ref.
554    cast_domain: ActorMeshCastDomain,
555    /// Recorded health issues with the mesh, to quickly consult before sending
556    /// out any casted messages. This is a locally updated copy of the authoritative
557    /// state stored on the ActorMeshController.
558    health_state: ActorLocal<HealthState>,
559    /// Shared cloneable receiver for supervision events, used by next_supervision_event.
560    /// Needs tokio mutex because it is held across an await point.
561    /// Should not be shared across actors because each actor context needs its
562    /// own subscriber.
563    receiver: ActorLocal<
564        Arc<
565            tokio::sync::Mutex<(
566                PortRef<Option<MeshFailure>>,
567                watch::Receiver<MessageOrFailure<Option<MeshFailure>>>,
568            )>,
569        >,
570    >,
571    /// Lazily allocated collection of pages:
572    /// - The outer `OnceCell` defers creating the vector until first
573    ///   use.
574    /// - The `Vec` holds slots for multiple pages.
575    /// - Each slot is itself a `OnceCell<Box<Page<A>>>`, so that each
576    ///   page can be initialized on demand.
577    /// - A `Page<A>` is a boxed slice of `OnceCell<ActorRef<A>>`,
578    ///   i.e. the actual storage for actor references within that
579    ///   page.
580    pages: OnceCell<Vec<OnceCell<Box<Page<A>>>>>,
581    // Page size knob (not serialize; defaults after deserialize).
582    page_size: usize,
583}
584
585impl<A: Referable> ActorMeshRef<A> {
586    fn cached_failure(&self, cx: &impl context::Actor) -> Option<MeshFailure> {
587        let health_state = self.health_state.entry(cx).or_default();
588        health_state
589            .get()
590            .failure_for_region(ndslice::view::Ranked::region(self))
591    }
592
593    /// Cast a message to all the actors in this mesh
594    #[allow(clippy::result_large_err)]
595    pub fn cast<M>(&self, cx: &impl context::Actor, message: M) -> crate::Result<()>
596    where
597        A: RemoteHandles<M>,
598        M: RemoteMessage + Clone, // Clone is required until we are fully onto comm actor
599    {
600        self.cast_with_headers(cx, &Flattrs::new(), message)
601    }
602
603    /// Cast a message to all the actors in this mesh, merging
604    /// caller-supplied `caller_headers` into the per-rank envelope
605    /// headers before send. Used to propagate caller-known context
606    /// (e.g. operation-context keys marked with `OPERATION_CONTEXT_HEADER`)
607    /// onto the outgoing request so receivers can project it back
608    /// onto replies.
609    #[allow(clippy::result_large_err)]
610    pub fn cast_with_headers<M>(
611        &self,
612        cx: &impl context::Actor,
613        caller_headers: &Flattrs,
614        message: M,
615    ) -> crate::Result<()>
616    where
617        A: RemoteHandles<M>,
618        M: RemoteMessage + Clone,
619    {
620        self.check_cached_failure(cx)?;
621        self.emit_sent_message_telemetry(cx, view::Ranked::region(self));
622
623        let mut headers = caller_headers.clone();
624        headers.set(
625            multicast::CAST_ORIGINATING_SENDER,
626            cx.instance().self_addr().clone(),
627        );
628        headers.set(crate::casting::CAST_ACTOR_MESH_ID, self.id.clone());
629
630        let threshold =
631            hyperactor_config::global::get(crate::config::V1_CAST_POINT_TO_POINT_THRESHOLD);
632
633        let num_ranks = self.len();
634
635        match num_ranks {
636            0 => Ok(()),
637            1 if threshold >= 1 => {
638                // Avoid paying tax of conversion to IndexedErasedUnbound and port splitting
639                // when the threshold enables direct singleton sends.
640                let point = self
641                    .cast_domain
642                    .region()
643                    .extent()
644                    .point_of_rank(0)
645                    .map_err(|err| Error::CastingError(self.id.clone(), err.into()))?;
646
647                let actor = self.materialize(0).ok_or_else(|| {
648                    Error::CastingError(
649                        self.id.clone(),
650                        anyhow::anyhow!("missing actor for rank 0"),
651                    )
652                })?;
653
654                self.post_cast_direct(cx, point, actor, message, &headers)
655            }
656            n if threshold > 0 && n < threshold => {
657                // Point-to-point: send directly to each destination actor,
658                // bypassing the comm actor tree for lower latency when fanout
659                // is small.
660                let sender = cx.instance().self_addr().clone();
661                let dest_port = M::port();
662                let mut data =
663                    wirevalue::Any::<wirevalue::encoding::Multipart>::serialize(&message)
664                        .expect("cast message serialization should not fail");
665
666                // Split ports for N destinations, matching the comm tree's
667                // split_ports behavior.
668                data.visit_multipart_parts_mut::<PortRefRepr, anyhow::Error>(|port| {
669                    if port.unsplit() {
670                        return Ok(());
671                    }
672                    let split = port.port_addr().split(
673                        cx,
674                        port.reducer_spec().clone(),
675                        ReducerMode::Streaming(port.streaming_opts().clone()),
676                        port.get_return_undeliverable(),
677                    )?;
678                    port.update_port_addr(split);
679                    Ok(())
680                })
681                .map_err(|e| Error::CastingError(self.id.clone(), e))?;
682
683                data.visit_multipart_parts_mut::<OncePortRefRepr, anyhow::Error>(|port| {
684                    if port.unsplit() || port.reducer_spec().is_none() {
685                        // Once ports without reducers pass through. If used more
686                        // than once, only one destination can reply.
687                        return Ok(());
688                    }
689                    let split = port.port_addr().split(
690                        cx,
691                        port.reducer_spec().clone(),
692                        ReducerMode::Once(n),
693                        port.get_return_undeliverable(),
694                    )?;
695                    port.update_port_addr(split);
696                    Ok(())
697                })
698                .map_err(|e| Error::CastingError(self.id.clone(), e))?;
699
700                for rank in 0..n {
701                    let point = self
702                        .cast_domain
703                        .region()
704                        .extent()
705                        .point_of_rank(rank)
706                        .map_err(|err| Error::CastingError(self.id.clone(), err.into()))?;
707
708                    let actor = self.materialize(rank).ok_or_else(|| {
709                        Error::CastingError(
710                            self.id.clone(),
711                            anyhow::anyhow!("missing actor for rank {rank}"),
712                        )
713                    })?;
714
715                    let mut rank_data = data.clone();
716
717                    rank_data
718                        .visit_multipart_parts_mut::<resource::RankRepr, anyhow::Error>(
719                            |resource::RankRepr(rank)| {
720                                *rank = Some(point.rank());
721                                Ok(())
722                            },
723                        )
724                        .map_err(|e| Error::CastingError(self.id.clone(), e))?;
725
726                    let mut rank_headers = headers.clone();
727
728                    multicast::set_cast_info_on_headers(&mut rank_headers, point, sender.clone());
729
730                    cx.instance().post(
731                        actor
732                            .actor_addr()
733                            .port_addr(Port::handler_id(dest_port, None)),
734                        rank_headers,
735                        rank_data.erase_encoding(),
736                    );
737                }
738
739                Ok(())
740            }
741            _ => self
742                .cast_domain
743                .ensure_materialized(cx, &headers)
744                .map_err(|e| Error::CastingError(self.id.clone(), e))?
745                .cast(cx, headers, message)
746                .map_err(|e| Error::CastingError(self.id.clone(), e)),
747        }
748    }
749
750    /// Cast a message to one randomly chosen actor in this mesh, merging
751    /// caller-supplied `caller_headers` into the outgoing envelope.
752    #[allow(clippy::result_large_err)]
753    pub fn cast_choose_with_headers<M>(
754        &self,
755        cx: &impl context::Actor,
756        caller_headers: &Flattrs,
757        message: M,
758    ) -> crate::Result<()>
759    where
760        A: RemoteHandles<M>,
761        M: RemoteMessage + Clone,
762    {
763        self.check_cached_failure(cx)?;
764        self.emit_sent_message_telemetry(
765            cx,
766            &Region::new(
767                Vec::new(),
768                ndslice::Slice::new(0, Vec::new(), Vec::new())
769                    .expect("zero-dimensional slice is valid"),
770            ),
771        );
772
773        let num_ranks = self.cast_domain.region().num_ranks();
774
775        if num_ranks == 0 {
776            return Ok(());
777        }
778
779        let rank_index = rand::random::<u64>() as usize % num_ranks;
780
781        let point = self
782            .cast_domain
783            .region()
784            .extent()
785            .point_of_rank(rank_index)
786            .map_err(|err| Error::CastingError(self.id.clone(), err.into()))?;
787
788        let actor = self.materialize(rank_index).ok_or_else(|| {
789            Error::CastingError(
790                self.id.clone(),
791                anyhow::anyhow!("missing actor for chosen rank {rank_index}"),
792            )
793        })?;
794
795        self.post_cast_direct(cx, point, actor, message, caller_headers)
796    }
797
798    #[allow(clippy::result_large_err)]
799    fn check_cached_failure(&self, cx: &impl context::Actor) -> crate::Result<()> {
800        // First check if the mesh is already dead before sending out any messages
801        // to a possibly undeliverable actor.
802        if let Some(failure) = self.cached_failure(cx) {
803            tracing::debug!(
804                actor_mesh = %self.id,
805                crashed_ranks = ?failure.crashed_ranks,
806                "rejecting cast due to cached supervision failure"
807            );
808            return Err(crate::Error::Supervision(Box::new(failure)));
809        }
810
811        Ok(())
812    }
813
814    fn emit_sent_message_telemetry(&self, cx: &impl context::Actor, region: &Region) {
815        hyperactor_telemetry::notify_sent_message(hyperactor_telemetry::SentMessageEvent {
816            timestamp: std::time::SystemTime::now(),
817            sender_actor_id: hyperactor_telemetry::hash_to_u64(cx.mailbox().actor_addr().id()),
818            actor_mesh_id: match &self.proc_mesh_id {
819                Some(proc_mesh_id) => telemetry_actor_mesh_id(proc_mesh_id, &self.id),
820                // No backing proc mesh (e.g. the host-agent mesh): key telemetry
821                // on the actor mesh id alone.
822                None => hyperactor_telemetry::hash_to_u64(&self.id),
823            },
824            view_json: serde_json::to_string(region).unwrap_or_default(),
825            shape_json: {
826                let shape: ndslice::Shape = region.into();
827                serde_json::to_string(&shape).unwrap_or_default()
828            },
829        });
830    }
831
832    #[allow(clippy::result_large_err)]
833    fn post_cast_direct<M>(
834        &self,
835        cx: &impl context::Actor,
836        point: ndslice::Point,
837        actor: &ActorRef<A>,
838        message: M,
839        caller_headers: &Flattrs,
840    ) -> crate::Result<()>
841    where
842        A: RemoteHandles<M>,
843        M: RemoteMessage,
844    {
845        let create_rank = point.rank();
846        let mut headers = caller_headers.clone();
847        multicast::set_cast_info_on_headers(&mut headers, point, cx.instance().self_addr().clone());
848
849        // Make sure that we rewrite ranks, as these may be used for
850        // bootstrapping comm actors.
851        let mut data = wirevalue::Any::<wirevalue::encoding::Multipart>::serialize(&message)
852            .map_err(|e| Error::CastingError(self.id.clone(), e.into()))?;
853        data.visit_multipart_parts_mut::<resource::RankRepr, anyhow::Error>(
854            |resource::RankRepr(rank)| {
855                *rank = Some(create_rank);
856                Ok(())
857            },
858        )
859        .map_err(|e| Error::CastingError(self.id.clone(), e))?;
860        let rebound_message = data
861            .deserialized_unchecked()
862            .map_err(|e| Error::CastingError(self.id.clone(), e.into()))?;
863        actor.post_with_headers(cx, headers, rebound_message);
864        Ok(())
865    }
866
867    pub(crate) fn new(
868        id: ActorMeshId,
869        proc_mesh_id: Option<ProcMeshId>,
870        region: Region,
871        controller: Option<ActorRef<ActorMeshController<A>>>,
872        members: Arc<ValueMesh<ActorAddr>>,
873    ) -> Self {
874        Self::with_page_size(id, proc_mesh_id, region, DEFAULT_PAGE, controller, members)
875    }
876
877    pub fn id(&self) -> &ActorMeshId {
878        &self.id
879    }
880
881    pub(crate) fn with_page_size(
882        id: ActorMeshId,
883        proc_mesh_id: Option<ProcMeshId>,
884        region: Region,
885        page_size: usize,
886        controller: Option<ActorRef<ActorMeshController<A>>>,
887        members: Arc<ValueMesh<ActorAddr>>,
888    ) -> Self {
889        Self::with_cast_domain(
890            id,
891            proc_mesh_id,
892            controller,
893            ActorMeshCastDomain::new(members, region),
894            page_size,
895        )
896    }
897
898    fn with_cast_domain(
899        id: ActorMeshId,
900        proc_mesh_id: Option<ProcMeshId>,
901        controller: Option<ActorRef<ActorMeshController<A>>>,
902        cast_domain: ActorMeshCastDomain,
903        page_size: usize,
904    ) -> Self {
905        Self {
906            id,
907            proc_mesh_id,
908            controller,
909            cast_domain,
910            health_state: ActorLocal::new(),
911            receiver: ActorLocal::new(),
912            pages: OnceCell::new(),
913            page_size: page_size.max(1),
914        }
915    }
916
917    #[inline]
918    fn len(&self) -> usize {
919        self.cast_domain.region().num_ranks()
920    }
921
922    pub fn controller(&self) -> &Option<ActorRef<ActorMeshController<A>>> {
923        &self.controller
924    }
925
926    fn set_controller(&mut self, controller: Option<ActorRef<ActorMeshController<A>>>) {
927        self.controller = controller;
928    }
929
930    fn ensure_pages(&self) -> &Vec<OnceCell<Box<Page<A>>>> {
931        let n = self.len().div_ceil(self.page_size); // ⌈len / page_size⌉
932        self.pages
933            .get_or_init(|| (0..n).map(|_| OnceCell::new()).collect())
934    }
935
936    fn materialize(&self, rank: usize) -> Option<&ActorRef<A>> {
937        let len = self.len();
938        if rank >= len {
939            return None;
940        }
941        let cast_domain = &self.cast_domain;
942        let p = self.page_size;
943        let page_ix = rank / p;
944        let local_ix = rank % p;
945
946        let pages = self.ensure_pages();
947        let page = pages[page_ix].get_or_init(|| {
948            // Last page may be partial.
949            let base = page_ix * p;
950            let remaining = len - base;
951            let page_len = remaining.min(p);
952            Box::new(Page::<A>::new(page_len))
953        });
954
955        Some(page.slots[local_ix].get_or_init(|| {
956            // AM-1: see module doc. The cast domain member map is in the
957            // same dense local-rank order as this mesh ref's region.
958            debug_assert!(rank < self.len(), "rank must be within [0, len)");
959            ActorRef::attest(
960                view::Ranked::get(cast_domain.members(), rank)
961                    .expect("rank must be present in cast-domain member map")
962                    .clone(),
963            )
964        }))
965    }
966
967    fn init_supervision_receiver(
968        controller: &ActorRef<ActorMeshController<A>>,
969        cx: &impl context::Actor,
970    ) -> (
971        PortRef<Option<MeshFailure>>,
972        watch::Receiver<MessageOrFailure<Option<MeshFailure>>>,
973    ) {
974        let (tx, rx) = cx.mailbox().open_port();
975        let tx = tx.bind();
976        controller.post(cx, Subscribe(tx.clone()));
977        (tx, into_watch(rx))
978    }
979
980    /// Returns the next supervision event occurring on this mesh. Await this
981    /// simultaneously with the return result of a message (such as awaiting a reply after a cast)
982    /// to get back a message that indicates the actor that failed, instead of
983    /// waiting forever for a reply.
984    /// If there are multiple simultaneous awaits of next_supervision_event,
985    /// all of them will receive the same event.
986    pub async fn next_supervision_event(
987        &self,
988        cx: &impl context::Actor,
989    ) -> Result<MeshFailure, anyhow::Error> {
990        if let Some(failure) = self.cached_failure(cx) {
991            tracing::debug!(
992                actor_mesh = %self.id,
993                crashed_ranks = ?failure.crashed_ranks,
994                "returning cached supervision failure"
995            );
996            return Ok(failure);
997        }
998        let controller = if let Some(c) = self.controller() {
999            c
1000        } else {
1001            return Err(anyhow::anyhow!(
1002                "unexpected healthy state while controller is gone"
1003            ));
1004        };
1005        let rx = {
1006            // Make sure to create only one PortReceiver per context.
1007            let entry = self.receiver.entry(cx).or_insert_with(|| {
1008                Arc::new(tokio::sync::Mutex::new(Self::init_supervision_receiver(
1009                    controller, cx,
1010                )))
1011            });
1012            // Need to clone so the lifetime is disconnected from entry, which
1013            // isn't Send so can't be held across an await point.
1014            Arc::clone(entry.get())
1015        };
1016        let message = {
1017            let mut rx = rx.lock().await;
1018            let subscriber_port = rx.0.clone();
1019            let message =
1020                rx.1.wait_for(|message| {
1021                    // Filter out messages that do not apply to these ranks. This
1022                    // is relevant for slices since we get messages back for the
1023                    // whole mesh.
1024                    if let MessageOrFailure::Message(message) = message {
1025                        if let Some(message) = &message {
1026                            let region = ndslice::view::Ranked::region(self).slice();
1027                            if message.crashed_ranks.is_empty() {
1028                                // Whole-mesh event (e.g. mesh stop).
1029                                true
1030                            } else {
1031                                // Accept if any crashed rank overlaps with
1032                                // this slice's region.
1033                                message.crashed_ranks.iter().any(|r| region.contains(*r))
1034                            }
1035                        } else {
1036                            // Filter out messages that are not failures. These are used
1037                            // to ensure the controller is still reachable, but are not
1038                            // otherwise interesting.
1039                            false
1040                        }
1041                    } else {
1042                        // either failure case is interesting
1043                        true
1044                    }
1045                })
1046                .await?;
1047            let message = message.clone();
1048            let is_failure = matches!(
1049                message,
1050                MessageOrFailure::Failure(_) | MessageOrFailure::Timeout
1051            );
1052            if is_failure {
1053                // In failure cases, the receiver is dropped, so we can unsubscribe
1054                // from the controller. The controller can detect this
1055                // on its own, but an explicit unsubscribe prevents error logs
1056                // about this receiver being unreachable.
1057                let mut port = controller.port();
1058                // We don't care if the controller is unreachable for an unsubscribe.
1059                port.return_undeliverable(false);
1060                let _ = port.post(cx, Unsubscribe(subscriber_port));
1061            }
1062            // If we successfully got a message back, we can't unsubscribe because
1063            // the receiver might be shared with other calls to next_supervision_event,
1064            // or with clones of this ActorMeshRef.
1065            match message {
1066                MessageOrFailure::Message(message) => Ok::<MeshFailure, anyhow::Error>(
1067                    message.expect("filter excludes any None messages"),
1068                ),
1069                MessageOrFailure::Failure(failure) => Err(anyhow::anyhow!("{}", failure)),
1070                MessageOrFailure::Timeout => {
1071                    // Treat timeout from controller as a supervision failure,
1072                    // the controller is unreachable.
1073                    Ok(MeshFailure {
1074                        actor_mesh_name: Some(self.id().to_string()),
1075                        event: ActorSupervisionEvent::new(
1076                            controller.actor_addr().clone(),
1077                            None,
1078                            ActorStatus::generic_failure(format!(
1079                                "timed out reaching controller {} for mesh {}. Assuming controller's proc is dead",
1080                                controller.actor_addr(),
1081                                self.id()
1082                            )),
1083                            None,
1084                        ),
1085                        crashed_ranks: vec![],
1086                    })
1087                }
1088            }?
1089        };
1090        // Update the health state now that we have received a message.
1091        let event = &message.event;
1092        // Make sure not to hold this lock across an await point.
1093        let mut entry = self.health_state.entry(cx).or_default();
1094        let health_state = entry.get_mut();
1095        if let ActorStatus::Failed(_) = event.actor_status {
1096            for &rank in &message.crashed_ranks {
1097                health_state.crashed_ranks.insert(rank, event.clone());
1098            }
1099        }
1100        health_state.unhealthy_event = match &event.actor_status {
1101            ActorStatus::Failed(_) => Some(Unhealthy::Crashed(message.clone())),
1102            ActorStatus::Stopped(_) => Some(Unhealthy::StreamClosed(message.clone())),
1103            _ => None,
1104        };
1105        Ok(message)
1106    }
1107
1108    /// Same as Clone, but includes a shared supervision receiver. This copy will
1109    /// share the same health state and get the same supervision events.
1110    /// Will have a separate cache.
1111    pub fn clone_with_supervision_receiver(&self) -> Self {
1112        Self {
1113            id: self.id.clone(),
1114            proc_mesh_id: self.proc_mesh_id.clone(),
1115            controller: self.controller.clone(),
1116            cast_domain: self.cast_domain.clone(),
1117            health_state: self.health_state.clone(),
1118            receiver: self.receiver.clone(),
1119            // Cache does not support Clone at this time.
1120            pages: OnceCell::new(),
1121            page_size: self.page_size,
1122        }
1123    }
1124}
1125
1126impl<A: Referable> Clone for ActorMeshRef<A> {
1127    fn clone(&self) -> Self {
1128        Self {
1129            id: self.id.clone(),
1130            proc_mesh_id: self.proc_mesh_id.clone(),
1131            controller: self.controller.clone(),
1132            cast_domain: self.cast_domain.clone(),
1133            // Cloning should not use the same health state or receiver, because
1134            // it should make a new subscriber.
1135            health_state: ActorLocal::new(),
1136            receiver: ActorLocal::new(),
1137            pages: OnceCell::new(), // No clone cache.
1138            page_size: self.page_size,
1139        }
1140    }
1141}
1142
1143impl<A: Referable> fmt::Display for ActorMeshRef<A> {
1144    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1145        write!(
1146            f,
1147            "{}:{}@{}",
1148            self.id,
1149            A::typename(),
1150            self.cast_domain.region()
1151        )
1152    }
1153}
1154
1155impl<A: Referable> PartialEq for ActorMeshRef<A> {
1156    fn eq(&self, other: &Self) -> bool {
1157        // Value identity: the same mesh (`id`) over the same `region`. `id` is
1158        // cloned through `sliced()` and preserved across serialization, and the
1159        // members are a function of `(id, region)`, so this captures "same
1160        // actors." The cast domain's `domain_id` is a freshly-minted per-slice
1161        // routing token (`Uid::anonymous()`), so it is deliberately excluded —
1162        // two independent slices to the same region denote the same actors and
1163        // must compare equal.
1164        self.id == other.id && self.cast_domain.region() == other.cast_domain.region()
1165    }
1166}
1167impl<A: Referable> Eq for ActorMeshRef<A> {}
1168
1169impl<A: Referable> Hash for ActorMeshRef<A> {
1170    fn hash<H: Hasher>(&self, state: &mut H) {
1171        self.id.hash(state);
1172        self.cast_domain.region().hash(state);
1173    }
1174}
1175
1176impl<A: Referable> fmt::Debug for ActorMeshRef<A> {
1177    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1178        f.debug_struct("ActorMeshRef")
1179            .field("region", self.cast_domain.region())
1180            .field("id", &self.id)
1181            .field("page_size", &self.page_size)
1182            .finish_non_exhaustive() // No print cache.
1183    }
1184}
1185
1186// Implement Serialize manually, without requiring A: Serialize
1187impl<A: Referable> Serialize for ActorMeshRef<A> {
1188    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
1189    where
1190        S: Serializer,
1191    {
1192        // Serialize only the fields that don't depend on A.
1193        (
1194            &self.id,
1195            &self.proc_mesh_id,
1196            &self.controller,
1197            &self.cast_domain,
1198        )
1199            .serialize(serializer)
1200    }
1201}
1202
1203// Implement Deserialize manually, without requiring A: Deserialize
1204impl<'de, A: Referable> Deserialize<'de> for ActorMeshRef<A> {
1205    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
1206    where
1207        D: Deserializer<'de>,
1208    {
1209        let (id, proc_mesh_id, controller, cast_domain) = <(
1210            ActorMeshId,
1211            Option<ProcMeshId>,
1212            Option<ActorRef<ActorMeshController<A>>>,
1213            ActorMeshCastDomain,
1214        )>::deserialize(deserializer)?;
1215        Ok(Self::with_cast_domain(
1216            id,
1217            proc_mesh_id,
1218            controller,
1219            cast_domain,
1220            DEFAULT_PAGE,
1221        ))
1222    }
1223}
1224
1225impl<A: Referable> view::Ranked for ActorMeshRef<A> {
1226    type Item = ActorRef<A>;
1227
1228    #[inline]
1229    fn region(&self) -> &Region {
1230        self.cast_domain.region()
1231    }
1232
1233    #[inline]
1234    fn get(&self, rank: usize) -> Option<&Self::Item> {
1235        self.materialize(rank)
1236    }
1237}
1238
1239impl<A: Referable> view::RankedSliceable for ActorMeshRef<A> {
1240    /// Return a pure slice of this actor mesh.
1241    ///
1242    /// This method cannot install routing state because the trait has no caller
1243    /// context. Instead it carries a lazy cast-domain descriptor. The first cast
1244    /// through the returned ref posts setup from that caller before sending the
1245    /// cast message, preserving normal sender-side stream ordering.
1246    fn sliced(&self, region: Region) -> Self {
1247        // Slices inherit cached failures that were already observed on the parent
1248        // mesh ref so new sub-slices do not race the controller replay path.
1249        // The supervision receiver stays independent because each slice applies
1250        // its own region filter to future updates.
1251        debug_assert!(region.is_subset(view::Ranked::region(self)));
1252        Self {
1253            id: self.id.clone(),
1254            proc_mesh_id: self.proc_mesh_id.clone(),
1255            controller: self.controller.clone(),
1256            cast_domain: ActorMeshCastDomain::new(
1257                Arc::new(self.cast_domain.members().sliced(region.clone())),
1258                region.clone(),
1259            ),
1260            health_state: self.health_state.clone(),
1261            receiver: ActorLocal::new(),
1262            pages: OnceCell::new(),
1263            page_size: self.page_size,
1264        }
1265    }
1266}
1267
1268#[cfg(all(test, fbcode_build))]
1269mod tests {
1270
1271    use std::collections::HashMap;
1272    use std::collections::HashSet;
1273    use std::ops::Deref;
1274    use std::sync::Arc;
1275
1276    use hyperactor::Endpoint as _;
1277    use hyperactor::actor::ActorErrorKind;
1278    use hyperactor::actor::ActorStatus;
1279    use hyperactor::context::Mailbox as _;
1280    use hyperactor::id::Label;
1281    use hyperactor::mailbox;
1282    use ndslice::Extent;
1283    use ndslice::Region;
1284    use ndslice::Slice;
1285    use ndslice::ViewExt;
1286    use ndslice::extent;
1287    use ndslice::view::Ranked;
1288    use ndslice::view::RankedSliceable;
1289    use timed_test::assert_no_process_leak;
1290    use timed_test::async_timed_test;
1291    use tokio::time::Duration;
1292
1293    use super::ActorMesh;
1294    use crate::ActorMeshRef;
1295    use crate::ProcMesh;
1296    use crate::host_mesh::GET_PROC_STATE_MAX_IDLE;
1297    use crate::host_mesh::PROC_SPAWN_MAX_IDLE;
1298    use crate::mesh_controller::SUPERVISION_POLL_FREQUENCY;
1299    use crate::mesh_id::ActorMeshId;
1300    use crate::proc_mesh::ACTOR_SPAWN_MAX_IDLE;
1301    use crate::proc_mesh::GET_ACTOR_STATE_MAX_IDLE;
1302    use crate::supervision::MeshFailure;
1303    use crate::testactor;
1304    use crate::testing;
1305
1306    #[test]
1307    fn test_actor_mesh_ref_is_send_and_sync() {
1308        fn assert_send_sync<T: Send + Sync>() {}
1309        assert_send_sync::<ActorMeshRef<()>>();
1310    }
1311
1312    #[tokio::test]
1313    async fn test_actor_mesh_ref_lazy_materialization() {
1314        // 1) Bring up procs and spawn actors.
1315        let instance = testing::instance();
1316        // Small mesh so the test runs fast, but > page_size so we
1317        // cross a boundary
1318        let mut hm = testing::host_mesh(2).await;
1319        let pm: ProcMesh = hm
1320            .spawn(instance, "test", extent!(gpus = 2), None, None)
1321            .await
1322            .unwrap();
1323        let am: ActorMesh<testactor::TestActor> = pm.spawn(instance, "test", &()).await.unwrap();
1324
1325        // 2) Build our ActorMeshRef with a tiny page size (2) to
1326        // force multiple pages:
1327        // page 0: ranks [0,1], page 1: [2,3], page 2: [4,5]
1328        let page_size = 2;
1329        let amr: ActorMeshRef<testactor::TestActor> = ActorMeshRef::with_page_size(
1330            am.id.clone(),
1331            am.deref().proc_mesh_id.clone(),
1332            am.region().clone(),
1333            page_size,
1334            None,
1335            Arc::clone(&am.deref().cast_domain.members),
1336        );
1337        assert_eq!(amr.extent(), extent!(hosts = 2, gpus = 2));
1338        assert_eq!(amr.region().num_ranks(), 4);
1339
1340        // 3) Within-rank pointer stability (OnceLock caches &ActorRef)
1341        let p0_a = amr.get(0).expect("rank 0 exists") as *const _;
1342        let p0_b = amr.get(0).expect("rank 0 exists") as *const _;
1343        assert_eq!(p0_a, p0_b, "same rank should return same cached pointer");
1344
1345        // 4) Same page, different rank (both materialize fine)
1346        let p1_a = amr.get(1).expect("rank 1 exists") as *const _;
1347        let p1_b = amr.get(1).expect("rank 1 exists") as *const _;
1348        assert_eq!(p1_a, p1_b, "same rank should return same cached pointer");
1349        // They're different ranks, so the pointers are different
1350        // (distinct OnceLocks in the page)
1351        assert_ne!(p0_a, p1_a, "different ranks have different cache slots");
1352
1353        // 5) Cross a page boundary (rank 2 is in a different page than rank 0/1)
1354        let p2_a = amr.get(2).expect("rank 2 exists") as *const _;
1355        let p2_b = amr.get(2).expect("rank 2 exists") as *const _;
1356        assert_eq!(p2_a, p2_b, "same rank should return same cached pointer");
1357        assert_ne!(p0_a, p2_a, "different pages have different cache slots");
1358
1359        // 6) Clone should drop the cache but keep identity (actor_id)
1360        let amr_clone = amr.clone();
1361        let orig_id_0 = amr.get(0).unwrap().actor_addr().clone();
1362        let clone_id_0 = amr_clone.get(0).unwrap().actor_addr().clone();
1363        assert_eq!(orig_id_0, clone_id_0, "clone preserves identity");
1364        let p0_clone = amr_clone.get(0).unwrap() as *const _;
1365        assert_ne!(
1366            p0_a, p0_clone,
1367            "cloned ActorMeshRef has a fresh cache (different pointer)"
1368        );
1369
1370        // 7) Slicing preserves page_size and clears cache
1371        // (RankedSliceable::sliced)
1372        let sliced = amr.range("hosts", 0..2).expect("slice should be valid"); // leaves 4 ranks
1373        assert_eq!(sliced.region().num_ranks(), 4);
1374        assert!(
1375            sliced.get(0).is_some(),
1376            "RankedSliceable::sliced preserves a lazy cast-domain descriptor"
1377        );
1378        // First access materializes a new cache for the sliced view.
1379        let sp0_a = sliced.get(0).unwrap() as *const _;
1380        let sp0_b = sliced.get(0).unwrap() as *const _;
1381        assert_eq!(sp0_a, sp0_b, "sliced view has its own cache slot per rank");
1382        // Cross-page inside the slice too (page_size = 2 => pages are
1383        // [0..2), [2..4)).
1384        let sp2 = sliced.get(2).unwrap() as *const _;
1385        assert_ne!(sp0_a, sp2, "sliced view crosses its own page boundary");
1386
1387        // 8) Hash/Eq ignore cache state; identical identity collapses
1388        // to one set entry.
1389        let mut set = HashSet::new();
1390        set.insert(amr.clone());
1391        set.insert(amr.clone());
1392        assert_eq!(set.len(), 1, "cache state must not affect Hash/Eq");
1393
1394        // 9) As a sanity check, cast to ensure the refs are indeed
1395        // usable/live.
1396        let (port, mut rx) = mailbox::open_port(instance);
1397        // Send to rank 0 and rank 3 (extent 3x2 => at least 4 ranks
1398        // exist).
1399        amr.get(0)
1400            .expect("rank 0 exists")
1401            .post(instance, testactor::GetActorId(port.bind()));
1402        amr.get(3)
1403            .expect("rank 3 exists")
1404            .post(instance, testactor::GetActorId(port.bind()));
1405        let id_a = tokio::time::timeout(Duration::from_secs(3), rx.recv())
1406            .await
1407            .expect("timed out waiting for first reply")
1408            .expect("channel closed before first reply");
1409        let id_b = tokio::time::timeout(Duration::from_secs(3), rx.recv())
1410            .await
1411            .expect("timed out waiting for second reply")
1412            .expect("channel closed before second reply");
1413        assert_ne!(id_a, id_b, "two different ranks responded");
1414
1415        let _ = hm.shutdown(instance).await;
1416    }
1417
1418    #[async_timed_test(timeout_secs = 300)]
1419    #[cfg(fbcode_build)]
1420    async fn test_actor_mesh_slice_casts_only_to_slice_members() {
1421        let instance = testing::instance();
1422        let mut hm = testing::host_mesh(2).await;
1423        let pm: ProcMesh = hm
1424            .spawn(instance, "test", extent!(gpus = 2), None, None)
1425            .await
1426            .unwrap();
1427        let actor_mesh: ActorMesh<testactor::TestActor> =
1428            pm.spawn(instance, "test", &()).await.unwrap();
1429
1430        {
1431            let host1_region = actor_mesh
1432                .region()
1433                .range("hosts", 1..2)
1434                .expect("host slice should exist");
1435            let host1 = actor_mesh.sliced(host1_region);
1436            // This waits for one reply from every actor in the sliced mesh and
1437            // then asserts that no extra actors replied. Passing `None` does not
1438            // pin exact sequence numbers, but the destination handler still unwraps
1439            // `SEQ_INFO`, so the delivered cast must carry ordering metadata.
1440            testactor::assert_casting_correctness(&host1, instance, None).await;
1441
1442            {
1443                let host1_gpu1_region = host1
1444                    .region()
1445                    .range("gpus", 1..2)
1446                    .expect("nested slice should exist");
1447                let host1_gpu1 = host1.sliced(host1_gpu1_region);
1448                testactor::assert_casting_correctness(&host1_gpu1, instance, None).await;
1449            }
1450        }
1451
1452        let _ = hm.shutdown(instance).await;
1453    }
1454
1455    async fn assert_slice_cast_points(
1456        actor_mesh: &ActorMeshRef<testactor::TestActor>,
1457        instance: &impl hyperactor::context::Actor,
1458    ) {
1459        let (port, mut rx) = mailbox::open_port(instance);
1460        actor_mesh
1461            .cast(
1462                instance,
1463                testactor::GetCastInfo {
1464                    cast_info: port.bind(),
1465                },
1466            )
1467            .unwrap();
1468
1469        let mut expected: HashMap<_, _> = actor_mesh
1470            .values()
1471            .enumerate()
1472            .map(|(rank, actor_ref)| {
1473                (
1474                    actor_ref.actor_addr().clone(),
1475                    actor_mesh
1476                        .extent()
1477                        .point_of_rank(rank)
1478                        .expect("rank must be in-bounds for slice extent"),
1479                )
1480            })
1481            .collect();
1482
1483        while !expected.is_empty() {
1484            let (point, actor_ref, _sender) =
1485                tokio::time::timeout(Duration::from_secs(3), rx.recv())
1486                    .await
1487                    .expect("timed out waiting for cast info")
1488                    .expect("channel closed before receiving cast info");
1489            let expected_point = expected
1490                .remove(actor_ref.actor_addr())
1491                .expect("received cast info from unexpected actor");
1492            assert_eq!(
1493                point, expected_point,
1494                "cast point should be computed from slice-local rank and shape"
1495            );
1496        }
1497
1498        tokio::time::sleep(Duration::from_secs(1)).await;
1499        let result = rx.try_recv();
1500        assert!(result.as_ref().unwrap().is_none(), "got {result:?}");
1501    }
1502
1503    #[async_timed_test(timeout_secs = 60)]
1504    #[cfg(fbcode_build)]
1505    async fn test_actor_mesh_slice_cast_uses_slice_local_points() {
1506        let instance = testing::instance();
1507        let mut hm = testing::host_mesh(2).await;
1508        let pm: ProcMesh = hm
1509            .spawn(instance, "test", extent!(gpus = 2), None, None)
1510            .await
1511            .unwrap();
1512        let actor_mesh: ActorMesh<testactor::TestActor> =
1513            pm.spawn(instance, "test", &()).await.unwrap();
1514
1515        let host1_region = actor_mesh
1516            .region()
1517            .range("hosts", 1..2)
1518            .expect("host slice should exist");
1519        let host1 = actor_mesh.sliced(host1_region);
1520        assert_slice_cast_points(&host1, instance).await;
1521
1522        let host1_gpu1_region = host1
1523            .region()
1524            .range("gpus", 1..2)
1525            .expect("nested slice should exist");
1526        let host1_gpu1 = host1.sliced(host1_gpu1_region);
1527        assert_slice_cast_points(&host1_gpu1, instance).await;
1528
1529        let _ = hm.shutdown(instance).await;
1530    }
1531
1532    #[async_timed_test(timeout_secs = 300)]
1533    async fn test_actor_states_with_panic() {
1534        hyperactor_telemetry::initialize_logging_for_test();
1535
1536        let instance = testing::instance();
1537        let config = hyperactor_config::global::lock();
1538        let _proc_spawn = config.override_key(PROC_SPAWN_MAX_IDLE, Duration::from_secs(120));
1539        let _actor_spawn = config.override_key(ACTOR_SPAWN_MAX_IDLE, Duration::from_secs(120));
1540        let _host_spawn = config.override_key(
1541            hyperactor::config::HOST_SPAWN_READY_TIMEOUT,
1542            Duration::from_secs(120),
1543        );
1544
1545        // Listen for supervision events sent to the parent instance.
1546        let (supervision_port, mut supervision_receiver) = instance.open_port::<MeshFailure>();
1547        let supervisor = supervision_port.bind();
1548        let num_replicas = 1;
1549        let mut hm = testing::host_mesh(num_replicas).await;
1550        let proc_mesh = hm
1551            .spawn(instance, "test", Extent::unity(), None, None)
1552            .await
1553            .unwrap();
1554        let child_name = ActorMeshId::instance(Label::new("child").unwrap());
1555
1556        // Need to use a wrapper as there's no way to customize the handler for MeshFailure
1557        // on the client instance. The client would just panic with the message.
1558        let actor_mesh: ActorMesh<testactor::WrapperActor> = proc_mesh
1559            .spawn(
1560                instance,
1561                "wrapper",
1562                &(proc_mesh.deref().clone(), supervisor, child_name.clone()),
1563            )
1564            .await
1565            .unwrap();
1566
1567        // Trigger the supervision error.
1568        actor_mesh
1569            .cast(
1570                instance,
1571                testactor::CauseSupervisionEvent {
1572                    kind: testactor::SupervisionEventType::Panic,
1573                    send_to_children: true,
1574                },
1575            )
1576            .unwrap();
1577
1578        // The error will come back on two different pathways:
1579        // * on the ActorMeshRef stored in WrapperActor
1580        //   as an observable supervision event as a subscriber.
1581        // * on the owning actor (WrapperActor here) to be handled.
1582        // We test to ensure both have occurred.
1583
1584        // First test the ActorMeshRef got the event.
1585        // Use a NextSupervisionFailure message to get the event from the wrapper
1586        // actor.
1587        let (failure_port, mut failure_receiver) = instance.open_port::<Option<MeshFailure>>();
1588        actor_mesh
1589            .cast(
1590                instance,
1591                testactor::NextSupervisionFailure(failure_port.bind()),
1592            )
1593            .unwrap();
1594        let failure = failure_receiver
1595            .recv()
1596            .await
1597            .unwrap()
1598            .expect("no supervision event found on ref from wrapper actor");
1599        let check_failure = move |failure: MeshFailure| {
1600            assert_eq!(failure.actor_mesh_name, Some(child_name.to_string()));
1601            assert!(
1602                failure
1603                    .event
1604                    .actor_id
1605                    .label()
1606                    .unwrap()
1607                    .as_str()
1608                    .starts_with(child_name.label().unwrap().as_str())
1609            );
1610            if let ActorStatus::Failed(ActorErrorKind::Generic(msg)) = &failure.event.actor_status {
1611                assert!(msg.contains("panic"), "{}", msg);
1612                assert!(msg.contains("for testing"), "{}", msg);
1613            } else {
1614                panic!("actor status is not failed: {}", failure.event.actor_status);
1615            }
1616        };
1617        check_failure(failure);
1618
1619        // The wrapper actor should *not* have an event.
1620
1621        // Wait for a supervision event to reach the wrapper actor.
1622        for _ in 0..num_replicas {
1623            let failure =
1624                tokio::time::timeout(Duration::from_secs(20), supervision_receiver.recv())
1625                    .await
1626                    .expect("timeout")
1627                    .unwrap();
1628            check_failure(failure);
1629        }
1630
1631        let _ = hm.shutdown(instance).await;
1632    }
1633
1634    #[assert_no_process_leak]
1635    #[async_timed_test(timeout_secs = 300)]
1636    async fn test_actor_states_with_process_exit() {
1637        hyperactor_telemetry::initialize_logging_for_test();
1638
1639        let config = hyperactor_config::global::lock();
1640        let _poll = config.override_key(SUPERVISION_POLL_FREQUENCY, Duration::from_secs(1));
1641        let _guard = config.override_key(GET_ACTOR_STATE_MAX_IDLE, Duration::from_secs(1));
1642        let _proc_guard = config.override_key(GET_PROC_STATE_MAX_IDLE, Duration::from_secs(1));
1643        let _proc_spawn = config.override_key(PROC_SPAWN_MAX_IDLE, Duration::from_secs(120));
1644        let _host_spawn = config.override_key(
1645            hyperactor::config::HOST_SPAWN_READY_TIMEOUT,
1646            Duration::from_secs(120),
1647        );
1648
1649        let instance = testing::instance();
1650        // Listen for supervision events sent to the parent instance.
1651        let (supervision_port, mut supervision_receiver) = instance.open_port::<MeshFailure>();
1652        let supervisor = supervision_port.bind();
1653        let num_replicas = 1;
1654        let mut hm = testing::host_mesh(num_replicas).await;
1655        let proc_mesh = hm
1656            .spawn(instance, "test", Extent::unity(), None, None)
1657            .await
1658            .unwrap();
1659        let mut second_hm = testing::host_mesh(num_replicas).await;
1660        let second_proc_mesh = second_hm
1661            .spawn(instance, "test2", Extent::unity(), None, None)
1662            .await
1663            .unwrap();
1664        let child_name = ActorMeshId::instance(Label::new("child").unwrap());
1665
1666        // Need to use a wrapper as there's no way to customize the handler for MeshFailure
1667        // on the client instance. The client would just panic with the message.
1668        let actor_mesh: ActorMesh<testactor::WrapperActor> = proc_mesh
1669            .spawn(
1670                instance,
1671                "wrapper",
1672                &(
1673                    // Need a second set of proc meshes for the inner test actor, so the
1674                    // WrapperActor is still alive and gets the message.
1675                    second_proc_mesh.deref().clone(),
1676                    supervisor,
1677                    child_name.clone(),
1678                ),
1679            )
1680            .await
1681            .unwrap();
1682
1683        actor_mesh
1684            .cast(
1685                instance,
1686                testactor::CauseSupervisionEvent {
1687                    kind: testactor::SupervisionEventType::ProcessExit(1),
1688                    send_to_children: true,
1689                },
1690            )
1691            .unwrap();
1692
1693        // Same drill as for panic, except this one is for process exit.
1694        let (failure_port, mut failure_receiver) = instance.open_port::<Option<MeshFailure>>();
1695        actor_mesh
1696            .cast(
1697                instance,
1698                testactor::NextSupervisionFailure(failure_port.bind()),
1699            )
1700            .unwrap();
1701        let failure = failure_receiver
1702            .recv()
1703            .await
1704            .unwrap()
1705            .expect("no supervision event found on ref from wrapper actor");
1706
1707        let check_failure = move |failure: MeshFailure| {
1708            assert_eq!(failure.actor_mesh_name, Some(child_name.to_string()));
1709            assert!(
1710                failure
1711                    .event
1712                    .actor_id
1713                    .label()
1714                    .unwrap()
1715                    .as_str()
1716                    .starts_with(child_name.label().unwrap().as_str())
1717            );
1718            if let ActorStatus::Failed(ActorErrorKind::Generic(msg)) = &failure.event.actor_status {
1719                assert!(msg.contains("exited with non-zero code 1"), "{}", msg);
1720            } else {
1721                panic!("actor status is not failed: {}", failure.event.actor_status);
1722            }
1723        };
1724        check_failure(failure);
1725
1726        // Wait for a supervision event to occur on these actors.
1727        for _ in 0..num_replicas {
1728            let failure =
1729                tokio::time::timeout(Duration::from_secs(20), supervision_receiver.recv())
1730                    .await
1731                    .expect("timeout")
1732                    .unwrap();
1733            check_failure(failure);
1734        }
1735
1736        let _ = second_hm.shutdown(instance).await;
1737        let _ = hm.shutdown(instance).await;
1738    }
1739
1740    #[async_timed_test(timeout_secs = 300)]
1741    async fn test_actor_states_on_sliced_mesh() {
1742        hyperactor_telemetry::initialize_logging_for_test();
1743
1744        let instance = testing::instance();
1745        // Listen for supervision events sent to the parent instance.
1746        let (supervision_port, mut supervision_receiver) = instance.open_port::<MeshFailure>();
1747        let supervisor = supervision_port.bind();
1748        let (mut hm, _actor_mesh, sliced, sliced_replicas, child_name) = {
1749            let config = hyperactor_config::global::lock();
1750            let _proc_spawn = config.override_key(PROC_SPAWN_MAX_IDLE, Duration::from_secs(120));
1751            let _actor_spawn = config.override_key(ACTOR_SPAWN_MAX_IDLE, Duration::from_secs(120));
1752            let _host_spawn = config.override_key(
1753                hyperactor::config::HOST_SPAWN_READY_TIMEOUT,
1754                Duration::from_secs(120),
1755            );
1756            let num_replicas = 2;
1757            let hm = testing::host_mesh(num_replicas).await;
1758            let proc_mesh = hm
1759                .spawn(instance, "test", Extent::unity(), None, None)
1760                .await
1761                .unwrap();
1762            let child_name = ActorMeshId::instance(Label::new("child").unwrap());
1763
1764            // Need to use a wrapper as there's no way to customize the handler for MeshFailure
1765            // on the client instance. The client would just panic with the message.
1766            let actor_mesh: ActorMesh<testactor::WrapperActor> = proc_mesh
1767                .spawn(
1768                    instance,
1769                    "wrapper",
1770                    &(proc_mesh.deref().clone(), supervisor, child_name.clone()),
1771                )
1772                .await
1773                .unwrap();
1774            let sliced = actor_mesh
1775                .range("hosts", 1..2)
1776                .expect("slice should be valid");
1777            let sliced_replicas = sliced.len();
1778            (hm, actor_mesh, sliced, sliced_replicas, child_name)
1779        };
1780
1781        // TODO: check that independent slice refs don't get the supervision event.
1782        sliced
1783            .cast(
1784                instance,
1785                testactor::CauseSupervisionEvent {
1786                    kind: testactor::SupervisionEventType::Panic,
1787                    send_to_children: true,
1788                },
1789            )
1790            .unwrap();
1791
1792        for _ in 0..sliced_replicas {
1793            let supervision_message =
1794                tokio::time::timeout(Duration::from_secs(20), supervision_receiver.recv())
1795                    .await
1796                    .expect("timeout")
1797                    .unwrap();
1798            let event = supervision_message.event;
1799            assert!(
1800                event
1801                    .actor_id
1802                    .label()
1803                    .unwrap()
1804                    .as_str()
1805                    .starts_with(child_name.label().unwrap().as_str())
1806            );
1807            if let ActorStatus::Failed(ActorErrorKind::Generic(msg)) = &event.actor_status {
1808                assert!(msg.contains("panic"));
1809                assert!(msg.contains("for testing"));
1810            } else {
1811                panic!("actor status is not failed: {}", event.actor_status);
1812            }
1813        }
1814
1815        let _ = hm.shutdown(instance).await;
1816    }
1817
1818    async fn execute_cast(config: &hyperactor_config::global::ConfigLock) {
1819        let _guard = config.override_key(crate::bootstrap::MESH_BOOTSTRAP_ENABLE_PDEATHSIG, false);
1820        let _proc_spawn = config.override_key(PROC_SPAWN_MAX_IDLE, Duration::from_secs(60));
1821        let _host_spawn = config.override_key(
1822            hyperactor::config::HOST_SPAWN_READY_TIMEOUT,
1823            Duration::from_secs(60),
1824        );
1825
1826        let instance = testing::instance();
1827        let mut host_mesh = testing::host_mesh(2).await;
1828        let proc_mesh = host_mesh
1829            .spawn(instance, "test", Extent::unity(), None, None)
1830            .await
1831            .unwrap();
1832        let actor_mesh: ActorMesh<testactor::TestActor> =
1833            proc_mesh.spawn(instance, "test", &()).await.unwrap();
1834
1835        let (cast_info, mut cast_info_rx) = instance.mailbox().open_port();
1836        actor_mesh
1837            .cast(
1838                instance,
1839                testactor::GetCastInfo {
1840                    cast_info: cast_info.bind(),
1841                },
1842            )
1843            .unwrap();
1844
1845        let mut point_to_actor: HashSet<_> = actor_mesh.iter().collect();
1846        while !point_to_actor.is_empty() {
1847            let (point, origin_actor_ref, sender_actor_id) = cast_info_rx.recv().await.unwrap();
1848            let key = (point, origin_actor_ref);
1849            assert!(
1850                point_to_actor.remove(&key),
1851                "key {:?} not present or removed twice",
1852                key
1853            );
1854            assert_eq!(&sender_actor_id, instance.self_addr());
1855        }
1856
1857        let _ = host_mesh.shutdown(instance).await;
1858    }
1859
1860    #[async_timed_test(timeout_secs = 60)]
1861    async fn test_sliced_actor_mesh_cast_v1_reaches_slice_members() {
1862        use hyperactor::config::ENABLE_DEST_ACTOR_REORDERING_BUFFER;
1863
1864        let config = hyperactor_config::global::lock();
1865        let _guard = config.override_key(crate::bootstrap::MESH_BOOTSTRAP_ENABLE_PDEATHSIG, false);
1866        let _v1 = config.override_key(crate::comm::ENABLE_NATIVE_V1_CASTING, true);
1867        let _reorder = config.override_key(ENABLE_DEST_ACTOR_REORDERING_BUFFER, true);
1868        let _proc_spawn = config.override_key(PROC_SPAWN_MAX_IDLE, Duration::from_secs(60));
1869        let _host_spawn = config.override_key(
1870            hyperactor::config::HOST_SPAWN_READY_TIMEOUT,
1871            Duration::from_secs(60),
1872        );
1873
1874        let instance = testing::instance();
1875        let mut host_mesh = testing::host_mesh(2).await;
1876        let proc_mesh = host_mesh
1877            .spawn(instance, "test", Extent::unity(), None, None)
1878            .await
1879            .unwrap();
1880        let root_actor_mesh: ActorMesh<testactor::TestActor> =
1881            proc_mesh.spawn(instance, "test", &()).await.unwrap();
1882
1883        // Cast through a sliced mesh — `cast` still means all, but all is
1884        // scoped to the immutable sliced rank space.
1885        let actor_mesh = root_actor_mesh.sliced(Region::new(
1886            vec!["rank".to_string()],
1887            Slice::new(0, vec![1], vec![1]).unwrap(),
1888        ));
1889        let (cast_info, mut cast_info_rx) = instance.mailbox().open_port();
1890        actor_mesh
1891            .cast(
1892                instance,
1893                testactor::GetCastInfo {
1894                    cast_info: cast_info.bind(),
1895                },
1896            )
1897            .unwrap();
1898
1899        let (point, _actor_ref, _sender) = cast_info_rx.recv().await.unwrap();
1900        let received_ranks = HashSet::from([point.rank()]);
1901        assert_eq!(received_ranks, HashSet::from([0]));
1902
1903        // Also cast the root mesh — all ranks should be reached via V1.
1904        let (cast_info2, mut cast_info_rx2) = instance.mailbox().open_port();
1905        root_actor_mesh
1906            .cast(
1907                instance,
1908                testactor::GetCastInfo {
1909                    cast_info: cast_info2.bind(),
1910                },
1911            )
1912            .unwrap();
1913
1914        let mut all_ranks: HashSet<usize> = HashSet::new();
1915        for _ in 0..2 {
1916            let (point, _actor_ref, _sender) = cast_info_rx2.recv().await.unwrap();
1917            all_ranks.insert(point.rank());
1918        }
1919        assert_eq!(all_ranks, HashSet::from([0, 1]));
1920
1921        let _ = host_mesh.shutdown(instance).await;
1922    }
1923
1924    #[async_timed_test(timeout_secs = 60)]
1925    async fn test_cast_domain_stamps_resource_rank_binding() {
1926        let client_proc = hyperactor::proc::Proc::direct(
1927            hyperactor::channel::ChannelTransport::Unix.any(),
1928            "client_proc".into(),
1929        )
1930        .unwrap();
1931
1932        let client = client_proc.client("client");
1933
1934        let mut procs = Vec::new();
1935
1936        let members = (0..2)
1937            .map(|rank| {
1938                let proc = hyperactor::proc::Proc::direct(
1939                    hyperactor::channel::ChannelTransport::Unix.any(),
1940                    format!("proc_{rank}"),
1941                )
1942                .unwrap();
1943
1944                let cast_handle = proc
1945                    .spawn_with_uid(
1946                        hyperactor::Uid::singleton(Label::strip("cast")),
1947                        hyperactor_cast::cast_actor::CastActor::default(),
1948                    )
1949                    .unwrap();
1950
1951                let _: hyperactor::ActorRef<hyperactor_cast::cast_actor::CastActor> =
1952                    cast_handle.bind();
1953
1954                let receiver_handle = proc
1955                    .spawn_with_uid(
1956                        hyperactor::Uid::singleton(Label::strip("receiver")),
1957                        testactor::TestActor,
1958                    )
1959                    .unwrap();
1960
1961                let _: hyperactor::ActorRef<testactor::TestActor> = receiver_handle.bind();
1962
1963                let actor_addr =
1964                    hyperactor::ActorAddr::root(proc.proc_addr().clone(), Label::strip("receiver"));
1965
1966                procs.push(proc);
1967
1968                (rank, actor_addr)
1969            })
1970            .collect::<HashMap<_, _>>();
1971
1972        let cast_domain = hyperactor_cast::cast_actor::CastDomainId::new()
1973            .materialize(
1974                &client,
1975                members,
1976                Region::from(ndslice::shape!(rank = 2)),
1977                hyperactor_cast::cast_actor::TilingPolicy::BlockPartitioning,
1978                hyperactor_config::Flattrs::new(),
1979            )
1980            .unwrap();
1981
1982        let (rank_port, mut rank_rx) = client.mailbox().open_port();
1983
1984        cast_domain
1985            .cast(
1986                &client,
1987                hyperactor_config::Flattrs::new(),
1988                testactor::GetResourceRank {
1989                    rank: crate::resource::Rank::default(),
1990                    reply: rank_port.bind(),
1991                },
1992            )
1993            .unwrap();
1994
1995        let mut received_ranks = HashSet::new();
1996
1997        for _ in 0..2 {
1998            let (_point, rank) = rank_rx.recv().await.unwrap();
1999
2000            received_ranks.insert(rank);
2001        }
2002
2003        assert_eq!(received_ranks, HashSet::from([Some(0), Some(1)]));
2004    }
2005
2006    #[async_timed_test(timeout_secs = 30)]
2007    async fn test_cast() {
2008        let config = hyperactor_config::global::lock();
2009        execute_cast(&config).await;
2010    }
2011
2012    #[async_timed_test(timeout_secs = 30)]
2013    async fn test_cast_p2p() {
2014        let config = hyperactor_config::global::lock();
2015        let _guard = config.override_key(crate::comm::ENABLE_NATIVE_V1_CASTING, true);
2016        let _guard2 = config.override_key(
2017            hyperactor::config::ENABLE_DEST_ACTOR_REORDERING_BUFFER,
2018            true,
2019        );
2020        let _guard3 = config.override_key(crate::config::V1_CAST_POINT_TO_POINT_THRESHOLD, 1024);
2021        execute_cast(&config).await;
2022    }
2023    /// Test that undeliverable messages are properly returned to the
2024    /// sender when communication to a proc is broken.
2025    ///
2026    /// This is the V1 version of the test from
2027    /// hyperactor_multiprocess/src/proc_actor.rs::test_undeliverable_message_return.
2028    #[assert_no_process_leak]
2029    #[async_timed_test(timeout_secs = 60)]
2030    async fn test_undeliverable_message_return() {
2031        use hyperactor::mailbox::MessageEnvelope;
2032        use hyperactor::mailbox::Undeliverable;
2033        use hyperactor::testing::pingpong::PingPongActor;
2034        use hyperactor::testing::pingpong::PingPongMessage;
2035
2036        hyperactor_telemetry::initialize_logging_for_test();
2037
2038        let instance = testing::instance();
2039
2040        // Create a proc mesh with 2 hosts.
2041        let (mut hm, proc_mesh) = {
2042            let config = hyperactor_config::global::lock();
2043            let _proc_spawn_guard =
2044                config.override_key(PROC_SPAWN_MAX_IDLE, Duration::from_secs(60));
2045            let _host_spawn_guard = config.override_key(
2046                hyperactor::config::HOST_SPAWN_READY_TIMEOUT,
2047                Duration::from_secs(60),
2048            );
2049            let hm = testing::host_mesh(2).await;
2050            let proc_mesh = hm
2051                .spawn(instance, "test", Extent::unity(), None, None)
2052                .await
2053                .unwrap();
2054            (hm, proc_mesh)
2055        };
2056
2057        // Set up undeliverable message port for collecting undeliverables
2058        let (undeliverable_port, mut undeliverable_rx) =
2059            instance.open_port::<Undeliverable<MessageEnvelope>>();
2060
2061        // Spawn actors individually on each host by spawning separate actor meshes
2062        // with specific proc selections.
2063        let ping_proc_mesh = proc_mesh.range("hosts", 0..1).unwrap();
2064        let pong_proc_mesh = proc_mesh.range("hosts", 1..2).unwrap();
2065
2066        let ping_mesh: ActorMesh<PingPongActor> = ping_proc_mesh
2067            .spawn(
2068                instance,
2069                "ping",
2070                &(Some(undeliverable_port.bind()), None, None),
2071            )
2072            .await
2073            .unwrap();
2074
2075        let mut pong_mesh: ActorMesh<PingPongActor> = pong_proc_mesh
2076            .spawn(instance, "pong", &(None, None, None))
2077            .await
2078            .unwrap();
2079
2080        // Get individual actor refs
2081        let ping_handle = ping_mesh.values().next().unwrap();
2082        let pong_handle = pong_mesh.values().next().unwrap();
2083
2084        // Verify ping-pong works initially
2085        let (done_tx, done_rx) = instance.open_once_port();
2086        ping_handle.post(
2087            instance,
2088            PingPongMessage(2, pong_handle.clone(), done_tx.bind()),
2089        );
2090        assert!(
2091            done_rx.recv().await.unwrap(),
2092            "Initial ping-pong should work"
2093        );
2094
2095        // Now stop the pong actor mesh to break communication
2096        pong_mesh
2097            .stop(instance, "test stop".to_string())
2098            .await
2099            .unwrap();
2100
2101        // Give it a moment to fully stop
2102        tokio::time::sleep(std::time::Duration::from_millis(200)).await;
2103
2104        // Set message delivery timeout for faster test
2105        let config = hyperactor_config::global::lock();
2106        let _guard = config.override_key(
2107            hyperactor::config::MESSAGE_DELIVERY_TIMEOUT,
2108            std::time::Duration::from_secs(5),
2109        );
2110
2111        // Send multiple messages that will all fail to be delivered
2112        let n = 100usize;
2113        for i in 1..=n {
2114            let ttl = 66 + i as u64; // Avoid ttl = 66 (which would cause other test behavior)
2115            let (once_tx, _once_rx) = instance.open_once_port();
2116            ping_handle.post(
2117                instance,
2118                PingPongMessage(ttl, pong_handle.clone(), once_tx.bind()),
2119            );
2120        }
2121
2122        // Collect all undeliverable messages.
2123        // The fact that we successfully collect them proves the ping actor
2124        // is still running and handling undeliverables correctly (not crashing).
2125        let mut count = 0;
2126        let deadline = tokio::time::Instant::now() + std::time::Duration::from_secs(10);
2127        while count < n && tokio::time::Instant::now() < deadline {
2128            match tokio::time::timeout(std::time::Duration::from_secs(1), undeliverable_rx.recv())
2129                .await
2130            {
2131                Ok(Ok(Undeliverable::Returned(envelope))) => {
2132                    let _: PingPongMessage = envelope.deserialized().unwrap();
2133                    count += 1;
2134                }
2135                Ok(Ok(Undeliverable::Report(_))) => break,
2136                Ok(Err(_)) => break, // Channel closed
2137                Err(_) => break,     // Timeout
2138            }
2139        }
2140
2141        assert_eq!(
2142            count, n,
2143            "Expected {} undeliverable messages, got {}",
2144            n, count
2145        );
2146
2147        let _ = hm.shutdown(instance).await;
2148    }
2149
2150    /// Test that `stop()` returns bounded by `ACTOR_SPAWN_MAX_IDLE` even
2151    /// when actors are stuck inside a handler and never observe the
2152    /// `DrainAndStop` signal. The controller's `Stop` handler awaits
2153    /// the underlying ProcAgent wait, which waits up to `ACTOR_SPAWN_MAX_IDLE`
2154    /// for ProcAgents to report `Stopped`; when that idle window elapses it
2155    /// stamps `Status::Timeout` into the controller's health state, and the
2156    /// subsequent `GetState` reads that back. The actors' tokio tasks
2157    /// continue running in the background: no code path in the mesh layer
2158    /// forcibly aborts them via `JoinHandle::abort()`.
2159    #[async_timed_test(timeout_secs = 30)]
2160    async fn test_actor_mesh_stop_timeout() {
2161        hyperactor_telemetry::initialize_logging_for_test();
2162
2163        // `ACTOR_SPAWN_MAX_IDLE` bounds how long the controller's Stop
2164        // handler waits for ProcAgents to report `Stopped`. Shorten it
2165        // from 30s to 1s so the test finishes quickly.
2166        let config = hyperactor_config::global::lock();
2167        let _proc_spawn = config.override_key(PROC_SPAWN_MAX_IDLE, Duration::from_secs(60));
2168        let _host_spawn = config.override_key(
2169            hyperactor::config::HOST_SPAWN_READY_TIMEOUT,
2170            Duration::from_secs(60),
2171        );
2172
2173        let instance = testing::instance();
2174
2175        // Create proc mesh with 2 procs
2176        let mut hm = testing::host_mesh(2).await;
2177        let proc_mesh = hm
2178            .spawn(instance, "test", Extent::unity(), None, None)
2179            .await
2180            .unwrap();
2181
2182        // Spawn SleepActors across the mesh that will block longer
2183        // than timeout
2184        let mut sleep_mesh: ActorMesh<testactor::SleepActor> =
2185            proc_mesh.spawn(instance, "sleepers", &()).await.unwrap();
2186        let _guard = config.override_key(ACTOR_SPAWN_MAX_IDLE, std::time::Duration::from_secs(1));
2187
2188        // Send each actor a message to sleep for 5 seconds. `Instance::run`
2189        // only polls the signal receiver at message boundaries, so
2190        // `DrainAndStop` will sit queued in the signal mailbox until this
2191        // handler completes. Nothing forcibly aborts it.
2192        for actor_ref in sleep_mesh.values() {
2193            actor_ref.post(instance, std::time::Duration::from_secs(5));
2194        }
2195
2196        // Give actors time to start sleeping
2197        tokio::time::sleep(std::time::Duration::from_millis(200)).await;
2198
2199        // Count how many actors we spawned (for verification later)
2200        let expected_actors = sleep_mesh.values().count();
2201
2202        // Now stop the mesh. The controller's Stop handler will give up on
2203        // waiting for `Stopped` after ACTOR_SPAWN_MAX_IDLE and mark the
2204        // ranks as `Status::Timeout`. Time this operation to confirm we
2205        // return on that budget rather than waiting the full 5s sleep.
2206        let stop_start = tokio::time::Instant::now();
2207        let result = sleep_mesh.stop(instance, "test stop".to_string()).await;
2208        let stop_duration = tokio::time::Instant::now().duration_since(stop_start);
2209
2210        // `stop()` returns `Ok(())` because `is_terminating()` accepts
2211        // `Status::Timeout`. We still check the duration below to confirm
2212        // the timeout path (not a natural graceful stop) produced this.
2213        match result {
2214            Ok(_) => {
2215                tracing::info!(
2216                    "stop returned Ok for {} actors; their tokio tasks \
2217                     may still be running until their handler yields",
2218                    expected_actors
2219                );
2220            }
2221            Err(ref e) => {
2222                let err_str = format!("{:?}", e);
2223                assert!(
2224                    err_str.contains("Timeout"),
2225                    "Expected Timeout error, got: {:?}",
2226                    e
2227                );
2228            }
2229        }
2230
2231        // Verify that stop returned on the ACTOR_SPAWN_MAX_IDLE budget
2232        // (~1s) rather than the full 5s sleep. This confirms we hit the
2233        // controller's idle timeout while querying for `Stopped` — not
2234        // that the actors were actually aborted; they weren't.
2235        assert!(
2236            stop_duration < std::time::Duration::from_millis(4500),
2237            "Stop took {:?}, expected < 4.5s (controller should have given up waiting for Stopped)",
2238            stop_duration
2239        );
2240        assert!(
2241            stop_duration >= std::time::Duration::from_millis(900),
2242            "Stop took {:?}, expected >= 900ms (should have waited for the 1s idle timeout)",
2243            stop_duration
2244        );
2245
2246        let _ = hm.shutdown(instance).await;
2247    }
2248
2249    /// Test that actors stop gracefully when they respond to stop
2250    /// signals within the timeout. Complementary to
2251    /// test_actor_mesh_stop_timeout which tests abort behavior. V1
2252    /// equivalent of
2253    /// hyperactor_multiprocess/src/proc_actor.rs::test_stop
2254    #[async_timed_test(timeout_secs = 60)]
2255    async fn test_actor_mesh_stop_graceful() {
2256        hyperactor_telemetry::initialize_logging_for_test();
2257
2258        let config = hyperactor_config::global::lock();
2259        let _proc_spawn = config.override_key(PROC_SPAWN_MAX_IDLE, Duration::from_secs(60));
2260        let _host_spawn = config.override_key(
2261            hyperactor::config::HOST_SPAWN_READY_TIMEOUT,
2262            Duration::from_secs(60),
2263        );
2264
2265        let instance = testing::instance();
2266
2267        // Create proc mesh with 2 procs
2268        let mut hm = testing::host_mesh(2).await;
2269        let proc_mesh = hm
2270            .spawn(instance, "test", Extent::unity(), None, None)
2271            .await
2272            .unwrap();
2273
2274        // Spawn TestActors - these stop cleanly (no blocking
2275        // operations)
2276        let mut actor_mesh: ActorMesh<testactor::TestActor> =
2277            proc_mesh.spawn(instance, "test_actors", &()).await.unwrap();
2278
2279        // Cloned mesh will still have its controller, even if the owned mesh
2280        // causes a stop.
2281        let mesh_ref = actor_mesh.deref().clone();
2282
2283        let expected_actors = actor_mesh.values().count();
2284        assert!(expected_actors > 0, "Should have spawned some actors");
2285
2286        // Time the stop operation
2287        let stop_start = tokio::time::Instant::now();
2288        let result = actor_mesh.stop(instance, "test stop".to_string()).await;
2289        let stop_duration = tokio::time::Instant::now().duration_since(stop_start);
2290
2291        // Graceful stop should succeed (return Ok)
2292        assert!(
2293            result.is_ok(),
2294            "Stop should succeed for responsive actors, got: {:?}",
2295            result.err()
2296        );
2297
2298        // Verify stop completed quickly (< 2 seconds). Responsive
2299        // actors should stop almost immediately, not wait for
2300        // timeout.
2301        assert!(
2302            stop_duration < std::time::Duration::from_secs(5),
2303            "Graceful stop took {:?}, expected < 5s (actors should stop quickly)",
2304            stop_duration
2305        );
2306
2307        tracing::info!(
2308            "Successfully stopped {} actors in {:?}",
2309            expected_actors,
2310            stop_duration
2311        );
2312
2313        // Check that the next returned supervision event is a Stopped event.
2314        // Note that Ref meshes get Stopped events, and Owned meshes do not,
2315        // because only the owner can stop them anyway.
2316        // Each owned mesh has an implicit ref mesh though, so that is what we
2317        // test here.
2318        let next_event = actor_mesh.next_supervision_event(instance).await.unwrap();
2319        assert_eq!(next_event.actor_mesh_name, Some(mesh_ref.id().to_string()));
2320        assert!(matches!(
2321            next_event.event.actor_status,
2322            ActorStatus::Stopped(_)
2323        ));
2324        // Check that a cloned Ref from earlier gets the same event. Every clone
2325        // should get the same event, even if it's not a subscriber.
2326        let next_event = mesh_ref.next_supervision_event(instance).await.unwrap();
2327        assert_eq!(next_event.actor_mesh_name, Some(mesh_ref.id().to_string()));
2328        assert!(matches!(
2329            next_event.event.actor_status,
2330            ActorStatus::Stopped(_)
2331        ));
2332
2333        let _ = hm.shutdown(instance).await;
2334    }
2335}
hyperactor_mesh/actor_mesh.rs

hyperactor_mesh/
actor_mesh.rs