hyperactor_mesh/
config.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9//! Configuration for Hyperactor Mesh.
10//!
11//! This module provides hyperactor_mesh-specific configuration attributes that extend
12//! the base hyperactor configuration system.
13
14use std::net::SocketAddr;
15use std::time::Duration;
16
17use hyperactor_config::AttrValue;
18use hyperactor_config::CONFIG;
19use hyperactor_config::ConfigAttr;
20use hyperactor_config::attrs::declare_attrs;
21use serde::Deserialize;
22use serde::Serialize;
23use typeuri::Named;
24
25/// A socket address string usable as a `declare_attrs!` default.
26///
27/// Follows the [`hyperactor::config::Pem`] pattern: the `Static`
28/// variant holds a `&'static str` so it can appear in a `static`
29/// item, while `Value` holds a runtime `String` from environment
30/// variables or Python `configure()`.
31#[derive(Clone, Debug, Serialize, Named)]
32#[named("hyperactor_mesh::config::SocketAddrStr")]
33pub enum SocketAddrStr {
34    /// Compile-time default (const-constructible).
35    Static(&'static str),
36    /// Runtime value from env / config.
37    Value(String),
38}
39
40impl<'de> Deserialize<'de> for SocketAddrStr {
41    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
42    where
43        D: serde::Deserializer<'de>,
44    {
45        #[derive(Deserialize)]
46        enum Helper {
47            Static(String),
48            Value(String),
49        }
50        match Helper::deserialize(deserializer)? {
51            Helper::Static(s) | Helper::Value(s) => Ok(SocketAddrStr::Value(s)),
52        }
53    }
54}
55
56impl From<String> for SocketAddrStr {
57    fn from(s: String) -> Self {
58        SocketAddrStr::Value(s)
59    }
60}
61
62impl From<SocketAddrStr> for String {
63    fn from(s: SocketAddrStr) -> Self {
64        s.as_ref().to_owned()
65    }
66}
67
68impl AsRef<str> for SocketAddrStr {
69    fn as_ref(&self) -> &str {
70        match self {
71            SocketAddrStr::Static(s) => s,
72            SocketAddrStr::Value(s) => s,
73        }
74    }
75}
76
77impl std::fmt::Display for SocketAddrStr {
78    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79        f.write_str(self.as_ref())
80    }
81}
82
83impl AttrValue for SocketAddrStr {
84    fn display(&self) -> String {
85        self.as_ref().to_owned()
86    }
87
88    fn parse(value: &str) -> Result<Self, anyhow::Error> {
89        value.parse::<SocketAddr>()?;
90        Ok(SocketAddrStr::Value(value.to_string()))
91    }
92}
93
94impl SocketAddrStr {
95    /// Parse the contained string as a `SocketAddr`.
96    pub fn parse_socket_addr(&self) -> Result<SocketAddr, std::net::AddrParseError> {
97        self.as_ref().parse()
98    }
99}
100
101// Declare hyperactor_mesh-specific configuration keys
102declare_attrs! {
103    /// The maximium for a dimension size allowed for a folded shape
104    /// when reshaping during casting to limit fanout.
105    /// usize::MAX means no reshaping as any shape will always be below
106    /// the limit so no dimension needs to be folded.
107    @meta(CONFIG = ConfigAttr::new(
108        Some("HYPERACTOR_MESH_MAX_CAST_DIMENSION_SIZE".to_string()),
109        Some("max_cast_dimension_size".to_string()),
110    ))
111    pub attr MAX_CAST_DIMENSION_SIZE: usize = 16;
112
113    /// Which builtin process launcher backend to use.
114    /// Accepted values: "native" (default), "systemd".
115    /// Trimmed and lowercased before matching.
116    ///
117    /// **Precedence:** Python spawner (via SetProcSpawner) overrides this.
118    @meta(CONFIG = ConfigAttr::new(
119        Some("HYPERACTOR_MESH_PROC_LAUNCHER_KIND".to_string()),
120        Some("proc_launcher_kind".to_string()),
121    ))
122    pub attr MESH_PROC_LAUNCHER_KIND: String = String::new();
123
124    /// Default socket address for the mesh admin HTTP server.
125    ///
126    /// Parsed as a `SocketAddr` (e.g. `[::]:1729`, `0.0.0.0:8080`).
127    /// Used as the bind address when no explicit address is provided
128    /// to `MeshAdminAgent`.
129    @meta(CONFIG = ConfigAttr::new(
130        Some("HYPERACTOR_MESH_ADMIN_ADDR".to_string()),
131        Some("mesh_admin_addr".to_string()),
132    ))
133    pub attr MESH_ADMIN_ADDR: SocketAddrStr = SocketAddrStr::Static("[::]:1729");
134
135    /// Timeout for fallback queries to actors/procs that may have been
136    /// recently destroyed. The second-chance paths in `resolve_proc_node`
137    /// and `resolve_actor_node` fire after the fast QueryChild lookup
138    /// fails. A short budget here prevents dead actors from blocking the
139    /// single-threaded MeshAdminAgent message loop.
140    @meta(CONFIG = ConfigAttr::new(
141        Some("HYPERACTOR_MESH_ADMIN_RESOLVE_ACTOR_TIMEOUT".to_string()),
142        Some("mesh_admin_resolve_actor_timeout".to_string()),
143    ))
144    pub attr MESH_ADMIN_RESOLVE_ACTOR_TIMEOUT: Duration = Duration::from_millis(200);
145
146    /// Maximum number of concurrent resolve requests the HTTP bridge
147    /// forwards to the MeshAdminAgent. Excess requests receive 503
148    /// immediately. Protects the shared tokio runtime from query floods
149    /// (e.g. multiple TUI clients, rapid polling). Increase if the admin
150    /// server serves many concurrent clients that need low-latency
151    /// responses; decrease if introspection queries interfere with the
152    /// actor workload under churn.
153    @meta(CONFIG = ConfigAttr::new(
154        Some("HYPERACTOR_MESH_ADMIN_MAX_CONCURRENT_RESOLVES".to_string()),
155        Some("mesh_admin_max_concurrent_resolves".to_string()),
156    ))
157    pub attr MESH_ADMIN_MAX_CONCURRENT_RESOLVES: usize = 2;
158
159    /// Timeout for the config-push barrier during `HostMesh::attach()`.
160    ///
161    /// When attaching to pre-existing workers (simple bootstrap), the
162    /// client pushes its propagatable config to each host agent and
163    /// waits for confirmation. If the barrier does not complete within
164    /// this duration, a warning is logged and attach continues without
165    /// blocking — config push is best-effort.
166    @meta(CONFIG = ConfigAttr::new(
167        Some("HYPERACTOR_MESH_ATTACH_CONFIG_TIMEOUT".to_string()),
168        Some("mesh_attach_config_timeout".to_string()),
169    ))
170    pub attr MESH_ATTACH_CONFIG_TIMEOUT: Duration = Duration::from_secs(10);
171
172    /// Timeout for targeted introspection queries that hit a single,
173    /// specific host. Kept short so a slow or dying actor cannot block
174    /// the single-threaded MeshAdminAgent message loop.
175    @meta(CONFIG = ConfigAttr::new(
176        Some("HYPERACTOR_MESH_ADMIN_SINGLE_HOST_TIMEOUT".to_string()),
177        Some("mesh_admin_single_host_timeout".to_string()),
178    ))
179    pub attr MESH_ADMIN_SINGLE_HOST_TIMEOUT: Duration = Duration::from_secs(3);
180
181    /// Timeout for QueryChild snapshot lookups in resolve_actor_node.
182    /// QueryChild is handled by a synchronous callback — it either
183    /// returns immediately or returns Error. A short budget ensures
184    /// the total time for resolve_actor_node stays well under
185    /// `MESH_ADMIN_SINGLE_HOST_TIMEOUT`.
186    @meta(CONFIG = ConfigAttr::new(
187        Some("HYPERACTOR_MESH_ADMIN_QUERY_CHILD_TIMEOUT".to_string()),
188        Some("mesh_admin_query_child_timeout".to_string()),
189    ))
190    pub attr MESH_ADMIN_QUERY_CHILD_TIMEOUT: Duration = Duration::from_millis(100);
191
192    /// Timeout for the end-to-end `/v1/config/{proc}` bridge reply.
193    /// The config-dump path forwards a `ConfigDump` message through
194    /// the HostAgent bridge and waits for `ConfigDumpResult`. This is
195    /// inter-process actor messaging — fundamentally slower than local
196    /// `QueryChild` snapshot lookups (which use
197    /// `MESH_ADMIN_QUERY_CHILD_TIMEOUT`). During startup, the
198    /// HostAgent message loop may be busy processing actor
199    /// registrations, so bridge latency can exceed several seconds.
200    @meta(CONFIG = ConfigAttr::new(
201        Some("HYPERACTOR_MESH_ADMIN_CONFIG_DUMP_BRIDGE_TIMEOUT".to_string()),
202        Some("mesh_admin_config_dump_bridge_timeout".to_string()),
203    ))
204    pub attr MESH_ADMIN_CONFIG_DUMP_BRIDGE_TIMEOUT: Duration = Duration::from_secs(5);
205
206    /// Timeout for py-spy dump requests. See PS-5 in `introspect`
207    /// module doc. With `--native --native-all`, py-spy unwinds native
208    /// stacks via libunwind which is significantly slower than
209    /// Python-only capture (~100ms). 10s accommodates native unwinding
210    /// on heavily loaded hosts. Independent of
211    /// `MESH_ADMIN_SINGLE_HOST_TIMEOUT` because py-spy does real I/O
212    /// (subprocess + ptrace) rather than actor messaging.
213    @meta(CONFIG = ConfigAttr::new(
214        Some("HYPERACTOR_MESH_ADMIN_PYSPY_TIMEOUT".to_string()),
215        Some("mesh_admin_pyspy_timeout".to_string()),
216    ))
217    pub attr MESH_ADMIN_PYSPY_TIMEOUT: Duration = Duration::from_secs(10);
218
219    /// Timeout for the `/v1/tree` fan-out. Kept generous because the
220    /// tree dump walks every host and proc in the mesh.
221    @meta(CONFIG = ConfigAttr::new(
222        Some("HYPERACTOR_MESH_ADMIN_TREE_TIMEOUT".to_string()),
223        Some("mesh_admin_tree_timeout".to_string()),
224    ))
225    pub attr MESH_ADMIN_TREE_TIMEOUT: Duration = Duration::from_secs(10);
226
227    /// Bridge-side timeout for py-spy dump requests. Must exceed
228    /// `MESH_ADMIN_PYSPY_TIMEOUT` to allow the subprocess kill/reap
229    /// and reply delivery to arrive before declaring `gateway_timeout`.
230    /// See PS-6 in `introspect` module doc.
231    @meta(CONFIG = ConfigAttr::new(
232        Some("HYPERACTOR_MESH_ADMIN_PYSPY_BRIDGE_TIMEOUT".to_string()),
233        Some("mesh_admin_pyspy_bridge_timeout".to_string()),
234    ))
235    pub attr MESH_ADMIN_PYSPY_BRIDGE_TIMEOUT: Duration = Duration::from_secs(13);
236
237    /// Client-side timeout for py-spy requests. Must exceed
238    /// `MESH_ADMIN_PYSPY_BRIDGE_TIMEOUT` so the server can return a
239    /// structured `PySpyResult` even when the subprocess uses the
240    /// full budget. See PS-6 in `introspect` module doc.
241    @meta(CONFIG = ConfigAttr::new(
242        Some("HYPERACTOR_MESH_ADMIN_PYSPY_CLIENT_TIMEOUT".to_string()),
243        Some("mesh_admin_pyspy_client_timeout".to_string()),
244    ))
245    pub attr MESH_ADMIN_PYSPY_CLIENT_TIMEOUT: Duration = Duration::from_secs(20);
246
247    /// Path to the py-spy binary. When non-empty, tried before
248    /// the fallback `"py-spy"` PATH lookup. See PS-3 in
249    /// `introspect` module doc.
250    ///
251    /// Note: env var is `PYSPY_BIN` (not `HYPERACTOR_MESH_PYSPY_BIN`)
252    /// to preserve backward compatibility with existing deployments
253    /// that already set `PYSPY_BIN`.
254    @meta(CONFIG = ConfigAttr::new(
255        Some("PYSPY_BIN".to_string()),
256        Some("pyspy_bin".to_string()),
257    ))
258    pub attr PYSPY_BIN: String = String::new();
259}