hyperactor_mesh/config.rs
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9//! Configuration for Hyperactor Mesh.
10//!
11//! This module provides hyperactor_mesh-specific configuration attributes that extend
12//! the base hyperactor configuration system.
13
14use std::net::SocketAddr;
15use std::time::Duration;
16
17use hyperactor_config::AttrValue;
18use hyperactor_config::CONFIG;
19use hyperactor_config::ConfigAttr;
20use hyperactor_config::attrs::declare_attrs;
21use serde::Deserialize;
22use serde::Serialize;
23use typeuri::Named;
24
25/// A socket address string usable as a `declare_attrs!` default.
26///
27/// Follows the [`hyperactor::config::Pem`] pattern: the `Static`
28/// variant holds a `&'static str` so it can appear in a `static`
29/// item, while `Value` holds a runtime `String` from environment
30/// variables or Python `configure()`.
31#[derive(Clone, Debug, Serialize, Named)]
32#[named("hyperactor_mesh::config::SocketAddrStr")]
33pub enum SocketAddrStr {
34 /// Compile-time default (const-constructible).
35 Static(&'static str),
36 /// Runtime value from env / config.
37 Value(String),
38}
39
40impl<'de> Deserialize<'de> for SocketAddrStr {
41 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
42 where
43 D: serde::Deserializer<'de>,
44 {
45 #[derive(Deserialize)]
46 enum Helper {
47 Static(String),
48 Value(String),
49 }
50 match Helper::deserialize(deserializer)? {
51 Helper::Static(s) | Helper::Value(s) => Ok(SocketAddrStr::Value(s)),
52 }
53 }
54}
55
56impl From<String> for SocketAddrStr {
57 fn from(s: String) -> Self {
58 SocketAddrStr::Value(s)
59 }
60}
61
62impl From<SocketAddrStr> for String {
63 fn from(s: SocketAddrStr) -> Self {
64 s.as_ref().to_owned()
65 }
66}
67
68impl AsRef<str> for SocketAddrStr {
69 fn as_ref(&self) -> &str {
70 match self {
71 SocketAddrStr::Static(s) => s,
72 SocketAddrStr::Value(s) => s,
73 }
74 }
75}
76
77impl std::fmt::Display for SocketAddrStr {
78 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79 f.write_str(self.as_ref())
80 }
81}
82
83impl AttrValue for SocketAddrStr {
84 fn display(&self) -> String {
85 self.as_ref().to_owned()
86 }
87
88 fn parse(value: &str) -> Result<Self, anyhow::Error> {
89 value.parse::<SocketAddr>()?;
90 Ok(SocketAddrStr::Value(value.to_string()))
91 }
92}
93
94impl SocketAddrStr {
95 /// Parse the contained string as a `SocketAddr`.
96 pub fn parse_socket_addr(&self) -> Result<SocketAddr, std::net::AddrParseError> {
97 self.as_ref().parse()
98 }
99}
100
101// Declare hyperactor_mesh-specific configuration keys
102declare_attrs! {
103 /// The maximium for a dimension size allowed for a folded shape
104 /// when reshaping during casting to limit fanout.
105 /// usize::MAX means no reshaping as any shape will always be below
106 /// the limit so no dimension needs to be folded.
107 @meta(CONFIG = ConfigAttr::new(
108 Some("HYPERACTOR_MESH_MAX_CAST_DIMENSION_SIZE".to_string()),
109 Some("max_cast_dimension_size".to_string()),
110 ))
111 pub attr MAX_CAST_DIMENSION_SIZE: usize = 16;
112
113 /// Which builtin process launcher backend to use.
114 /// Accepted values: "native" (default), "systemd".
115 /// Trimmed and lowercased before matching.
116 ///
117 /// **Precedence:** Python spawner (via SetProcSpawner) overrides this.
118 @meta(CONFIG = ConfigAttr::new(
119 Some("HYPERACTOR_MESH_PROC_LAUNCHER_KIND".to_string()),
120 Some("proc_launcher_kind".to_string()),
121 ))
122 pub attr MESH_PROC_LAUNCHER_KIND: String = String::new();
123
124 /// Default socket address for the mesh admin HTTP server.
125 ///
126 /// Parsed as a `SocketAddr` (e.g. `[::]:1729`, `0.0.0.0:8080`).
127 /// Used as the bind address when no explicit address is provided
128 /// to `MeshAdminAgent`.
129 @meta(CONFIG = ConfigAttr::new(
130 Some("HYPERACTOR_MESH_ADMIN_ADDR".to_string()),
131 Some("mesh_admin_addr".to_string()),
132 ))
133 pub attr MESH_ADMIN_ADDR: SocketAddrStr = SocketAddrStr::Static("[::]:1729");
134
135 /// Timeout for fallback queries to actors/procs that may have been
136 /// recently destroyed. The second-chance paths in `resolve_proc_node`
137 /// and `resolve_actor_node` fire after the fast QueryChild lookup
138 /// fails. A short budget here prevents dead actors from blocking the
139 /// single-threaded MeshAdminAgent message loop.
140 @meta(CONFIG = ConfigAttr::new(
141 Some("HYPERACTOR_MESH_ADMIN_RESOLVE_ACTOR_TIMEOUT".to_string()),
142 Some("mesh_admin_resolve_actor_timeout".to_string()),
143 ))
144 pub attr MESH_ADMIN_RESOLVE_ACTOR_TIMEOUT: Duration = Duration::from_millis(200);
145
146 /// Maximum number of concurrent resolve requests the HTTP bridge
147 /// forwards to the MeshAdminAgent. Excess requests receive 503
148 /// immediately. Protects the shared tokio runtime from query floods
149 /// (e.g. multiple TUI clients, rapid polling). Increase if the admin
150 /// server serves many concurrent clients that need low-latency
151 /// responses; decrease if introspection queries interfere with the
152 /// actor workload under churn.
153 @meta(CONFIG = ConfigAttr::new(
154 Some("HYPERACTOR_MESH_ADMIN_MAX_CONCURRENT_RESOLVES".to_string()),
155 Some("mesh_admin_max_concurrent_resolves".to_string()),
156 ))
157 pub attr MESH_ADMIN_MAX_CONCURRENT_RESOLVES: usize = 2;
158
159 /// Timeout for the config-push barrier during `HostMesh::attach()`.
160 ///
161 /// When attaching to pre-existing workers (simple bootstrap), the
162 /// client pushes its propagatable config to each host agent and
163 /// waits for confirmation. If the barrier does not complete within
164 /// this duration, a warning is logged and attach continues without
165 /// blocking — config push is best-effort.
166 @meta(CONFIG = ConfigAttr::new(
167 Some("HYPERACTOR_MESH_ATTACH_CONFIG_TIMEOUT".to_string()),
168 Some("mesh_attach_config_timeout".to_string()),
169 ))
170 pub attr MESH_ATTACH_CONFIG_TIMEOUT: Duration = Duration::from_secs(10);
171
172 /// Timeout for targeted introspection queries that hit a single,
173 /// specific host. Kept short so a slow or dying actor cannot block
174 /// the single-threaded MeshAdminAgent message loop.
175 @meta(CONFIG = ConfigAttr::new(
176 Some("HYPERACTOR_MESH_ADMIN_SINGLE_HOST_TIMEOUT".to_string()),
177 Some("mesh_admin_single_host_timeout".to_string()),
178 ))
179 pub attr MESH_ADMIN_SINGLE_HOST_TIMEOUT: Duration = Duration::from_secs(3);
180
181 /// Timeout for QueryChild snapshot lookups in resolve_actor_node.
182 /// QueryChild is handled by a synchronous callback — it either
183 /// returns immediately or returns Error. A short budget ensures
184 /// the total time for resolve_actor_node stays well under
185 /// `MESH_ADMIN_SINGLE_HOST_TIMEOUT`.
186 @meta(CONFIG = ConfigAttr::new(
187 Some("HYPERACTOR_MESH_ADMIN_QUERY_CHILD_TIMEOUT".to_string()),
188 Some("mesh_admin_query_child_timeout".to_string()),
189 ))
190 pub attr MESH_ADMIN_QUERY_CHILD_TIMEOUT: Duration = Duration::from_millis(100);
191
192 /// Timeout for the end-to-end `/v1/config/{proc}` bridge reply.
193 /// The config-dump path forwards a `ConfigDump` message through
194 /// the HostAgent bridge and waits for `ConfigDumpResult`. This is
195 /// inter-process actor messaging — fundamentally slower than local
196 /// `QueryChild` snapshot lookups (which use
197 /// `MESH_ADMIN_QUERY_CHILD_TIMEOUT`). During startup, the
198 /// HostAgent message loop may be busy processing actor
199 /// registrations, so bridge latency can exceed several seconds.
200 @meta(CONFIG = ConfigAttr::new(
201 Some("HYPERACTOR_MESH_ADMIN_CONFIG_DUMP_BRIDGE_TIMEOUT".to_string()),
202 Some("mesh_admin_config_dump_bridge_timeout".to_string()),
203 ))
204 pub attr MESH_ADMIN_CONFIG_DUMP_BRIDGE_TIMEOUT: Duration = Duration::from_secs(5);
205
206 /// Timeout for py-spy dump requests. See PS-5 in `introspect`
207 /// module doc. With `--native --native-all`, py-spy unwinds native
208 /// stacks via libunwind which is significantly slower than
209 /// Python-only capture (~100ms). 10s accommodates native unwinding
210 /// on heavily loaded hosts. Independent of
211 /// `MESH_ADMIN_SINGLE_HOST_TIMEOUT` because py-spy does real I/O
212 /// (subprocess + ptrace) rather than actor messaging.
213 @meta(CONFIG = ConfigAttr::new(
214 Some("HYPERACTOR_MESH_ADMIN_PYSPY_TIMEOUT".to_string()),
215 Some("mesh_admin_pyspy_timeout".to_string()),
216 ))
217 pub attr MESH_ADMIN_PYSPY_TIMEOUT: Duration = Duration::from_secs(10);
218
219 /// Timeout for the `/v1/tree` fan-out. Kept generous because the
220 /// tree dump walks every host and proc in the mesh.
221 @meta(CONFIG = ConfigAttr::new(
222 Some("HYPERACTOR_MESH_ADMIN_TREE_TIMEOUT".to_string()),
223 Some("mesh_admin_tree_timeout".to_string()),
224 ))
225 pub attr MESH_ADMIN_TREE_TIMEOUT: Duration = Duration::from_secs(10);
226
227 /// Bridge-side timeout for py-spy dump requests. Must exceed
228 /// `MESH_ADMIN_PYSPY_TIMEOUT` to allow the subprocess kill/reap
229 /// and reply delivery to arrive before declaring `gateway_timeout`.
230 /// See PS-6 in `introspect` module doc.
231 @meta(CONFIG = ConfigAttr::new(
232 Some("HYPERACTOR_MESH_ADMIN_PYSPY_BRIDGE_TIMEOUT".to_string()),
233 Some("mesh_admin_pyspy_bridge_timeout".to_string()),
234 ))
235 pub attr MESH_ADMIN_PYSPY_BRIDGE_TIMEOUT: Duration = Duration::from_secs(13);
236
237 /// Client-side timeout for py-spy requests. Must exceed
238 /// `MESH_ADMIN_PYSPY_BRIDGE_TIMEOUT` so the server can return a
239 /// structured `PySpyResult` even when the subprocess uses the
240 /// full budget. See PS-6 in `introspect` module doc.
241 @meta(CONFIG = ConfigAttr::new(
242 Some("HYPERACTOR_MESH_ADMIN_PYSPY_CLIENT_TIMEOUT".to_string()),
243 Some("mesh_admin_pyspy_client_timeout".to_string()),
244 ))
245 pub attr MESH_ADMIN_PYSPY_CLIENT_TIMEOUT: Duration = Duration::from_secs(20);
246
247 /// Path to the py-spy binary. When non-empty, tried before
248 /// the fallback `"py-spy"` PATH lookup. See PS-3 in
249 /// `introspect` module doc.
250 ///
251 /// Note: env var is `PYSPY_BIN` (not `HYPERACTOR_MESH_PYSPY_BIN`)
252 /// to preserve backward compatibility with existing deployments
253 /// that already set `PYSPY_BIN`.
254 @meta(CONFIG = ConfigAttr::new(
255 Some("PYSPY_BIN".to_string()),
256 Some("pyspy_bin".to_string()),
257 ))
258 pub attr PYSPY_BIN: String = String::new();
259}