monarch_rdma/config.rs
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9//! RDMA configuration attributes.
10
11use std::time::Duration;
12
13use hyperactor_config::CONFIG;
14use hyperactor_config::ConfigAttr;
15use hyperactor_config::attrs::declare_attrs;
16
17declare_attrs! {
18 /// Maximum chunk size in MiB for TCP-based RDMA transfers.
19 @meta(CONFIG = ConfigAttr::new(
20 Some("MONARCH_RDMA_MAX_CHUNK_SIZE_MB".to_string()),
21 Some("rdma_max_chunk_size_mb".to_string()),
22 ))
23 pub attr RDMA_MAX_CHUNK_SIZE_MB: usize = 64;
24
25 /// Allow TCP fallback when ibverbs hardware is unavailable.
26 ///
27 /// When true (the default), RDMA operations fall back to chunked
28 /// hyperactor messaging over the default channel transport. When
29 /// false, operations fail if no ibverbs backend is available.
30 @meta(CONFIG = ConfigAttr::new(
31 Some("MONARCH_RDMA_ALLOW_TCP_FALLBACK".to_string()),
32 Some("rdma_allow_tcp_fallback".to_string()),
33 ))
34 pub attr RDMA_ALLOW_TCP_FALLBACK: bool = true;
35
36 /// Disable ibverbs even when hardware is present.
37 ///
38 /// When true, `RdmaManagerActor` skips ibverbs initialization and
39 /// relies on the TCP fallback (if enabled). Useful for testing the
40 /// TCP transport on machines that have RDMA hardware.
41 @meta(CONFIG = ConfigAttr::new(
42 Some("MONARCH_RDMA_DISABLE_IBVERBS".to_string()),
43 Some("rdma_disable_ibverbs".to_string()),
44 ))
45 pub attr RDMA_DISABLE_IBVERBS: bool = false;
46
47 /// Number of parallel channels for TCP fallback transfers.
48 ///
49 /// When greater than 1, each [`TcpManagerActor`] serves this many
50 /// direct `hyperactor::channel` connections for bulk data transfer,
51 /// bypassing the single-socket actor mailbox. Default is 1
52 /// (sequential, existing behavior).
53 @meta(CONFIG = ConfigAttr::new(
54 Some("MONARCH_RDMA_TCP_FALLBACK_PARALLELISM".to_string()),
55 Some("rdma_tcp_fallback_parallelism".to_string()),
56 ))
57 pub attr RDMA_TCP_FALLBACK_PARALLELISM: usize = 1;
58
59 /// Cooperative-yield window for the ibverbs CQ poll loop. While
60 /// the policy is within this window it calls
61 /// `tokio::task::yield_now` between polls; past it, polls fall
62 /// into an exponential backoff sleep (1ms initial, x2, capped at
63 /// 10ms). `None` (the default) disables the cutoff entirely:
64 /// the loop only ever yields, never sleeps.
65 @meta(CONFIG = ConfigAttr::new(
66 Some("MONARCH_RDMA_CQ_BUSY_POLL_WINDOW".to_string()),
67 Some("rdma_cq_busy_poll_window".to_string()),
68 ))
69 pub attr RDMA_CQ_BUSY_POLL_WINDOW: Option<Duration> = None;
70
71 /// Capacity of the per-processor LRU cache that memoizes
72 /// `IbvMemoryRegionView`s by `(virtual_addr, size)`. Hits skip
73 /// the manager round-trip; misses ask the manager to register
74 /// the region and insert the result. A value of `0` is clamped
75 /// to `1` (the LRU is effectively disabled at that size, but
76 /// the processor still functions).
77 @meta(CONFIG = ConfigAttr::new(
78 Some("MONARCH_RDMA_MR_LRU_CACHE_SIZE".to_string()),
79 Some("rdma_mr_lru_cache_size".to_string()),
80 ))
81 pub attr RDMA_MR_LRU_CACHE_SIZE: usize = 1024;
82
83 /// Per-side budget for the `QueuePairInitializer` handshake. The
84 /// timer arms once when we send `EnsureQueuePair` and is rearmed
85 /// after we hit RTS while still waiting for the peer's
86 /// `NotifyRts`. If it fires the entry is tombstoned with a
87 /// `qp_initializer_failed` so further `RequestQueuePair` calls
88 /// for the same key surface the same error rather than hanging.
89 @meta(CONFIG = ConfigAttr::new(
90 Some("MONARCH_RDMA_QP_INIT_TIMEOUT".to_string()),
91 Some("rdma_qp_init_timeout".to_string()),
92 ))
93 pub attr RDMA_QP_INIT_TIMEOUT: Duration = Duration::from_secs(30);
94}