Skip to main content

monarch_rdma/
config.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9//! RDMA configuration attributes.
10
11use std::time::Duration;
12
13use hyperactor_config::CONFIG;
14use hyperactor_config::ConfigAttr;
15use hyperactor_config::attrs::declare_attrs;
16
17declare_attrs! {
18    /// Maximum chunk size in MiB for TCP-based RDMA transfers.
19    @meta(CONFIG = ConfigAttr::new(
20        Some("MONARCH_RDMA_MAX_CHUNK_SIZE_MB".to_string()),
21        Some("rdma_max_chunk_size_mb".to_string()),
22    ))
23    pub attr RDMA_MAX_CHUNK_SIZE_MB: usize = 64;
24
25    /// Allow TCP fallback when ibverbs hardware is unavailable.
26    ///
27    /// When true (the default), RDMA operations fall back to chunked
28    /// hyperactor messaging over the default channel transport. When
29    /// false, operations fail if no ibverbs backend is available.
30    @meta(CONFIG = ConfigAttr::new(
31        Some("MONARCH_RDMA_ALLOW_TCP_FALLBACK".to_string()),
32        Some("rdma_allow_tcp_fallback".to_string()),
33    ))
34    pub attr RDMA_ALLOW_TCP_FALLBACK: bool = true;
35
36    /// Disable ibverbs even when hardware is present.
37    ///
38    /// When true, `RdmaManagerActor` skips ibverbs initialization and
39    /// relies on the TCP fallback (if enabled). Useful for testing the
40    /// TCP transport on machines that have RDMA hardware.
41    @meta(CONFIG = ConfigAttr::new(
42        Some("MONARCH_RDMA_DISABLE_IBVERBS".to_string()),
43        Some("rdma_disable_ibverbs".to_string()),
44    ))
45    pub attr RDMA_DISABLE_IBVERBS: bool = false;
46
47    /// Number of parallel channels for TCP fallback transfers.
48    ///
49    /// When greater than 1, each [`TcpManagerActor`] serves this many
50    /// direct `hyperactor::channel` connections for bulk data transfer,
51    /// bypassing the single-socket actor mailbox. Default is 1
52    /// (sequential, existing behavior).
53    @meta(CONFIG = ConfigAttr::new(
54        Some("MONARCH_RDMA_TCP_FALLBACK_PARALLELISM".to_string()),
55        Some("rdma_tcp_fallback_parallelism".to_string()),
56    ))
57    pub attr RDMA_TCP_FALLBACK_PARALLELISM: usize = 1;
58
59    /// Cooperative-yield window for the ibverbs CQ poll loop. While
60    /// the policy is within this window it calls
61    /// `tokio::task::yield_now` between polls; past it, polls fall
62    /// into an exponential backoff sleep (1ms initial, x2, capped at
63    /// 10ms). `None` (the default) disables the cutoff entirely:
64    /// the loop only ever yields, never sleeps.
65    @meta(CONFIG = ConfigAttr::new(
66        Some("MONARCH_RDMA_CQ_BUSY_POLL_WINDOW".to_string()),
67        Some("rdma_cq_busy_poll_window".to_string()),
68    ))
69    pub attr RDMA_CQ_BUSY_POLL_WINDOW: Option<Duration> = None;
70
71    /// Capacity of the per-processor LRU cache that memoizes
72    /// `IbvMemoryRegionView`s by `(virtual_addr, size)`. Hits skip
73    /// the manager round-trip; misses ask the manager to register
74    /// the region and insert the result. A value of `0` is clamped
75    /// to `1` (the LRU is effectively disabled at that size, but
76    /// the processor still functions).
77    @meta(CONFIG = ConfigAttr::new(
78        Some("MONARCH_RDMA_MR_LRU_CACHE_SIZE".to_string()),
79        Some("rdma_mr_lru_cache_size".to_string()),
80    ))
81    pub attr RDMA_MR_LRU_CACHE_SIZE: usize = 1024;
82
83    /// Per-side budget for the `QueuePairInitializer` handshake. The
84    /// timer arms once when we send `EnsureQueuePair` and is rearmed
85    /// after we hit RTS while still waiting for the peer's
86    /// `NotifyRts`. If it fires the entry is tombstoned with a
87    /// `qp_initializer_failed` so further `RequestQueuePair` calls
88    /// for the same key surface the same error rather than hanging.
89    @meta(CONFIG = ConfigAttr::new(
90        Some("MONARCH_RDMA_QP_INIT_TIMEOUT".to_string()),
91        Some("rdma_qp_init_timeout".to_string()),
92    ))
93    pub attr RDMA_QP_INIT_TIMEOUT: Duration = Duration::from_secs(30);
94}