monarch_rdma/
lib.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9// RDMA requires frequent unsafe code blocks
10#![allow(clippy::undocumented_unsafe_blocks)]
11
12use std::sync::Arc;
13
14use local_memory::RdmaLocalMemory;
15use serde::Deserialize;
16use serde::Serialize;
17
18#[macro_use]
19mod macros;
20
21pub mod backend;
22pub mod config;
23pub mod device_selection;
24pub mod efa;
25pub mod local_memory;
26mod rdma_components;
27mod rdma_manager_actor;
28
29pub use backend::ibverbs::primitives::*;
30
31/// Whether any RDMA backend is available on this system.
32///
33/// Returns true if ibverbs hardware is present, or if TCP fallback
34/// is enabled via [`config::RDMA_ALLOW_TCP_FALLBACK`].
35pub fn rdma_supported() -> bool {
36    ibverbs_supported() || hyperactor_config::global::get(config::RDMA_ALLOW_TCP_FALLBACK)
37}
38pub use rdma_components::RdmaRemoteBuffer;
39pub use rdma_components::SegmentScannerFn;
40// Re-export segment scanner types for extension crate
41pub use rdma_components::register_segment_scanner;
42pub use rdma_components::*;
43pub use rdma_manager_actor::*;
44// Re-export rdmaxcel_sys for extension crate to access types
45pub use rdmaxcel_sys;
46pub use test_utils::is_cuda_available;
47
48/// Type of RDMA operation.
49#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
50pub enum RdmaOpType {
51    ReadIntoLocal,
52    WriteFromLocal,
53}
54
55/// A single RDMA operation to be submitted to a backend.
56#[derive(Debug)]
57pub struct RdmaOp {
58    pub op_type: RdmaOpType,
59    pub local: Arc<dyn RdmaLocalMemory>,
60    pub remote: RdmaRemoteBuffer,
61}
62
63/// Transport level, ordered slowest to fastest.
64///
65/// The `Ord` implementation reflects this ordering, enabling transport
66/// selection via comparison (e.g., "at least NIC speed").
67#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
68pub enum RdmaTransportLevel {
69    /// TCP/IP sockets (fallback transport).
70    Tcp,
71    /// RDMA NIC (RoCE, InfiniBand, EFA).
72    Nic,
73    /// Direct memory access (NVLink, shared memory).
74    Memory,
75}
76
77/// Print comprehensive RDMA device information for debugging.
78/// Controlled by MONARCH_DEBUG_RDMA environment variable.
79pub fn print_device_info_if_debug_enabled(context: *mut rdmaxcel_sys::ibv_context) {
80    if std::env::var("MONARCH_DEBUG_RDMA").is_ok() {
81        unsafe {
82            rdmaxcel_sys::rdmaxcel_print_device_info(context);
83        }
84    }
85}
86
87/// Print comprehensive RDMA device information for debugging (always prints).
88pub fn print_device_info(context: *mut rdmaxcel_sys::ibv_context) {
89    unsafe {
90        rdmaxcel_sys::rdmaxcel_print_device_info(context);
91    }
92}
93
94mod test_utils;