monarch_rdma/backend.rs
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9//! RDMA backend implementations.
10
11#[cfg(any(test, feature = "test-utils"))]
12pub mod cuda_test_utils;
13pub mod ibverbs;
14pub mod tcp;
15
16use std::fmt::Debug;
17use std::time::Duration;
18
19use anyhow::Result;
20use async_trait::async_trait;
21use hyperactor::ActorRef;
22use serde::Deserialize;
23use serde::Serialize;
24
25use crate::RdmaOp;
26use crate::RdmaTransportLevel;
27
28/// Backend-specific context for a remote buffer.
29///
30/// Each variant holds the information needed to perform RDMA operations
31/// using that backend on a particular buffer.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub enum RdmaRemoteBackendContext {
34 Ibverbs(
35 ActorRef<ibverbs::manager_actor::IbvManagerActor>,
36 ibverbs::IbvBuffer,
37 ),
38 Tcp(ActorRef<tcp::manager_actor::TcpManagerActor>),
39}
40
41/// Backend for executing RDMA operations over a specific transport.
42///
43/// Each backend manages the transport-specific details of connection
44/// management and data movement. The backend decides internally how to
45/// batch and schedule submitted operations.
46///
47/// Current implementations:
48/// - [`ibverbs::IbvManagerActor`] -- ibverbs NIC transport
49/// - [`tcp::TcpManagerActor`] -- TCP fallback transport
50#[async_trait]
51pub trait RdmaBackend: Send + Debug {
52 /// Backend-specific transport details (e.g., a cffi struct with raw
53 /// ibverbs handles for GPU-initiated RDMA).
54 type TransportInfo;
55
56 /// Submit a batch of RDMA operations.
57 ///
58 /// The backend decides internally how to batch, schedule, and execute
59 /// the operations (e.g., managing QPs and connections as needed).
60 async fn submit(
61 &mut self,
62 cx: &(impl hyperactor::context::Actor + Send + Sync),
63 ops: Vec<RdmaOp>,
64 timeout: Duration,
65 ) -> Result<()>;
66
67 /// The transport level provided by this backend.
68 fn transport_level(&self) -> RdmaTransportLevel;
69
70 /// Low-level backend-specific transport details for direct control
71 /// over RDMA operations (e.g., from a GPU kernel).
72 fn transport_info(&self) -> Option<Self::TransportInfo>;
73}