Skip to main content

monarch_rdma/
backend.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9//! RDMA backend implementations.
10
11#[cfg(any(test, feature = "test-utils"))]
12pub mod cuda_test_utils;
13pub mod ibverbs;
14pub mod tcp;
15
16use std::fmt::Debug;
17use std::time::Duration;
18
19use anyhow::Result;
20use async_trait::async_trait;
21use hyperactor::ActorRef;
22use serde::Deserialize;
23use serde::Serialize;
24
25use crate::RdmaOp;
26use crate::RdmaTransportLevel;
27
28/// Backend-specific context for a remote buffer.
29///
30/// Each variant holds the information needed to perform RDMA operations
31/// using that backend on a particular buffer.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub enum RdmaRemoteBackendContext {
34    Ibverbs(
35        ActorRef<ibverbs::manager_actor::IbvManagerActor>,
36        ibverbs::IbvBuffer,
37    ),
38    Tcp(ActorRef<tcp::manager_actor::TcpManagerActor>),
39}
40
41/// Backend for executing RDMA operations over a specific transport.
42///
43/// Each backend manages the transport-specific details of connection
44/// management and data movement. The backend decides internally how to
45/// batch and schedule submitted operations.
46///
47/// Current implementations:
48/// - [`ibverbs::IbvManagerActor`] -- ibverbs NIC transport
49/// - [`tcp::TcpManagerActor`] -- TCP fallback transport
50#[async_trait]
51pub trait RdmaBackend: Send + Debug {
52    /// Backend-specific transport details (e.g., a cffi struct with raw
53    /// ibverbs handles for GPU-initiated RDMA).
54    type TransportInfo;
55
56    /// Submit a batch of RDMA operations.
57    ///
58    /// The backend decides internally how to batch, schedule, and execute
59    /// the operations (e.g., managing QPs and connections as needed).
60    async fn submit(
61        &mut self,
62        cx: &(impl hyperactor::context::Actor + Send + Sync),
63        ops: Vec<RdmaOp>,
64        timeout: Duration,
65    ) -> Result<()>;
66
67    /// The transport level provided by this backend.
68    fn transport_level(&self) -> RdmaTransportLevel;
69
70    /// Low-level backend-specific transport details for direct control
71    /// over RDMA operations (e.g., from a GPU kernel).
72    fn transport_info(&self) -> Option<Self::TransportInfo>;
73}