Skip to main content

monarch_rdma/backend/
ibverbs.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9//! ibverbs backend implementation for RDMA operations.
10
11use std::sync::Arc;
12
13use hyperactor::ActorRef;
14use hyperactor::actor::Referable;
15use serde::Deserialize;
16use serde::Serialize;
17use typeuri::Named;
18
19pub mod device_selection;
20pub(crate) mod domain;
21pub mod manager_actor;
22pub mod primitives;
23mod processor_actor;
24pub mod queue_pair;
25
26use manager_actor::IbvManagerActor;
27pub use queue_pair::IbvQueuePair;
28pub use queue_pair::PollTarget;
29
30#[cfg(test)]
31mod ibv_manager_actor_tests;
32#[cfg(test)]
33mod mlx5dv_tests;
34#[cfg(test)]
35mod test_utils;
36
37use crate::RdmaOpType;
38use crate::local_memory::KeepaliveLocalMemory;
39
40/// Lazily-initialized ibverbs transport details for a registered memory
41/// region. Retrieved on demand from the [`IbvManagerActor`] via
42/// [`IbvManagerMessage::RequestBuffer`].
43#[derive(Debug, Clone, Serialize, Deserialize, Named)]
44pub struct IbvBuffer {
45    pub mr_id: usize,
46    pub lkey: u32,
47    pub rkey: u32,
48    /// RDMA address (may differ from virtual address for CUDA memory).
49    pub addr: usize,
50    pub size: usize,
51    /// Name of the RDMA device this buffer is associated with (e.g., "mlx5_0").
52    pub device_name: String,
53}
54
55/// A single RDMA op for the [`IbvBackend`](manager_actor::IbvBackend).
56///
57/// Generic over the manager actor type so unit tests can swap in a
58/// mock; production code uses the default `IbvOp<IbvManagerActor>`.
59#[derive(Debug, Named)]
60pub struct IbvOp<M: Referable = IbvManagerActor> {
61    pub op_type: RdmaOpType,
62    pub local_memory: Arc<KeepaliveLocalMemory>,
63    pub remote_buffer: IbvBuffer,
64    pub remote_manager: ActorRef<M>,
65}
66
67// Hand-rolled `Clone` to avoid the `M: Clone` bound the derive macro
68// would add (`ActorRef<M>` is `Clone` for all `M: Referable`).
69impl<M: Referable> Clone for IbvOp<M> {
70    fn clone(&self) -> Self {
71        Self {
72            op_type: self.op_type,
73            local_memory: Arc::clone(&self.local_memory),
74            remote_buffer: self.remote_buffer.clone(),
75            remote_manager: self.remote_manager.clone(),
76        }
77    }
78}