monarch_rdma/ibverbs_primitives.rs
1/*
2 * Portions Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9/*
10 * Sections of code adapted from
11 * Copyright (c) 2016 Jon Gjengset under MIT License (MIT)
12*/
13
14//! This file contains primitive data structures for interacting with ibverbs.
15//!
16//! Primitives:
17//! - `IbverbsConfig`: Represents ibverbs specific configurations, holding parameters required to establish and
18//! manage an RDMA connection, including settings for the RDMA device, queue pair attributes, and other
19//! connection-specific parameters.
20//! - `RdmaDevice`: Represents an RDMA device, i.e. 'mlx5_0'. Contains information about the device, such as:
21//! its name, vendor ID, vendor part ID, hardware version, firmware version, node GUID, and capabilities.
22//! - `RdmaPort`: Represents information about the port of an RDMA device, including state, physical state,
23//! LID (Local Identifier), and GID (Global Identifier) information.
24//! - `RdmaMemoryRegionView`: Represents a memory region that can be registered with an RDMA device for direct
25//! memory access operations.
26//! - `RdmaOperation`: Represents the type of RDMA operation to perform (Read or Write).
27//! - `RdmaQpInfo`: Contains connection information needed to establish an RDMA connection with a remote endpoint.
28//! - `IbvWc`: Wrapper around ibverbs work completion structure, used to track the status of RDMA operations.
29use std::ffi::CStr;
30use std::fmt;
31
32use hyperactor::Named;
33use serde::Deserialize;
34use serde::Serialize;
35
36#[derive(
37 Default,
38 Copy,
39 Clone,
40 Debug,
41 Eq,
42 PartialEq,
43 Hash,
44 serde::Serialize,
45 serde::Deserialize
46)]
47#[repr(transparent)]
48pub struct Gid {
49 raw: [u8; 16],
50}
51
52impl Gid {
53 #[allow(dead_code)]
54 fn subnet_prefix(&self) -> u64 {
55 u64::from_be_bytes(self.raw[..8].try_into().unwrap())
56 }
57
58 #[allow(dead_code)]
59 fn interface_id(&self) -> u64 {
60 u64::from_be_bytes(self.raw[8..].try_into().unwrap())
61 }
62}
63impl From<rdmaxcel_sys::ibv_gid> for Gid {
64 fn from(gid: rdmaxcel_sys::ibv_gid) -> Self {
65 Self {
66 raw: unsafe { gid.raw },
67 }
68 }
69}
70
71impl From<Gid> for rdmaxcel_sys::ibv_gid {
72 fn from(mut gid: Gid) -> Self {
73 *gid.as_mut()
74 }
75}
76
77impl AsRef<rdmaxcel_sys::ibv_gid> for Gid {
78 fn as_ref(&self) -> &rdmaxcel_sys::ibv_gid {
79 unsafe { &*self.raw.as_ptr().cast::<rdmaxcel_sys::ibv_gid>() }
80 }
81}
82
83impl AsMut<rdmaxcel_sys::ibv_gid> for Gid {
84 fn as_mut(&mut self) -> &mut rdmaxcel_sys::ibv_gid {
85 unsafe { &mut *self.raw.as_mut_ptr().cast::<rdmaxcel_sys::ibv_gid>() }
86 }
87}
88
89/// Represents ibverbs specific configurations.
90///
91/// This struct holds various parameters required to establish and manage an RDMA connection.
92/// It includes settings for the RDMA device, queue pair attributes, and other connection-specific
93/// parameters.
94#[derive(Debug, Named, Clone, Serialize, Deserialize)]
95pub struct IbverbsConfig {
96 /// `device` - The RDMA device to use for the connection.
97 pub device: RdmaDevice,
98 /// `cq_entries` - The number of completion queue entries.
99 pub cq_entries: i32,
100 /// `port_num` - The physical port number on the device.
101 pub port_num: u8,
102 /// `gid_index` - The GID index for the RDMA device.
103 pub gid_index: u8,
104 /// `max_send_wr` - The maximum number of outstanding send work requests.
105 pub max_send_wr: u32,
106 /// `max_recv_wr` - The maximum number of outstanding receive work requests.
107 pub max_recv_wr: u32,
108 /// `max_send_sge` - Te maximum number of scatter/gather elements in a send work request.
109 pub max_send_sge: u32,
110 /// `max_recv_sge` - The maximum number of scatter/gather elements in a receive work request.
111 pub max_recv_sge: u32,
112 /// `path_mtu` - The path MTU (Maximum Transmission Unit) for the connection.
113 pub path_mtu: u32,
114 /// `retry_cnt` - The number of retry attempts for a connection request.
115 pub retry_cnt: u8,
116 /// `rnr_retry` - The number of retry attempts for a receiver not ready (RNR) condition.
117 pub rnr_retry: u8,
118 /// `qp_timeout` - The timeout for a queue pair operation.
119 pub qp_timeout: u8,
120 /// `min_rnr_timer` - The minimum RNR timer value.
121 pub min_rnr_timer: u8,
122 /// `max_dest_rd_atomic` - The maximum number of outstanding RDMA read operations at the destination.
123 pub max_dest_rd_atomic: u8,
124 /// `max_rd_atomic` - The maximum number of outstanding RDMA read operations at the initiator.
125 pub max_rd_atomic: u8,
126 /// `pkey_index` - The partition key index.
127 pub pkey_index: u16,
128 /// `psn` - The packet sequence number.
129 pub psn: u32,
130 /// `use_gpu_direct` - Whether to enable GPU Direct RDMA support on init.
131 pub use_gpu_direct: bool,
132}
133
134/// Default RDMA parameters below are based on common values from rdma-core examples
135/// (e.g. rc_pingpong). For high-performance or production use, consider tuning
136/// based on ibv_query_device() results and workload characteristics.
137impl Default for IbverbsConfig {
138 fn default() -> Self {
139 Self {
140 device: RdmaDevice::default(),
141 cq_entries: 10,
142 port_num: 1,
143 gid_index: 3,
144 max_send_wr: 1,
145 max_recv_wr: 1,
146 max_send_sge: 4, // min value 4, may need to be muliple of 2.
147 max_recv_sge: 4, // min value 4, may need to be muliple of 2.
148 path_mtu: rdmaxcel_sys::IBV_MTU_1024,
149 retry_cnt: 7,
150 rnr_retry: 7,
151 qp_timeout: 14, // 4.096 μs * 2^14 = ~67 ms
152 min_rnr_timer: 12,
153 max_dest_rd_atomic: 1,
154 max_rd_atomic: 1,
155 pkey_index: 0,
156 psn: rand::random::<u32>() & 0xffffff,
157 use_gpu_direct: true,
158 }
159 }
160}
161
162impl std::fmt::Display for IbverbsConfig {
163 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
164 write!(
165 f,
166 "IbverbsConfig {{ device: {}, port_num: {}, gid_index: {}, max_send_wr: {}, max_recv_wr: {}, max_send_sge: {}, max_recv_sge: {}, path_mtu: {:?}, retry_cnt: {}, rnr_retry: {}, qp_timeout: {}, min_rnr_timer: {}, max_dest_rd_atomic: {}, max_rd_atomic: {}, pkey_index: {}, psn: 0x{:x} }}",
167 self.device.name(),
168 self.port_num,
169 self.gid_index,
170 self.max_send_wr,
171 self.max_recv_wr,
172 self.max_send_sge,
173 self.max_recv_sge,
174 self.path_mtu,
175 self.retry_cnt,
176 self.rnr_retry,
177 self.qp_timeout,
178 self.min_rnr_timer,
179 self.max_dest_rd_atomic,
180 self.max_rd_atomic,
181 self.pkey_index,
182 self.psn,
183 )
184 }
185}
186
187/// Represents an RDMA device in the system.
188///
189/// This struct encapsulates information about an RDMA device, including its hardware
190/// characteristics, capabilities, and port information. It provides access to device
191/// attributes such as vendor information, firmware version, and supported features.
192///
193/// # Examples
194///
195/// ```
196/// use monarch_rdma::get_all_devices;
197///
198/// let devices = get_all_devices();
199/// if let Some(device) = devices.first() {
200/// // Access device name and firmware version
201/// let device_name = device.name();
202/// let firmware_version = device.fw_ver();
203/// }
204/// ```
205#[derive(Debug, Clone, Serialize, Deserialize)]
206pub struct RdmaDevice {
207 /// `name` - The name of the RDMA device (e.g., "mlx5_0").
208 pub name: String,
209 /// `vendor_id` - The vendor ID of the device.
210 vendor_id: u32,
211 /// `vendor_part_id` - The vendor part ID of the device.
212 vendor_part_id: u32,
213 /// `hw_ver` - Hardware version of the device.
214 hw_ver: u32,
215 /// `fw_ver` - Firmware version of the device.
216 fw_ver: String,
217 /// `node_guid` - Node GUID (Globally Unique Identifier) of the device.
218 node_guid: u64,
219 /// `ports` - Vector of ports available on this device.
220 ports: Vec<RdmaPort>,
221 /// `max_qp` - Maximum number of queue pairs supported.
222 max_qp: i32,
223 /// `max_cq` - Maximum number of completion queues supported.
224 max_cq: i32,
225 /// `max_mr` - Maximum number of memory regions supported.
226 max_mr: i32,
227 /// `max_pd` - Maximum number of protection domains supported.
228 max_pd: i32,
229 /// `max_qp_wr` - Maximum number of work requests per queue pair.
230 max_qp_wr: i32,
231 /// `max_sge` - Maximum number of scatter/gather elements per work request.
232 max_sge: i32,
233}
234
235impl RdmaDevice {
236 /// Returns the name of the RDMA device.
237 pub fn name(&self) -> &String {
238 &self.name
239 }
240
241 /// Returns the first available RDMA device, if any.
242 pub fn first_available() -> Option<RdmaDevice> {
243 let devices = get_all_devices();
244 if devices.is_empty() {
245 None
246 } else {
247 Some(devices.into_iter().next().unwrap())
248 }
249 }
250
251 /// Returns the vendor ID of the RDMA device.
252 pub fn vendor_id(&self) -> u32 {
253 self.vendor_id
254 }
255
256 /// Returns the vendor part ID of the RDMA device.
257 pub fn vendor_part_id(&self) -> u32 {
258 self.vendor_part_id
259 }
260
261 /// Returns the hardware version of the RDMA device.
262 pub fn hw_ver(&self) -> u32 {
263 self.hw_ver
264 }
265
266 /// Returns the firmware version of the RDMA device.
267 pub fn fw_ver(&self) -> &String {
268 &self.fw_ver
269 }
270
271 /// Returns the node GUID of the RDMA device.
272 pub fn node_guid(&self) -> u64 {
273 self.node_guid
274 }
275
276 /// Returns a reference to the vector of ports available on the RDMA device.
277 pub fn ports(&self) -> &Vec<RdmaPort> {
278 &self.ports
279 }
280
281 /// Returns the maximum number of queue pairs supported by the RDMA device.
282 pub fn max_qp(&self) -> i32 {
283 self.max_qp
284 }
285
286 /// Returns the maximum number of completion queues supported by the RDMA device.
287 pub fn max_cq(&self) -> i32 {
288 self.max_cq
289 }
290
291 /// Returns the maximum number of memory regions supported by the RDMA device.
292 pub fn max_mr(&self) -> i32 {
293 self.max_mr
294 }
295
296 /// Returns the maximum number of protection domains supported by the RDMA device.
297 pub fn max_pd(&self) -> i32 {
298 self.max_pd
299 }
300
301 /// Returns the maximum number of work requests per queue pair supported by the RDMA device.
302 pub fn max_qp_wr(&self) -> i32 {
303 self.max_qp_wr
304 }
305
306 /// Returns the maximum number of scatter/gather elements per work request supported by the RDMA device.
307 pub fn max_sge(&self) -> i32 {
308 self.max_sge
309 }
310}
311
312impl Default for RdmaDevice {
313 fn default() -> Self {
314 get_all_devices()
315 .into_iter()
316 .next()
317 .unwrap_or_else(|| panic!("No RDMA devices found"))
318 }
319}
320
321#[derive(Debug, Clone, Serialize, Deserialize)]
322pub struct RdmaPort {
323 /// `port_num` - The physical port number on the device.
324 port_num: u8,
325 /// `state` - The current state of the port.
326 state: String,
327 /// `physical_state` - The physical state of the port.
328 physical_state: String,
329 /// `base_lid` - Base Local Identifier for the port.
330 base_lid: u16,
331 /// `lmc` - LID Mask Control.
332 lmc: u8,
333 /// `sm_lid` - Subnet Manager Local Identifier.
334 sm_lid: u16,
335 /// `capability_mask` - Capability mask of the port.
336 capability_mask: u32,
337 /// `link_layer` - The link layer type (e.g., InfiniBand, Ethernet).
338 link_layer: String,
339 /// `gid` - Global Identifier for the port.
340 gid: String,
341 /// `gid_tbl_len` - Length of the GID table.
342 gid_tbl_len: i32,
343}
344
345impl fmt::Display for RdmaDevice {
346 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
347 writeln!(f, "{}", self.name)?;
348 writeln!(f, "\tNumber of ports: {}", self.ports.len())?;
349 writeln!(f, "\tFirmware version: {}", self.fw_ver)?;
350 writeln!(f, "\tHardware version: {}", self.hw_ver)?;
351 writeln!(f, "\tNode GUID: 0x{:016x}", self.node_guid)?;
352 writeln!(f, "\tVendor ID: 0x{:x}", self.vendor_id)?;
353 writeln!(f, "\tVendor part ID: {}", self.vendor_part_id)?;
354 writeln!(f, "\tMax QPs: {}", self.max_qp)?;
355 writeln!(f, "\tMax CQs: {}", self.max_cq)?;
356 writeln!(f, "\tMax MRs: {}", self.max_mr)?;
357 writeln!(f, "\tMax PDs: {}", self.max_pd)?;
358 writeln!(f, "\tMax QP WRs: {}", self.max_qp_wr)?;
359 writeln!(f, "\tMax SGE: {}", self.max_sge)?;
360
361 for port in &self.ports {
362 write!(f, "{}", port)?;
363 }
364
365 Ok(())
366 }
367}
368
369impl fmt::Display for RdmaPort {
370 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
371 writeln!(f, "\tPort {}:", self.port_num)?;
372 writeln!(f, "\t\tState: {}", self.state)?;
373 writeln!(f, "\t\tPhysical state: {}", self.physical_state)?;
374 writeln!(f, "\t\tBase lid: {}", self.base_lid)?;
375 writeln!(f, "\t\tLMC: {}", self.lmc)?;
376 writeln!(f, "\t\tSM lid: {}", self.sm_lid)?;
377 writeln!(f, "\t\tCapability mask: 0x{:08x}", self.capability_mask)?;
378 writeln!(f, "\t\tLink layer: {}", self.link_layer)?;
379 writeln!(f, "\t\tGID: {}", self.gid)?;
380 writeln!(f, "\t\tGID table length: {}", self.gid_tbl_len)?;
381 Ok(())
382 }
383}
384
385/// Converts the given port state to a human-readable string.
386///
387/// # Arguments
388///
389/// * `state` - The port state as defined by `ffi::ibv_port_state::Type`.
390///
391/// # Returns
392///
393/// A string representation of the port state.
394pub fn get_port_state_str(state: rdmaxcel_sys::ibv_port_state::Type) -> String {
395 // SAFETY: We are calling a C function that returns a C string.
396 unsafe {
397 let c_str = rdmaxcel_sys::ibv_port_state_str(state);
398 if c_str.is_null() {
399 return "Unknown".to_string();
400 }
401 CStr::from_ptr(c_str).to_string_lossy().into_owned()
402 }
403}
404
405/// Converts the given physical state to a human-readable string.
406///
407/// # Arguments
408///
409/// * `phys_state` - The physical state as a `u8`.
410///
411/// # Returns
412///
413/// A string representation of the physical state.
414pub fn get_port_phy_state_str(phys_state: u8) -> String {
415 match phys_state {
416 1 => "Sleep".to_string(),
417 2 => "Polling".to_string(),
418 3 => "Disabled".to_string(),
419 4 => "PortConfigurationTraining".to_string(),
420 5 => "LinkUp".to_string(),
421 6 => "LinkErrorRecovery".to_string(),
422 7 => "PhyTest".to_string(),
423 _ => "No state change".to_string(),
424 }
425}
426
427/// Converts the given link layer type to a human-readable string.
428///
429/// # Arguments
430///
431/// * `link_layer` - The link layer type as a `u8`.
432///
433/// # Returns
434///
435/// A string representation of the link layer type.
436pub fn get_link_layer_str(link_layer: u8) -> String {
437 match link_layer {
438 1 => "InfiniBand".to_string(),
439 2 => "Ethernet".to_string(),
440 _ => "Unknown".to_string(),
441 }
442}
443
444/// Formats a GID (Global Identifier) into a human-readable string.
445///
446/// # Arguments
447///
448/// * `gid` - A reference to a 16-byte array representing the GID.
449///
450/// # Returns
451///
452/// A formatted string representation of the GID.
453pub fn format_gid(gid: &[u8; 16]) -> String {
454 format!(
455 "{:02x}{:02x}:{:02x}{:02x}:{:02x}{:02x}:{:02x}{:02x}:{:02x}{:02x}:{:02x}{:02x}:{:02x}{:02x}:{:02x}{:02x}",
456 gid[0],
457 gid[1],
458 gid[2],
459 gid[3],
460 gid[4],
461 gid[5],
462 gid[6],
463 gid[7],
464 gid[8],
465 gid[9],
466 gid[10],
467 gid[11],
468 gid[12],
469 gid[13],
470 gid[14],
471 gid[15]
472 )
473}
474
475/// Retrieves information about all available RDMA devices in the system.
476///
477/// This function queries the system for all available RDMA devices and returns
478/// detailed information about each device, including its capabilities, ports,
479/// and attributes.
480///
481/// # Returns
482///
483/// A vector of `RdmaDevice` structures, each representing an RDMA device in the system.
484/// Returns an empty vector if no devices are found or if there was an error querying
485/// the devices.
486pub fn get_all_devices() -> Vec<RdmaDevice> {
487 let mut devices = Vec::new();
488
489 // SAFETY: We are calling several C functions from libibverbs.
490 unsafe {
491 let mut num_devices = 0;
492 let device_list = rdmaxcel_sys::ibv_get_device_list(&mut num_devices);
493 if device_list.is_null() || num_devices == 0 {
494 return devices;
495 }
496
497 for i in 0..num_devices {
498 let device = *device_list.add(i as usize);
499 if device.is_null() {
500 continue;
501 }
502
503 let context = rdmaxcel_sys::ibv_open_device(device);
504 if context.is_null() {
505 continue;
506 }
507
508 let device_name = CStr::from_ptr(rdmaxcel_sys::ibv_get_device_name(device))
509 .to_string_lossy()
510 .into_owned();
511
512 let mut device_attr = rdmaxcel_sys::ibv_device_attr::default();
513 if rdmaxcel_sys::ibv_query_device(context, &mut device_attr) != 0 {
514 rdmaxcel_sys::ibv_close_device(context);
515 continue;
516 }
517
518 let fw_ver = CStr::from_ptr(device_attr.fw_ver.as_ptr())
519 .to_string_lossy()
520 .into_owned();
521
522 let mut rdma_device = RdmaDevice {
523 name: device_name,
524 vendor_id: device_attr.vendor_id,
525 vendor_part_id: device_attr.vendor_part_id,
526 hw_ver: device_attr.hw_ver,
527 fw_ver,
528 node_guid: device_attr.node_guid,
529 ports: Vec::new(),
530 max_qp: device_attr.max_qp,
531 max_cq: device_attr.max_cq,
532 max_mr: device_attr.max_mr,
533 max_pd: device_attr.max_pd,
534 max_qp_wr: device_attr.max_qp_wr,
535 max_sge: device_attr.max_sge,
536 };
537
538 for port_num in 1..=device_attr.phys_port_cnt {
539 let mut port_attr = rdmaxcel_sys::ibv_port_attr::default();
540 if rdmaxcel_sys::ibv_query_port(
541 context,
542 port_num,
543 &mut port_attr as *mut rdmaxcel_sys::ibv_port_attr as *mut _,
544 ) != 0
545 {
546 continue;
547 }
548 let state = get_port_state_str(port_attr.state);
549 let physical_state = get_port_phy_state_str(port_attr.phys_state);
550
551 let link_layer = get_link_layer_str(port_attr.link_layer);
552
553 let mut gid = rdmaxcel_sys::ibv_gid::default();
554 let gid_str = if rdmaxcel_sys::ibv_query_gid(context, port_num, 0, &mut gid) == 0 {
555 format_gid(&gid.raw)
556 } else {
557 "N/A".to_string()
558 };
559
560 let rdma_port = RdmaPort {
561 port_num,
562 state,
563 physical_state,
564 base_lid: port_attr.lid,
565 lmc: port_attr.lmc,
566 sm_lid: port_attr.sm_lid,
567 capability_mask: port_attr.port_cap_flags,
568 link_layer,
569 gid: gid_str,
570 gid_tbl_len: port_attr.gid_tbl_len,
571 };
572
573 rdma_device.ports.push(rdma_port);
574 }
575
576 devices.push(rdma_device);
577 rdmaxcel_sys::ibv_close_device(context);
578 }
579
580 rdmaxcel_sys::ibv_free_device_list(device_list);
581 }
582
583 devices
584}
585
586/// Checks if ibverbs devices can be retrieved successfully.
587///
588/// This function attempts to retrieve the list of RDMA devices using the
589/// `ibv_get_device_list` function from the ibverbs library. It returns `true`
590/// if devices are found, and `false` otherwise.
591///
592/// # Returns
593///
594/// `true` if devices are successfully retrieved, `false` otherwise.
595pub fn ibverbs_supported() -> bool {
596 // SAFETY: We are calling a C function from libibverbs.
597 unsafe {
598 let mut num_devices = 0;
599 let device_list = rdmaxcel_sys::ibv_get_device_list(&mut num_devices);
600 if !device_list.is_null() {
601 rdmaxcel_sys::ibv_free_device_list(device_list);
602 }
603 num_devices > 0
604 }
605}
606
607/// Represents a view of a memory region that can be registered with an RDMA device.
608///
609/// An `RdmaMemoryRegionView` encapsulates a pointer to a memory buffer and its size.
610/// This memory region can be registered with an RDMA device to allow direct memory
611/// access operations (such as RDMA reads and writes) to be performed on it.
612///
613/// # Safety
614///
615/// The memory pointed to by `ptr` must remain valid for the lifetime of the `RdmaMemoryRegionView`.
616/// The caller is responsible for ensuring that the memory is not freed, moved or overwritten while
617/// RDMA operations are in progress.
618#[derive(Debug, PartialEq, Eq, std::hash::Hash, Serialize, Deserialize, Clone)]
619pub struct RdmaMemoryRegionView {
620 pub id: u32,
621 pub addr: usize,
622 pub size: usize,
623 pub lkey: u32,
624 pub rkey: u32,
625}
626
627// SAFETY: RdmaMemoryRegionView can be safely sent between threads because it only
628// contains address and size information without any thread-local state. However,
629// this DOES NOT provide any protection against data races in the underlying memory.
630// If one thread initiates an RDMA operation while another thread modifies the same
631// memory region, undefined behavior will occur. The caller is responsible for proper
632// synchronization of access to the underlying memory.
633unsafe impl Send for RdmaMemoryRegionView {}
634
635// SAFETY: RdmaMemoryRegionView is safe for concurrent access by multiple threads
636// as it only provides a view into memory without modifying its own state. However,
637// it provides NO PROTECTION against concurrent access to the underlying memory region.
638// The caller must ensure proper synchronization when:
639// 1. Initiating RDMA operations while local code reads/writes the same memory
640// 2. Performing multiple overlapping RDMA operations on the same memory region
641// 3. Freeing or reallocating memory that has in-flight RDMA operations
642unsafe impl Sync for RdmaMemoryRegionView {}
643
644impl RdmaMemoryRegionView {
645 /// Creates a new `RdmaMemoryRegionView` with the given address and size.
646 pub fn new(id: u32, addr: usize, size: usize, lkey: u32, rkey: u32) -> Self {
647 Self {
648 id,
649 addr,
650 size,
651 lkey,
652 rkey,
653 }
654 }
655}
656
657/// Enum representing the common RDMA operations.
658///
659/// This provides a more ergonomic interface to the underlying ibv_wr_opcode types.
660/// RDMA operations allow for direct memory access between two machines without
661/// involving the CPU of the target machine.
662///
663/// # Variants
664///
665/// * `Write` - Represents an RDMA write operation where data is written from the local
666/// memory to a remote memory region.
667/// * `Read` - Represents an RDMA read operation where data is read from a remote memory
668/// region into the local memory.
669#[derive(Debug, Clone, Copy, PartialEq, Eq)]
670pub enum RdmaOperation {
671 /// RDMA write operations
672 Write,
673 WriteWithImm,
674 /// RDMA read operation
675 Read,
676 /// RDMA recv operation
677 Recv,
678}
679
680impl From<RdmaOperation> for rdmaxcel_sys::ibv_wr_opcode::Type {
681 fn from(op: RdmaOperation) -> Self {
682 match op {
683 RdmaOperation::Write => rdmaxcel_sys::ibv_wr_opcode::IBV_WR_RDMA_WRITE,
684 RdmaOperation::WriteWithImm => rdmaxcel_sys::ibv_wr_opcode::IBV_WR_RDMA_WRITE_WITH_IMM,
685 RdmaOperation::Read => rdmaxcel_sys::ibv_wr_opcode::IBV_WR_RDMA_READ,
686 RdmaOperation::Recv => panic!("Invalid wr opcode"),
687 }
688 }
689}
690
691impl From<rdmaxcel_sys::ibv_wc_opcode::Type> for RdmaOperation {
692 fn from(op: rdmaxcel_sys::ibv_wc_opcode::Type) -> Self {
693 match op {
694 rdmaxcel_sys::ibv_wc_opcode::IBV_WC_RDMA_WRITE => RdmaOperation::Write,
695 rdmaxcel_sys::ibv_wc_opcode::IBV_WC_RDMA_READ => RdmaOperation::Read,
696 _ => panic!("Unsupported operation type"),
697 }
698 }
699}
700
701/// Contains information needed to establish an RDMA queue pair with a remote endpoint.
702///
703/// `RdmaQpInfo` encapsulates all the necessary information to establish a queue pair
704/// with a remote RDMA device. This includes queue pair number, LID (Local Identifier),
705/// GID (Global Identifier), remote memory address, remote key, and packet sequence number.
706#[derive(Default, Named, Clone, serde::Serialize, serde::Deserialize)]
707pub struct RdmaQpInfo {
708 /// `qp_num` - Queue Pair Number, uniquely identifies a queue pair on the remote device
709 pub qp_num: u32,
710 /// `lid` - Local Identifier, used for addressing in InfiniBand subnet
711 pub lid: u16,
712 /// `gid` - Global Identifier, used for routing across subnets (similar to IPv6 address)
713 pub gid: Option<Gid>,
714 /// `psn` - Packet Sequence Number, used for ordering packets
715 pub psn: u32,
716}
717
718impl std::fmt::Debug for RdmaQpInfo {
719 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
720 write!(
721 f,
722 "RdmaQpInfo {{ qp_num: {}, lid: {}, gid: {:?}, psn: 0x{:x} }}",
723 self.qp_num, self.lid, self.gid, self.psn
724 )
725 }
726}
727
728/// Wrapper around ibv_wc (ibverbs work completion).
729///
730/// This exposes only the public fields of rdmaxcel_sys::ibv_wc, allowing us to more easily
731/// interact with it from Rust. Work completions are used to track the status of
732/// RDMA operations and are generated when an operation completes.
733#[derive(Debug, Named, Clone, serde::Serialize, serde::Deserialize)]
734pub struct IbvWc {
735 /// `wr_id` - Work Request ID, used to identify the completed operation
736 wr_id: u64,
737 /// `len` - Length of the data transferred
738 len: usize,
739 /// `valid` - Whether the work completion is valid
740 valid: bool,
741 /// `error` - Error information if the operation failed
742 error: Option<(rdmaxcel_sys::ibv_wc_status::Type, u32)>,
743 /// `opcode` - Type of operation that completed (read, write, etc.)
744 opcode: rdmaxcel_sys::ibv_wc_opcode::Type,
745 /// `bytes` - Immediate data (if any)
746 bytes: Option<u32>,
747 /// `qp_num` - Queue Pair Number
748 qp_num: u32,
749 /// `src_qp` - Source Queue Pair Number
750 src_qp: u32,
751 /// `pkey_index` - Partition Key Index
752 pkey_index: u16,
753 /// `slid` - Source LID
754 slid: u16,
755 /// `sl` - Service Level
756 sl: u8,
757 /// `dlid_path_bits` - Destination LID Path Bits
758 dlid_path_bits: u8,
759}
760
761impl From<rdmaxcel_sys::ibv_wc> for IbvWc {
762 fn from(wc: rdmaxcel_sys::ibv_wc) -> Self {
763 IbvWc {
764 wr_id: wc.wr_id(),
765 len: wc.len(),
766 valid: wc.is_valid(),
767 error: wc.error(),
768 opcode: wc.opcode(),
769 bytes: wc.imm_data(),
770 qp_num: wc.qp_num,
771 src_qp: wc.src_qp,
772 pkey_index: wc.pkey_index,
773 slid: wc.slid,
774 sl: wc.sl,
775 dlid_path_bits: wc.dlid_path_bits,
776 }
777 }
778}
779
780impl IbvWc {
781 /// Returns the Work Request ID associated with this work completion.
782 ///
783 /// The Work Request ID is used to identify the specific operation that completed.
784 /// It is set by the application when posting the work request and is returned
785 /// unchanged in the work completion.
786 pub fn wr_id(&self) -> u64 {
787 self.wr_id
788 }
789
790 /// Returns whether this work completion is valid.
791 ///
792 /// A valid work completion indicates that the operation completed successfully.
793 /// If false, the `error` field may contain additional information about the failure.
794 pub fn is_valid(&self) -> bool {
795 self.valid
796 }
797}
798
799#[cfg(test)]
800mod tests {
801 use super::*;
802
803 #[test]
804 fn test_get_all_devices() {
805 // Skip test if RDMA devices are not available
806 let devices = get_all_devices();
807 if devices.is_empty() {
808 println!("Skipping test: RDMA devices not available");
809 return;
810 }
811 // Basic validation of first device
812 let device = &devices[0];
813 assert!(!device.name().is_empty(), "device name should not be empty");
814 assert!(
815 !device.ports().is_empty(),
816 "device should have at least one port"
817 );
818 }
819
820 #[test]
821 fn test_first_available() {
822 // Skip test if RDMA is not available
823 let devices = get_all_devices();
824 if devices.is_empty() {
825 println!("Skipping test: RDMA devices not available");
826 return;
827 }
828 // Basic validation of first device
829 let device = &devices[0];
830
831 let dev = device;
832 // Verify getters return expected values
833 assert_eq!(dev.vendor_id(), dev.vendor_id);
834 assert_eq!(dev.vendor_part_id(), dev.vendor_part_id);
835 assert_eq!(dev.hw_ver(), dev.hw_ver);
836 assert_eq!(dev.fw_ver(), &dev.fw_ver);
837 assert_eq!(dev.node_guid(), dev.node_guid);
838 assert_eq!(dev.max_qp(), dev.max_qp);
839 assert_eq!(dev.max_cq(), dev.max_cq);
840 assert_eq!(dev.max_mr(), dev.max_mr);
841 assert_eq!(dev.max_pd(), dev.max_pd);
842 assert_eq!(dev.max_qp_wr(), dev.max_qp_wr);
843 assert_eq!(dev.max_sge(), dev.max_sge);
844 }
845
846 #[test]
847 fn test_device_display() {
848 if let Some(device) = RdmaDevice::first_available() {
849 let display_output = format!("{}", device);
850 assert!(
851 display_output.contains(&device.name),
852 "display should include device name"
853 );
854 assert!(
855 display_output.contains(&device.fw_ver),
856 "display should include firmware version"
857 );
858 }
859 }
860
861 #[test]
862 fn test_port_display() {
863 if let Some(device) = RdmaDevice::first_available() {
864 if !device.ports().is_empty() {
865 let port = &device.ports()[0];
866 let display_output = format!("{}", port);
867 assert!(
868 display_output.contains(&port.state),
869 "display should include port state"
870 );
871 assert!(
872 display_output.contains(&port.link_layer),
873 "display should include link layer"
874 );
875 }
876 }
877 }
878
879 #[test]
880 fn test_rdma_operation_conversion() {
881 assert_eq!(
882 rdmaxcel_sys::ibv_wr_opcode::IBV_WR_RDMA_WRITE,
883 rdmaxcel_sys::ibv_wr_opcode::Type::from(RdmaOperation::Write)
884 );
885 assert_eq!(
886 rdmaxcel_sys::ibv_wr_opcode::IBV_WR_RDMA_READ,
887 rdmaxcel_sys::ibv_wr_opcode::Type::from(RdmaOperation::Read)
888 );
889
890 assert_eq!(
891 RdmaOperation::Write,
892 RdmaOperation::from(rdmaxcel_sys::ibv_wc_opcode::IBV_WC_RDMA_WRITE)
893 );
894 assert_eq!(
895 RdmaOperation::Read,
896 RdmaOperation::from(rdmaxcel_sys::ibv_wc_opcode::IBV_WC_RDMA_READ)
897 );
898 }
899
900 #[test]
901 fn test_rdma_endpoint() {
902 let endpoint = RdmaQpInfo {
903 qp_num: 42,
904 lid: 123,
905 gid: None,
906 psn: 0x5678,
907 };
908
909 let debug_str = format!("{:?}", endpoint);
910 assert!(debug_str.contains("qp_num: 42"));
911 assert!(debug_str.contains("lid: 123"));
912 assert!(debug_str.contains("psn: 0x5678"));
913 }
914
915 #[test]
916 fn test_ibv_wc() {
917 let mut wc = rdmaxcel_sys::ibv_wc::default();
918
919 // SAFETY: modifies private fields through pointer manipulation
920 unsafe {
921 // Cast to pointer and modify the fields directly
922 let wc_ptr = &mut wc as *mut rdmaxcel_sys::ibv_wc as *mut u8;
923
924 // Set wr_id (at offset 0, u64)
925 *(wc_ptr as *mut u64) = 42;
926
927 // Set status to SUCCESS (at offset 8, u32)
928 *(wc_ptr.add(8) as *mut i32) = rdmaxcel_sys::ibv_wc_status::IBV_WC_SUCCESS as i32;
929 }
930 let ibv_wc = IbvWc::from(wc);
931 assert_eq!(ibv_wc.wr_id(), 42);
932 assert!(ibv_wc.is_valid());
933 }
934
935 #[test]
936 fn test_format_gid() {
937 let gid = [
938 0x12, 0x34, 0x56, 0x78, 0x9a, 0xbc, 0xde, 0xf0, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66,
939 0x77, 0x88,
940 ];
941
942 let formatted = format_gid(&gid);
943 assert_eq!(formatted, "1234:5678:9abc:def0:1122:3344:5566:7788");
944 }
945}