rdmaxcel_sys/lib.rs
1/*
2 * Portions Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9// sections of code adapted from https://github.com/jonhoo/rust-ibverbs
10// Copyright (c) 2016 Jon Gjengset under MIT License (MIT)
11
12mod inner {
13 #![allow(non_upper_case_globals)]
14 #![allow(non_camel_case_types)]
15 #![allow(non_snake_case)]
16 #![allow(unused_attributes)]
17 #[cfg(not(cargo))]
18 use crate::ibv_wc_flags;
19 #[cfg(not(cargo))]
20 use crate::ibv_wc_opcode;
21 #[cfg(not(cargo))]
22 use crate::ibv_wc_status;
23 #[cfg(cargo)]
24 include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
25
26 // ROCm/HIP compatibility layer
27 //
28 // In ROCm builds, bindgen generates HIP types and constants instead of CUDA equivalents.
29 // These type aliases and const aliases allow Rust code to use CUDA names consistently
30 // across both CUDA and ROCm backends, avoiding the need for conditional compilation
31 // throughout the codebase.
32 #[cfg(use_rocm)]
33 pub use self::rocm_compat::*;
34
35 #[cfg(use_rocm)]
36 mod rocm_compat {
37 use super::*;
38
39 // Basic types
40 pub type CUdevice = hipDevice_t;
41 pub type CUdeviceptr = hipDeviceptr_t;
42 pub type CUcontext = hipCtx_t;
43
44 // Memory management types
45 pub type CUmemGenericAllocationHandle = hipMemGenericAllocationHandle_t;
46 pub type CUmemAllocationProp = hipMemAllocationProp;
47 pub type CUmemAccessDesc = hipMemAccessDesc;
48
49 // Error codes
50 pub const CUDA_SUCCESS: hipError_t = hipSuccess;
51
52 // Pointer attributes
53 pub const CU_POINTER_ATTRIBUTE_MEMORY_TYPE: hipPointer_attribute =
54 HIP_POINTER_ATTRIBUTE_MEMORY_TYPE;
55 pub const CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL: hipPointer_attribute =
56 HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL;
57
58 // Memory types
59 pub const CU_MEMORYTYPE_DEVICE: u32 = 2; // hipMemoryTypeDevice = 2
60
61 // Memory handle types
62 pub const CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD: hipMemRangeHandleType =
63 hipMemRangeHandleTypeDmaBufFd;
64 pub const CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR: hipMemAllocationHandleType =
65 hipMemHandleTypePosixFileDescriptor;
66
67 // Memory allocation flags
68 pub const CU_MEM_ALLOCATION_TYPE_PINNED: hipMemAllocationType = hipMemAllocationTypePinned;
69 pub const CU_MEM_LOCATION_TYPE_DEVICE: hipMemLocationType = hipMemLocationTypeDevice;
70 pub const CU_MEM_ALLOC_GRANULARITY_MINIMUM: hipMemAllocationGranularity_flags =
71 hipMemAllocationGranularityMinimum;
72 pub const CU_MEM_ACCESS_FLAGS_PROT_READWRITE: hipMemAccessFlags =
73 hipMemAccessFlagsProtReadWrite;
74 }
75
76 #[repr(C, packed(1))]
77 #[derive(Debug, Default, Clone, Copy)]
78 pub struct mlx5_wqe_ctrl_seg {
79 pub opmod_idx_opcode: u32,
80 pub qpn_ds: u32,
81 pub signature: u8,
82 pub dci_stream_channel_id: u16,
83 pub fm_ce_se: u8,
84 pub imm: u32,
85 }
86
87 #[repr(C)]
88 #[derive(Debug, Copy, Clone)]
89 pub struct ibv_wc {
90 wr_id: u64,
91 status: ibv_wc_status::Type,
92 opcode: ibv_wc_opcode::Type,
93 vendor_err: u32,
94 byte_len: u32,
95
96 /// Immediate data OR the local RKey that was invalidated depending on `wc_flags`.
97 /// See `man ibv_poll_cq` for details.
98 pub imm_data: u32,
99 /// Local QP number of completed WR.
100 ///
101 /// Relevant for Receive Work Completions that are associated with an SRQ.
102 pub qp_num: u32,
103 /// Source QP number (remote QP number) of completed WR.
104 ///
105 /// Relevant for Receive Work Completions of a UD QP.
106 pub src_qp: u32,
107 /// Flags of the Work Completion. It is either 0 or the bitwise OR of one or more of the
108 /// following flags:
109 ///
110 /// - `IBV_WC_GRH`: Indicator that GRH is present for a Receive Work Completions of a UD QP.
111 /// If this bit is set, the first 40 bytes of the buffered that were referred to in the
112 /// Receive request will contain the GRH of the incoming message. If this bit is cleared,
113 /// the content of those first 40 bytes is undefined
114 /// - `IBV_WC_WITH_IMM`: Indicator that imm_data is valid. Relevant for Receive Work
115 /// Completions
116 pub wc_flags: ibv_wc_flags,
117 /// P_Key index (valid only for GSI QPs).
118 pub pkey_index: u16,
119 /// Source LID (the base LID that this message was sent from).
120 ///
121 /// Relevant for Receive Work Completions of a UD QP.
122 pub slid: u16,
123 /// Service Level (the SL LID that this message was sent with).
124 ///
125 /// Relevant for Receive Work Completions of a UD QP.
126 pub sl: u8,
127 /// Destination LID path bits.
128 ///
129 /// Relevant for Receive Work Completions of a UD QP (not applicable for multicast messages).
130 pub dlid_path_bits: u8,
131 }
132
133 #[allow(clippy::len_without_is_empty)]
134 impl ibv_wc {
135 /// Returns the 64 bit value that was associated with the corresponding Work Request.
136 pub fn wr_id(&self) -> u64 {
137 self.wr_id
138 }
139
140 /// Returns the number of bytes transferred.
141 ///
142 /// Relevant if the Receive Queue for incoming Send or RDMA Write with immediate operations.
143 /// This value doesn't include the length of the immediate data, if such exists. Relevant in
144 /// the Send Queue for RDMA Read and Atomic operations.
145 ///
146 /// For the Receive Queue of a UD QP that is not associated with an SRQ or for an SRQ that is
147 /// associated with a UD QP this value equals to the payload of the message plus the 40 bytes
148 /// reserved for the GRH. The number of bytes transferred is the payload of the message plus
149 /// the 40 bytes reserved for the GRH, whether or not the GRH is present
150 pub fn len(&self) -> usize {
151 self.byte_len as usize
152 }
153
154 /// Check if this work requested completed successfully.
155 ///
156 /// A successful work completion (`IBV_WC_SUCCESS`) means that the corresponding Work Request
157 /// (and all of the unsignaled Work Requests that were posted previous to it) ended, and the
158 /// memory buffers that this Work Request refers to are ready to be (re)used.
159 pub fn is_valid(&self) -> bool {
160 self.status == ibv_wc_status::IBV_WC_SUCCESS
161 }
162
163 /// Returns the work completion status and vendor error syndrome (`vendor_err`) if the work
164 /// request did not completed successfully.
165 ///
166 /// Possible statuses include:
167 ///
168 /// - `IBV_WC_LOC_LEN_ERR`: Local Length Error: this happens if a Work Request that was posted
169 /// in a local Send Queue contains a message that is greater than the maximum message size
170 /// that is supported by the RDMA device port that should send the message or an Atomic
171 /// operation which its size is different than 8 bytes was sent. This also may happen if a
172 /// Work Request that was posted in a local Receive Queue isn't big enough for holding the
173 /// incoming message or if the incoming message size if greater the maximum message size
174 /// supported by the RDMA device port that received the message.
175 /// - `IBV_WC_LOC_QP_OP_ERR`: Local QP Operation Error: an internal QP consistency error was
176 /// detected while processing this Work Request: this happens if a Work Request that was
177 /// posted in a local Send Queue of a UD QP contains an Address Handle that is associated
178 /// with a Protection Domain to a QP which is associated with a different Protection Domain
179 /// or an opcode which isn't supported by the transport type of the QP isn't supported (for
180 /// example:
181 /// RDMA Write over a UD QP).
182 /// - `IBV_WC_LOC_EEC_OP_ERR`: Local EE Context Operation Error: an internal EE Context
183 /// consistency error was detected while processing this Work Request (unused, since its
184 /// relevant only to RD QPs or EE Context, which aren’t supported).
185 /// - `IBV_WC_LOC_PROT_ERR`: Local Protection Error: the locally posted Work Request’s buffers
186 /// in the scatter/gather list does not reference a Memory Region that is valid for the
187 /// requested operation.
188 /// - `IBV_WC_WR_FLUSH_ERR`: Work Request Flushed Error: A Work Request was in process or
189 /// outstanding when the QP transitioned into the Error State.
190 /// - `IBV_WC_MW_BIND_ERR`: Memory Window Binding Error: A failure happened when tried to bind
191 /// a MW to a MR.
192 /// - `IBV_WC_BAD_RESP_ERR`: Bad Response Error: an unexpected transport layer opcode was
193 /// returned by the responder. Relevant for RC QPs.
194 /// - `IBV_WC_LOC_ACCESS_ERR`: Local Access Error: a protection error occurred on a local data
195 /// buffer during the processing of a RDMA Write with Immediate operation sent from the
196 /// remote node. Relevant for RC QPs.
197 /// - `IBV_WC_REM_INV_REQ_ERR`: Remote Invalid Request Error: The responder detected an
198 /// invalid message on the channel. Possible causes include the operation is not supported
199 /// by this receive queue (qp_access_flags in remote QP wasn't configured to support this
200 /// operation), insufficient buffering to receive a new RDMA or Atomic Operation request, or
201 /// the length specified in a RDMA request is greater than 2^{31} bytes. Relevant for RC
202 /// QPs.
203 /// - `IBV_WC_REM_ACCESS_ERR`: Remote Access Error: a protection error occurred on a remote
204 /// data buffer to be read by an RDMA Read, written by an RDMA Write or accessed by an
205 /// atomic operation. This error is reported only on RDMA operations or atomic operations.
206 /// Relevant for RC QPs.
207 /// - `IBV_WC_REM_OP_ERR`: Remote Operation Error: the operation could not be completed
208 /// successfully by the responder. Possible causes include a responder QP related error that
209 /// prevented the responder from completing the request or a malformed WQE on the Receive
210 /// Queue. Relevant for RC QPs.
211 /// - `IBV_WC_RETRY_EXC_ERR`: Transport Retry Counter Exceeded: The local transport timeout
212 /// retry counter was exceeded while trying to send this message. This means that the remote
213 /// side didn't send any Ack or Nack. If this happens when sending the first message,
214 /// usually this mean that the connection attributes are wrong or the remote side isn't in a
215 /// state that it can respond to messages. If this happens after sending the first message,
216 /// usually it means that the remote QP isn't available anymore. Relevant for RC QPs.
217 /// - `IBV_WC_RNR_RETRY_EXC_ERR`: RNR Retry Counter Exceeded: The RNR NAK retry count was
218 /// exceeded. This usually means that the remote side didn't post any WR to its Receive
219 /// Queue. Relevant for RC QPs.
220 /// - `IBV_WC_LOC_RDD_VIOL_ERR`: Local RDD Violation Error: The RDD associated with the QP
221 /// does not match the RDD associated with the EE Context (unused, since its relevant only
222 /// to RD QPs or EE Context, which aren't supported).
223 /// - `IBV_WC_REM_INV_RD_REQ_ERR`: Remote Invalid RD Request: The responder detected an
224 /// invalid incoming RD message. Causes include a Q_Key or RDD violation (unused, since its
225 /// relevant only to RD QPs or EE Context, which aren't supported)
226 /// - `IBV_WC_REM_ABORT_ERR`: Remote Aborted Error: For UD or UC QPs associated with a SRQ,
227 /// the responder aborted the operation.
228 /// - `IBV_WC_INV_EECN_ERR`: Invalid EE Context Number: An invalid EE Context number was
229 /// detected (unused, since its relevant only to RD QPs or EE Context, which aren't
230 /// supported).
231 /// - `IBV_WC_INV_EEC_STATE_ERR`: Invalid EE Context State Error: Operation is not legal for
232 /// the specified EE Context state (unused, since its relevant only to RD QPs or EE Context,
233 /// which aren't supported).
234 /// - `IBV_WC_FATAL_ERR`: Fatal Error.
235 /// - `IBV_WC_RESP_TIMEOUT_ERR`: Response Timeout Error.
236 /// - `IBV_WC_GENERAL_ERR`: General Error: other error which isn't one of the above errors.
237 pub fn error(&self) -> Option<(ibv_wc_status::Type, u32)> {
238 match self.status {
239 ibv_wc_status::IBV_WC_SUCCESS => None,
240 status => Some((status, self.vendor_err)),
241 }
242 }
243
244 /// Returns the operation that the corresponding Work Request performed.
245 ///
246 /// This value controls the way that data was sent, the direction of the data flow and the
247 /// valid attributes in the Work Completion.
248 pub fn opcode(&self) -> ibv_wc_opcode::Type {
249 self.opcode
250 }
251
252 /// Returns a 32 bits number, in network order, in an SEND or RDMA WRITE opcodes that is being
253 /// sent along with the payload to the remote side and placed in a Receive Work Completion and
254 /// not in a remote memory buffer
255 ///
256 /// Note that IMM is only returned if `IBV_WC_WITH_IMM` is set in `wc_flags`. If this is not
257 /// the case, no immediate value was provided, and `imm_data` should be interpreted
258 /// differently. See `man ibv_poll_cq` for details.
259 pub fn imm_data(&self) -> Option<u32> {
260 if self.is_valid() && ((self.wc_flags & ibv_wc_flags::IBV_WC_WITH_IMM).0 != 0) {
261 Some(self.imm_data)
262 } else {
263 None
264 }
265 }
266 }
267
268 impl Default for ibv_wc {
269 fn default() -> Self {
270 ibv_wc {
271 wr_id: 0,
272 status: ibv_wc_status::IBV_WC_GENERAL_ERR,
273 opcode: ibv_wc_opcode::IBV_WC_LOCAL_INV,
274 vendor_err: 0,
275 byte_len: 0,
276 imm_data: 0,
277 qp_num: 0,
278 src_qp: 0,
279 wc_flags: ibv_wc_flags(0),
280 pkey_index: 0,
281 slid: 0,
282 sl: 0,
283 dlid_path_bits: 0,
284 }
285 }
286 }
287}
288
289pub use inner::*;
290
291// Segment scanner callback type - type alias for the bindgen-generated type
292pub type RdmaxcelSegmentScannerFn = rdmaxcel_segment_scanner_fn;
293
294// Additional extern "C" declarations for functions that are also auto-generated by bindgen.
295// These provide a place for doc comments and explicit signatures.
296unsafe extern "C" {
297 pub fn rdmaxcel_error_string(error_code: std::os::raw::c_int) -> *const std::os::raw::c_char;
298
299 /// Get PCI address from a CUDA/HIP device pointer
300 ///
301 /// In CUDA builds, cuda_ptr is CUdeviceptr (u64).
302 /// In ROCm builds, cuda_ptr is CUdeviceptr (aliased to hipDeviceptr_t = void*).
303 pub fn get_cuda_pci_address_from_ptr(
304 cuda_ptr: CUdeviceptr,
305 pci_addr_out: *mut std::os::raw::c_char,
306 pci_addr_size: usize,
307 ) -> std::os::raw::c_int;
308
309 /// Debug: Print comprehensive device attributes
310 pub fn rdmaxcel_print_device_info(context: *mut ibv_context);
311
312 // EFA functions
313
314 /// Check if the device is an EFA device (via efadv_query_device)
315 pub fn rdmaxcel_is_efa_dev(ctx: *mut ibv_context) -> std::os::raw::c_int;
316
317 /// EFA connect: INIT->RTR->RTS + AH creation, stored directly in qp struct
318 pub fn rdmaxcel_efa_connect(
319 qp: *mut rdmaxcel_qp_t,
320 port_num: u8,
321 pkey_index: u16,
322 qkey: u32,
323 psn: u32,
324 gid_index: u8,
325 remote_gid: *const u8,
326 remote_qpn: u32,
327 ) -> std::os::raw::c_int;
328
329 /// EFA post operation with ibv_post_recv fallback
330 /// op_type: 0 = write, 1 = read, 2 = recv, 3 = write_with_imm
331 pub fn rdmaxcel_qp_post_op(
332 qp: *mut rdmaxcel_qp_t,
333 local_addr: *mut std::ffi::c_void,
334 lkey: u32,
335 length: usize,
336 remote_addr: *mut std::ffi::c_void,
337 rkey: u32,
338 wr_id: u64,
339 signaled: std::os::raw::c_int,
340 op_type: std::os::raw::c_int,
341 ) -> std::os::raw::c_int;
342}