Skip to main content

rdmaxcel_sys/
lib.rs

1/*
2 * Portions Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9// sections of code adapted from https://github.com/jonhoo/rust-ibverbs
10// Copyright (c) 2016 Jon Gjengset under MIT License (MIT)
11
12mod inner {
13    #![allow(non_upper_case_globals)]
14    #![allow(non_camel_case_types)]
15    #![allow(non_snake_case)]
16    #![allow(unused_attributes)]
17    #[cfg(not(cargo))]
18    use crate::ibv_wc_flags;
19    #[cfg(not(cargo))]
20    use crate::ibv_wc_opcode;
21    #[cfg(not(cargo))]
22    use crate::ibv_wc_status;
23    #[cfg(cargo)]
24    include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
25
26    // ROCm/HIP compatibility layer
27    //
28    // In ROCm builds, bindgen generates HIP types and constants instead of CUDA equivalents.
29    // These type aliases and const aliases allow Rust code to use CUDA names consistently
30    // across both CUDA and ROCm backends, avoiding the need for conditional compilation
31    // throughout the codebase.
32    #[cfg(use_rocm)]
33    pub use self::rocm_compat::*;
34
35    #[cfg(use_rocm)]
36    mod rocm_compat {
37        use super::*;
38
39        // Basic types
40        pub type CUdevice = hipDevice_t;
41        pub type CUdeviceptr = hipDeviceptr_t;
42        pub type CUcontext = hipCtx_t;
43
44        // Memory management types
45        pub type CUmemGenericAllocationHandle = hipMemGenericAllocationHandle_t;
46        pub type CUmemAllocationProp = hipMemAllocationProp;
47        pub type CUmemAccessDesc = hipMemAccessDesc;
48
49        // Error codes
50        pub const CUDA_SUCCESS: hipError_t = hipSuccess;
51
52        // Pointer attributes
53        pub const CU_POINTER_ATTRIBUTE_MEMORY_TYPE: hipPointer_attribute =
54            HIP_POINTER_ATTRIBUTE_MEMORY_TYPE;
55        pub const CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL: hipPointer_attribute =
56            HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL;
57        pub const CU_POINTER_ATTRIBUTE_CONTEXT: hipPointer_attribute =
58            HIP_POINTER_ATTRIBUTE_CONTEXT;
59
60        // Memory types
61        pub const CU_MEMORYTYPE_DEVICE: u32 = 2; // hipMemoryTypeDevice = 2
62
63        // Memory handle types
64        pub const CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD: hipMemRangeHandleType =
65            hipMemRangeHandleTypeDmaBufFd;
66        pub const CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR: hipMemAllocationHandleType =
67            hipMemHandleTypePosixFileDescriptor;
68
69        // Memory allocation flags
70        pub const CU_MEM_ALLOCATION_TYPE_PINNED: hipMemAllocationType = hipMemAllocationTypePinned;
71        pub const CU_MEM_LOCATION_TYPE_DEVICE: hipMemLocationType = hipMemLocationTypeDevice;
72        pub const CU_MEM_ALLOC_GRANULARITY_MINIMUM: hipMemAllocationGranularity_flags =
73            hipMemAllocationGranularityMinimum;
74        pub const CU_MEM_ACCESS_FLAGS_PROT_READWRITE: hipMemAccessFlags =
75            hipMemAccessFlagsProtReadWrite;
76    }
77
78    #[repr(C, packed(1))]
79    #[derive(Debug, Default, Clone, Copy)]
80    pub struct mlx5_wqe_ctrl_seg {
81        pub opmod_idx_opcode: u32,
82        pub qpn_ds: u32,
83        pub signature: u8,
84        pub dci_stream_channel_id: u16,
85        pub fm_ce_se: u8,
86        pub imm: u32,
87    }
88
89    #[repr(C)]
90    #[derive(Debug, Copy, Clone)]
91    pub struct ibv_wc {
92        wr_id: u64,
93        status: ibv_wc_status::Type,
94        opcode: ibv_wc_opcode::Type,
95        vendor_err: u32,
96        byte_len: u32,
97
98        /// Immediate data OR the local RKey that was invalidated depending on `wc_flags`.
99        /// See `man ibv_poll_cq` for details.
100        pub imm_data: u32,
101        /// Local QP number of completed WR.
102        ///
103        /// Relevant for Receive Work Completions that are associated with an SRQ.
104        pub qp_num: u32,
105        /// Source QP number (remote QP number) of completed WR.
106        ///
107        /// Relevant for Receive Work Completions of a UD QP.
108        pub src_qp: u32,
109        /// Flags of the Work Completion. It is either 0 or the bitwise OR of one or more of the
110        /// following flags:
111        ///
112        ///  - `IBV_WC_GRH`: Indicator that GRH is present for a Receive Work Completions of a UD QP.
113        ///    If this bit is set, the first 40 bytes of the buffered that were referred to in the
114        ///    Receive request will contain the GRH of the incoming message. If this bit is cleared,
115        ///    the content of those first 40 bytes is undefined
116        ///  - `IBV_WC_WITH_IMM`: Indicator that imm_data is valid. Relevant for Receive Work
117        ///    Completions
118        pub wc_flags: ibv_wc_flags,
119        /// P_Key index (valid only for GSI QPs).
120        pub pkey_index: u16,
121        /// Source LID (the base LID that this message was sent from).
122        ///
123        /// Relevant for Receive Work Completions of a UD QP.
124        pub slid: u16,
125        /// Service Level (the SL LID that this message was sent with).
126        ///
127        /// Relevant for Receive Work Completions of a UD QP.
128        pub sl: u8,
129        /// Destination LID path bits.
130        ///
131        /// Relevant for Receive Work Completions of a UD QP (not applicable for multicast messages).
132        pub dlid_path_bits: u8,
133    }
134
135    #[allow(clippy::len_without_is_empty)]
136    impl ibv_wc {
137        /// Returns the 64 bit value that was associated with the corresponding Work Request.
138        pub fn wr_id(&self) -> u64 {
139            self.wr_id
140        }
141
142        /// Returns the number of bytes transferred.
143        ///
144        /// Relevant if the Receive Queue for incoming Send or RDMA Write with immediate operations.
145        /// This value doesn't include the length of the immediate data, if such exists. Relevant in
146        /// the Send Queue for RDMA Read and Atomic operations.
147        ///
148        /// For the Receive Queue of a UD QP that is not associated with an SRQ or for an SRQ that is
149        /// associated with a UD QP this value equals to the payload of the message plus the 40 bytes
150        /// reserved for the GRH. The number of bytes transferred is the payload of the message plus
151        /// the 40 bytes reserved for the GRH, whether or not the GRH is present
152        pub fn len(&self) -> usize {
153            self.byte_len as usize
154        }
155
156        /// Check if this work requested completed successfully.
157        ///
158        /// A successful work completion (`IBV_WC_SUCCESS`) means that the corresponding Work Request
159        /// (and all of the unsignaled Work Requests that were posted previous to it) ended, and the
160        /// memory buffers that this Work Request refers to are ready to be (re)used.
161        pub fn is_valid(&self) -> bool {
162            self.status == ibv_wc_status::IBV_WC_SUCCESS
163        }
164
165        /// Returns the work completion status and vendor error syndrome (`vendor_err`) if the work
166        /// request did not completed successfully.
167        ///
168        /// Possible statuses include:
169        ///
170        ///  - `IBV_WC_LOC_LEN_ERR`: Local Length Error: this happens if a Work Request that was posted
171        ///    in a local Send Queue contains a message that is greater than the maximum message size
172        ///    that is supported by the RDMA device port that should send the message or an Atomic
173        ///    operation which its size is different than 8 bytes was sent. This also may happen if a
174        ///    Work Request that was posted in a local Receive Queue isn't big enough for holding the
175        ///    incoming message or if the incoming message size if greater the maximum message size
176        ///    supported by the RDMA device port that received the message.
177        ///  - `IBV_WC_LOC_QP_OP_ERR`: Local QP Operation Error: an internal QP consistency error was
178        ///    detected while processing this Work Request: this happens if a Work Request that was
179        ///    posted in a local Send Queue of a UD QP contains an Address Handle that is associated
180        ///    with a Protection Domain to a QP which is associated with a different Protection Domain
181        ///    or an opcode which isn't supported by the transport type of the QP isn't supported (for
182        ///    example:
183        ///    RDMA Write over a UD QP).
184        ///  - `IBV_WC_LOC_EEC_OP_ERR`: Local EE Context Operation Error: an internal EE Context
185        ///    consistency error was detected while processing this Work Request (unused, since its
186        ///    relevant only to RD QPs or EE Context, which aren’t supported).
187        ///  - `IBV_WC_LOC_PROT_ERR`: Local Protection Error: the locally posted Work Request’s buffers
188        ///    in the scatter/gather list does not reference a Memory Region that is valid for the
189        ///    requested operation.
190        ///  - `IBV_WC_WR_FLUSH_ERR`: Work Request Flushed Error: A Work Request was in process or
191        ///    outstanding when the QP transitioned into the Error State.
192        ///  - `IBV_WC_MW_BIND_ERR`: Memory Window Binding Error: A failure happened when tried to bind
193        ///    a MW to a MR.
194        ///  - `IBV_WC_BAD_RESP_ERR`: Bad Response Error: an unexpected transport layer opcode was
195        ///    returned by the responder. Relevant for RC QPs.
196        ///  - `IBV_WC_LOC_ACCESS_ERR`: Local Access Error: a protection error occurred on a local data
197        ///    buffer during the processing of a RDMA Write with Immediate operation sent from the
198        ///    remote node. Relevant for RC QPs.
199        ///  - `IBV_WC_REM_INV_REQ_ERR`: Remote Invalid Request Error: The responder detected an
200        ///    invalid message on the channel. Possible causes include the operation is not supported
201        ///    by this receive queue (qp_access_flags in remote QP wasn't configured to support this
202        ///    operation), insufficient buffering to receive a new RDMA or Atomic Operation request, or
203        ///    the length specified in a RDMA request is greater than 2^{31} bytes. Relevant for RC
204        ///    QPs.
205        ///  - `IBV_WC_REM_ACCESS_ERR`: Remote Access Error: a protection error occurred on a remote
206        ///    data buffer to be read by an RDMA Read, written by an RDMA Write or accessed by an
207        ///    atomic operation. This error is reported only on RDMA operations or atomic operations.
208        ///    Relevant for RC QPs.
209        ///  - `IBV_WC_REM_OP_ERR`: Remote Operation Error: the operation could not be completed
210        ///    successfully by the responder. Possible causes include a responder QP related error that
211        ///    prevented the responder from completing the request or a malformed WQE on the Receive
212        ///    Queue. Relevant for RC QPs.
213        ///  - `IBV_WC_RETRY_EXC_ERR`: Transport Retry Counter Exceeded: The local transport timeout
214        ///    retry counter was exceeded while trying to send this message. This means that the remote
215        ///    side didn't send any Ack or Nack. If this happens when sending the first message,
216        ///    usually this mean that the connection attributes are wrong or the remote side isn't in a
217        ///    state that it can respond to messages. If this happens after sending the first message,
218        ///    usually it means that the remote QP isn't available anymore. Relevant for RC QPs.
219        ///  - `IBV_WC_RNR_RETRY_EXC_ERR`: RNR Retry Counter Exceeded: The RNR NAK retry count was
220        ///    exceeded. This usually means that the remote side didn't post any WR to its Receive
221        ///    Queue. Relevant for RC QPs.
222        ///  - `IBV_WC_LOC_RDD_VIOL_ERR`: Local RDD Violation Error: The RDD associated with the QP
223        ///    does not match the RDD associated with the EE Context (unused, since its relevant only
224        ///    to RD QPs or EE Context, which aren't supported).
225        ///  - `IBV_WC_REM_INV_RD_REQ_ERR`: Remote Invalid RD Request: The responder detected an
226        ///    invalid incoming RD message. Causes include a Q_Key or RDD violation (unused, since its
227        ///    relevant only to RD QPs or EE Context, which aren't supported)
228        ///  - `IBV_WC_REM_ABORT_ERR`: Remote Aborted Error: For UD or UC QPs associated with a SRQ,
229        ///    the responder aborted the operation.
230        ///  - `IBV_WC_INV_EECN_ERR`: Invalid EE Context Number: An invalid EE Context number was
231        ///    detected (unused, since its relevant only to RD QPs or EE Context, which aren't
232        ///    supported).
233        ///  - `IBV_WC_INV_EEC_STATE_ERR`: Invalid EE Context State Error: Operation is not legal for
234        ///    the specified EE Context state (unused, since its relevant only to RD QPs or EE Context,
235        ///    which aren't supported).
236        ///  - `IBV_WC_FATAL_ERR`: Fatal Error.
237        ///  - `IBV_WC_RESP_TIMEOUT_ERR`: Response Timeout Error.
238        ///  - `IBV_WC_GENERAL_ERR`: General Error: other error which isn't one of the above errors.
239        pub fn error(&self) -> Option<(ibv_wc_status::Type, u32)> {
240            match self.status {
241                ibv_wc_status::IBV_WC_SUCCESS => None,
242                status => Some((status, self.vendor_err)),
243            }
244        }
245
246        /// Returns the operation that the corresponding Work Request performed.
247        ///
248        /// This value controls the way that data was sent, the direction of the data flow and the
249        /// valid attributes in the Work Completion.
250        pub fn opcode(&self) -> ibv_wc_opcode::Type {
251            self.opcode
252        }
253
254        /// Returns a 32 bits number, in network order, in an SEND or RDMA WRITE opcodes that is being
255        /// sent along with the payload to the remote side and placed in a Receive Work Completion and
256        /// not in a remote memory buffer
257        ///
258        /// Note that IMM is only returned if `IBV_WC_WITH_IMM` is set in `wc_flags`. If this is not
259        /// the case, no immediate value was provided, and `imm_data` should be interpreted
260        /// differently. See `man ibv_poll_cq` for details.
261        pub fn imm_data(&self) -> Option<u32> {
262            if self.is_valid() && ((self.wc_flags & ibv_wc_flags::IBV_WC_WITH_IMM).0 != 0) {
263                Some(self.imm_data)
264            } else {
265                None
266            }
267        }
268    }
269
270    impl Default for ibv_wc {
271        fn default() -> Self {
272            ibv_wc {
273                wr_id: 0,
274                status: ibv_wc_status::IBV_WC_GENERAL_ERR,
275                opcode: ibv_wc_opcode::IBV_WC_LOCAL_INV,
276                vendor_err: 0,
277                byte_len: 0,
278                imm_data: 0,
279                qp_num: 0,
280                src_qp: 0,
281                wc_flags: ibv_wc_flags(0),
282                pkey_index: 0,
283                slid: 0,
284                sl: 0,
285                dlid_path_bits: 0,
286            }
287        }
288    }
289}
290
291pub use inner::*;
292
293// Segment scanner callback type - type alias for the bindgen-generated type
294pub type RdmaxcelSegmentScannerFn = rdmaxcel_segment_scanner_fn;
295
296// Additional extern "C" declarations for functions that are also auto-generated by bindgen.
297// These provide a place for doc comments and explicit signatures.
298unsafe extern "C" {
299    pub fn rdmaxcel_error_string(error_code: std::os::raw::c_int) -> *const std::os::raw::c_char;
300
301    /// Get PCI address from a CUDA/HIP device pointer
302    ///
303    /// In CUDA builds, cuda_ptr is CUdeviceptr (u64).
304    /// In ROCm builds, cuda_ptr is CUdeviceptr (aliased to hipDeviceptr_t = void*).
305    pub fn get_cuda_pci_address_from_ptr(
306        cuda_ptr: CUdeviceptr,
307        pci_addr_out: *mut std::os::raw::c_char,
308        pci_addr_size: usize,
309    ) -> std::os::raw::c_int;
310
311    /// Debug: Print comprehensive device attributes
312    pub fn rdmaxcel_print_device_info(context: *mut ibv_context);
313
314    // EFA functions
315
316    /// Check if the device is an EFA device (via efadv_query_device)
317    pub fn rdmaxcel_is_efa_dev(ctx: *mut ibv_context) -> std::os::raw::c_int;
318
319    /// EFA connect: INIT->RTR->RTS + AH creation, stored directly in qp struct
320    pub fn rdmaxcel_efa_connect(
321        qp: *mut rdmaxcel_qp_t,
322        port_num: u8,
323        pkey_index: u16,
324        qkey: u32,
325        psn: u32,
326        gid_index: u8,
327        remote_gid: *const u8,
328        remote_qpn: u32,
329    ) -> std::os::raw::c_int;
330
331    /// EFA post operation with ibv_post_recv fallback
332    /// op_type: 0 = write, 1 = read, 2 = recv, 3 = write_with_imm
333    pub fn rdmaxcel_qp_post_op(
334        qp: *mut rdmaxcel_qp_t,
335        local_addr: *mut std::ffi::c_void,
336        lkey: u32,
337        length: usize,
338        remote_addr: *mut std::ffi::c_void,
339        rkey: u32,
340        wr_id: u64,
341        signaled: std::os::raw::c_int,
342        op_type: std::os::raw::c_int,
343    ) -> std::os::raw::c_int;
344}