diff --git a/pyverbs/cq.pyx b/pyverbs/cq.pyx index 0ac73d1a1..06dbe7f6a 100644 --- a/pyverbs/cq.pyx +++ b/pyverbs/cq.pyx @@ -1,6 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) # Copyright (c) 2019, Mellanox Technologies. All rights reserved. import weakref +from libc.stdint cimport uintptr_t from pyverbs.pyverbs_error import PyverbsError, PyverbsRDMAError from pyverbs.base import PyverbsRDMAErrno @@ -214,7 +215,7 @@ cdef class CQ(PyverbsCM): @property def cq(self): - return self.cq + return self.cq cdef class CqInitAttrEx(PyverbsObject): diff --git a/pyverbs/device.pyx b/pyverbs/device.pyx index 1eaf3cfd2..054f8b4d4 100644 --- a/pyverbs/device.pyx +++ b/pyverbs/device.pyx @@ -28,6 +28,7 @@ from libc.string cimport memset from libc.stdint cimport uint64_t from libc.stdint cimport uint16_t from libc.stdint cimport uint32_t +from libc.stdint cimport uintptr_t from pyverbs.utils import gid_str cdef extern from 'endian.h': @@ -188,7 +189,7 @@ cdef class Context(PyverbsCM): @property def context(self): - return self.context + return self.context @property def num_comp_vectors(self): diff --git a/pyverbs/providers/mlx5/mlx5dv_objects.pyx b/pyverbs/providers/mlx5/mlx5dv_objects.pyx index 0567e81a0..c61bc2b77 100644 --- a/pyverbs/providers/mlx5/mlx5dv_objects.pyx +++ b/pyverbs/providers/mlx5/mlx5dv_objects.pyx @@ -167,7 +167,7 @@ cdef class Mlx5DvObj(PyverbsObject): if cq: dv_cq = Mlx5DvCQ() - self.obj.cq.in_ = cq.cq + self.obj.cq.in_ = cq.cq self.obj.cq.out = &(dv_cq.dv_cq) self.dv_cq = dv_cq if qp: @@ -177,7 +177,7 @@ cdef class Mlx5DvObj(PyverbsObject): dv.MLX5DV_QP_MASK_UAR_MMAP_OFFSET | \ dv.MLX5DV_QP_MASK_RAW_QP_HANDLES | \ dv.MLX5DV_QP_MASK_RAW_QP_TIR_ADDR - self.obj.qp.in_ = qp.qp + self.obj.qp.in_ = qp.qp self.obj.qp.out = &(dv_qp.dv_qp) self.dv_qp = dv_qp if pd: @@ -189,7 +189,7 @@ cdef class Mlx5DvObj(PyverbsObject): dv_srq = Mlx5DvSRQ() comp_mask = kwargs.get('srq_comp_mask') dv_srq.comp_mask = comp_mask if comp_mask else dv.MLX5DV_SRQ_MASK_SRQN - self.obj.srq.in_ = srq.srq + self.obj.srq.in_ = srq.srq self.obj.srq.out = &(dv_srq.dv_srq) self.dv_srq = dv_srq diff --git a/pyverbs/qp.pyx b/pyverbs/qp.pyx index 61b75c638..40550813b 100644 --- a/pyverbs/qp.pyx +++ b/pyverbs/qp.pyx @@ -2,6 +2,7 @@ # Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. from libc.stdlib cimport malloc, free +from libc.stdint cimport uintptr_t from libc.string cimport memcpy import weakref @@ -1265,6 +1266,10 @@ cdef class QP(PyverbsCM): def qp_num(self): return self.qp.qp_num + @property + def qp(self): + return self.qp + def __str__(self): print_format = '{:22}: {:<20}\n' return print_format.format('QP type', qp_type_to_str(self.qp_type)) +\ diff --git a/pyverbs/srq.pyx b/pyverbs/srq.pyx index e0098e7d0..b87d46f2f 100644 --- a/pyverbs/srq.pyx +++ b/pyverbs/srq.pyx @@ -2,6 +2,7 @@ import weakref from libc.errno cimport errno from libc.string cimport memcpy from libc.stdlib cimport malloc, free +from libc.stdint cimport uintptr_t from pyverbs.pyverbs_error import PyverbsRDMAError, PyverbsError from pyverbs.wr cimport RecvWR, SGE, copy_sg_array from pyverbs.base import PyverbsRDMAErrno @@ -336,3 +337,7 @@ cdef class SRQ(PyverbsCM): if bad_wr: memcpy(&bad_wr.recv_wr, my_bad_wr, sizeof(bad_wr.recv_wr)) raise PyverbsRDMAError('Failed to post receive to SRQ.', rc) + + @property + def srq(self): + return self.srq diff --git a/tests/mlx5_base.py b/tests/mlx5_base.py index f843940e8..d02c3b3e1 100644 --- a/tests/mlx5_base.py +++ b/tests/mlx5_base.py @@ -63,6 +63,7 @@ 0x1021, # ConnectX-7 0x1023, # ConnectX-8 0x1025, # ConnectX-9 + 0x1027, # ConnectX-10 0xa2d2, # BlueField integrated ConnectX-5 network controller 0xa2d3, # BlueField integrated ConnectX-5 network controller VF 0xa2d6, # BlueField-2 integrated ConnectX-6 Dx network controller @@ -70,9 +71,6 @@ 0xa2df, # BlueField-4 integrated ConnectX-8 network controller } -DCI_TEST_GOOD_FLOW = 0 -DCI_TEST_BAD_FLOW_WITH_RESET = 1 -DCI_TEST_BAD_FLOW_WITHOUT_RESET = 2 IB_SMP_ATTR_PORT_INFO = 0x0015 IB_MGMT_CLASS_SUBN_LID_ROUTED = 0x01 IB_MGMT_METHOD_GET = 0x01 @@ -123,9 +121,16 @@ def setUp(self): class Mlx5DcResources(RoCETrafficResources): def __init__(self, dev_name, ib_port, gid_index, send_ops_flags, - qp_count=1, create_flags=0): + qp_count=1, create_flags=0, mr_access=None): self.send_ops_flags = send_ops_flags self.create_flags = create_flags + if mr_access is None: + self.mr_access = (ibv_access_flags.IBV_ACCESS_REMOTE_WRITE | + ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | + ibv_access_flags.IBV_ACCESS_REMOTE_ATOMIC | + ibv_access_flags.IBV_ACCESS_REMOTE_READ) + else: + self.mr_access = mr_access super().__init__(dev_name, ib_port, gid_index, with_srq=True, qp_count=qp_count) @@ -145,9 +150,7 @@ def create_context(self): raise unittest.SkipTest('Opening mlx5 context is not supported') def create_mr(self): - access = ibv_access_flags.IBV_ACCESS_REMOTE_WRITE | ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | \ - ibv_access_flags.IBV_ACCESS_REMOTE_ATOMIC | ibv_access_flags.IBV_ACCESS_REMOTE_READ - self.mr = MR(self.pd, self.msg_size, access) + self.mr = MR(self.pd, self.msg_size, self.mr_access) def create_qp_cap(self): return QPCap(100, 0, 1, 0) @@ -205,18 +208,14 @@ def create_qps(self): class Mlx5DcStreamsRes(Mlx5DcResources): def __init__(self, dev_name, ib_port, gid_index, send_ops_flags, - qp_count=1, create_flags=0): - self.bad_flow = 0 - self.mr_bad_flow = False - self.stream_check = False + qp_count=1, create_flags=0, mr_access=None): super().__init__(dev_name, ib_port, gid_index, send_ops_flags, - qp_count, create_flags) + qp_count, create_flags, mr_access) def reset_qp(self, qp_idx): qp_attr = QPAttr(qp_state=ibv_qp_state.IBV_QPS_RESET) self.qps[qp_idx].modify(qp_attr, ibv_qp_attr_mask.IBV_QP_STATE) self.qps[qp_idx].to_rts(qp_attr) - self.qp_stream_errors[qp_idx][0] = 0 def get_stream_id(self, qp_idx): return self.current_qp_stream_id[qp_idx] @@ -231,96 +230,6 @@ def generate_stream_id(self, qp_idx): def dci_reset_stream_id(self, qp_idx): stream_id = self.get_stream_id(qp_idx) Mlx5QP.modify_dci_stream_channel_id(self.qps[qp_idx], stream_id) - # Check once if error raised when reset wrong stream id - if self.stream_check: - try: - Mlx5QP.modify_dci_stream_channel_id(self.qps[qp_idx], - stream_id+1) - except PyverbsRDMAError as ex: - self.stream_check = False - - def bad_flow_handler_qp(self, qp_idx, status, reset=False): - str_id = self.get_stream_id(qp_idx) - bt_stream = (1 << str_id) - - if status == ibv_wc_status.IBV_WC_LOC_PROT_ERR: - self.qp_stream_errors[qp_idx][1] += 1 - if (self.qp_stream_errors[qp_idx][0] & bt_stream) != 0: - raise PyverbsError(f'Dublicate error from stream id {str_id}') - self.qp_stream_errors[qp_idx][0] |= bt_stream - if status == ibv_wc_status.IBV_WC_WR_FLUSH_ERR: - qp_attr, _ = self.qps[qp_idx].query(ibv_qp_attr_mask.IBV_QP_STATE) - if qp_attr.cur_qp_state == ibv_qp_state.IBV_QPS_ERR and reset: - if self.qp_stream_errors[qp_idx][1] != self.dcis[qp_idx]['errored']: - msg = f'QP {qp_idx} in ERR state with wrong number of counter' - raise PyverbsError(msg) - self.reset_qp(qp_idx) - self.qp_stream_errors[qp_idx][2] = True - - return True - - def bad_flow_handling(self, qp_idx, status, reset=False): - if self.bad_flow == DCI_TEST_GOOD_FLOW: - return False - if self.bad_flow == DCI_TEST_BAD_FLOW_WITH_RESET: - self.qp_stream_errors[qp_idx][1] += 1 - if reset: - self.dci_reset_stream_id(qp_idx) - return True - if self.bad_flow == DCI_TEST_BAD_FLOW_WITHOUT_RESET: - return self.bad_flow_handler_qp(qp_idx, status, reset) - return False - - def set_bad_flow(self, bad_flow): - self.bad_flow = bad_flow - if self.bad_flow: - if bad_flow == DCI_TEST_BAD_FLOW_WITH_RESET and self.log_dci_errored == 0: - raise unittest.SkipTest('DCS test of bad flow with reset is not ' - 'supported when HCA_CAP.log_dci_errored is 0') - self.pd_bad = PD(self.ctx) - self.mr_bad_flow = False - if bad_flow == DCI_TEST_BAD_FLOW_WITH_RESET: - self.stream_check = True - - def is_bad_flow(self, qp_idx): - cnt = self.get_stream_id(qp_idx) - if self.bad_flow == DCI_TEST_GOOD_FLOW: - return False - if self.bad_flow == DCI_TEST_BAD_FLOW_WITH_RESET: - if (cnt % 3) != 0: - return False - self.qp_stream_errors[qp_idx][0] += 1 - if self.bad_flow == DCI_TEST_BAD_FLOW_WITHOUT_RESET: - if self.qp_stream_errors[qp_idx][2]: - return False - return True - - def check_bad_flow(self, qp_idx): - change_mr = False - if self.is_bad_flow(qp_idx): - if not self.mr_bad_flow: - self.mr_bad_flow = True - pd = self.pd_bad - change_mr = True - else: - if self.mr_bad_flow: - self.mr_bad_flow = False - pd = self.pd - change_mr = True - if change_mr: - self.mr.rereg(flags=ibv_rereg_mr_flags.IBV_REREG_MR_CHANGE_PD, pd=pd, - addr=0, length=0, access=0) - - def check_after_traffic(self): - if self.bad_flow == DCI_TEST_BAD_FLOW_WITH_RESET: - for errs in self.qp_stream_errors: - if errs[0] != errs[1]: - msg = f'Number of qp_stream_errors {errs[0]} not same '\ - f'as number of catches {errs[1]}' - raise PyverbsError(msg) - if self.stream_check: - msg = 'Reset of good stream id does not create exception' - raise PyverbsError(msg) def generate_dci_attr(self, qpn): # This array contains current number of log_dci_streams @@ -347,8 +256,6 @@ def create_qps(self): self.dcis = {} # This array contains current stream id self.current_qp_stream_id = {} - # This array counts different errors in bad_flow - self.qp_stream_errors = [] comp_mask = mlx5dv_qp_init_attr_mask.MLX5DV_QP_INIT_ATTR_MASK_DC | \ mlx5dv_qp_init_attr_mask.MLX5DV_QP_INIT_ATTR_MASK_DCI_STREAMS try: @@ -367,15 +274,6 @@ def create_qps(self): self.qps.append(qp) # Different values for start point of stream id per qp self.current_qp_stream_id[qpn] = qpn - # Array of errors for bad_flow - # For DCI_TEST_BAD_FLOW_WITH_RESET - # First element - number of injected bad flows - # Second element - number of exceptions from bad flows - # For DCI_TEST_BAD_FLOW_WITHOUT_RESET - # First element - bitmap of bad flow streams - # Second element - number of exceptions from bad flows - # Third element - flag if reset of qp been executed - self.qp_stream_errors.append([0, 0, False]) self.qps_num.append(qp.qp_num) self.psns.append(random.getrandbits(24)) # Create the DCT QP. @@ -390,49 +288,6 @@ def create_qps(self): raise unittest.SkipTest('Create DC QP is not supported') raise ex - @staticmethod - def traffic_with_bad_flow(client, server, iters, gid_idx, port): - """ - Runs basic traffic with bad flow between two sides - :param client: client side, clients base class is BaseTraffic - :param server: server side, servers base class is BaseTraffic - :param iters: number of traffic iterations - :param gid_idx: local gid index - :param port: IB port - :return: None - """ - import tests.utils as u - send_op = ibv_wr_opcode.IBV_WR_SEND - ah_client = u.get_global_ah(client, gid_idx, port) - s_recv_wr = u.get_recv_wr(server) - c_recv_wr = u.get_recv_wr(client) - for qp_idx in range(server.qp_count): - # Prepare the receive queue with RecvWR - u.post_recv(client, c_recv_wr, qp_idx=qp_idx) - u.post_recv(server, s_recv_wr, qp_idx=qp_idx) - read_offset = 0 - for _ in range(iters): - for qp_idx in range(server.qp_count): - _, c_send_object = u.get_send_elements(client, False) - u.send(client, c_send_object, send_op, True, qp_idx, - ah_client, False) - try: - wcs = u._poll_cq(client.cq) - except PyverbsError as ex: - if client.bad_flow_handling(qp_idx, ibv_wc_status.IBV_WC_SUCCESS, True): - continue - raise ex - else: - if wcs[0].status != ibv_wc_status.IBV_WC_SUCCESS and \ - client.bad_flow_handling(qp_idx, wcs[0].status, True): - continue - - u.poll_cq(server.cq) - u.post_recv(server, s_recv_wr, qp_idx=qp_idx) - msg_received = server.mr.read(server.msg_size, read_offset) - u.validate(msg_received, True, server.msg_size) - client.check_after_traffic() - class WqAttrs: def __init__(self): diff --git a/tests/test_mlx5_dc.py b/tests/test_mlx5_dc.py index ccf3eb844..176ac286b 100644 --- a/tests/test_mlx5_dc.py +++ b/tests/test_mlx5_dc.py @@ -4,13 +4,11 @@ import unittest import errno -from tests.mlx5_base import Mlx5DcResources, Mlx5RDMATestCase, Mlx5DcStreamsRes,\ - DCI_TEST_GOOD_FLOW, DCI_TEST_BAD_FLOW_WITH_RESET,\ - DCI_TEST_BAD_FLOW_WITHOUT_RESET +from tests.mlx5_base import Mlx5DcResources, Mlx5RDMATestCase, Mlx5DcStreamsRes from pyverbs.pyverbs_error import PyverbsRDMAError from pyverbs.providers.mlx5.mlx5dv import Mlx5QP from pyverbs.libibverbs_enums import ibv_access_flags, ibv_qp_create_send_ops_flags, ibv_wr_opcode, \ - ibv_odp_transport_cap_bits + ibv_odp_transport_cap_bits, ibv_qp_attr_mask, ibv_qp_state import tests.utils as u @@ -106,30 +104,58 @@ def test_dc_rdma_write_stream(self): u.rdma_traffic(**self.traffic_args, new_send=True, send_op=ibv_wr_opcode.IBV_WR_RDMA_WRITE) - def test_dc_send_stream_bad_flow(self): + def test_dc_stream_qp_recovery(self): """ - Check bad flow of DCS with reset stream id. - Create error in dci stream by setting invalid PD so dci stream goes to error. - In the end, the test verifies that the number of errors is as expected. - :raises SkipTest: In case DCI is not supported with HW + Test DC QP error state transition with stream channel error accumulation. + Creates DC QPs with restricted MR access and generates remote access errors + via RDMA_WRITE operations. Verifies QP transitions to ERR state after enough + channels entered error mode. Validates QP recovery after reset. """ - self.create_players(Mlx5DcStreamsRes, - qp_count=1, send_ops_flags=ibv_qp_create_send_ops_flags.IBV_QP_EX_WITH_SEND) - self.client.set_bad_flow(DCI_TEST_BAD_FLOW_WITH_RESET) - self.client.traffic_with_bad_flow(**self.traffic_args) - - def test_dc_send_stream_bad_flow_qp(self): + self.create_players(Mlx5DcStreamsRes, qp_count=2, + send_ops_flags=ibv_qp_create_send_ops_flags.IBV_QP_EX_WITH_RDMA_WRITE, + mr_access=ibv_access_flags.IBV_ACCESS_LOCAL_WRITE) + qp_idx = 0 + error_threshold = self.client.dcis[qp_idx]['errored'] + u.traffic(**self.traffic_args, new_send=True, send_op=ibv_wr_opcode.IBV_WR_SEND) + for _ in range(error_threshold): + with self.assertRaisesRegex(PyverbsRDMAError, r'Remote access error'): + u.rdma_traffic(**self.traffic_args, new_send=True, + send_op=ibv_wr_opcode.IBV_WR_RDMA_WRITE) + # Retry mechanism: QP state update to ERR takes time after errors occur + qp_in_err_state = False + for _ in range(3): + qp_attr, _ = self.client.qps[qp_idx].query(ibv_qp_attr_mask.IBV_QP_STATE) + if qp_attr.cur_qp_state == ibv_qp_state.IBV_QPS_ERR: + qp_in_err_state = True + break + if not qp_in_err_state: + raise PyverbsRDMAError(f'QP is not in ERR state after {error_threshold} errors') + for qp_idx in range(self.client.qp_count): + self.client.reset_qp(qp_idx) + for qp_idx in range(self.server.qp_count): + self.server.reset_qp(qp_idx) + u.traffic(**self.traffic_args, new_send=True, send_op=ibv_wr_opcode.IBV_WR_SEND) + + def test_dc_stream_ids_recovery(self): """ - Check bad flow of DCS with reset qp. - Checked if resetting of wrong dci stream id produces an exception. - This bad flow creates enough errors without resetting the streams, - enforcing the QP to get into ERR state. Then the checking is stopped. - Also has feature that after QP goes in ERR state test will - reset QP to RTS state. - :raises SkipTest: In case DCI is not supported with HW + Test DC stream ID reset functionality after remote access errors. + Creates DC QPs with restricted MR access and generates + remote access errors via RDMA_WRITE operations. After each error, resets + the stream ID and verifies QP remains functional. + Validates normal SEND traffic continues to work after stream resets. """ - self.iters = 20 - self.create_players(Mlx5DcStreamsRes, - qp_count=1, send_ops_flags=ibv_qp_create_send_ops_flags.IBV_QP_EX_WITH_SEND) - self.client.set_bad_flow(DCI_TEST_BAD_FLOW_WITHOUT_RESET) - self.client.traffic_with_bad_flow(**self.traffic_args) + self.create_players(Mlx5DcStreamsRes, qp_count=2, + send_ops_flags=ibv_qp_create_send_ops_flags.IBV_QP_EX_WITH_RDMA_WRITE, + mr_access=ibv_access_flags.IBV_ACCESS_LOCAL_WRITE) + qp_idx = 0 + error_threshold = self.client.dcis[qp_idx]['errored'] + u.traffic(**self.traffic_args, new_send=True, send_op=ibv_wr_opcode.IBV_WR_SEND) + for _ in range(error_threshold): + with self.assertRaisesRegex(PyverbsRDMAError, r'Remote access error'): + u.rdma_traffic(**self.traffic_args, new_send=True, + send_op=ibv_wr_opcode.IBV_WR_RDMA_WRITE) + self.client.dci_reset_stream_id(qp_idx) + qp_attr, _ = self.client.qps[qp_idx].query(ibv_qp_attr_mask.IBV_QP_STATE) + if qp_attr.cur_qp_state == ibv_qp_state.IBV_QPS_ERR: + raise PyverbsRDMAError('QP is in ERR state after reset stream id') + u.traffic(**self.traffic_args, new_send=True, send_op=ibv_wr_opcode.IBV_WR_SEND) diff --git a/tests/utils.py b/tests/utils.py index 333fef7c8..0860b304b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -606,7 +606,6 @@ def post_send_ex(agr_obj, send_object, send_op=None, qp_idx=0, ah=None, **kwargs if hasattr(agr_obj, 'remote_dct_num'): if isinstance(agr_obj, Mlx5DcStreamsRes): stream_id = agr_obj.generate_stream_id(qp_idx) - agr_obj.check_bad_flow(qp_idx) qp.wr_set_dc_addr_stream(ah, agr_obj.remote_dct_num, DCT_KEY, stream_id) else: