-
Notifications
You must be signed in to change notification settings - Fork 196
Open
Description
What happened?
when did at kvm based environment, we can follow below steps to setup the rdma environment
yum install rdma-core perftest -y
sudo modprobe ib_core
sudo modprobe ib_uverbs
sudo modprobe rdma_cm
cat /boot/config-$(uname -r) | grep RXE
#m
modprobe rdma_rxe
rdma link add rxe_0 type rxe netdev eth1
rdma link
A: ib_send_bw -d rxe_0
B: ib_send_bw -d rxe_0 <server_ip>
so that mean we can use virtio device as the rdma nic, rt?
this is pcap from testperf.
rdma.zip
What did you expect to happen?
use virtio nic to support RDMA
What are the minimal steps needed to reproduce the bug?
[root@rocky92 ~]# kk get cm sriovdp-config -o yaml
apiVersion: v1
data:
config.json: |
{
"resourceList": [
{
"resourcePrefix": "mellanox.com",
"resourceName": "sriov_vppdpdk_rdma5",
"selectors":
{
"pciaddresses": ["0000:02:00.0"],
"isRdma": true
}
},
{
"resourcePrefix": "mellanox.com",
"resourceName": "sriov_vppdpdk_rdma8",
"selectors":
{
"pciaddresses": ["0000:03:00.0"],
"isRdma": true
}
}
]
}
kind: ConfigMap
and the device-plugin pod's log show as below:
[root@rocky92 ~]# kk logs kube-sriov-device-plugin-hcwks
I0830 14:28:06.628921 1 manager.go:57] Using Kubelet Plugin Registry Mode
I0830 14:28:06.629778 1 main.go:46] resource manager reading configs
I0830 14:28:06.630716 1 manager.go:86] raw ResourceList: {
"resourceList": [
{
"resourcePrefix": "mellanox.com",
"resourceName": "sriov_vppdpdk_rdma5",
"selectors":
{
"pciaddresses": ["0000:02:00.0"],
"isRdma": true
}
},
{
"resourcePrefix": "mellanox.com",
"resourceName": "sriov_vppdpdk_rdma8",
"selectors":
{
"pciaddresses": ["0000:03:00.0"],
"isRdma": true
}
}
]
}
I0830 14:28:06.630886 1 factory.go:211] *types.NetDeviceSelectors for resource sriov_vppdpdk_rdma5 is [0xc00043fc20]
I0830 14:28:06.630900 1 factory.go:211] *types.NetDeviceSelectors for resource sriov_vppdpdk_rdma8 is [0xc0004d4000]
I0830 14:28:06.630904 1 manager.go:106] unmarshalled ResourceList: [{ResourcePrefix:mellanox.com ResourceName:sriov_vppdpdk_rdma5 DeviceType:netDevice ExcludeTopology:false Selectors:0xc000469188 AdditionalInfo:map[] SelectorObjs:[0xc00043fc20]} {ResourcePrefix:mellanox.com ResourceName:sriov_vppdpdk_rdma8 DeviceType:netDevice ExcludeTopology:false Selectors:0xc0004691a0 AdditionalInfo:map[] SelectorObjs:[0xc0004d4000]}]
I0830 14:28:06.630947 1 manager.go:217] validating resource name "mellanox.com/sriov_vppdpdk_rdma5"
I0830 14:28:06.630997 1 manager.go:217] validating resource name "mellanox.com/sriov_vppdpdk_rdma8"
I0830 14:28:06.631000 1 main.go:62] Discovering host devices
I0830 14:28:06.710775 1 netDeviceProvider.go:67] netdevice AddTargetDevices(): device found: 0000:01:00.0 02 Red Hat, Inc. Virtio 1.0 network device
I0830 14:28:06.710922 1 netDeviceProvider.go:67] netdevice AddTargetDevices(): device found: 0000:02:00.0 02 Red Hat, Inc. Virtio 1.0 network device
I0830 14:28:06.710946 1 netDeviceProvider.go:67] netdevice AddTargetDevices(): device found: 0000:03:00.0 02 Red Hat, Inc. Virtio 1.0 network device
I0830 14:28:06.710966 1 netDeviceProvider.go:67] netdevice AddTargetDevices(): device found: 0000:04:00.0 02 Red Hat, Inc. Virtio 1.0 network device
I0830 14:28:06.710989 1 auxNetDeviceProvider.go:84] auxnetdevice AddTargetDevices(): device found: 0000:01:00.0 02 Red Hat, Inc. Virtio 1.0 network device
I0830 14:28:06.711005 1 auxNetDeviceProvider.go:84] auxnetdevice AddTargetDevices(): device found: 0000:02:00.0 02 Red Hat, Inc. Virtio 1.0 network device
I0830 14:28:06.711008 1 auxNetDeviceProvider.go:84] auxnetdevice AddTargetDevices(): device found: 0000:03:00.0 02 Red Hat, Inc. Virtio 1.0 network device
I0830 14:28:06.711011 1 auxNetDeviceProvider.go:84] auxnetdevice AddTargetDevices(): device found: 0000:04:00.0 02 Red Hat, Inc. Virtio 1.0 network device
I0830 14:28:06.711014 1 main.go:68] Initializing resource servers
I0830 14:28:06.711021 1 manager.go:117] number of config: 2
I0830 14:28:06.711034 1 manager.go:121] Creating new ResourcePool: sriov_vppdpdk_rdma5
I0830 14:28:06.711037 1 manager.go:122] DeviceType: netDevice
W0830 14:28:06.711121 1 pciNetDevice.go:74] RDMA resources for 0000:01:00.0 not found. Are RDMA modules loaded?
W0830 14:28:06.711668 1 pciNetDevice.go:74] RDMA resources for 0000:02:00.0 not found. Are RDMA modules loaded?
W0830 14:28:06.711888 1 pciNetDevice.go:74] RDMA resources for 0000:03:00.0 not found. Are RDMA modules loaded?
E0830 14:28:06.712100 1 netDeviceProvider.go:50] netdevice GetDevices(): error creating new device: "error getting driver info for device 0000:04:00.0 readlink /sys/bus/pci/devices/0000:04:00.0/driver: no such file or directory"
I0830 14:28:06.712394 1 manager.go:138] initServers(): selector index 0 will register 0 devices
I0830 14:28:06.712500 1 manager.go:142] no devices in device pool, skipping creating resource server for sriov_vppdpdk_rdma5
I0830 14:28:06.712510 1 manager.go:121] Creating new ResourcePool: sriov_vppdpdk_rdma8
I0830 14:28:06.712514 1 manager.go:122] DeviceType: netDevice
W0830 14:28:06.712676 1 pciNetDevice.go:74] RDMA resources for 0000:01:00.0 not found. Are RDMA modules loaded?
W0830 14:28:06.713002 1 pciNetDevice.go:74] RDMA resources for 0000:02:00.0 not found. Are RDMA modules loaded?
W0830 14:28:06.713192 1 pciNetDevice.go:74] RDMA resources for 0000:03:00.0 not found. Are RDMA modules loaded?
E0830 14:28:06.713371 1 netDeviceProvider.go:50] netdevice GetDevices(): error creating new device: "error getting driver info for device 0000:04:00.0 readlink /sys/bus/pci/devices/0000:04:00.0/driver: no such file or directory"
I0830 14:28:06.713394 1 manager.go:138] initServers(): selector index 0 will register 0 devices
I0830 14:28:06.713536 1 manager.go:142] no devices in device pool, skipping creating resource server for sriov_vppdpdk_rdma8
I0830 14:28:06.713651 1 main.go:74] Starting all servers...
I0830 14:28:06.713663 1 main.go:79] All servers started.
I0830 14:28:06.713675 1 main.go:80] Listening for term signals
Anything else we need to know?
Component Versions
Please fill in the below table with the version numbers of components used.
| Component | Version |
|---|---|
| SR-IOV Network Device Plugin | <ghcr.io/k8snetworkplumbingwg/sriov-network-device-plugin:latest> |
| SR-IOV CNI Plugin | <ghcr.io/k8snetworkplumbingwg/sriov-cni:latest> |
| Multus | <k8snetworkplumbingwg/multus-cni@sha256:6568b4762f6e793ab696bf0dc559c22c010e9f3223266c5da9d6c2ac052e251e> |
| Kubernetes | <1.28.2> |
| OS | <rocky9.2/5.14.0-427.31.1.el9_4.x86_64 > |
Config Files
Config file locations may be config dependent.
NA
Device pool config file location (Try '/etc/pcidp/config.json')
[root@rocky92 ~]# kk exec -it kube-sriov-device-plugin-hcwks sh
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
/ # cat /etc/pcidp/
..2024_08_30_14_28_05.3828926319/ ..data/ config.json
/ # cat /etc/pcidp/config.json
{
"resourceList": [
{
"resourcePrefix": "mellanox.com",
"resourceName": "sriov_vppdpdk_rdma5",
"selectors":
{
"pciaddresses": ["0000:02:00.0"],
"isRdma": true
}
},
{
"resourcePrefix": "mellanox.com",
"resourceName": "sriov_vppdpdk_rdma8",
"selectors":
{
"pciaddresses": ["0000:03:00.0"],
"isRdma": true
}
}
]
}
/ #
Multus config (Try '/etc/cni/multus/net.d')
CNI config (Try '/etc/cni/net.d/')
Kubernetes deployment type ( Bare Metal, Kubeadm etc.)
kvm based vm
Kubeconfig file
NA
SR-IOV Network Custom Resource Definition
[root@rocky92 ~]# kk exec -it kube-sriov-device-plugin-hcwks sh
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
/ # cat /etc/pcidp/
..2024_08_30_14_28_05.3828926319/ ..data/ config.json
/ # cat /etc/pcidp/config.json
{
"resourceList": [
{
"resourcePrefix": "mellanox.com",
"resourceName": "sriov_vppdpdk_rdma5",
"selectors":
{
"pciaddresses": ["0000:02:00.0"],
"isRdma": true
}
},
{
"resourcePrefix": "mellanox.com",
"resourceName": "sriov_vppdpdk_rdma8",
"selectors":
{
"pciaddresses": ["0000:03:00.0"],
"isRdma": true
}
}
]
}
/ #
Logs
SR-IOV Network Device Plugin Logs (use kubectl logs $PODNAME)
NA
Multus logs (If enabled. Try '/var/log/multus.log' )
NA
Kubelet logs (journalctl -u kubelet)
[root@rocky92 ~]# journalctl -xe -u kubelet
Aug 30 22:28:34 rocky92 kubelet[14666]: I0830 22:28:34.722461 14666 memory_manager.go:346] "RemoveStaleState removing state" podUID="bd5db651-a44c-42a5-84c1-432734ab2>
Aug 30 22:28:34 rocky92 kubelet[14666]: I0830 22:28:34.812787 14666 reconciler_common.go:172] "operationExecutor.UnmountVolume started for volume \"cni-net-dir\" (Uni>
Aug 30 22:28:34 rocky92 kubelet[14666]: I0830 22:28:34.813123 14666 reconciler_common.go:172] "operationExecutor.UnmountVolume started for volume \"cnibin\" (UniqueNa>
Aug 30 22:28:34 rocky92 kubelet[14666]: I0830 22:28:34.813300 14666 reconciler_common.go:172] "operationExecutor.UnmountVolume started for volume \"kube-api-access-np>
Aug 30 22:28:34 rocky92 kubelet[14666]: I0830 22:28:34.813382 14666 reconciler_common.go:172] "operationExecutor.UnmountVolume started for volume \"cron-scheduler-con>
Aug 30 22:28:34 rocky92 kubelet[14666]: I0830 22:28:34.813599 14666 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"c>
Aug 30 22:28:34 rocky92 kubelet[14666]: I0830 22:28:34.814307 14666 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"c>
Aug 30 22:28:34 rocky92 kubelet[14666]: I0830 22:28:34.813075 14666 operation_generator.go:878] UnmountVolume.TearDown succeeded for volume "kubernetes.io/host-path/3>
Aug 30 22:28:34 rocky92 kubelet[14666]: I0830 22:28:34.813240 14666 operation_generator.go:878] UnmountVolume.TearDown succeeded for volume "kubernetes.io/host-path/3>
Aug 30 22:28:34 rocky92 kubelet[14666]: I0830 22:28:34.814223 14666 operation_generator.go:878] UnmountVolume.TearDown succeeded for volume "kubernetes.io/configmap/3>
Aug 30 22:28:34 rocky92 kubelet[14666]: I0830 22:28:34.814575 14666 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"k>
Aug 30 22:28:34 rocky92 kubelet[14666]: I0830 22:28:34.814784 14666 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"c>
Aug 30 22:28:34 rocky92 kubelet[14666]: I0830 22:28:34.814843 14666 reconciler_common.go:300] "Volume detached for volume \"cni-net-dir\" (UniqueName: \"kubernetes.io>
Aug 30 22:28:34 rocky92 kubelet[14666]: I0830 22:28:34.814883 14666 reconciler_common.go:300] "Volume detached for volume \"cnibin\" (UniqueName: \"kubernetes.io/host>
Aug 30 22:28:34 rocky92 kubelet[14666]: I0830 22:28:34.814920 14666 reconciler_common.go:300] "Volume detached for volume \"cron-scheduler-configmap\" (UniqueName: \">
Aug 30 22:28:34 rocky92 kubelet[14666]: I0830 22:28:34.817230 14666 operation_generator.go:878] UnmountVolume.TearDown succeeded for volume "kubernetes.io/projected/3>
Aug 30 22:28:34 rocky92 kubelet[14666]: I0830 22:28:34.916371 14666 reconciler_common.go:300] "Volume detached for volume \"kube-api-access-npb7f\" (UniqueName: \"kub>
Aug 30 22:28:35 rocky92 kubelet[14666]: I0830 22:28:35.112691 14666 scope.go:117] "RemoveContainer" containerID="5f788d257595c44574eaccc13c4d0bc2e56acac1a9e590c706bbe>
Aug 30 22:28:35 rocky92 kubelet[14666]: I0830 22:28:35.138837 14666 scope.go:117] "RemoveContainer" containerID="5f788d257595c44574eaccc13c4d0bc2e56acac1a9e590c706bbe>
Aug 30 22:28:35 rocky92 kubelet[14666]: E0830 22:28:35.141829 14666 remote_runtime.go:432] "ContainerStatus from runtime service failed" err="rpc error: code = Unknow>
Aug 30 22:28:35 rocky92 kubelet[14666]: I0830 22:28:35.141892 14666 pod_container_deletor.go:53] "DeleteContainer returned error" containerID={"Type":"docker","ID":"5>
Aug 30 22:28:35 rocky92 kubelet[14666]: I0830 22:28:35.299654 14666 pod_container_deletor.go:80] "Container not found in pod's containers" containerID="0495d7076e9da7>
Aug 30 22:28:35 rocky92 kubelet[14666]: I0830 22:28:35.449034 14666 kubelet_volumes.go:161] "Cleaned up orphaned pod volumes dir" podUID="3d9353dc-e86c-4387-9e8a-caf3>
Aug 30 22:28:36 rocky92 kubelet[14666]: I0830 22:28:36.337505 14666 pod_startup_latency_tracker.go:102] "Observed pod startup duration" pod="kube-system/whereabouts-r>
Aug 30 22:28:42 rocky92 kubelet[14666]: I0830 22:28:42.739108 14666 state_mem.go:80] "Updated desired CPUSet" podUID="12baf6b6-6cfc-4dc3-ba64-c287d33e3294" containerN>
Aug 31 03:00:14 rocky92 kubelet[14666]: E0831 03:00:14.349582 14666 controller.go:193] "Failed to update lease" err="etcdserver: request timed out"
Aug 31 03:00:18 rocky92 kubelet[14666]: E0831 03:00:18.508101 14666 event.go:280] Server rejected event '&v1.Event{TypeMeta:v1.TypeMeta{Kind:"", APIVersion:""}, Objec>
Aug 31 03:00:21 rocky92 kubelet[14666]: E0831 03:00:21.354419 14666 controller.go:193] "Failed to update lease" err="etcdserver: request timed out"
Aug 31 03:00:21 rocky92 kubelet[14666]: E0831 03:00:21.631133 14666 token_manager.go:121] "Couldn't update token" err="etcdserver: request timed out" cacheKey="\"defa>
Aug 31 03:00:25 rocky92 kubelet[14666]: E0831 03:00:25.515955 14666 event.go:280] Server rejected event '&v1.Event{TypeMeta:v1.TypeMeta{Kind:"", APIVersion:""}, Objec>
Aug 31 03:00:26 rocky92 kubelet[14666]: E0831 03:00:26.003222 14666 controller.go:193] "Failed to update lease" err="Operation cannot be fulfilled on leases.coordinat>
Aug 31 03:00:27 rocky92 kubelet[14666]: I0831 03:00:27.086918 14666 scope.go:117] "RemoveContainer" containerID="d49259eb6273b693e2445584d6366e681ff4fa71b9d698e953629>
Aug 31 03:00:27 rocky92 kubelet[14666]: I0831 03:00:27.087361 14666 scope.go:117] "RemoveContainer" containerID="11113dfa3d82843fa81412653ab90875244411809b93dd12ca8ab>
Aug 31 03:00:27 rocky92 kubelet[14666]: I0831 03:00:27.111826 14666 scope.go:117] "RemoveContainer" containerID="13bf60b771ba025b9721cfbd450edeaefdb21bd5e920d0819c882>
Aug 31 03:00:27 rocky92 kubelet[14666]: I0831 03:00:27.130464 14666 scope.go:117] "RemoveContainer" containerID="d712e52ab0df45507a44d76180eae4cdb0b7dc7c03feb9b914aea>
[root@rocky92 ~]#
Metadata
Metadata
Assignees
Labels
No labels