Skip to content

Commit 9bb7c9c

Browse files
committed
Create NFD rule from the AIC controller
Create NFD rule from the AIC controller, to cut down a manual pre-requisite step. Update Qualcomm license markings. Change-Id: Idf89460c7dde6320301e216a1f850540f72338ec
1 parent b918019 commit 9bb7c9c

File tree

9 files changed

+208
-16
lines changed

9 files changed

+208
-16
lines changed

Dockerfile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ COPY cmd/main.go cmd/main.go
1515
COPY api/ api/
1616
COPY internal/controller/ internal/controller/
1717
COPY internal/kmmmodule/ internal/kmmmodule/
18+
COPY internal/nfdrule/ internal/nfdrule/
1819

1920
# Build
2021
# the GOARCH has not a default value to allow the binary be built according to the host where the command
@@ -28,6 +29,8 @@ ARG VERSION=none
2829
RUN microdnf -y update
2930
WORKDIR /
3031
COPY --from=builder /opt/app-root/src/manager .
32+
RUN mkdir -p /opt/aic-manifests
33+
COPY --from=builder /opt/app-root/src/internal/nfdrule/qcom-aic-nfr.yaml /opt/aic-manifests/
3134
LABEL name="cloud_ai_openshift_operator" \
3235
maintainer="Qualcomm Innovation Center, Inc." \
3336
vendor="Qualcomm Innovation Center, Inc." \

README.md

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,9 @@ This Operator relies on the Node Feature Discovery (NFD) and Kernel Module Manag
88
(KMM) Operators. Be sure to install them from the OperatorHub (provided by Red Hat, not
99
the Community).
1010

11-
NFD and KMM both require addition configuration after they're installed. NFD requires the
12-
CRD located in [./manual_install/qcom-aic-rule-nfd.yaml](./manual_install/qcom-aic-rule-nfd.yaml) to be added to the cluster.
11+
NFD operator needs the default NodeFeatureDiscovery Custom Resource(CR) to be created after it's installed.
1312

14-
```sh
15-
oc apply -f ./manual_install/qcom-aic-rule-nfd.yaml
16-
```
17-
18-
KMM also requires configuration so that firmware can be located correctly. The following
13+
KMM requires configuration so that firmware can be located correctly. The following
1914
command should work for most clusters, but make sure to check that the
2015
'controler_config.yaml' section matches the existing configuration (note: ordering of the
2116
elements shouldn't matter (so long as they're under the correct heading (e.g. 'webhook',

aic-operator-deploy.yaml

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,26 @@ rules:
295295
- get
296296
- patch
297297
- update
298+
- apiGroups:
299+
- nfd.openshift.io
300+
resources:
301+
- nodefeaturerules
302+
verbs:
303+
- create
304+
- delete
305+
- get
306+
- list
307+
- patch
308+
- update
309+
- watch
310+
- apiGroups:
311+
- nfd.openshift.io
312+
resources:
313+
- nodefeaturerules/status
314+
verbs:
315+
- get
316+
- patch
317+
- update
298318
- apiGroups:
299319
- rbac.authorization.k8s.io
300320
resources:
@@ -519,7 +539,7 @@ spec:
519539
- --leader-elect
520540
command:
521541
- /manager
522-
image: ##Replace with aic-operator image URL
542+
image: ghcr.io/quic/cloud_ai_openshift_operator:0.1.1 ##Replace the image tag to use a different version of Operator
523543
imagePullPolicy: Always
524544
livenessProbe:
525545
httpGet:

cmd/main.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ limitations under the License.
1515
1616
Based on code from https://github.com/yevgeny-shnaidman/amd-gpu-operator
1717
18-
Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
18+
Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
1919
SPDX-License-Identifier: BSD-3-Clause-Clear
2020
Not a contribution.
2121
*/
@@ -43,7 +43,9 @@ import (
4343
aicv1 "github.com/quic/aic-operator/api/v1"
4444
"github.com/quic/aic-operator/internal/controller"
4545
"github.com/quic/aic-operator/internal/kmmmodule"
46+
"github.com/quic/aic-operator/internal/nfdrule"
4647
kmmv1beta1 "github.com/rh-ecosystem-edge/kernel-module-management/api/v1beta1"
48+
nfr "github.com/openshift/cluster-nfd-operator/api/v1alpha1"
4749
//+kubebuilder:scaffold:imports
4850
)
4951

@@ -56,6 +58,7 @@ func init() {
5658
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
5759
utilruntime.Must(aicv1.AddToScheme(scheme))
5860
utilruntime.Must(kmmv1beta1.AddToScheme(scheme))
61+
utilruntime.Must(nfr.AddToScheme(scheme))
5962
//+kubebuilder:scaffold:scheme
6063
}
6164

@@ -132,10 +135,11 @@ func main() {
132135

133136
client := mgr.GetClient()
134137
kmmHandler := kmmmodule.NewKMMModule(client, scheme)
138+
nfdHandler := nfdrule.NewNFDRule(client, scheme)
135139
aicr := controller.NewAICReconciler(
136140
client,
137141
mgr.GetScheme(),
138-
kmmHandler)
142+
kmmHandler, nfdHandler)
139143

140144
if err = aicr.SetupWithManager(mgr); err != nil {
141145
setupLog.Error(err, "unable to create controller", "controller", "AIC")

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ require (
88
github.com/a8m/envsubst v1.4.2
99
github.com/onsi/ginkgo/v2 v2.17.1
1010
github.com/onsi/gomega v1.32.0
11+
github.com/openshift/cluster-nfd-operator v0.0.0-20250423093037-ad5de3ee96a7
1112
github.com/rh-ecosystem-edge/kernel-module-management v0.0.0-20240412150217-30de2c7d4ede
1213
k8s.io/api v0.29.3
1314
k8s.io/apimachinery v0.29.3

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ github.com/onsi/ginkgo/v2 v2.17.1 h1:V++EzdbhI4ZV4ev0UTIj0PzhzOcReJFyJaLjtSF55M8
8282
github.com/onsi/ginkgo/v2 v2.17.1/go.mod h1:llBI3WDLL9Z6taip6f33H76YcWtJv+7R3HigUjbIBOs=
8383
github.com/onsi/gomega v1.32.0 h1:JRYU78fJ1LPxlckP6Txi/EYqJvjtMrDC04/MM5XRHPk=
8484
github.com/onsi/gomega v1.32.0/go.mod h1:a4x4gW6Pz2yK1MAmvluYme5lvYTn61afQ2ETw/8n4Lg=
85+
github.com/openshift/cluster-nfd-operator v0.0.0-20250423093037-ad5de3ee96a7 h1:ljGD4OXMcSGzksUQ4WHjpM9IZ48AhwucvrtZiLUVHmk=
86+
github.com/openshift/cluster-nfd-operator v0.0.0-20250423093037-ad5de3ee96a7/go.mod h1:oXNRM0qmbuFinGmQpC8xh0eLIMx8Z/1R9d/5HPjVCRM=
8587
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
8688
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
8789
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=

internal/controller/aic_controller.go

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ limitations under the License.
1515
1616
Based on code from https://github.com/yevgeny-shnaidman/amd-gpu-operator
1717
18-
Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
18+
Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
1919
SPDX-License-Identifier: BSD-3-Clause-Clear
2020
Not a contribution.
2121
*/
@@ -28,6 +28,7 @@ import (
2828
"fmt"
2929

3030
kmmv1beta1 "github.com/rh-ecosystem-edge/kernel-module-management/api/v1beta1"
31+
nfr "github.com/openshift/cluster-nfd-operator/api/v1alpha1"
3132
v1 "k8s.io/api/core/v1"
3233
k8serrors "k8s.io/apimachinery/pkg/api/errors"
3334
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -40,6 +41,7 @@ import (
4041

4142
aicv1 "github.com/quic/aic-operator/api/v1"
4243
"github.com/quic/aic-operator/internal/kmmmodule"
44+
"github.com/quic/aic-operator/internal/nfdrule"
4345
)
4446

4547
// AICReconciler reconciles an AIC object
@@ -54,6 +56,8 @@ type AICReconciler struct {
5456
//+kubebuilder:rbac:groups=aic.quicinc.com,resources=aics/finalizers,verbs=update
5557
//+kubebuilder:rbac:groups=kmm.sigs.x-k8s.io,resources=modules,verbs=get;list;watch;create;patch;update;delete
5658
//+kubebuilder:rbac:groups=kmm.sigs.x-k8s.io,resources=modules/status,verbs=get;update;patch
59+
//+kubebuilder:rbac:groups=nfd.openshift.io,resources=nodefeaturerules,verbs=get;list;watch;create;update;patch;delete
60+
//+kubebuilder:rbac:groups=nfd.openshift.io,resources=nodefeaturerules/status,verbs=get;update;patch
5761
//+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterroles;clusterrolebindings;roles;rolebindings,verbs=get;list;watch;create;update;patch;delete
5862
//+kubebuilder:rbac:groups=core,resources=namespaces;serviceaccounts;pods;pods/exec;pods/attach;services;services/finalizers;endpoints,verbs=get;list;watch;create;update;patch;delete
5963
//+kubebuilder:rbac:groups=core,resources=configmaps;secrets;nodes,verbs=get;list;watch;create;update;patch;delete
@@ -63,8 +67,9 @@ type AICReconciler struct {
6367
func NewAICReconciler(
6468
client client.Client,
6569
scheme *runtime.Scheme,
66-
kmmHandler kmmmodule.KMMModuleAPI) *AICReconciler {
67-
helper := newAICReconcilerHelper(client, kmmHandler)
70+
kmmHandler kmmmodule.KMMModuleAPI,
71+
nfdHandler nfdrule.NFDRuleAPI) *AICReconciler {
72+
helper := newAICReconcilerHelper(client, kmmHandler, nfdHandler)
6873
return &AICReconciler{
6974
Client: client,
7075
Scheme: scheme,
@@ -114,6 +119,12 @@ func (r *AICReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
114119
return res, fmt.Errorf("failed to set finalizer for AIC %s: %v", req.NamespacedName, err)
115120
}
116121

122+
logger.Info("start NFR reconciliation")
123+
err = r.helper.handleAICNFDRule(ctx, aic)
124+
if err != nil {
125+
return res, fmt.Errorf("failed to handle NFR creation %s: %v", req.NamespacedName, err)
126+
}
127+
117128
logger.Info("start build configmap reconciliation")
118129
// Always want to have the ConfigMap that can build module images
119130
err = r.helper.handleBuildConfigMap(ctx, aic, false)
@@ -156,18 +167,21 @@ type AICReconcilerHelperAPI interface {
156167
setFinalizer(ctx context.Context, aic *aicv1.AIC) error
157168
handleBuildConfigMap(ctx context.Context, devConfig *aicv1.AIC, useInTree bool) error
158169
handleKMMModule(ctx context.Context, devConfig *aicv1.AIC, loadedMods aicv1.LoadedModules) error
170+
handleAICNFDRule(ctx context.Context, aic *aicv1.AIC) error
159171
}
160172

161173
type AICReconcilerHelper struct {
162174
client client.Client
163175
kmmHandler kmmmodule.KMMModuleAPI
176+
nfdHandler nfdrule.NFDRuleAPI
164177
}
165178

166179
func newAICReconcilerHelper(client client.Client,
167-
kmmHandler kmmmodule.KMMModuleAPI) AICReconcilerHelperAPI {
180+
kmmHandler kmmmodule.KMMModuleAPI, nfdHandler nfdrule.NFDRuleAPI) AICReconcilerHelperAPI {
168181
return &AICReconcilerHelper{
169182
client: client,
170183
kmmHandler: kmmHandler,
184+
nfdHandler: nfdHandler,
171185
}
172186
}
173187
func (aicrh *AICReconcilerHelper) getRequestedAIC(ctx context.Context, namespacedName types.NamespacedName) (*aicv1.AIC, error) {
@@ -215,12 +229,31 @@ func (aicrh *AICReconcilerHelper) finalizeAIC(ctx context.Context, aic *aicv1.AI
215229
faults = append(faults, err)
216230
}
217231
}
232+
//Delete NFR owned by AIC.
233+
nfrObj := nfr.NodeFeatureRule{}
234+
nsName := types.NamespacedName{
235+
Namespace: aic.Namespace,
236+
Name: "qcom-aic-nfr",
237+
}
238+
err = aicrh.client.Get(ctx, nsName, &nfrObj)
239+
if err != nil {
240+
if k8serrors.IsNotFound(err) {
241+
deleted = append(deleted, err)
242+
} else {
243+
faults = append(faults, fmt.Errorf("failed to get the requested NFR %s: %w", nsName, err))
244+
}
245+
} else {
246+
logger.Info("deleting NFR CR", "NFR", nsName)
247+
if err = aicrh.client.Delete(ctx, &nfrObj); client.IgnoreNotFound(err) != nil {
248+
faults = append(faults, err)
249+
}
250+
}
218251

219252
err = errors.Join(faults...)
220253

221254
//remove finalizer only if no faults occurred during removal
222-
if len(deleted) == int(aicv1.None_loaded)+1 && err == nil {
223-
logger.Info("modules already deleted, removing finalizer", "module", aic.Name)
255+
if len(deleted) == int(aicv1.None_loaded)+2 && err == nil {
256+
logger.Info("modules & NFR already deleted, removing finalizer", "module, NFR", aic.Name)
224257
aicCopy := aic.DeepCopy()
225258
controllerutil.RemoveFinalizer(aic, aicFinalizer)
226259
return aicrh.client.Patch(ctx, aic, client.MergeFrom(aicCopy))
@@ -271,6 +304,25 @@ func (aicrh *AICReconcilerHelper) handleKMMModule(ctx context.Context, aic *aicv
271304
return err
272305
}
273306

307+
func (aicrh *AICReconcilerHelper) handleAICNFDRule(ctx context.Context, aic *aicv1.AIC) error {
308+
309+
nfrObj := &nfr.NodeFeatureRule{
310+
ObjectMeta: metav1.ObjectMeta{
311+
Namespace: aic.Namespace,
312+
Name: "qcom-aic-nfr",
313+
},
314+
}
315+
logger := log.FromContext(ctx)
316+
opRes, err := controllerutil.CreateOrPatch(ctx, aicrh.client, nfrObj, func() error {
317+
return aicrh.nfdHandler.SetNFRasDesired(nfrObj, aic)
318+
})
319+
if err != nil {
320+
logger.Info("Reconciled NFR", "name", nfrObj.Name, "result", opRes)
321+
return err
322+
}
323+
return nil
324+
325+
}
274326
func getDockerfileCMName(aic *aicv1.AIC, useInTree bool) string {
275327
if useInTree {
276328
return "dockerfile-intree-" + aic.Name

internal/nfdrule/qcom-aic-nfr.go

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
/*
2+
Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
3+
SPDX-License-Identifier: BSD-3-Clause-Clear.
4+
*/
5+
6+
package nfdrule
7+
8+
import (
9+
_ "embed"
10+
"fmt"
11+
"io/ioutil"
12+
"regexp"
13+
"strings"
14+
15+
"k8s.io/apimachinery/pkg/runtime"
16+
"sigs.k8s.io/controller-runtime/pkg/client"
17+
"k8s.io/client-go/kubernetes/scheme"
18+
"k8s.io/apimachinery/pkg/runtime/serializer/json"
19+
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
20+
21+
aicv1 "github.com/quic/aic-operator/api/v1"
22+
nfr "github.com/openshift/cluster-nfd-operator/api/v1alpha1"
23+
)
24+
25+
const (
26+
qcom_aic_nfrmanifest = "/opt/aic-manifests/qcom-aic-nfr.yaml"
27+
)
28+
29+
type NFDRuleAPI interface {
30+
SetNFRasDesired(nfrObj *nfr.NodeFeatureRule, aic *aicv1.AIC) error
31+
}
32+
33+
type nfdRule struct {
34+
client client.Client
35+
scheme *runtime.Scheme
36+
}
37+
38+
func NewNFDRule(client client.Client, scheme *runtime.Scheme) NFDRuleAPI {
39+
return &nfdRule{
40+
client: client,
41+
scheme: scheme,
42+
}
43+
}
44+
45+
func (nfrstruct *nfdRule) SetNFRasDesired(nfrObj *nfr.NodeFeatureRule, aic *aicv1.AIC) error {
46+
err := parseNFR_Manifest(nfrObj, aic)
47+
if err != nil {
48+
return fmt.Errorf("failed to set NodeFeatureRule: %v", err)
49+
}
50+
return controllerutil.SetControllerReference(aic, nfrObj, nfrstruct.scheme)
51+
}
52+
53+
func parseNFR_Manifest(nfrObj *nfr.NodeFeatureRule, aic *aicv1.AIC) error {
54+
raw_manifest, err := ioutil.ReadFile(qcom_aic_nfrmanifest)
55+
if err != nil {
56+
return fmt.Errorf("Error encountered while reading NFR manifest %s : %v", qcom_aic_nfrmanifest, err)
57+
}
58+
s := json.NewYAMLSerializer(json.DefaultMetaFactory, scheme.Scheme, scheme.Scheme)
59+
regex, _ := regexp.Compile(`\b(\w*kind:\w*)\B.*\b`)
60+
kind := strings.TrimSpace(strings.Split(regex.FindString(string(raw_manifest)), ":")[1])
61+
fmt.Println("Resource identified kind:", kind)
62+
_, _, err = s.Decode(raw_manifest, nil, nfrObj)
63+
if err != nil {
64+
return fmt.Errorf("Error encountered while decoding %s resource in manifest %s: %v", "NodeFeatureRule", qcom_aic_nfrmanifest, err)
65+
}
66+
nfrObj.ObjectMeta.Namespace = aic.Namespace
67+
return nil
68+
}

internal/nfdrule/qcom-aic-nfr.yaml

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
apiVersion: nfd.openshift.io/v1alpha1
2+
kind: NodeFeatureRule
3+
metadata:
4+
name: qcom-aic-nfr
5+
namespace: aic-operator-system
6+
spec:
7+
rules:
8+
- name: "node.has_qaic_hw"
9+
labels:
10+
"qualcomm.com/qaic": "true"
11+
matchFeatures:
12+
- feature: pci.device
13+
matchExpressions:
14+
class: {op: In, value: ["1200", "ff00"]}
15+
vendor: {op: In, value: ["17cb"]}
16+
device: {op: In, value: ["a100"]}
17+
- name: "kernel.mhi_avail"
18+
labels:
19+
"qualcomm.com/mhi_in_kernel": "true"
20+
matchFeatures:
21+
- feature: rule.matched
22+
matchExpressions:
23+
"qualcomm.com/qaic": {op: IsTrue}
24+
- feature: kernel.config
25+
matchExpressions:
26+
MHI_BUS: {op: In, value: ["y","m"]}
27+
- name: "kernel.qaic_avail"
28+
labels:
29+
"qualcomm.com/qaic_in_kernel": "true"
30+
matchFeatures:
31+
- feature: rule.matched
32+
matchExpressions:
33+
"qualcomm.com/qaic": {op: IsTrue}
34+
- feature: kernel.config
35+
matchExpressions:
36+
DRM_ACCEL_QAIC: {op: In, value: ["y","m"]}
37+
- name: "kernel.not_mhi_qaic"
38+
labels:
39+
"qualcomm.com/not_mhi_qaic": "true"
40+
matchFeatures:
41+
- feature: rule.matched
42+
matchExpressions:
43+
"qualcomm.com/qaic": {op: IsTrue}
44+
- feature: kernel:loadedmodule
45+
matchExpressions:
46+
qaic: {op: DoesNotExist}
47+
mhi: {op: DoesNotExist}

0 commit comments

Comments
 (0)