diff --git a/go.mod b/go.mod index 3e3b1e91e..b36707dd0 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/NVIDIA/nvidia-container-toolkit go 1.25.0 require ( - github.com/NVIDIA/go-nvlib v0.8.1 + github.com/NVIDIA/go-nvlib v0.8.2-0.20251202135446-d0f42ba016dd github.com/NVIDIA/go-nvml v0.13.0-1 github.com/google/uuid v1.6.0 github.com/moby/sys/mountinfo v0.7.2 diff --git a/go.sum b/go.sum index dc2afbda1..041498a0a 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,7 @@ cyphar.com/go-pathrs v0.2.1 h1:9nx1vOgwVvX1mNBWDu93+vaceedpbsDqo+XuBGL40b8= cyphar.com/go-pathrs v0.2.1/go.mod h1:y8f1EMG7r+hCuFf/rXsKqMJrJAUoADZGNh5/vZPKcGc= -github.com/NVIDIA/go-nvlib v0.8.1 h1:OPEHVvn3zcV5OXB68A7WRpeCnYMRSPl7LdeJH/d3gZI= -github.com/NVIDIA/go-nvlib v0.8.1/go.mod h1:7mzx9FSdO9fXWP9NKuZmWkCwhkEcSWQFe2tmFwtLb9c= +github.com/NVIDIA/go-nvlib v0.8.2-0.20251202135446-d0f42ba016dd h1:9NpcQGNvt/djzbX1uwIed1u2xd1ssiYlHNrt5Av19rQ= +github.com/NVIDIA/go-nvlib v0.8.2-0.20251202135446-d0f42ba016dd/go.mod h1:7mzx9FSdO9fXWP9NKuZmWkCwhkEcSWQFe2tmFwtLb9c= github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw= github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= diff --git a/internal/info/auto_test.go b/internal/info/auto_test.go index f6d99c7e5..9301514d6 100644 --- a/internal/info/auto_test.go +++ b/internal/info/auto_test.go @@ -215,13 +215,10 @@ func TestResolveAutoMode(t *testing.T) { HasDXCoreFunc: func() (bool, string) { return tc.info["dxcore"], "dxcore" }, - IsTegraSystemFunc: func() (bool, string) { - return tc.info["tegra"], "tegra" - }, HasTegraFilesFunc: func() (bool, string) { return tc.info["tegra"], "tegra" }, - HasOnlyIntegratedGPUsFunc: func() (bool, string) { + HasAnIntegratedGPUFunc: func() (bool, string) { return tc.info["nvgpu"], "nvgpu" }, } diff --git a/internal/platform-support/tegra/csv.go b/internal/platform-support/tegra/csv.go index edb7fdc48..b3076e403 100644 --- a/internal/platform-support/tegra/csv.go +++ b/internal/platform-support/tegra/csv.go @@ -17,103 +17,38 @@ package tegra import ( - "fmt" - - "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" - "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra/csv" ) -// newDiscovererFromCSVFiles creates a discoverer for the specified CSV files. A logger is also supplied. -// The constructed discoverer is comprised of a list, with each element in the list being associated with a -// single CSV files. -func (o tegraOptions) newDiscovererFromCSVFiles() (discover.Discover, error) { - if len(o.csvFiles) == 0 { - o.logger.Warningf("No CSV files specified") - return discover.None{}, nil - } - - targetsByType := getTargetsFromCSVFiles(o.logger, o.csvFiles) - - devices := discover.NewCharDeviceDiscoverer( - o.logger, - o.devRoot, - targetsByType[csv.MountSpecDev], - ) - - directories := discover.NewMounts( - o.logger, - lookup.NewDirectoryLocator(lookup.WithLogger(o.logger), lookup.WithRoot(o.driverRoot)), - o.driverRoot, - targetsByType[csv.MountSpecDir], - ) - - // We create a discoverer for mounted libraries and add additional .so - // symlinks for the driver. - libraries := discover.WithDriverDotSoSymlinks( - o.logger, - discover.NewMounts( - o.logger, - o.symlinkLocator, - o.driverRoot, - targetsByType[csv.MountSpecLib], - ), - "", - o.hookCreator, - ) - - // We process the explicitly requested symlinks. - symlinkTargets := o.ignorePatterns.Apply(targetsByType[csv.MountSpecSym]...) - o.logger.Debugf("Filtered symlink targets: %v", symlinkTargets) - symlinks := discover.NewMounts( - o.logger, - o.symlinkLocator, - o.driverRoot, - symlinkTargets, - ) - createSymlinks := o.createCSVSymlinkHooks(symlinkTargets) +// MountSpecsFromCSVFiles returns a MountSpecPathsByTyper for the specified list +// of CSV files. +func MountSpecsFromCSVFiles(logger logger.Interface, csvFiles ...string) MountSpecPathsByTyper { + var tts []MountSpecPathsByTyper - d := discover.Merge( - devices, - directories, - libraries, - symlinks, - createSymlinks, - ) - - return d, nil + for _, filename := range csvFiles { + tts = append(tts, &fromCSVFile{logger, filename}) + } + return Merge(tts...) } -// getTargetsFromCSVFiles returns the list of mount specs from the specified CSV files. -// These are aggregated by mount spec type. -// TODO: We use a function variable here to allow this to be overridden for testing. -// This should be properly mocked. -var getTargetsFromCSVFiles = func(logger logger.Interface, files []string) map[csv.MountSpecType][]string { - targetsByType := make(map[csv.MountSpecType][]string) - for _, filename := range files { - targets, err := loadCSVFile(logger, filename) - if err != nil { - logger.Warningf("Skipping CSV file %v: %v", filename, err) - continue - } - for _, t := range targets { - targetsByType[t.Type] = append(targetsByType[t.Type], t.Path) - } - } - return targetsByType +type fromCSVFile struct { + logger logger.Interface + filename string } -// loadCSVFile loads the specified CSV file and returns the list of mount specs -func loadCSVFile(logger logger.Interface, filename string) ([]*csv.MountSpec, error) { +// MountSpecPathsByType returns mountspecs defined in the specified CSV file. +func (t *fromCSVFile) MountSpecPathsByType() MountSpecPathsByType { // Create a discoverer for each file-kind combination - targets, err := csv.NewCSVFileParser(logger, filename).Parse() + targets, err := csv.NewCSVFileParser(t.logger, t.filename).Parse() if err != nil { - return nil, fmt.Errorf("failed to parse CSV file: %v", err) - } - if len(targets) == 0 { - return nil, fmt.Errorf("CSV file is empty") + t.logger.Warningf("failed to parse CSV file %v: %v", t.filename, err) + return nil } - return targets, nil + targetsByType := make(MountSpecPathsByType) + for _, t := range targets { + targetsByType[t.Type] = append(targetsByType[t.Type], t.Path) + } + return targetsByType } diff --git a/internal/platform-support/tegra/csv_test.go b/internal/platform-support/tegra/csv_test.go index 1fcda971b..1e22655e5 100644 --- a/internal/platform-support/tegra/csv_test.go +++ b/internal/platform-support/tegra/csv_test.go @@ -24,7 +24,6 @@ import ( "github.com/stretchr/testify/require" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" - "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra/csv" @@ -34,7 +33,7 @@ func TestDiscovererFromCSVFiles(t *testing.T) { logger, _ := testlog.NewNullLogger() testCases := []struct { description string - moutSpecs map[csv.MountSpecType][]string + moutSpecs MountSpecPathsByType ignorePatterns []string symlinkLocator lookup.Locator symlinkChainLocator lookup.Locator @@ -186,19 +185,20 @@ func TestDiscovererFromCSVFiles(t *testing.T) { hookCreator := discover.NewHookCreator() for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { - defer setGetTargetsFromCSVFiles(tc.moutSpecs)() - - o := tegraOptions{ + o := options{ logger: logger, hookCreator: hookCreator, - csvFiles: []string{"dummy"}, - ignorePatterns: tc.ignorePatterns, symlinkLocator: tc.symlinkLocator, symlinkChainLocator: tc.symlinkChainLocator, resolveSymlink: tc.symlinkResolver, + + MountSpecPathsByTyper: Transform( + AsIgnorePatternsByType(Symlinks(tc.ignorePatterns...)), + tc.moutSpecs, + ), } - d, err := o.newDiscovererFromCSVFiles() + d, err := o.newDiscovererFromMountSpecs(o.MountSpecPathsByType()) require.ErrorIs(t, err, tc.expectedError) hooks, err := d.Hooks() @@ -212,14 +212,3 @@ func TestDiscovererFromCSVFiles(t *testing.T) { }) } } - -func setGetTargetsFromCSVFiles(override map[csv.MountSpecType][]string) func() { - original := getTargetsFromCSVFiles - getTargetsFromCSVFiles = func(logger logger.Interface, files []string) map[csv.MountSpecType][]string { - return override - } - - return func() { - getTargetsFromCSVFiles = original - } -} diff --git a/internal/platform-support/tegra/filter.go b/internal/platform-support/tegra/filter.go index 03b18bf74..d53d2602a 100644 --- a/internal/platform-support/tegra/filter.go +++ b/internal/platform-support/tegra/filter.go @@ -19,31 +19,92 @@ package tegra import ( "path/filepath" "strings" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra/csv" ) -type ignoreMountSpecPatterns []string +// A filter removes elements from an input list and returns the remaining +// elements. +type filter interface { + apply(...string) []string +} + +// A stringMatcher implements the MatchString function. +type stringMatcher interface { + MatchString(string) bool +} + +// A matcherAsFilter is used to ensure that a string matcher can be used as a filter. +type matcherAsFilter struct { + stringMatcher +} -func (d ignoreMountSpecPatterns) Match(name string) bool { +type filterByMountSpecType map[csv.MountSpecType]filter + +type pathPatterns []string +type pathPattern string +type basenamePattern string + +// MatchString for a set of path patterns returns true if any of the patterns +// matches against the input string. +func (d pathPatterns) MatchString(input string) bool { for _, pattern := range d { - target := name - if strings.HasPrefix(pattern, "**/") { - target = filepath.Base(name) - pattern = strings.TrimPrefix(pattern, "**/") - } - if match, _ := filepath.Match(pattern, target); match { + if match := pathPattern(pattern).MatchString(input); match { return true } } return false } -func (d ignoreMountSpecPatterns) Apply(input ...string) []string { +// MatchString attempts to match a path pattern to the specified input string. +// If the pattern starts with `**/` the input is treated as a path and only +// the basenames are matched using regular glob rules. +func (d pathPattern) MatchString(input string) bool { + if strings.HasPrefix(string(d), "**/") { + return basenamePattern(d).MatchString(input) + } + match, _ := filepath.Match(string(d), input) + return match +} + +// MatchString for a basename pattern applies the specified pattern against the +// basename of the input. +// If the pattern starts with **/, this is stripped before attempting to match. +func (d basenamePattern) MatchString(input string) bool { + pattern := strings.TrimPrefix(string(d), "**/") + match, _ := filepath.Match(pattern, filepath.Base(input)) + return match +} + +// Apply the specified per-type filters to the input mount specs. +func (p filterByMountSpecType) Apply(input MountSpecPathsByTyper) MountSpecPathsByTyper { + ms := input.MountSpecPathsByType() + for t, filter := range p { + if len(ms[t]) == 0 { + continue + } + ms[t] = filter.apply(ms[t]...) + } + return ms +} + +// apply uses a matcher to filter an input string. +// Each element in the input that matches is skipped and the remaining elements +// are returned. +func (f *matcherAsFilter) apply(input ...string) []string { var filtered []string - for _, name := range input { - if d.Match(name) { + for _, path := range input { + if f.MatchString(path) { continue } - filtered = append(filtered, name) + filtered = append(filtered, path) } return filtered } + +// removeAll is a filter that will not return any inputs. +type removeAll struct{} + +func (a removeAll) apply(...string) []string { + return nil +} diff --git a/internal/platform-support/tegra/filter_test.go b/internal/platform-support/tegra/filter_test.go index a3b1a8f7c..f7aae2798 100644 --- a/internal/platform-support/tegra/filter_test.go +++ b/internal/platform-support/tegra/filter_test.go @@ -25,7 +25,7 @@ import ( func TestIgnorePatterns(t *testing.T) { testCases := []struct { description string - blockedFilter []string + blockedFilter pathPatterns input []string expected []string }{ @@ -50,7 +50,8 @@ func TestIgnorePatterns(t *testing.T) { for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { - filtered := ignoreMountSpecPatterns(tc.blockedFilter).Apply(tc.input...) + filter := &matcherAsFilter{tc.blockedFilter} + filtered := filter.apply(tc.input...) require.ElementsMatch(t, tc.expected, filtered) }) } diff --git a/internal/platform-support/tegra/mount_specs.go b/internal/platform-support/tegra/mount_specs.go new file mode 100644 index 000000000..ea9b43b53 --- /dev/null +++ b/internal/platform-support/tegra/mount_specs.go @@ -0,0 +1,140 @@ +/** +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package tegra + +import ( + "regexp" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra/csv" +) + +// A MountSpecPathsByTyper provides a function to return mount specs paths by +// mount type. +// The MountSpecTypes are one of: dev, dir, lib, sym and define how these should +// be included in a container (or represented in the associated CDI spec). +type MountSpecPathsByTyper interface { + MountSpecPathsByType() MountSpecPathsByType +} + +type MountSpecPathsByType map[csv.MountSpecType][]string + +var _ MountSpecPathsByTyper = (MountSpecPathsByType)(nil) + +// MountSpecPathsByType for a variable of type MountSpecPathsByType returns the +// underlying data structure. +// This allows for using this type in functions such as Merge and Filter. +func (m MountSpecPathsByType) MountSpecPathsByType() MountSpecPathsByType { + return m +} + +type merge []MountSpecPathsByTyper + +// Merge combines the MountSpecPathsByType for the specified sources. +func Merge(sources ...MountSpecPathsByTyper) MountSpecPathsByTyper { + return merge(sources) +} + +// MountSpecPathsByType for a set of merged mount specs combines the list of +// paths per type. +func (ts merge) MountSpecPathsByType() MountSpecPathsByType { + targetsByType := make(MountSpecPathsByType) + for _, t := range ts { + if t == nil { + continue + } + for tType, targets := range t.MountSpecPathsByType() { + targetsByType[tType] = append(targetsByType[tType], targets...) + } + } + return targetsByType +} + +type Transformer interface { + Apply(MountSpecPathsByTyper) MountSpecPathsByTyper +} + +type transformMountSpecByPathsByType struct { + Transformer + input MountSpecPathsByTyper +} + +// Transform applies the specified transform to a set of mount specs by paths. +func Transform(t Transformer, input MountSpecPathsByTyper) MountSpecPathsByTyper { + return transformMountSpecByPathsByType{ + Transformer: t, + input: input, + } +} + +func (m transformMountSpecByPathsByType) MountSpecPathsByType() MountSpecPathsByType { + return m.Apply(m.input).MountSpecPathsByType() +} + +// AsIgnorePatternsByType uses the paths in the specified mount spec paths by +// mount spec type as patterns to ignore. +func AsIgnorePatternsByType(m MountSpecPathsByTyper) Transformer { + patternsByType := m.MountSpecPathsByType() + + ignorePatterns := make(filterByMountSpecType) + for t, patterns := range patternsByType { + ignorePatterns[t] = &matcherAsFilter{pathPatterns(patterns)} + } + return ignorePatterns +} + +// OnlyDeviceNodes creates a transformer that will remove any input mounts specs +// that are not of the `MountSpecDev` type. +func OnlyDeviceNodes() Transformer { + return filterByMountSpecType{ + csv.MountSpecDir: removeAll{}, + csv.MountSpecLib: removeAll{}, + csv.MountSpecSym: removeAll{}, + } +} + +// WithoutDeviceNodes creates a transformer that will remove entries with type +// MountSpecDevice from the input. +func WithoutDeviceNodes() Transformer { + return filterByMountSpecType{ + csv.MountSpecDev: removeAll{}, + } +} + +// WithoutRegularDeviceNodes creates a transfomer which removes +// regular `/dev/nvidia[0-9]+` device nodes from the source. +func WithoutRegularDeviceNodes() Transformer { + return filterByMountSpecType{ + csv.MountSpecDev: &matcherAsFilter{regexp.MustCompile("^/dev/nvidia[0-9]+$")}, + } +} + +// DeviceNodes creates a set of MountSpecPaths for the specified device nodes. +// These have the MoutSpecDev type. +func DeviceNodes(dn ...string) MountSpecPathsByTyper { + return MountSpecPathsByType{ + csv.MountSpecDev: dn, + } +} + +// DeviceNodes creates a set of MountSpecPaths for the specified symlinks. +// These have the MountSpecSym type. +func Symlinks(s ...string) MountSpecPathsByTyper { + return MountSpecPathsByType{ + csv.MountSpecSym: s, + } +} diff --git a/internal/platform-support/tegra/options.go b/internal/platform-support/tegra/options.go new file mode 100644 index 000000000..8a005a80c --- /dev/null +++ b/internal/platform-support/tegra/options.go @@ -0,0 +1,105 @@ +/** +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package tegra + +import ( + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" +) + +type options struct { + logger logger.Interface + driverRoot string + devRoot string + hookCreator discover.HookCreator + ldconfigPath string + librarySearchPaths []string + + // The following can be overridden for testing + symlinkLocator lookup.Locator + symlinkChainLocator lookup.Locator + // TODO: This should be replaced by a regular mock + resolveSymlink func(string) (string, error) + + MountSpecPathsByTyper +} + +// Option defines a functional option for configuring a Tegra discoverer. +type Option func(*options) + +// WithLogger sets the logger for the discoverer. +func WithLogger(logger logger.Interface) Option { + return func(o *options) { + o.logger = logger + } +} + +// WithDriverRoot sets the driver root for the discoverer. +func WithDriverRoot(driverRoot string) Option { + return func(o *options) { + o.driverRoot = driverRoot + } +} + +// WithDevRoot sets the /dev root. +// If this is unset, the driver root is assumed. +func WithDevRoot(devRoot string) Option { + return func(o *options) { + o.devRoot = devRoot + } +} + +// WithHookCreator sets the hook creator for the discoverer. +func WithHookCreator(hookCreator discover.HookCreator) Option { + return func(o *options) { + o.hookCreator = hookCreator + } +} + +// WithLdconfigPath sets the path to the ldconfig program +func WithLdconfigPath(ldconfigPath string) Option { + return func(o *options) { + o.ldconfigPath = ldconfigPath + } +} + +// WithLibrarySearchPaths sets the library search paths for the discoverer. +func WithLibrarySearchPaths(librarySearchPaths ...string) Option { + return func(o *options) { + o.librarySearchPaths = librarySearchPaths + } +} + +// WithMountSpecsByPath sets the source of MountSpec paths per type. +// If multiple values are supplied, these are merged. +func WithMountSpecsByPath(msfp ...MountSpecPathsByTyper) Option { + return func(o *options) { + o.MountSpecPathsByTyper = Merge(msfp...) + } +} + +// MountSpecPathsByType returns the mounts specs by path configured for these +// options. +// For an unconfigured MountSpecPathsByTyper no mountspecs are returned. +func (o options) MountSpecPathsByType() MountSpecPathsByType { + if o.MountSpecPathsByTyper == nil { + return nil + } + return o.MountSpecPathsByTyper.MountSpecPathsByType() +} diff --git a/internal/platform-support/tegra/symlinks.go b/internal/platform-support/tegra/symlinks.go index 822d482fd..00e664a19 100644 --- a/internal/platform-support/tegra/symlinks.go +++ b/internal/platform-support/tegra/symlinks.go @@ -36,7 +36,7 @@ type symlinkHook struct { } // createCSVSymlinkHooks creates a discoverer for a hook that creates required symlinks in the container -func (o tegraOptions) createCSVSymlinkHooks(targets []string) discover.Discover { +func (o options) createCSVSymlinkHooks(targets []string) discover.Discover { return symlinkHook{ logger: o.logger, hookCreator: o.hookCreator, diff --git a/internal/platform-support/tegra/tegra.go b/internal/platform-support/tegra/tegra.go index 6ad774b4e..ee3ab47b7 100644 --- a/internal/platform-support/tegra/tegra.go +++ b/internal/platform-support/tegra/tegra.go @@ -20,34 +20,14 @@ import ( "fmt" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" - "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/symlinks" + "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra/csv" ) -type tegraOptions struct { - logger logger.Interface - csvFiles []string - driverRoot string - devRoot string - hookCreator discover.HookCreator - ldconfigPath string - librarySearchPaths []string - ignorePatterns ignoreMountSpecPatterns - - // The following can be overridden for testing - symlinkLocator lookup.Locator - symlinkChainLocator lookup.Locator - // TODO: This should be replaced by a regular mock - resolveSymlink func(string) (string, error) -} - -// Option defines a functional option for configuring a Tegra discoverer. -type Option func(*tegraOptions) - -// New creates a new tegra discoverer using the supplied options. +// New creates a new tegra discoverer using the supplied functional options. func New(opts ...Option) (discover.Discover, error) { - o := &tegraOptions{} + o := &options{} for _, opt := range opts { opt(o) } @@ -75,12 +55,12 @@ func New(opts ...Option) (discover.Discover, error) { o.resolveSymlink = symlinks.Resolve } - csvDiscoverer, err := o.newDiscovererFromCSVFiles() + mountSpecDiscoverer, err := o.newDiscovererFromMountSpecs(o.MountSpecPathsByType()) if err != nil { - return nil, fmt.Errorf("failed to create CSV discoverer: %v", err) + return nil, fmt.Errorf("failed to create discoverer for mount specs: %v", err) } - ldcacheUpdateHook, err := discover.NewLDCacheUpdateHook(o.logger, csvDiscoverer, o.hookCreator, o.ldconfigPath) + ldcacheUpdateHook, err := discover.NewLDCacheUpdateHook(o.logger, mountSpecDiscoverer, o.hookCreator, o.ldconfigPath) if err != nil { return nil, fmt.Errorf("failed to create ldcach update hook discoverer: %v", err) } @@ -95,8 +75,9 @@ func New(opts ...Option) (discover.Discover, error) { ) d := discover.Merge( - csvDiscoverer, - // The ldcacheUpdateHook is added last to ensure that the created symlinks are included + mountSpecDiscoverer, + // The ldcacheUpdateHook is added after the mount spec discoverer to + // ensure that the symlinks are included. ldcacheUpdateHook, tegraSystemMounts, ) @@ -104,59 +85,58 @@ func New(opts ...Option) (discover.Discover, error) { return d, nil } -// WithLogger sets the logger for the discoverer. -func WithLogger(logger logger.Interface) Option { - return func(o *tegraOptions) { - o.logger = logger - } -} - -// WithDriverRoot sets the driver root for the discoverer. -func WithDriverRoot(driverRoot string) Option { - return func(o *tegraOptions) { - o.driverRoot = driverRoot +// newDiscovererFromMountSpecs creates a discoverer for the specified mount specs. +func (o options) newDiscovererFromMountSpecs(pathsByType MountSpecPathsByType) (discover.Discover, error) { + if len(pathsByType) == 0 { + o.logger.Warningf("No mount specs specified") + return discover.None{}, nil } -} -// WithDevRoot sets the /dev root. -// If this is unset, the driver root is assumed. -func WithDevRoot(devRoot string) Option { - return func(o *tegraOptions) { - o.devRoot = devRoot - } -} + devices := discover.NewCharDeviceDiscoverer( + o.logger, + o.devRoot, + pathsByType[csv.MountSpecDev], + ) -// WithCSVFiles sets the CSV files for the discoverer. -func WithCSVFiles(csvFiles []string) Option { - return func(o *tegraOptions) { - o.csvFiles = csvFiles - } -} + directories := discover.NewMounts( + o.logger, + lookup.NewDirectoryLocator(lookup.WithLogger(o.logger), lookup.WithRoot(o.driverRoot)), + o.driverRoot, + pathsByType[csv.MountSpecDir], + ) -// WithHookCreator sets the hook creator for the discoverer. -func WithHookCreator(hookCreator discover.HookCreator) Option { - return func(o *tegraOptions) { - o.hookCreator = hookCreator - } -} + // We create a discoverer for mounted libraries and add additional .so + // symlinks for the driver. + libraries := discover.WithDriverDotSoSymlinks( + o.logger, + discover.NewMounts( + o.logger, + o.symlinkLocator, + o.driverRoot, + pathsByType[csv.MountSpecLib], + ), + "", + o.hookCreator, + ) -// WithLdconfigPath sets the path to the ldconfig program -func WithLdconfigPath(ldconfigPath string) Option { - return func(o *tegraOptions) { - o.ldconfigPath = ldconfigPath - } -} + // We process the explicitly requested symlinks. + symlinkTargets := pathsByType[csv.MountSpecSym] + o.logger.Debugf("Filtered symlink targets: %v", symlinkTargets) + symlinks := discover.NewMounts( + o.logger, + o.symlinkLocator, + o.driverRoot, + symlinkTargets, + ) + createSymlinks := o.createCSVSymlinkHooks(symlinkTargets) -// WithLibrarySearchPaths sets the library search paths for the discoverer. -func WithLibrarySearchPaths(librarySearchPaths ...string) Option { - return func(o *tegraOptions) { - o.librarySearchPaths = librarySearchPaths - } -} + d := discover.Merge( + devices, + directories, + libraries, + symlinks, + createSymlinks, + ) -// WithIngorePatterns sets patterns to ignore in the CSV files -func WithIngorePatterns(ignorePatterns ...string) Option { - return func(o *tegraOptions) { - o.ignorePatterns = ignoreMountSpecPatterns(ignorePatterns) - } + return d, nil } diff --git a/pkg/nvcdi/api.go b/pkg/nvcdi/api.go index fce32bc88..14cbdb83f 100644 --- a/pkg/nvcdi/api.go +++ b/pkg/nvcdi/api.go @@ -88,4 +88,8 @@ const ( // FeatureEnableCoherentAnnotations enables the addition of annotations // coherent or non-coherent devices. FeatureEnableCoherentAnnotations = FeatureFlag("enable-coherent-annotations") + + // FeatureDisableMultipleCSVDevices disables the handling of multiple devices + // in CSV mode. + FeatureDisableMultipleCSVDevices = FeatureFlag("disable-multiple-csv-devices") ) diff --git a/pkg/nvcdi/common-nvml.go b/pkg/nvcdi/common-nvml.go index fbb5f01d1..b4ebc7dc4 100644 --- a/pkg/nvcdi/common-nvml.go +++ b/pkg/nvcdi/common-nvml.go @@ -25,16 +25,7 @@ import ( // newCommonNVMLDiscoverer returns a discoverer for entities that are not associated with a specific CDI device. // This includes driver libraries and meta devices, for example. func (l *nvmllib) newCommonNVMLDiscoverer() (discover.Discover, error) { - metaDevices := discover.NewCharDeviceDiscoverer( - l.logger, - l.devRoot, - []string{ - "/dev/nvidia-modeset", - "/dev/nvidia-uvm-tools", - "/dev/nvidia-uvm", - "/dev/nvidiactl", - }, - ) + metaDevices := l.controlDeviceNodeDiscoverer() graphicsMounts, err := discover.NewGraphicsMountsDiscoverer(l.logger, l.driver, l.hookCreator) if err != nil { @@ -54,3 +45,16 @@ func (l *nvmllib) newCommonNVMLDiscoverer() (discover.Discover, error) { return d, nil } + +func (l *nvmllib) controlDeviceNodeDiscoverer() discover.Discover { + return discover.NewCharDeviceDiscoverer( + l.logger, + l.devRoot, + []string{ + "/dev/nvidia-modeset", + "/dev/nvidia-uvm-tools", + "/dev/nvidia-uvm", + "/dev/nvidiactl", + }, + ) +} diff --git a/pkg/nvcdi/full-gpu-nvml.go b/pkg/nvcdi/full-gpu-nvml.go index c52f44e47..0555b81c9 100644 --- a/pkg/nvcdi/full-gpu-nvml.go +++ b/pkg/nvcdi/full-gpu-nvml.go @@ -37,7 +37,8 @@ type fullGPUDeviceSpecGenerator struct { uuid string index int - featureFlags map[FeatureFlag]bool + featureFlags map[FeatureFlag]bool + additionalDiscoverers []discover.Discover } var _ DeviceSpecGenerator = (*fullGPUDeviceSpecGenerator)(nil) @@ -145,7 +146,6 @@ func (l *fullGPUDeviceSpecGenerator) getDeviceEdits() (*cdi.ContainerEdits, erro if err != nil { return nil, fmt.Errorf("failed to create device discoverer: %v", err) } - editsForDevice, err := edits.FromDiscoverer(deviceDiscoverer) if err != nil { return nil, fmt.Errorf("failed to create container edits for device: %v", err) @@ -177,10 +177,18 @@ func (l *fullGPUDeviceSpecGenerator) newFullGPUDiscoverer(d device.Device) (disc deviceNodes, ) - dd := discover.Merge( + var discoverers []discover.Discover + + discoverers = append(discoverers, deviceNodes, deviceFolderPermissionHooks, ) + discoverers = append(discoverers, l.additionalDiscoverers...) + + dd := discover.Merge( + discoverers..., + ) + return dd, nil } diff --git a/pkg/nvcdi/lib-csv.go b/pkg/nvcdi/lib-csv.go index 6380d79dc..de5b28e96 100644 --- a/pkg/nvcdi/lib-csv.go +++ b/pkg/nvcdi/lib-csv.go @@ -18,10 +18,17 @@ package nvcdi import ( "fmt" + "slices" + "strconv" "tags.cncf.io/container-device-interface/pkg/cdi" "tags.cncf.io/container-device-interface/specs-go" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/google/uuid" + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra" @@ -29,9 +36,31 @@ import ( type csvlib nvcdilib +type mixedcsvlib nvcdilib + var _ deviceSpecGeneratorFactory = (*csvlib)(nil) +// DeviceSpecGenerators creates a set of generators for the specified set of +// devices. +// If NVML is not available or the disable-multiple-csv-devices feature flag is +// enabled, a single device is assumed. func (l *csvlib) DeviceSpecGenerators(ids ...string) (DeviceSpecGenerator, error) { + if l.featureFlags[FeatureDisableMultipleCSVDevices] { + return l.purecsvDeviceSpecGenerators(ids...) + } + hasNVML, _ := l.infolib.HasNvml() + if !hasNVML { + return l.purecsvDeviceSpecGenerators(ids...) + } + mixed, err := l.mixedDeviceSpecGenerators(ids...) + if err != nil { + l.logger.Warningf("Failed to create mixed CSV spec generator; falling back to pure CSV implementation: %v", err) + return l.purecsvDeviceSpecGenerators(ids...) + } + return mixed, nil +} + +func (l *csvlib) purecsvDeviceSpecGenerators(ids ...string) (DeviceSpecGenerator, error) { for _, id := range ids { switch id { case "all": @@ -40,31 +69,42 @@ func (l *csvlib) DeviceSpecGenerators(ids ...string) (DeviceSpecGenerator, error return nil, fmt.Errorf("unsupported device id: %v", id) } } + g := &csvDeviceGenerator{ + csvlib: l, + index: 0, + uuid: "", + } + return g, nil +} + +func (l *csvlib) mixedDeviceSpecGenerators(ids ...string) (DeviceSpecGenerator, error) { + return (*mixedcsvlib)(l).DeviceSpecGenerators(ids...) +} + +// A csvDeviceGenerator generates CDI specs for a device based on a set of +// platform-specific CSV files. +type csvDeviceGenerator struct { + *csvlib + index int + uuid string +} - return l, nil +func (l *csvDeviceGenerator) GetUUID() (string, error) { + return l.uuid, nil } // GetDeviceSpecs returns the CDI device specs for a single device. -func (l *csvlib) GetDeviceSpecs() ([]specs.Device, error) { - d, err := tegra.New( - tegra.WithLogger(l.logger), - tegra.WithDriverRoot(l.driverRoot), - tegra.WithDevRoot(l.devRoot), - tegra.WithHookCreator(l.hookCreator), - tegra.WithLdconfigPath(l.ldconfigPath), - tegra.WithCSVFiles(l.csvFiles), - tegra.WithLibrarySearchPaths(l.librarySearchPaths...), - tegra.WithIngorePatterns(l.csvIgnorePatterns...), - ) +func (l *csvDeviceGenerator) GetDeviceSpecs() ([]specs.Device, error) { + deviceNodeDiscoverer, err := l.deviceNodeDiscoverer() if err != nil { - return nil, fmt.Errorf("failed to create discoverer for CSV files: %v", err) + return nil, fmt.Errorf("failed to create discoverer for device nodes from CSV files: %w", err) } - e, err := edits.FromDiscoverer(d) + e, err := edits.FromDiscoverer(deviceNodeDiscoverer) if err != nil { return nil, fmt.Errorf("failed to create container edits for CSV files: %v", err) } - names, err := l.deviceNamers.GetDeviceNames(0, uuidIgnored{}) + names, err := l.deviceNamers.GetDeviceNames(l.index, l) if err != nil { return nil, fmt.Errorf("failed to get device name: %v", err) } @@ -80,7 +120,205 @@ func (l *csvlib) GetDeviceSpecs() ([]specs.Device, error) { return deviceSpecs, nil } +// deviceNodeDiscoverer creates a discoverer for the device nodes associated +// with the specified device. +// The CSV mount specs are used as the source for which device nodes are +// required with the following additions: +// +// - Any regular device nodes (i.e. /dev/nvidia[0-9]+) are removed from the +// input set. +// - The device node (i.e. /dev/nvidia{{ .index }}) associated with this +// particular device is added to the set of device nodes to be discovered. +func (l *csvDeviceGenerator) deviceNodeDiscoverer() (discover.Discover, error) { + mountSpecs := tegra.Transform( + // We remove the regular (nvidia[0-9]+) device nodes. + tegra.WithoutRegularDeviceNodes(), + tegra.Transform( + // We remove non-device nodes. + tegra.OnlyDeviceNodes(), + tegra.MountSpecsFromCSVFiles(l.logger, l.csvFiles...), + ), + ) + // We add the specific device node for this device. + mountSpecs = tegra.Merge( + mountSpecs, + tegra.DeviceNodes(fmt.Sprintf("/dev/nvidia%d", l.index)), + ) + return tegra.New( + tegra.WithLogger(l.logger), + tegra.WithDriverRoot(l.driverRoot), + tegra.WithDevRoot(l.devRoot), + tegra.WithHookCreator(l.hookCreator), + tegra.WithLdconfigPath(l.ldconfigPath), + tegra.WithLibrarySearchPaths(l.librarySearchPaths...), + tegra.WithMountSpecsByPath(mountSpecs), + ) +} + // GetCommonEdits generates a CDI specification that can be used for ANY devices +// These explicitly do not include any device nodes. func (l *csvlib) GetCommonEdits() (*cdi.ContainerEdits, error) { - return edits.FromDiscoverer(discover.None{}) + mountSpecs := tegra.Transform( + tegra.AsIgnorePatternsByType(tegra.Symlinks(l.csvIgnorePatterns...)), + tegra.Transform( + tegra.WithoutDeviceNodes(), + tegra.MountSpecsFromCSVFiles(l.logger, l.csvFiles...), + ), + ) + driverDiscoverer, err := tegra.New( + tegra.WithLogger(l.logger), + tegra.WithDriverRoot(l.driverRoot), + tegra.WithDevRoot(l.devRoot), + tegra.WithHookCreator(l.hookCreator), + tegra.WithLdconfigPath(l.ldconfigPath), + tegra.WithLibrarySearchPaths(l.librarySearchPaths...), + tegra.WithMountSpecsByPath(mountSpecs), + ) + if err != nil { + return nil, fmt.Errorf("failed to create driver discoverer from CSV files: %w", err) + } + return edits.FromDiscoverer(driverDiscoverer) +} + +func (l *mixedcsvlib) DeviceSpecGenerators(ids ...string) (DeviceSpecGenerator, error) { + asNvmlLib := (*nvmllib)(l) + err := asNvmlLib.init() + if err != nil { + return nil, fmt.Errorf("failed to initialize nvml: %w", err) + } + defer asNvmlLib.tryShutdown() + + if slices.Contains(ids, "all") { + ids, err = l.getAllDeviceIndices() + if err != nil { + return nil, fmt.Errorf("failed to get device indices: %w", err) + } + } + + var DeviceSpecGenerators DeviceSpecGenerators + for _, id := range ids { + generator, err := l.deviceSpecGeneratorForId(device.Identifier(id)) + if err != nil { + return nil, fmt.Errorf("failed to create device spec generator for device %q: %w", id, err) + } + DeviceSpecGenerators = append(DeviceSpecGenerators, generator) + } + + return DeviceSpecGenerators, nil +} + +func (l *mixedcsvlib) getAllDeviceIndices() ([]string, error) { + numDevices, ret := l.nvmllib.DeviceGetCount() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("faled to get device count: %v", ret) + } + + var allIndices []string + for index := range numDevices { + allIndices = append(allIndices, fmt.Sprintf("%d", index)) + } + return allIndices, nil +} + +func (l *mixedcsvlib) deviceSpecGeneratorForId(id device.Identifier) (DeviceSpecGenerator, error) { + switch { + case id.IsGpuUUID(), isIntegratedGPUID(id): + uuid := string(id) + device, ret := l.nvmllib.DeviceGetHandleByUUID(uuid) + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("failed to get device handle from UUID %q: %v", uuid, ret) + } + index, ret := device.GetIndex() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("failed to get device index: %v", ret) + } + return l.csvDeviceSpecGenerator(index, uuid, device) + case id.IsGpuIndex(): + index, err := strconv.Atoi(string(id)) + if err != nil { + return nil, fmt.Errorf("failed to convert device index to an int: %w", err) + } + device, ret := l.nvmllib.DeviceGetHandleByIndex(index) + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("failed to get device handle from index: %v", ret) + } + uuid, ret := device.GetUUID() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("failed to get UUID: %v", ret) + } + return l.csvDeviceSpecGenerator(index, uuid, device) + case id.IsMigUUID(): + fallthrough + case id.IsMigIndex(): + return nil, fmt.Errorf("generating a CDI spec for MIG id %q is not supported in CSV mode", id) + } + return nil, fmt.Errorf("identifier is not a valid UUID or index: %q", id) +} + +func (l *mixedcsvlib) csvDeviceSpecGenerator(index int, uuid string, device nvml.Device) (DeviceSpecGenerator, error) { + isIntegrated, err := isIntegratedGPU(device) + if err != nil { + return nil, fmt.Errorf("is-integrated check failed for device (index=%v,uuid=%v)", index, uuid) + } + + g := &csvDeviceGenerator{ + csvlib: (*csvlib)(l), + index: index, + uuid: uuid, + } + + if !isIntegrated { + csvDeviceNodeDiscoverer, err := g.deviceNodeDiscoverer() + if err != nil { + return nil, fmt.Errorf("failed to create discoverer for devices nodes: %w", err) + } + + // If this is not an integrated GPU, we also create a spec generator for + // the full GPU. + dgpu := (*nvmllib)(l).withInit(&fullGPUDeviceSpecGenerator{ + nvmllib: (*nvmllib)(l), + uuid: uuid, + index: index, + // For the CSV case, we include the control device nodes at a + // device level. + additionalDiscoverers: []discover.Discover{ + (*nvmllib)(l).controlDeviceNodeDiscoverer(), + csvDeviceNodeDiscoverer, + }, + featureFlags: l.featureFlags, + }) + return dgpu, nil + } + + return g, nil +} + +func isIntegratedGPUID(id device.Identifier) bool { + _, err := uuid.Parse(string(id)) + return err == nil +} + +// isIntegratedGPU checks whether the specified device is an integrated GPU. +// As a proxy we check the PCI Bus if for thes +// TODO: This should be replaced by an explicit NVML call once available. +func isIntegratedGPU(d nvml.Device) (bool, error) { + pciInfo, ret := d.GetPciInfo() + if ret == nvml.ERROR_NOT_SUPPORTED { + name, ret := d.GetName() + if ret != nvml.SUCCESS { + return false, fmt.Errorf("failed to get device name: %v", ret) + } + return info.IsIntegratedGPUName(name), nil + } + if ret != nvml.SUCCESS { + return false, fmt.Errorf("failed to get PCI info: %v", ret) + } + + if pciInfo.Domain != 0 { + return false, nil + } + if pciInfo.Bus != 1 { + return false, nil + } + return pciInfo.Device == 0, nil } diff --git a/pkg/nvcdi/namer.go b/pkg/nvcdi/namer.go index 8019f699e..8ebdd33b4 100644 --- a/pkg/nvcdi/namer.go +++ b/pkg/nvcdi/namer.go @@ -105,12 +105,6 @@ type convert struct { nvmlUUIDer } -type uuidIgnored struct{} - -func (m uuidIgnored) GetUUID() (string, error) { - return "", nil -} - type uuidUnsupported struct{} func (m convert) GetUUID() (string, error) { diff --git a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go index e2089f7d3..a67ce3c1b 100644 --- a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go +++ b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go @@ -29,6 +29,7 @@ type Device interface { GetArchitectureAsString() (string, error) GetBrandAsString() (string, error) GetCudaComputeCapabilityAsString() (string, error) + GetAddressingModeAsString() (string, error) GetMigDevices() ([]MigDevice, error) GetMigProfiles() ([]MigProfile, error) GetPCIBusID() (string, error) @@ -146,6 +147,32 @@ func (d *device) GetBrandAsString() (string, error) { return "", fmt.Errorf("error interpreting device brand as string: %v", brand) } +// GetAddressingModeAsString returns the Device addressing mode as a string. +func (d *device) GetAddressingModeAsString() (string, error) { + mode, ret := d.GetAddressingMode() + + switch ret { + case nvml.SUCCESS: + // continue + case nvml.ERROR_NOT_SUPPORTED: + // Addressing mode is not supported on the current platform. + return "", nil + default: + return "", fmt.Errorf("error getting device addressing mode: %v", ret) + } + + switch nvml.DeviceAddressingModeType(mode.Value) { + case nvml.DEVICE_ADDRESSING_MODE_ATS: + return "ATS", nil + case nvml.DEVICE_ADDRESSING_MODE_HMM: + return "HMM", nil + case nvml.DEVICE_ADDRESSING_MODE_NONE: + return "None", nil + } + + return "", fmt.Errorf("error interpreting addressing mode as string: %v", mode) +} + // GetPCIBusID returns the string representation of the bus ID. func (d *device) GetPCIBusID() (string, error) { info, ret := d.GetPciInfo() diff --git a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/info/api.go b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/info/api.go index bcc9cb69c..48abff5d6 100644 --- a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/info/api.go +++ b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/info/api.go @@ -35,9 +35,5 @@ type PropertyExtractor interface { HasDXCore() (bool, string) HasNvml() (bool, string) HasTegraFiles() (bool, string) - // Deprecated: Use HasTegraFiles instead. - IsTegraSystem() (bool, string) - // Deprecated: Use HasOnlyIntegratedGPUs - UsesOnlyNVGPUModule() (bool, string) - HasOnlyIntegratedGPUs() (bool, string) + HasAnIntegratedGPU() (bool, string) } diff --git a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/info/property-extractor.go b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/info/property-extractor.go index 202047133..3ef582dd0 100644 --- a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/info/property-extractor.go +++ b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/info/property-extractor.go @@ -59,6 +59,7 @@ func (i *propertyExtractor) HasNvml() (bool, string) { } // IsTegraSystem returns true if the system is detected as a Tegra-based system. +// // Deprecated: Use HasTegraFiles instead. func (i *propertyExtractor) IsTegraSystem() (bool, string) { return i.HasTegraFiles() @@ -89,25 +90,20 @@ func (i *propertyExtractor) HasTegraFiles() (bool, string) { return false, fmt.Sprintf("%v has no 'tegra' prefix", tegraFamilyFile) } -// UsesOnlyNVGPUModule checks whether the only the nvgpu module is used. -// -// Deprecated: UsesOnlyNVGPUModule is deprecated, use HasOnlyIntegratedGPUs instead. -func (i *propertyExtractor) UsesOnlyNVGPUModule() (uses bool, reason string) { - return i.HasOnlyIntegratedGPUs() -} - -// HasOnlyIntegratedGPUs checks whether all GPUs are iGPUs that use NVML. +// HasAnIntegratedGPU checks whether any of the GPUs reported by NVML is an +// integrated GPU. // // As of Orin-based systems iGPUs also support limited NVML queries. -// In the absence of a robust API, we rely on heuristics to make this decision. +// In the absence of a robust API, we rely on heuristics based on the device +// name to make this decision. // -// The following device names are checked: +// Devices with the following names are considered integrated GPUs: // // GPU 0: Orin (nvgpu) (UUID: 54d0709b-558d-5a59-9c65-0c5fc14a21a4) // GPU 0: NVIDIA Thor (UUID: 54d0709b-558d-5a59-9c65-0c5fc14a21a4) // -// This function returns true if ALL devices are detected as iGPUs. -func (i *propertyExtractor) HasOnlyIntegratedGPUs() (uses bool, reason string) { +// (Where this shows the nvidia-smi -L output on these systems). +func (i *propertyExtractor) HasAnIntegratedGPU() (uses bool, reason string) { // We ensure that this function never panics defer func() { if err := recover(); err != nil { @@ -143,14 +139,23 @@ func (i *propertyExtractor) HasOnlyIntegratedGPUs() (uses bool, reason string) { } for _, name := range names { - if !isIntegratedGPUName(name) { - return false, fmt.Sprintf("device %q does not use nvgpu module", name) + if IsIntegratedGPUName(name) { + return true, fmt.Sprintf("device %q is an integrated GPU", name) } } - return true, "all devices use nvgpu module" + return false, "no integrated GPUs found" } -func isIntegratedGPUName(name string) bool { +// IsIntegratedGPUName checks whether the specified device name is associated +// with a known integrated GPU. +// +// Devices with the following names are considered integrated GPUs: +// +// GPU 0: Orin (nvgpu) (UUID: 54d0709b-558d-5a59-9c65-0c5fc14a21a4) +// GPU 0: NVIDIA Thor (UUID: 54d0709b-558d-5a59-9c65-0c5fc14a21a4) +// +// (Where this shows the nvidia-smi -L output on these systems). +func IsIntegratedGPUName(name string) bool { if strings.Contains(name, "(nvgpu)") { return true } diff --git a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/info/property-extractor_mock.go b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/info/property-extractor_mock.go index bd7d41353..708c6e833 100644 --- a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/info/property-extractor_mock.go +++ b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/info/property-extractor_mock.go @@ -17,24 +17,18 @@ var _ PropertyExtractor = &PropertyExtractorMock{} // // // make and configure a mocked PropertyExtractor // mockedPropertyExtractor := &PropertyExtractorMock{ +// HasAnIntegratedGPUFunc: func() (bool, string) { +// panic("mock out the HasAnIntegratedGPU method") +// }, // HasDXCoreFunc: func() (bool, string) { // panic("mock out the HasDXCore method") // }, // HasNvmlFunc: func() (bool, string) { // panic("mock out the HasNvml method") // }, -// HasOnlyIntegratedGPUsFunc: func() (bool, string) { -// panic("mock out the HasOnlyIntegratedGPUs method") -// }, // HasTegraFilesFunc: func() (bool, string) { // panic("mock out the HasTegraFiles method") // }, -// IsTegraSystemFunc: func() (bool, string) { -// panic("mock out the IsTegraSystem method") -// }, -// UsesOnlyNVGPUModuleFunc: func() (bool, string) { -// panic("mock out the UsesOnlyNVGPUModule method") -// }, // } // // // use mockedPropertyExtractor in code that requires PropertyExtractor @@ -42,51 +36,64 @@ var _ PropertyExtractor = &PropertyExtractorMock{} // // } type PropertyExtractorMock struct { + // HasAnIntegratedGPUFunc mocks the HasAnIntegratedGPU method. + HasAnIntegratedGPUFunc func() (bool, string) + // HasDXCoreFunc mocks the HasDXCore method. HasDXCoreFunc func() (bool, string) // HasNvmlFunc mocks the HasNvml method. HasNvmlFunc func() (bool, string) - // HasOnlyIntegratedGPUsFunc mocks the HasOnlyIntegratedGPUs method. - HasOnlyIntegratedGPUsFunc func() (bool, string) - // HasTegraFilesFunc mocks the HasTegraFiles method. HasTegraFilesFunc func() (bool, string) - // IsTegraSystemFunc mocks the IsTegraSystem method. - IsTegraSystemFunc func() (bool, string) - - // UsesOnlyNVGPUModuleFunc mocks the UsesOnlyNVGPUModule method. - UsesOnlyNVGPUModuleFunc func() (bool, string) - // calls tracks calls to the methods. calls struct { + // HasAnIntegratedGPU holds details about calls to the HasAnIntegratedGPU method. + HasAnIntegratedGPU []struct { + } // HasDXCore holds details about calls to the HasDXCore method. HasDXCore []struct { } // HasNvml holds details about calls to the HasNvml method. HasNvml []struct { } - // HasOnlyIntegratedGPUs holds details about calls to the HasOnlyIntegratedGPUs method. - HasOnlyIntegratedGPUs []struct { - } // HasTegraFiles holds details about calls to the HasTegraFiles method. HasTegraFiles []struct { } - // IsTegraSystem holds details about calls to the IsTegraSystem method. - IsTegraSystem []struct { - } - // UsesOnlyNVGPUModule holds details about calls to the UsesOnlyNVGPUModule method. - UsesOnlyNVGPUModule []struct { - } } - lockHasDXCore sync.RWMutex - lockHasNvml sync.RWMutex - lockHasOnlyIntegratedGPUs sync.RWMutex - lockHasTegraFiles sync.RWMutex - lockIsTegraSystem sync.RWMutex - lockUsesOnlyNVGPUModule sync.RWMutex + lockHasAnIntegratedGPU sync.RWMutex + lockHasDXCore sync.RWMutex + lockHasNvml sync.RWMutex + lockHasTegraFiles sync.RWMutex +} + +// HasAnIntegratedGPU calls HasAnIntegratedGPUFunc. +func (mock *PropertyExtractorMock) HasAnIntegratedGPU() (bool, string) { + if mock.HasAnIntegratedGPUFunc == nil { + panic("PropertyExtractorMock.HasAnIntegratedGPUFunc: method is nil but PropertyExtractor.HasAnIntegratedGPU was just called") + } + callInfo := struct { + }{} + mock.lockHasAnIntegratedGPU.Lock() + mock.calls.HasAnIntegratedGPU = append(mock.calls.HasAnIntegratedGPU, callInfo) + mock.lockHasAnIntegratedGPU.Unlock() + return mock.HasAnIntegratedGPUFunc() +} + +// HasAnIntegratedGPUCalls gets all the calls that were made to HasAnIntegratedGPU. +// Check the length with: +// +// len(mockedPropertyExtractor.HasAnIntegratedGPUCalls()) +func (mock *PropertyExtractorMock) HasAnIntegratedGPUCalls() []struct { +} { + var calls []struct { + } + mock.lockHasAnIntegratedGPU.RLock() + calls = mock.calls.HasAnIntegratedGPU + mock.lockHasAnIntegratedGPU.RUnlock() + return calls } // HasDXCore calls HasDXCoreFunc. @@ -143,33 +150,6 @@ func (mock *PropertyExtractorMock) HasNvmlCalls() []struct { return calls } -// HasOnlyIntegratedGPUs calls HasOnlyIntegratedGPUsFunc. -func (mock *PropertyExtractorMock) HasOnlyIntegratedGPUs() (bool, string) { - if mock.HasOnlyIntegratedGPUsFunc == nil { - panic("PropertyExtractorMock.HasOnlyIntegratedGPUsFunc: method is nil but PropertyExtractor.HasOnlyIntegratedGPUs was just called") - } - callInfo := struct { - }{} - mock.lockHasOnlyIntegratedGPUs.Lock() - mock.calls.HasOnlyIntegratedGPUs = append(mock.calls.HasOnlyIntegratedGPUs, callInfo) - mock.lockHasOnlyIntegratedGPUs.Unlock() - return mock.HasOnlyIntegratedGPUsFunc() -} - -// HasOnlyIntegratedGPUsCalls gets all the calls that were made to HasOnlyIntegratedGPUs. -// Check the length with: -// -// len(mockedPropertyExtractor.HasOnlyIntegratedGPUsCalls()) -func (mock *PropertyExtractorMock) HasOnlyIntegratedGPUsCalls() []struct { -} { - var calls []struct { - } - mock.lockHasOnlyIntegratedGPUs.RLock() - calls = mock.calls.HasOnlyIntegratedGPUs - mock.lockHasOnlyIntegratedGPUs.RUnlock() - return calls -} - // HasTegraFiles calls HasTegraFilesFunc. func (mock *PropertyExtractorMock) HasTegraFiles() (bool, string) { if mock.HasTegraFilesFunc == nil { @@ -196,57 +176,3 @@ func (mock *PropertyExtractorMock) HasTegraFilesCalls() []struct { mock.lockHasTegraFiles.RUnlock() return calls } - -// IsTegraSystem calls IsTegraSystemFunc. -func (mock *PropertyExtractorMock) IsTegraSystem() (bool, string) { - if mock.IsTegraSystemFunc == nil { - panic("PropertyExtractorMock.IsTegraSystemFunc: method is nil but PropertyExtractor.IsTegraSystem was just called") - } - callInfo := struct { - }{} - mock.lockIsTegraSystem.Lock() - mock.calls.IsTegraSystem = append(mock.calls.IsTegraSystem, callInfo) - mock.lockIsTegraSystem.Unlock() - return mock.IsTegraSystemFunc() -} - -// IsTegraSystemCalls gets all the calls that were made to IsTegraSystem. -// Check the length with: -// -// len(mockedPropertyExtractor.IsTegraSystemCalls()) -func (mock *PropertyExtractorMock) IsTegraSystemCalls() []struct { -} { - var calls []struct { - } - mock.lockIsTegraSystem.RLock() - calls = mock.calls.IsTegraSystem - mock.lockIsTegraSystem.RUnlock() - return calls -} - -// UsesOnlyNVGPUModule calls UsesOnlyNVGPUModuleFunc. -func (mock *PropertyExtractorMock) UsesOnlyNVGPUModule() (bool, string) { - if mock.UsesOnlyNVGPUModuleFunc == nil { - panic("PropertyExtractorMock.UsesOnlyNVGPUModuleFunc: method is nil but PropertyExtractor.UsesOnlyNVGPUModule was just called") - } - callInfo := struct { - }{} - mock.lockUsesOnlyNVGPUModule.Lock() - mock.calls.UsesOnlyNVGPUModule = append(mock.calls.UsesOnlyNVGPUModule, callInfo) - mock.lockUsesOnlyNVGPUModule.Unlock() - return mock.UsesOnlyNVGPUModuleFunc() -} - -// UsesOnlyNVGPUModuleCalls gets all the calls that were made to UsesOnlyNVGPUModule. -// Check the length with: -// -// len(mockedPropertyExtractor.UsesOnlyNVGPUModuleCalls()) -func (mock *PropertyExtractorMock) UsesOnlyNVGPUModuleCalls() []struct { -} { - var calls []struct { - } - mock.lockUsesOnlyNVGPUModule.RLock() - calls = mock.calls.UsesOnlyNVGPUModule - mock.lockUsesOnlyNVGPUModule.RUnlock() - return calls -} diff --git a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/info/resolver.go b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/info/resolver.go index 0454d8a6b..824373823 100644 --- a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/info/resolver.go +++ b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/info/resolver.go @@ -48,13 +48,13 @@ func (p platformResolver) ResolvePlatform() Platform { hasNVML, reason := p.propertyExtractor.HasNvml() p.logger.Debugf("Is NVML-based system? %v: %v", hasNVML, reason) - hasOnlyIntegratedGPUs, reason := p.propertyExtractor.HasOnlyIntegratedGPUs() - p.logger.Debugf("Has only integrated GPUs? %v: %v", hasOnlyIntegratedGPUs, reason) + hasAnIntegratedGPU, reason := p.propertyExtractor.HasAnIntegratedGPU() + p.logger.Debugf("Has an integrated GPU? %v: %v", hasAnIntegratedGPU, reason) switch { case hasDXCore: return PlatformWSL - case (hasTegraFiles && !hasNVML), hasOnlyIntegratedGPUs: + case (hasTegraFiles && !hasNVML), hasAnIntegratedGPU: return PlatformTegra case hasNVML: return PlatformNVML diff --git a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/mock.go b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/mock.go index f7b4bb186..375cfb584 100644 --- a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/mock.go +++ b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/mock.go @@ -84,6 +84,13 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int, sriov *SriovInfo) return err } + vfioDev := filepath.Join(deviceDir, "vfio-dev") + vfioFD := filepath.Join(vfioDev, "vfio8") + err = os.MkdirAll(vfioFD, 0755) + if err != nil { + return err + } + iommuGroup := 20 _, err = os.Create(filepath.Join(deviceDir, strconv.Itoa(iommuGroup))) if err != nil { diff --git a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/nvpci.go b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/nvpci.go index 1165ab138..f41d0f246 100644 --- a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/nvpci.go +++ b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/nvpci.go @@ -115,6 +115,7 @@ type NvidiaPCIDevice struct { DeviceName string Driver string IommuGroup int + IommuFD string NumaNode int Config *ConfigSpace Resources MemoryResources @@ -292,6 +293,12 @@ func (p *nvpci) getGPUByPciBusID(address string, cache map[string]*NvidiaPCIDevi return nil, fmt.Errorf("unable to detect IOMMU group for %s: %w", address, err) } + iommuFD, err := getIOMMUFD(devicePath) + if err != nil { + // log a warning, do not return an error as this host may not have iommufd configured/supported + p.logger.Warningf("unable to detect IOMMU FD for %s: %v", address, err) + } + numa, err := os.ReadFile(path.Join(devicePath, "numa_node")) if err != nil { return nil, fmt.Errorf("unable to read PCI NUMA node for %s: %v", address, err) @@ -376,6 +383,7 @@ func (p *nvpci) getGPUByPciBusID(address string, cache map[string]*NvidiaPCIDevi Device: uint16(deviceID), Driver: driver, IommuGroup: int(iommuGroup), + IommuFD: iommuFD, NumaNode: int(numaNode), Config: config, Resources: resources, @@ -523,6 +531,22 @@ func getDriver(devicePath string) (string, error) { return "", err } +func getIOMMUFD(devicePath string) (string, error) { + content, err := os.ReadDir(path.Join(devicePath, "vfio-dev")) + if err != nil { + return "", err + } + for _, c := range content { + if !c.IsDir() { + continue + } + if strings.HasPrefix(c.Name(), "vfio") { + return c.Name(), nil + } + } + return "", fmt.Errorf("no iommufd device found") +} + func getIOMMUGroup(devicePath string) (int64, error) { var iommuGroup int64 iommu, err := filepath.EvalSymlinks(path.Join(devicePath, "iommu_group")) diff --git a/vendor/modules.txt b/vendor/modules.txt index e405cc0df..bfbdb3f13 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -4,7 +4,7 @@ cyphar.com/go-pathrs cyphar.com/go-pathrs/internal/fdutils cyphar.com/go-pathrs/internal/libpathrs cyphar.com/go-pathrs/procfs -# github.com/NVIDIA/go-nvlib v0.8.1 +# github.com/NVIDIA/go-nvlib v0.8.2-0.20251202135446-d0f42ba016dd ## explicit; go 1.20 github.com/NVIDIA/go-nvlib/pkg/nvlib/device github.com/NVIDIA/go-nvlib/pkg/nvlib/info