From 0f27d53723aa2bd2d7552c48f565aea1f441b504 Mon Sep 17 00:00:00 2001 From: Tariq Ibrahim Date: Thu, 1 Feb 2024 23:05:54 -0800 Subject: [PATCH] lookup the nvidia/nvidia-frontend device by device major number Signed-off-by: Tariq Ibrahim --- .../system/create-dev-char-symlinks/all.go | 54 +++++++++----- internal/info/proc/devices/devices.go | 72 ++++++++++++++++--- internal/info/proc/devices/devices_mock.go | 54 +++++++------- internal/info/proc/devices/devices_test.go | 2 +- internal/system/nvdevices/devices.go | 26 ++++--- internal/system/nvdevices/devices_test.go | 53 +++++++++++--- internal/system/nvdevices/options.go | 11 ++- 7 files changed, 195 insertions(+), 77 deletions(-) diff --git a/cmd/nvidia-ctk/system/create-dev-char-symlinks/all.go b/cmd/nvidia-ctk/system/create-dev-char-symlinks/all.go index cafb8f9c..ce964f01 100644 --- a/cmd/nvidia-ctk/system/create-dev-char-symlinks/all.go +++ b/cmd/nvidia-ctk/system/create-dev-char-symlinks/all.go @@ -17,6 +17,7 @@ package devchar import ( + "errors" "fmt" "path/filepath" @@ -28,18 +29,23 @@ import ( ) type allPossible struct { - logger logger.Interface - devRoot string - deviceMajors devices.Devices - migCaps nvcaps.MigCaps + logger logger.Interface + devRoot string + devices devices.Devices[devices.Name, devices.Major] + migCaps nvcaps.MigCaps } // newAllPossible returns a new allPossible device node lister. // This lister lists all possible device nodes for NVIDIA GPUs, control devices, and capability devices. func newAllPossible(logger logger.Interface, devRoot string) (nodeLister, error) { - deviceMajors, err := devices.GetNVIDIADevices() + nvdevices, err := devices.GetNVIDIADevices() if err != nil { - return nil, fmt.Errorf("failed reading device majors: %v", err) + return nil, fmt.Errorf("failed reading devices: %w", err) + } + + nvdeviceMajors, err := devices.GetNVIDIADeviceMajors() + if err != nil { + return nil, fmt.Errorf("failed reading device major map: %w", err) } var requiredMajors []devices.Name @@ -53,18 +59,22 @@ func newAllPossible(logger logger.Interface, devRoot string) (nodeLister, error) requiredMajors = append(requiredMajors, devices.NVIDIACaps) } - requiredMajors = append(requiredMajors, devices.NVIDIAGPU, devices.NVIDIAUVM) + requiredMajors = append(requiredMajors, devices.NVIDIAUVM) for _, name := range requiredMajors { - if !deviceMajors.Exists(name) { - return nil, fmt.Errorf("missing required device major %s", name) + if !nvdevices.Exists(name) { + return nil, fmt.Errorf("missing required device %s", name) } } + if !nvdeviceMajors.Exists(devices.Major(devices.NVIDIAGPUMajor)) { + return nil, errors.New("missing required device nvidia/nvidia-frontend") + } + l := allPossible{ - logger: logger, - devRoot: devRoot, - deviceMajors: deviceMajors, - migCaps: migCaps, + logger: logger, + devRoot: devRoot, + devices: nvdevices, + migCaps: migCaps, } return l, nil @@ -105,8 +115,8 @@ func (m allPossible) getControlDeviceNodes() ([]deviceNode, error) { // Define the control devices for standard GPUs. controlDevices := []deviceNode{ - m.newDeviceNode(devices.NVIDIAGPU, "/dev/nvidia-modeset", devices.NVIDIAModesetMinor), - m.newDeviceNode(devices.NVIDIAGPU, "/dev/nvidiactl", devices.NVIDIACTLMinor), + m.newDeviceNodeFromMajor(devices.NVIDIAGPUMajor, "/dev/nvidia-modeset", devices.NVIDIAModesetMinor), + m.newDeviceNodeFromMajor(devices.NVIDIAGPUMajor, "/dev/nvidiactl", devices.NVIDIACTLMinor), m.newDeviceNode(devices.NVIDIAUVM, "/dev/nvidia-uvm", devices.NVIDIAUVMMinor), m.newDeviceNode(devices.NVIDIAUVM, "/dev/nvidia-uvm-tools", devices.NVIDIAUVMToolsMinor), } @@ -133,8 +143,8 @@ func (m allPossible) getControlDeviceNodes() ([]deviceNode, error) { // getGPUDeviceNodes generates a list of device nodes for a given GPU. func (m allPossible) getGPUDeviceNodes(gpu int) []deviceNode { - d := m.newDeviceNode( - devices.NVIDIAGPU, + d := m.newDeviceNodeFromMajor( + devices.NVIDIAGPUMajor, fmt.Sprintf("/dev/nvidia%d", gpu), gpu, ) @@ -178,7 +188,7 @@ func (m allPossible) getNVCapDeviceNodes(gpu int) []deviceNode { // newDeviceNode creates a new device node with the specified path and major/minor numbers. // The path is adjusted for the specified driver root. func (m allPossible) newDeviceNode(deviceName devices.Name, path string, minor int) deviceNode { - major, _ := m.deviceMajors.Get(deviceName) + major, _ := m.devices.Get(deviceName) return deviceNode{ path: filepath.Join(m.devRoot, path), @@ -186,3 +196,11 @@ func (m allPossible) newDeviceNode(deviceName devices.Name, path string, minor i minor: uint32(minor), } } + +func (m allPossible) newDeviceNodeFromMajor(deviceMajor devices.Major, path string, minor int) deviceNode { + return deviceNode{ + path: filepath.Join(m.devRoot, path), + major: uint32(deviceMajor), + minor: uint32(minor), + } +} diff --git a/internal/info/proc/devices/devices.go b/internal/info/proc/devices/devices.go index 95430428..4008c1b9 100644 --- a/internal/info/proc/devices/devices.go +++ b/internal/info/proc/devices/devices.go @@ -32,10 +32,10 @@ const ( NVIDIACTLMinor = 255 NVIDIAModesetMinor = 254 - NVIDIAFrontend = Name("nvidia-frontend") - NVIDIAGPU = NVIDIAFrontend - NVIDIACaps = Name("nvidia-caps") - NVIDIAUVM = Name("nvidia-uvm") + NVIDIAGPUMajor = 195 + + NVIDIACaps = Name("nvidia-caps") + NVIDIAUVM = Name("nvidia-uvm") procDevicesPath = "/proc/devices" nvidiaDevicePrefix = "nvidia" @@ -50,14 +50,14 @@ type Major int // Devices represents the set of devices under /proc/devices // //go:generate moq -stub -out devices_mock.go . Devices -type Devices interface { - Exists(Name) bool - Get(Name) (Major, bool) +type Devices[A, B any] interface { + Exists(A) bool + Get(A) (B, bool) } type devices map[Name]Major -var _ Devices = devices(nil) +var _ Devices[Name, Major] = devices(nil) // Exists checks if a Device with a given name exists or not func (d devices) Exists(name Name) bool { @@ -72,14 +72,19 @@ func (d devices) Get(name Name) (Major, bool) { } // GetNVIDIADevices returns the set of NVIDIA Devices on the machine -func GetNVIDIADevices() (Devices, error) { +func GetNVIDIADevices() (Devices[Name, Major], error) { return nvidiaDevices(procDevicesPath) } +// GetNVIDIADeviceMajors returns the set of NVIDIA Device Major numbers on the machine +func GetNVIDIADeviceMajors() (Devices[Major, Name], error) { + return nvidiaDeviceMajors(procDevicesPath) +} + // nvidiaDevices returns the set of NVIDIA Devices from the specified devices file. // This is useful for testing since we may be testing on a system where `/proc/devices` does // contain a reference to NVIDIA devices. -func nvidiaDevices(devicesPath string) (Devices, error) { +func nvidiaDevices(devicesPath string) (Devices[Name, Major], error) { devicesFile, err := os.Open(devicesPath) if os.IsNotExist(err) { return nil, nil @@ -92,6 +97,20 @@ func nvidiaDevices(devicesPath string) (Devices, error) { return nvidiaDeviceFrom(devicesFile) } +// nvidiaDeviceMajors returns the map of NVIDIA Device Major numbers and their corresponding device names +func nvidiaDeviceMajors(devicesPath string) (Devices[Major, Name], error) { + devicesFile, err := os.Open(devicesPath) + if os.IsNotExist(err) { + return nil, nil + } + if err != nil { + return nil, fmt.Errorf("error opening devices file: %v", err) + } + defer devicesFile.Close() + + return nvidiaDeviceMajorFrom(devicesFile) +} + var errNoNvidiaDevices = errors.New("no NVIDIA devices found") func nvidiaDeviceFrom(reader io.Reader) (devices, error) { @@ -139,3 +158,36 @@ func processProcDeviceLine(line string) (Name, Major, error) { return "", 0, fmt.Errorf("unparsable line: %v", line) } + +type deviceMajors map[Major]Name + +// Exists checks if a Device with a given name exists or not +func (d deviceMajors) Exists(m Major) bool { + _, exists := d[m] + return exists +} + +// Get a Device from Devices +func (d deviceMajors) Get(m Major) (Name, bool) { + device, exists := d[m] + return device, exists +} + +func nvidiaDeviceMajorFrom(reader io.Reader) (deviceMajors, error) { + allDevices := devicesFrom(reader) + nvidiaDeviceMajors := make(deviceMajors) + + var hasNvidiaDevices bool + for n, d := range allDevices { + if !strings.HasPrefix(string(n), nvidiaDevicePrefix) { + continue + } + nvidiaDeviceMajors[d] = n + hasNvidiaDevices = true + } + + if !hasNvidiaDevices { + return nil, errNoNvidiaDevices + } + return nvidiaDeviceMajors, nil +} diff --git a/internal/info/proc/devices/devices_mock.go b/internal/info/proc/devices/devices_mock.go index 315541c2..5d5ed2ab 100644 --- a/internal/info/proc/devices/devices_mock.go +++ b/internal/info/proc/devices/devices_mock.go @@ -9,7 +9,7 @@ import ( // Ensure, that DevicesMock does implement Devices. // If this is not the case, regenerate this file with moq. -var _ Devices = &DevicesMock{} +var _ Devices[any, any] = &DevicesMock[any, any]{} // DevicesMock is a mock implementation of Devices. // @@ -17,10 +17,10 @@ var _ Devices = &DevicesMock{} // // // make and configure a mocked Devices // mockedDevices := &DevicesMock{ -// ExistsFunc: func(name Name) bool { +// ExistsFunc: func(v A) bool { // panic("mock out the Exists method") // }, -// GetFunc: func(name Name) (Major, bool) { +// GetFunc: func(v A) (B, bool) { // panic("mock out the Get method") // }, // } @@ -29,24 +29,24 @@ var _ Devices = &DevicesMock{} // // and then make assertions. // // } -type DevicesMock struct { +type DevicesMock[A any, B any] struct { // ExistsFunc mocks the Exists method. - ExistsFunc func(name Name) bool + ExistsFunc func(v A) bool // GetFunc mocks the Get method. - GetFunc func(name Name) (Major, bool) + GetFunc func(v A) (B, bool) // calls tracks calls to the methods. calls struct { // Exists holds details about calls to the Exists method. Exists []struct { - // Name is the name argument value. - Name Name + // V is the v argument value. + V A } // Get holds details about calls to the Get method. Get []struct { - // Name is the name argument value. - Name Name + // V is the v argument value. + V A } } lockExists sync.RWMutex @@ -54,11 +54,11 @@ type DevicesMock struct { } // Exists calls ExistsFunc. -func (mock *DevicesMock) Exists(name Name) bool { +func (mock *DevicesMock[A, B]) Exists(v A) bool { callInfo := struct { - Name Name + V A }{ - Name: name, + V: v, } mock.lockExists.Lock() mock.calls.Exists = append(mock.calls.Exists, callInfo) @@ -69,18 +69,18 @@ func (mock *DevicesMock) Exists(name Name) bool { ) return bOut } - return mock.ExistsFunc(name) + return mock.ExistsFunc(v) } // ExistsCalls gets all the calls that were made to Exists. // Check the length with: // // len(mockedDevices.ExistsCalls()) -func (mock *DevicesMock) ExistsCalls() []struct { - Name Name +func (mock *DevicesMock[A, B]) ExistsCalls() []struct { + V A } { var calls []struct { - Name Name + V A } mock.lockExists.RLock() calls = mock.calls.Exists @@ -89,34 +89,34 @@ func (mock *DevicesMock) ExistsCalls() []struct { } // Get calls GetFunc. -func (mock *DevicesMock) Get(name Name) (Major, bool) { +func (mock *DevicesMock[A, B]) Get(v A) (B, bool) { callInfo := struct { - Name Name + V A }{ - Name: name, + V: v, } mock.lockGet.Lock() mock.calls.Get = append(mock.calls.Get, callInfo) mock.lockGet.Unlock() if mock.GetFunc == nil { var ( - majorOut Major - bOut bool + vOut B + bOut bool ) - return majorOut, bOut + return vOut, bOut } - return mock.GetFunc(name) + return mock.GetFunc(v) } // GetCalls gets all the calls that were made to Get. // Check the length with: // // len(mockedDevices.GetCalls()) -func (mock *DevicesMock) GetCalls() []struct { - Name Name +func (mock *DevicesMock[A, B]) GetCalls() []struct { + V A } { var calls []struct { - Name Name + V A } mock.lockGet.RLock() calls = mock.calls.Get diff --git a/internal/info/proc/devices/devices_test.go b/internal/info/proc/devices/devices_test.go index 037ccbe0..c1e899c9 100644 --- a/internal/info/proc/devices/devices_test.go +++ b/internal/info/proc/devices/devices_test.go @@ -99,6 +99,6 @@ func TestProcessDeviceFileLine(t *testing.T) { } // testDevices creates a set of test NVIDIA devices -func testDevices(d map[Name]Major) Devices { +func testDevices(d map[Name]Major) Devices[Name, Major] { return devices(d) } diff --git a/internal/system/nvdevices/devices.go b/internal/system/nvdevices/devices.go index 1da030dc..cde7a492 100644 --- a/internal/system/nvdevices/devices.go +++ b/internal/system/nvdevices/devices.go @@ -31,7 +31,8 @@ var errInvalidDeviceNode = errors.New("invalid device node") // Interface provides a set of utilities for interacting with NVIDIA devices on the system. type Interface struct { - devices.Devices + devices devices.Devices[devices.Name, devices.Major] + deviceMajors devices.Devices[devices.Major, devices.Name] logger logger.Interface @@ -55,12 +56,19 @@ func New(opts ...Option) (*Interface, error) { if i.devRoot == "" { i.devRoot = "/" } - if i.Devices == nil { - devices, err := devices.GetNVIDIADevices() + if i.devices == nil { + nvdevices, err := devices.GetNVIDIADevices() if err != nil { - return nil, fmt.Errorf("failed to create devices info: %v", err) + return nil, fmt.Errorf("failed to create devices info: %w", err) } - i.Devices = devices + i.devices = nvdevices + } + if i.deviceMajors == nil { + nvdeviceMajors, err := devices.GetNVIDIADeviceMajors() + if err != nil { + return nil, fmt.Errorf("failed to create devices info: %w", err) + } + i.deviceMajors = nvdeviceMajors } if i.dryRun { @@ -103,7 +111,7 @@ func (m *Interface) CreateNVIDIADevice(node string) error { return m.createDeviceNode(filepath.Join("dev", node), int(major), int(minor)) } -// createDeviceNode creates the specified device node with the require major and minor numbers. +// createDeviceNode creates the specified device node with the required major and minor numbers. // If a devRoot is configured, this is prepended to the path. func (m *Interface) createDeviceNode(path string, major int, minor int) error { path = filepath.Join(m.devRoot, path) @@ -124,9 +132,11 @@ func (m *Interface) Major(node string) (int64, error) { var major devices.Major switch node { case "nvidia-uvm", "nvidia-uvm-tools": - major, valid = m.Get(devices.NVIDIAUVM) + major, valid = m.devices.Get(devices.NVIDIAUVM) case "nvidia-modeset", "nvidiactl": - major, valid = m.Get(devices.NVIDIAGPU) + if m.deviceMajors.Exists(devices.NVIDIAGPUMajor) { + return int64(devices.NVIDIAGPUMajor), nil + } } if valid { diff --git a/internal/system/nvdevices/devices_test.go b/internal/system/nvdevices/devices_test.go index 388f91dc..770f9d7f 100644 --- a/internal/system/nvdevices/devices_test.go +++ b/internal/system/nvdevices/devices_test.go @@ -29,7 +29,7 @@ import ( func TestCreateControlDevices(t *testing.T) { logger, _ := testlog.NewNullLogger() - nvidiaDevices := &devices.DevicesMock{ + nvidiaDevices := &devices.DevicesMock[devices.Name, devices.Major]{ GetFunc: func(name devices.Name) (devices.Major, bool) { devices := map[devices.Name]devices.Major{ "nvidia-frontend": 195, @@ -39,12 +39,31 @@ func TestCreateControlDevices(t *testing.T) { }, } + nvidiaDeviceMajors := &devices.DevicesMock[devices.Major, devices.Name]{ + GetFunc: func(deviceMajor devices.Major) (devices.Name, bool) { + deviceMajors := map[devices.Major]devices.Name{ + 195: "nvidia-frontend", + 243: "nvidia-uvm", + } + return deviceMajors[deviceMajor], true + }, + ExistsFunc: func(deviceMajor devices.Major) bool { + deviceMajors := map[devices.Major]devices.Name{ + 195: "nvidia-frontend", + 243: "nvidia-uvm", + } + _, ok := deviceMajors[deviceMajor] + return ok + }, + } + mknodeError := errors.New("mknode error") testCases := []struct { description string root string - devices devices.Devices + devices devices.Devices[devices.Name, devices.Major] + deviceMajors devices.Devices[devices.Major, devices.Name] mknodeError error expectedError error expectedCalls []struct { @@ -54,10 +73,11 @@ func TestCreateControlDevices(t *testing.T) { } }{ { - description: "no root specified", - root: "", - devices: nvidiaDevices, - mknodeError: nil, + description: "no root specified", + root: "", + devices: nvidiaDevices, + deviceMajors: nvidiaDeviceMajors, + mknodeError: nil, expectedCalls: []struct { S string N1 int @@ -70,10 +90,11 @@ func TestCreateControlDevices(t *testing.T) { }, }, { - description: "some root specified", - root: "/some/root", - devices: nvidiaDevices, - mknodeError: nil, + description: "some root specified", + root: "/some/root", + devices: nvidiaDevices, + deviceMajors: nvidiaDeviceMajors, + mknodeError: nil, expectedCalls: []struct { S string N1 int @@ -88,6 +109,7 @@ func TestCreateControlDevices(t *testing.T) { { description: "mknod error returns error", devices: nvidiaDevices, + deviceMajors: nvidiaDeviceMajors, mknodeError: mknodeError, expectedError: mknodeError, // We expect the first call to this to fail, and the rest to be skipped @@ -101,11 +123,19 @@ func TestCreateControlDevices(t *testing.T) { }, { description: "missing major returns error", - devices: &devices.DevicesMock{ + devices: &devices.DevicesMock[devices.Name, devices.Major]{ GetFunc: func(name devices.Name) (devices.Major, bool) { return 0, false }, }, + deviceMajors: &devices.DevicesMock[devices.Major, devices.Name]{ + GetFunc: func(v devices.Major) (devices.Name, bool) { + return "", false + }, + ExistsFunc: func(v devices.Major) bool { + return false + }, + }, expectedError: errInvalidDeviceNode, }, } @@ -122,6 +152,7 @@ func TestCreateControlDevices(t *testing.T) { WithLogger(logger), WithDevRoot(tc.root), WithDevices(tc.devices), + WithDeviceMajors(tc.deviceMajors), ) d.mknoder = mknode diff --git a/internal/system/nvdevices/options.go b/internal/system/nvdevices/options.go index 0bcf319d..db71b905 100644 --- a/internal/system/nvdevices/options.go +++ b/internal/system/nvdevices/options.go @@ -46,8 +46,15 @@ func WithDevRoot(devRoot string) Option { } // WithDevices sets the devices for the Interface struct. -func WithDevices(devices devices.Devices) Option { +func WithDevices(devices devices.Devices[devices.Name, devices.Major]) Option { return func(i *Interface) { - i.Devices = devices + i.devices = devices + } +} + +// WithDeviceMajors sets the deviceMajors for the Interface struct. +func WithDeviceMajors(deviceMajors devices.Devices[devices.Major, devices.Name]) Option { + return func(i *Interface) { + i.deviceMajors = deviceMajors } }