From a4301f8e7bdfe4925aab77d9a89e56466490bf33 Mon Sep 17 00:00:00 2001 From: AMIYAMAITY Date: Mon, 29 Jan 2024 13:01:35 +0530 Subject: [PATCH 1/2] Socket id env added for multi GPU configuration --- kubernetes/device-plugin/server.go | 15 +++++++++++++-- src/comm.c | 7 ++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/kubernetes/device-plugin/server.go b/kubernetes/device-plugin/server.go index 70bb396..1246ae0 100644 --- a/kubernetes/device-plugin/server.go +++ b/kubernetes/device-plugin/server.go @@ -25,6 +25,7 @@ import ( "log" "net" "os" + "strings" "golang.org/x/net/context" "google.golang.org/grpc" @@ -218,6 +219,12 @@ func (m *NvshareDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.Devic */ func (m *NvshareDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { log.SetOutput(os.Stderr) + + socketId:= os.Getenv("NVSHARE_SOCK_ID") + if len(socketId) == 0 { + socketId = "0" + } + responses := pluginapi.AllocateResponse{} for _, req := range reqs.ContainerRequests { for _, id := range req.DevicesIDs { @@ -249,10 +256,14 @@ func (m *NvshareDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.Allo ReadOnly: true, } mounts = append(mounts, mount) + + SocketHostPathNew:= strings.Split(SocketHostPath, ".sock")[0]+socketId+".sock" + SocketContainerPathNew:= strings.Split(SocketContainerPath, ".sock")[0]+socketId+".sock" + /* Mount scheduler socket */ mount = &pluginapi.Mount{ - HostPath: SocketHostPath, - ContainerPath: SocketContainerPath, + HostPath: SocketHostPathNew, + ContainerPath: SocketContainerPathNew, ReadOnly: true, } mounts = append(mounts, mount) diff --git a/src/comm.c b/src/comm.c index 1a2a7fc..eb390a7 100644 --- a/src/comm.c +++ b/src/comm.c @@ -80,9 +80,14 @@ int nvshare_get_scheduler_path(char *sock_path) offset = ret; /* Start from the trailing NULL byte */ + // get scheduler socket ID + char *NVSHARE_SOCK_ID = getenv("NVSHARE_SOCK_ID"); + if (NVSHARE_SOCK_ID == NULL) { + NVSHARE_SOCK_ID = "0"; + } /* TODO: Ensure it fits in sock_path, check return value */ ret = snprintf(sock_path + offset, NVSHARE_SOCK_PATH_MAX - offset, - "%s", "scheduler.sock"); + "%s%s%s", "scheduler",NVSHARE_SOCK_ID,".sock"); return 0; } From 2e9cef961355ed6816741780c40385c3d8500227 Mon Sep 17 00:00:00 2001 From: AMIYAMAITY Date: Fri, 2 Feb 2024 13:31:42 +0530 Subject: [PATCH 2/2] Update socket id for device plugin and update the manifests --- kubernetes/device-plugin/server.go | 60 ++++++++++++++++++++----- kubernetes/manifests/device-plugin.yaml | 38 +++++++++++++--- kubernetes/manifests/scheduler.yaml | 28 +++++++++++- 3 files changed, 109 insertions(+), 17 deletions(-) diff --git a/kubernetes/device-plugin/server.go b/kubernetes/device-plugin/server.go index 1246ae0..a7ccd01 100644 --- a/kubernetes/device-plugin/server.go +++ b/kubernetes/device-plugin/server.go @@ -25,7 +25,7 @@ import ( "log" "net" "os" - "strings" + "strings" "golang.org/x/net/context" "google.golang.org/grpc" @@ -37,6 +37,8 @@ const ( serverSock = pluginapi.DevicePluginPath + "nvshare-device-plugin.sock" ) + + type NvshareDevicePlugin struct { devs []*pluginapi.Device socket string @@ -48,9 +50,15 @@ type NvshareDevicePlugin struct { } func NewNvshareDevicePlugin() *NvshareDevicePlugin { + socketId:= os.Getenv("NVSHARE_SOCK_ID") + if len(socketId) == 0 { + socketId = "0" + } + + serverSockNew:= strings.Split(serverSock, ".sock")[0]+socketId+".sock" return &NvshareDevicePlugin{ devs: getDevices(), - socket: serverSock, + socket: serverSockNew, stop: make(chan interface{}), health: make(chan *pluginapi.Device), @@ -76,13 +84,19 @@ func (m *NvshareDevicePlugin) cleanup() { func (m *NvshareDevicePlugin) Start() error { m.initialize() + socketId:= os.Getenv("NVSHARE_SOCK_ID") + if len(socketId) == 0 { + socketId = "0" + } + resourceNameNew:= resourceName+socketId + err := m.Serve() if err != nil { - log.Printf("Could not start device plugin for '%s': %s", resourceName, err) + log.Printf("Could not start device plugin for '%s': %s", resourceNameNew, err) m.cleanup() return err } - log.Printf("Starting to serve '%s' on %s", resourceName, m.socket) + log.Printf("Starting to serve '%s' on %s", resourceNameNew, m.socket) err = m.Register() if err != nil { @@ -90,17 +104,24 @@ func (m *NvshareDevicePlugin) Start() error { m.Stop() return err } - log.Printf("Registered device plugin for '%s' with Kubelet", resourceName) + log.Printf("Registered device plugin for '%s' with Kubelet", resourceNameNew) return nil } /* Stop the gRPC server and clean up the UNIX socket file */ func (m *NvshareDevicePlugin) Stop() error { + + socketId:= os.Getenv("NVSHARE_SOCK_ID") + if len(socketId) == 0 { + socketId = "0" + } + resourceNameNew:= resourceName+socketId + if (m == nil) || (m.server == nil) { return nil } - log.Printf("Stopping to serve '%s' on %s\n", resourceName, m.socket) + log.Printf("Stopping to serve '%s' on %s\n", resourceNameNew, m.socket) m.server.Stop() err := os.Remove(m.socket) if (err != nil) && (!os.IsNotExist(err)) { @@ -112,6 +133,13 @@ func (m *NvshareDevicePlugin) Stop() error { /* Starts the gRPC server which serves incoming requests from kubelet */ func (m *NvshareDevicePlugin) Serve() error { + + socketId:= os.Getenv("NVSHARE_SOCK_ID") + if len(socketId) == 0 { + socketId = "0" + } + resourceNameNew:= resourceName+socketId + os.Remove(m.socket) sock, err := net.Listen("unix", m.socket) if err != nil { @@ -124,17 +152,17 @@ func (m *NvshareDevicePlugin) Serve() error { lastCrashTime := time.Now() restartCount := 0 for { - log.Printf("Starting gRPC server for '%s'", resourceName) + log.Printf("Starting gRPC server for '%s'", resourceNameNew) err := m.server.Serve(sock) if err == nil { break } log.Printf("GRPC server for '%s' crashed with error: %v", - resourceName, err) + resourceNameNew, err) if restartCount > 5 { - log.Fatalf("GRPC server for '%s' has repeatedly crashed recently. Quitting", resourceName) + log.Fatalf("GRPC server for '%s' has repeatedly crashed recently. Quitting", resourceNameNew) } timeSinceLastCrash := time.Since(lastCrashTime).Seconds() lastCrashTime = time.Now() @@ -157,6 +185,14 @@ func (m *NvshareDevicePlugin) Serve() error { /* Registers the device plugin for resourceName with kubelet */ func (m *NvshareDevicePlugin) Register() error { + + socketId:= os.Getenv("NVSHARE_SOCK_ID") + if len(socketId) == 0 { + socketId = "0" + } + resourceNameNew := resourceName+socketId + + conn, err := m.dial(pluginapi.KubeletSocket, 5*time.Second) if err != nil { return err @@ -167,7 +203,7 @@ func (m *NvshareDevicePlugin) Register() error { reqt := &pluginapi.RegisterRequest{ Version: pluginapi.Version, Endpoint: path.Base(m.socket), - ResourceName: resourceName, + ResourceName: resourceNameNew, Options: &pluginapi.DevicePluginOptions{ GetPreferredAllocationAvailable: false, }, @@ -224,13 +260,15 @@ func (m *NvshareDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.Allo if len(socketId) == 0 { socketId = "0" } + + resourceNameNew := resourceName+socketId responses := pluginapi.AllocateResponse{} for _, req := range reqs.ContainerRequests { for _, id := range req.DevicesIDs { log.Printf("Received Allocate request for %s", id) if !m.deviceExists(id) { - return nil, fmt.Errorf("invalid allocation request for '%s' - unknown device: %s", resourceName, id) + return nil, fmt.Errorf("invalid allocation request for '%s' - unknown device: %s", resourceNameNew, id) } } diff --git a/kubernetes/manifests/device-plugin.yaml b/kubernetes/manifests/device-plugin.yaml index 81d67a5..0917e90 100644 --- a/kubernetes/manifests/device-plugin.yaml +++ b/kubernetes/manifests/device-plugin.yaml @@ -29,7 +29,7 @@ spec: priorityClassName: system-node-critical containers: - name: nvshare-lib - image: docker.io/grgalex/nvshare:libnvshare-v0.1-f654c296 + image: nvshare:libnvshare-a4301f8e command: - sleep - infinity @@ -46,6 +46,11 @@ spec: - "/bin/sh" - "-c" - "umount -v /host-var-run-nvshare/libnvshare.so && rm /host-var-run-nvshare/libnvshare.so" + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "1" + - name: NVSHARE_SOCK_ID + value: "1" securityContext: # Necessary for mounts to work. privileged: true @@ -58,11 +63,37 @@ spec: # mount for /var/run/nvshare/libnvshare.so mountPropagation: Bidirectional - name: nvshare-device-plugin - image: docker.io/grgalex/nvshare:nvshare-device-plugin-v0.1-f654c296 + image: nvshare:nvshare-device-plugin-a4301f8e + imagePullPolicy: IfNotPresent + env: + - name: NVSHARE_VIRTUAL_DEVICES + value: "10" + - name: NVIDIA_VISIBLE_DEVICES + value: "0" + - name: NVSHARE_SOCK_ID + value: "0" + - name: CUDA_VISIBLE_DEVICES + value: "0" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin-socket + mountPath: /var/lib/kubelet/device-plugins + + - name: nvshare-device-plugin-1 + image: nvshare:nvshare-device-plugin-a4301f8e imagePullPolicy: IfNotPresent env: - name: NVSHARE_VIRTUAL_DEVICES value: "10" + - name: NVIDIA_VISIBLE_DEVICES + value: "1" + - name: NVSHARE_SOCK_ID + value: "1" + - name: CUDA_VISIBLE_DEVICES + value: "1" securityContext: allowPrivilegeEscalation: false capabilities: @@ -70,9 +101,6 @@ spec: volumeMounts: - name: device-plugin-socket mountPath: /var/lib/kubelet/device-plugins - resources: - limits: - nvidia.com/gpu: 1 volumes: - name: host-var-run-nvshare hostPath: diff --git a/kubernetes/manifests/scheduler.yaml b/kubernetes/manifests/scheduler.yaml index b84cf57..4a472d5 100644 --- a/kubernetes/manifests/scheduler.yaml +++ b/kubernetes/manifests/scheduler.yaml @@ -29,7 +29,7 @@ spec: priorityClassName: system-node-critical containers: - name: nvshare-scheduler - image: docker.io/grgalex/nvshare:nvshare-scheduler-v0.1-f654c296 + image: nvshare:nvshare-scheduler-a4301f8e imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false @@ -38,6 +38,32 @@ spec: volumeMounts: - name: nvshare-socket-directory mountPath: /var/run/nvshare + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "0" + - name: NVSHARE_SOCK_ID + value: "0" + - name: CUDA_VISIBLE_DEVICES + value: "0" + + - name: nvshare-scheduler-1 + image: nvshare:nvshare-scheduler-a4301f8e + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: nvshare-socket-directory + mountPath: /var/run/nvshare + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "1" + - name: NVSHARE_SOCK_ID + value: "1" + - name: CUDA_VISIBLE_DEVICES + value: "1" + volumes: - name: nvshare-socket-directory hostPath: