Add support for GDRCopy

Tested log:
```
sudo COS_GPU_INSTALLER=us.gcr.io/cloud-kernel-build/cos-gpu-installer:latest cos-extensions install gpu -- -debug -test -gdr -gcs-download-prefix=lakitu-release-tryjob/R129-19275.0.0-90a6ad3f -gcs-download-bucket=cos-infra-prod-artifacts-presubmit
I0910 05:26:43.969611    1423 installer.go:934] Applying default module parameter: use_persistent_mapping=1
I0910 05:26:43.969635    1423 installer.go:934] Applying default module parameter: dbg_enabled=0
I0910 05:26:43.969642    1423 installer.go:934] Applying default module parameter: info_enabled=0
I0910 05:26:43.969647    1423 installer.go:939] Loading GDRCopy kernel module with dependencies.
I0910 05:26:43.973776    1423 modules.go:190] loading module: /usr/sbin/insmod /usr/local/nvidia/drivers/gdrdrv.ko use_persistent_mapping=1 dbg_enabled=0 info_enabled=0
I0910 05:26:44.024876    1423 installer.go:949] GDRCopy driver major is 241
I0910 05:26:44.024929    1423 installer.go:958] Creating device node /dev/gdrdrv
I0910 05:26:44.024955    1423 install.go:606] GDRCopy driver and device node created successfully.
```

BUG=b/428981220
TEST=Tested in a presubmit GPU VM with GDRCopy kernel module installed.
RELEASE_NOTE=None

Change-Id: Iab353c605ddf3d11643f883391eb30932a0ac911
Reviewed-on: https://cos-review.googlesource.com/c/cos/tools/+/110701
Tested-by: Chenglong Tang <chenglongtang@google.com>
Cloud-Build: GCB Service account <228075978874@cloudbuild.gserviceaccount.com>
Reviewed-by: Kevin Berry <kpberry@google.com>
Reviewed-by: Shuo Yang <gshuoy@google.com>
diff --git a/src/cmd/cos_gpu_installer/internal/commands/install.go b/src/cmd/cos_gpu_installer/internal/commands/install.go
index eb532ee..1ed27c4 100644
--- a/src/cmd/cos_gpu_installer/internal/commands/install.go
+++ b/src/cmd/cos_gpu_installer/internal/commands/install.go
@@ -104,6 +104,7 @@
 	hostInstallDir          string
 	forceFallback           FallBackFlag
 	unsignedDriver          bool
+	gdrCopy                 bool
 	gcsDownloadBucket       string
 	gcsDownloadPrefix       string
 	nvidiaInstallerURL      string
@@ -184,11 +185,12 @@
 	f.BoolVar(&c.noVerify, "no-verify", false, "Skip kernel module loading and installation verification. Useful for preloading drivers without attached GPU.")
 	f.BoolVar(&c.skipNvidiaSmi, "skip-nvidia-smi", false, "This flag disables the execution of nvidia-smi verification.")
 	c.kernelModuleParams = modules.NewModuleParameters()
-	f.Var(&c.kernelModuleParams, "module-arg", "Kernel module parameters can be specified using this flag. These parameters are used while loading the specific kernel mode drivers into the kernel. Usage: -module-arg <module-x>.<parameter-y>=<value> -module-arg <module-y>.<parameter-z>=<value> ..    For eg: –module-arg nvidia_uvm.uvm_debug_prints=1 –module-arg nvidia.NVreg_EnableGpuFirmware=0.")
+	f.Var(&c.kernelModuleParams, "module-arg", "Kernel module parameters can be specified using this flag. These parameters are used while loading the specific kernel mode drivers into the kernel. Usage: -module-arg <module-x>.<parameter-y>=<value> -module-arg <module-y>.<parameter-z>=<value> ..	For eg: –module-arg nvidia_uvm.uvm_debug_prints=1 –module-arg nvidia.NVreg_EnableGpuFirmware=0.")
 	f.Var(&c.forceFallback, "force-fallback", "This flag specify whether to use fallback mechanism when specified GPU driver is not compatible with GPU devices.\n"+
 		"If unspecified, it is `false` for --version=R<major-version> eg. 'R470', 'R525' or --version=<precise-version> eg. '535.129.03', '525.147.05', it is `true` for version is not specified or --version=default or --version=latest.\n"+
 		"When fallback behavior is active, the installer will find a compatible driver to install for the detected GPU on the VM.")
 	f.StringVar(&c.localArtifactsDir, "local-artifacts-dir", "", "Local directory where NVIDIA driver artifacts are stored. If set, artifacts will be copied from this directory instead of downloaded from GCS.")
+	f.BoolVar(&c.gdrCopy, "gdr", false, "Install GDRCopy driver.")
 }
 
 func (c *InstallCommand) validateFlags() error {
@@ -353,6 +355,15 @@
 				c.logError(errors.Wrap(err, "failed to verify GPU driver installation"))
 				return subcommands.ExitFailure
 			}
+
+			if c.gdrCopy {
+				if err := installer.InstallGDRCopy(c.noVerify, c.kernelModuleParams); err != nil {
+					c.logError(err)
+					return subcommands.ExitFailure
+				}
+				log.V(1).Info("GDRCopy driver and device node created successfully.")
+			}
+
 			if err := modules.UpdateHostLdCache(hostRootPath, filepath.Join(c.hostInstallDir, "lib64")); err != nil {
 				c.logError(errors.Wrap(err, "failed to update host ld cache"))
 				return subcommands.ExitFailure
@@ -532,6 +543,14 @@
 	if err := installer.VerifyDriverInstallation(c.noVerify, c.debug, c.skipNvidiaSmi); err != nil {
 		return errors.Wrap(err, "failed to verify installation")
 	}
+
+	if c.gdrCopy {
+		if err := installer.InstallGDRCopy(c.noVerify, c.kernelModuleParams); err != nil {
+			return err
+		}
+		log.V(1).Info("GDRCopy driver and device node created successfully.")
+	}
+
 	if err := modules.UpdateHostLdCache(hostRootPath, filepath.Join(c.hostInstallDir, "lib64")); err != nil {
 		return errors.Wrap(err, "failed to update host ld cache")
 	}
@@ -580,6 +599,12 @@
 	if err := installer.VerifyDriverInstallation(c.noVerify, c.debug, c.skipNvidiaSmi); err != nil {
 		return errors.Wrap(err, "failed to verify installation")
 	}
+	if c.gdrCopy {
+		if err := installer.InstallGDRCopy(c.noVerify, c.kernelModuleParams); err != nil {
+			return err
+		}
+		log.V(1).Info("GDRCopy driver and device node created successfully.")
+	}
 	if err := modules.UpdateHostLdCache(hostRootPath, filepath.Join(c.hostInstallDir, "lib64")); err != nil {
 		return errors.Wrap(err, "failed to update host ld cache")
 	}
diff --git a/src/cmd/cos_gpu_installer/internal/installer/installer.go b/src/cmd/cos_gpu_installer/internal/installer/installer.go
index 43e97ee..58f55e8 100644
--- a/src/cmd/cos_gpu_installer/internal/installer/installer.go
+++ b/src/cmd/cos_gpu_installer/internal/installer/installer.go
@@ -13,6 +13,7 @@
 	"path/filepath"
 	"regexp"
 	"sort"
+	"strconv"
 	"strings"
 	"syscall"
 
@@ -42,6 +43,9 @@
 	LatestVersion                      = "latest"
 	MajorGPUDriverArtifactPrefix       = "gpu_"
 	MajorGPUDriverArtifactSuffix       = "_version"
+	gdrdrvDevicePath                   = "/dev/gdrdrv"
+	gdrdrvModuleName                   = "gdrdrv"
+	procDevicesPath                    = "/proc/devices"
 )
 
 var (
@@ -861,3 +865,114 @@
 	return nil
 
 }
+
+// isDeviceRegistered checks if a device is registered in /proc/devices
+// and returns true and its major number if it is.
+func isDeviceRegistered(deviceName string) (bool, int) {
+	content, err := ioutil.ReadFile(procDevicesPath)
+	if err != nil {
+		log.Errorf("Failed to read %s: %v", procDevicesPath, err)
+		return false, 0
+	}
+
+	lines := strings.Split(string(content), "\n")
+	for _, line := range lines {
+		fields := strings.Fields(line)
+		if len(fields) == 2 && fields[1] == deviceName {
+			major, err := strconv.Atoi(fields[0])
+			if err != nil {
+				log.Errorf("Failed to parse major number for %s: %v", deviceName, err)
+				return false, 0
+			}
+			return true, major
+		}
+	}
+	return false, 0
+}
+
+// mergeModuleParams takes a list of user-provided parameters (as "key=val" strings)
+// and a map of default parameters. It returns a final slice of parameters, ensuring
+// that any key provided by the user is not overridden by a default.
+func mergeModuleParams(userParams []string, defaults map[string]string) []string {
+	userSetKeys := make(map[string]bool)
+	finalParamsList := []string{}
+
+	// 1. Add all user params first and record which keys they set.
+	for _, userParam := range userParams {
+		finalParamsList = append(finalParamsList, userParam)
+		if key, _, found := strings.Cut(userParam, "="); found {
+			userSetKeys[key] = true
+		}
+	}
+
+	// 2. Add defaults ONLY if the key wasn't already set by the user.
+	for key, value := range defaults {
+		if !userSetKeys[key] {
+			paramString := fmt.Sprintf("%s=%s", key, value)
+			finalParamsList = append(finalParamsList, paramString)
+		}
+	}
+	return finalParamsList
+}
+
+// InstallGDRCopy loads the GDRCopy kernel module and creates its device node.
+// This should be run after the main NVIDIA kernel modules are loaded.
+// It follows https://github.com/NVIDIA/gdrcopy/blob/master/insmod.sh.
+func InstallGDRCopy(noVerify bool, moduleParams modules.ModuleParameters) error {
+	if noVerify {
+		log.Info("Flag --no-verify is set, skipping GDRCopy installation.")
+		return nil
+	}
+
+	kernelModulePath := filepath.Join(gpuInstallDirContainer, "drivers")
+
+	// 1. Define the gdrdrv module.
+	gdrModule := &modules.Module{
+		Name: gdrdrvModuleName,
+		Path: filepath.Join(kernelModulePath, "gdrdrv.ko"),
+	}
+
+	// Set default module parameters if the user did not provide them.
+	// Flags are defined here: https://github.com/NVIDIA/gdrcopy/blob/master/insmod.sh#L28.
+	defaults := map[string]string{
+		"dbg_enabled":            "0",
+		"info_enabled":           "0",
+		"use_persistent_mapping": "1",
+	}
+
+	// Call our tested helper function to get the final parameter list.
+	userGDRParams := moduleParams[gdrdrvModuleName]
+	finalGDRParams := mergeModuleParams(userGDRParams, defaults)
+
+	// Assign the merged list back to the global map to be passed to LoadModule.
+	moduleParams[gdrdrvModuleName] = finalGDRParams
+	log.V(1).Infof("Applying final parameters for %s: %v", gdrdrvModuleName, finalGDRParams)
+
+	// 2. Load the module.
+	log.V(1).Info("Loading GDRCopy kernel module with dependencies.")
+	if err := modules.LoadModule(gdrModule, moduleParams); err != nil {
+		return errors.Wrap(err, "failed to load gdrdrv kernel module")
+	}
+
+	// 3. Create the device node
+	isLoaded, major := isDeviceRegistered(gdrdrvModuleName)
+	if !isLoaded {
+		return stderrors.New("gdrdrv module loaded but device not found in /proc/devices")
+	}
+	log.Infof("GDRCopy driver major is %d", major)
+
+	if _, err := os.Stat(gdrdrvDevicePath); err == nil {
+		log.Infof("Removing old inode %s", gdrdrvDevicePath)
+		if err := os.Remove(gdrdrvDevicePath); err != nil {
+			return errors.Wrapf(err, "failed to remove existing device node %s", gdrdrvDevicePath)
+		}
+	}
+
+	log.Infof("Creating device node %s", gdrdrvDevicePath)
+	dev := unix.Mkdev(uint32(major), 0)
+	if err := unix.Mknod(gdrdrvDevicePath, unix.S_IFCHR|0666, int(dev)); err != nil {
+		return errors.Wrapf(err, "failed to create device node for %s", gdrdrvDevicePath)
+	}
+
+	return nil
+}
diff --git a/src/cmd/cos_gpu_installer/internal/installer/installer_test.go b/src/cmd/cos_gpu_installer/internal/installer/installer_test.go
index 4c015ce..a06bc7c 100644
--- a/src/cmd/cos_gpu_installer/internal/installer/installer_test.go
+++ b/src/cmd/cos_gpu_installer/internal/installer/installer_test.go
@@ -8,12 +8,14 @@
 	"os"
 	"path"
 	"path/filepath"
+	"sort"
 	"testing"
 
 	"cos.googlesource.com/cos/tools.git/src/pkg/cos"
 	"cos.googlesource.com/cos/tools.git/src/pkg/fakes"
 	"cos.googlesource.com/cos/tools.git/src/pkg/gpuconfig/pb"
 	"github.com/golang/protobuf/proto"
+	"github.com/google/go-cmp/cmp"
 )
 
 func TestDownloadGPUDriverVersionsProto(t *testing.T) {
@@ -204,3 +206,58 @@
 	return tarPath
 
 }
+
+func TestMergeModuleParams(t *testing.T) {
+	// These are the defaults from InstallGDRCopy
+	defaults := map[string]string{
+		"dbg_enabled":            "0",
+		"info_enabled":           "0",
+		"use_persistent_mapping": "1",
+	}
+
+	testCases := []struct {
+		name       string
+		userParams []string
+		want       []string
+	}{
+		{
+			name:       "No user params, all defaults applied",
+			userParams: []string{},
+			want:       []string{"dbg_enabled=0", "info_enabled=0", "use_persistent_mapping=1"},
+		},
+		{
+			name:       "User overrides one default",
+			userParams: []string{"dbg_enabled=1"},
+			want:       []string{"dbg_enabled=1", "info_enabled=0", "use_persistent_mapping=1"},
+		},
+		{
+			name:       "User adds a custom param",
+			userParams: []string{"foo=bar"},
+			want:       []string{"foo=bar", "dbg_enabled=0", "info_enabled=0", "use_persistent_mapping=1"},
+		},
+		{
+			name:       "User overrides one and adds one",
+			userParams: []string{"use_persistent_mapping=0", "custom=true"},
+			want:       []string{"use_persistent_mapping=0", "custom=true", "dbg_enabled=0", "info_enabled=0"},
+		},
+		{
+			name:       "User overrides all defaults",
+			userParams: []string{"dbg_enabled=1", "info_enabled=1", "use_persistent_mapping=0"},
+			want:       []string{"dbg_enabled=1", "info_enabled=1", "use_persistent_mapping=0"},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := mergeModuleParams(tc.userParams, defaults)
+
+			// Sort slices before comparing since parameter order does not matter.
+			sort.Strings(got)
+			sort.Strings(tc.want)
+
+			if diff := cmp.Diff(tc.want, got); diff != "" {
+				t.Errorf("mergeModuleParams() mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}