test/e2e_node/node_perf_test.go - third_party/kubernetes - Git at Google

 /*
 Copyright 2018 The Kubernetes Authors.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */

 package e2enode

 import (
 	"context"
 	"fmt"
 	"time"

 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
 	admissionapi "k8s.io/pod-security-admission/api"

 	"k8s.io/kubernetes/test/e2e/framework"
 	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
 	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
 	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
 	e2enodekubelet "k8s.io/kubernetes/test/e2e_node/kubeletconfig"
 	"k8s.io/kubernetes/test/e2e_node/perf/workloads"

 	"github.com/onsi/ginkgo/v2"
 	"github.com/onsi/gomega"
 )

 // makeNodePerfPod returns a pod with the information provided from the workload.
 func makeNodePerfPod(w workloads.NodePerfWorkload) *v1.Pod {
 	return &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			Name: fmt.Sprintf("%s-pod", w.Name()),
 		},
 		Spec: w.PodSpec(),
 	}
 }

 func setKubeletConfig(ctx context.Context, f *framework.Framework, cfg *kubeletconfig.KubeletConfiguration) {
 	if cfg != nil {
 		// Update the Kubelet configuration.
 		ginkgo.By("Stopping the kubelet")
 		startKubelet := stopKubelet()

 		// wait until the kubelet health check will fail
 		gomega.Eventually(ctx, func() bool {
 			return kubeletHealthCheck(kubeletHealthCheckURL)
 		}, time.Minute, time.Second).Should(gomega.BeFalse())

 		framework.ExpectNoError(e2enodekubelet.WriteKubeletConfigFile(cfg))

 		ginkgo.By("Starting the kubelet")
 		startKubelet()

 		// wait until the kubelet health check will succeed
 		gomega.Eventually(ctx, func() bool {
 			return kubeletHealthCheck(kubeletHealthCheckURL)
 		}, 2*time.Minute, 5*time.Second).Should(gomega.BeTrue())
 	}

 	// Wait for the Kubelet to be ready.
 	gomega.Eventually(ctx, func(ctx context.Context) bool {
 		nodes, err := e2enode.TotalReady(ctx, f.ClientSet)
 		framework.ExpectNoError(err)
 		return nodes == 1
 	}, time.Minute, time.Second).Should(gomega.BeTrue())
 }

 // Serial because the test updates kubelet configuration.
 // Slow by design.
 var _ = SIGDescribe("Node Performance Testing", framework.WithSerial(), framework.WithSlow(), func() {
 	f := framework.NewDefaultFramework("node-performance-testing")
 	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
 	var (
 		wl     workloads.NodePerfWorkload
 		oldCfg *kubeletconfig.KubeletConfiguration
 		newCfg *kubeletconfig.KubeletConfiguration
 		pod    *v1.Pod
 	)
 	ginkgo.JustBeforeEach(func(ctx context.Context) {
 		err := wl.PreTestExec()
 		framework.ExpectNoError(err)
 		oldCfg, err = getCurrentKubeletConfig(ctx)
 		framework.ExpectNoError(err)
 		newCfg, err = wl.KubeletConfig(oldCfg)
 		framework.ExpectNoError(err)
 		setKubeletConfig(ctx, f, newCfg)
 	})

 	cleanup := func(ctx context.Context) {
 		gp := int64(0)
 		delOpts := metav1.DeleteOptions{
 			GracePeriodSeconds: &gp,
 		}
 		e2epod.NewPodClient(f).DeleteSync(ctx, pod.Name, delOpts, e2epod.DefaultPodDeletionTimeout)

 		// We are going to give some more time for the CPU manager to do any clean
 		// up it needs to do now that the pod has been deleted. Otherwise we may
 		// run into a data race condition in which the PostTestExec function
 		// deletes the CPU manager's checkpoint file while the CPU manager is still
 		// doing work and we end with a new checkpoint file after PosttestExec has
 		// finished. This issues would result in the kubelet panicking after we try
 		// and set the kubelet config.
 		time.Sleep(15 * time.Second)
 		ginkgo.By("running the post test exec from the workload")
 		err := wl.PostTestExec()
 		framework.ExpectNoError(err)
 		setKubeletConfig(ctx, f, oldCfg)
 	}

 	runWorkload := func(ctx context.Context) {
 		ginkgo.By("running the workload and waiting for success")
 		// Make the pod for the workload.
 		pod = makeNodePerfPod(wl)
 		// Create the pod.
 		pod = e2epod.NewPodClient(f).CreateSync(ctx, pod)
 		// Wait for pod success.
 		// but avoid using WaitForSuccess because we want the container logs upon failure #109295
 		podErr := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, fmt.Sprintf("%s or %s", v1.PodSucceeded, v1.PodFailed), wl.Timeout(),
 			func(pod *v1.Pod) (bool, error) {
 				switch pod.Status.Phase {
 				case v1.PodFailed:
 					return true, fmt.Errorf("pod %q failed with reason: %q, message: %q", pod.Name, pod.Status.Reason, pod.Status.Message)
 				case v1.PodSucceeded:
 					return true, nil
 				default:
 					return false, nil
 				}
 			},
 		)
 		podLogs, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, pod.Name, pod.Spec.Containers[0].Name)
 		framework.ExpectNoError(err)
 		if podErr != nil {
 			framework.Logf("dumping pod logs due to pod error detected: \n%s", podLogs)
 			framework.Failf("pod error: %v", podErr)
 		}
 		perf, err := wl.ExtractPerformanceFromLogs(podLogs)
 		framework.ExpectNoError(err)
 		framework.Logf("Time to complete workload %s: %v", wl.Name(), perf)
 		// using framework.ExpectNoError for consistency would cause changes the output format
 		gomega.Expect(podErr).To(gomega.Succeed(), "wait for pod %q to succeed", pod.Name)
 	}

 	ginkgo.BeforeEach(func(ctx context.Context) {
 		ginkgo.By("ensure environment has enough CPU + Memory to run")
 		minimumRequiredCPU := resource.MustParse("15")
 		minimumRequiredMemory := resource.MustParse("48Gi")
 		localNodeCap := getLocalNode(ctx, f).Status.Allocatable
 		cpuCap := localNodeCap[v1.ResourceCPU]
 		memCap := localNodeCap[v1.ResourceMemory]
 		if cpuCap.Cmp(minimumRequiredCPU) == -1 {
 			e2eskipper.Skipf("Skipping Node Performance Tests due to lack of CPU. Required %v is less than capacity %v.", minimumRequiredCPU, cpuCap)
 		}
 		if memCap.Cmp(minimumRequiredMemory) == -1 {
 			e2eskipper.Skipf("Skipping Node Performance Tests due to lack of memory. Required %v is less than capacity %v.", minimumRequiredMemory, memCap)
 		}
 	})

 	ginkgo.Context("Run node performance testing with pre-defined workloads", func() {
 		ginkgo.BeforeEach(func() {
 			wl = workloads.NodePerfWorkloads[0]
 		})
 		ginkgo.It("NAS parallel benchmark (NPB) suite - Integer Sort (IS) workload", func(ctx context.Context) {
 			ginkgo.DeferCleanup(cleanup)
 			runWorkload(ctx)
 		})
 	})
 	ginkgo.Context("Run node performance testing with pre-defined workloads", func() {
 		ginkgo.BeforeEach(func() {
 			wl = workloads.NodePerfWorkloads[1]
 		})
 		ginkgo.It("NAS parallel benchmark (NPB) suite - Embarrassingly Parallel (EP) workload", func(ctx context.Context) {
 			ginkgo.DeferCleanup(cleanup)
 			runWorkload(ctx)
 		})
 	})
 	ginkgo.Context("Run node performance testing with pre-defined workloads", func() {
 		ginkgo.BeforeEach(func() {
 			wl = workloads.NodePerfWorkloads[2]
 		})
 		ginkgo.It("TensorFlow workload", func(ctx context.Context) {
 			ginkgo.DeferCleanup(cleanup)
 			runWorkload(ctx)
 		})
 	})
 })
	/*
	Copyright 2018 The Kubernetes Authors.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	*/

	package e2enode

	import (
	"context"
	"fmt"
	"time"

	v1 "k8s.io/api/core/v1"
	"k8s.io/apimachinery/pkg/api/resource"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
	admissionapi "k8s.io/pod-security-admission/api"

	"k8s.io/kubernetes/test/e2e/framework"
	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
	e2enodekubelet "k8s.io/kubernetes/test/e2e_node/kubeletconfig"
	"k8s.io/kubernetes/test/e2e_node/perf/workloads"

	"github.com/onsi/ginkgo/v2"
	"github.com/onsi/gomega"
	)

	// makeNodePerfPod returns a pod with the information provided from the workload.
	func makeNodePerfPod(w workloads.NodePerfWorkload) *v1.Pod {
	return &v1.Pod{
	ObjectMeta: metav1.ObjectMeta{
	Name: fmt.Sprintf("%s-pod", w.Name()),
	},
	Spec: w.PodSpec(),
	}
	}

	func setKubeletConfig(ctx context.Context, f framework.Framework, cfg kubeletconfig.KubeletConfiguration) {
	if cfg != nil {
	// Update the Kubelet configuration.
	ginkgo.By("Stopping the kubelet")
	startKubelet := stopKubelet()

	// wait until the kubelet health check will fail
	gomega.Eventually(ctx, func() bool {
	return kubeletHealthCheck(kubeletHealthCheckURL)
	}, time.Minute, time.Second).Should(gomega.BeFalse())

	framework.ExpectNoError(e2enodekubelet.WriteKubeletConfigFile(cfg))

	ginkgo.By("Starting the kubelet")
	startKubelet()

	// wait until the kubelet health check will succeed
	gomega.Eventually(ctx, func() bool {
	return kubeletHealthCheck(kubeletHealthCheckURL)
	}, 2time.Minute, 5time.Second).Should(gomega.BeTrue())
	}

	// Wait for the Kubelet to be ready.
	gomega.Eventually(ctx, func(ctx context.Context) bool {
	nodes, err := e2enode.TotalReady(ctx, f.ClientSet)
	framework.ExpectNoError(err)
	return nodes == 1
	}, time.Minute, time.Second).Should(gomega.BeTrue())
	}

	// Serial because the test updates kubelet configuration.
	// Slow by design.
	var _ = SIGDescribe("Node Performance Testing", framework.WithSerial(), framework.WithSlow(), func() {
	f := framework.NewDefaultFramework("node-performance-testing")
	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
	var (
	wl workloads.NodePerfWorkload
	oldCfg *kubeletconfig.KubeletConfiguration
	newCfg *kubeletconfig.KubeletConfiguration
	pod *v1.Pod
	)
	ginkgo.JustBeforeEach(func(ctx context.Context) {
	err := wl.PreTestExec()
	framework.ExpectNoError(err)
	oldCfg, err = getCurrentKubeletConfig(ctx)
	framework.ExpectNoError(err)
	newCfg, err = wl.KubeletConfig(oldCfg)
	framework.ExpectNoError(err)
	setKubeletConfig(ctx, f, newCfg)
	})

	cleanup := func(ctx context.Context) {
	gp := int64(0)
	delOpts := metav1.DeleteOptions{
	GracePeriodSeconds: &gp,
	}
	e2epod.NewPodClient(f).DeleteSync(ctx, pod.Name, delOpts, e2epod.DefaultPodDeletionTimeout)

	// We are going to give some more time for the CPU manager to do any clean
	// up it needs to do now that the pod has been deleted. Otherwise we may
	// run into a data race condition in which the PostTestExec function
	// deletes the CPU manager's checkpoint file while the CPU manager is still
	// doing work and we end with a new checkpoint file after PosttestExec has
	// finished. This issues would result in the kubelet panicking after we try
	// and set the kubelet config.
	time.Sleep(15 * time.Second)
	ginkgo.By("running the post test exec from the workload")
	err := wl.PostTestExec()
	framework.ExpectNoError(err)
	setKubeletConfig(ctx, f, oldCfg)
	}

	runWorkload := func(ctx context.Context) {
	ginkgo.By("running the workload and waiting for success")
	// Make the pod for the workload.
	pod = makeNodePerfPod(wl)
	// Create the pod.
	pod = e2epod.NewPodClient(f).CreateSync(ctx, pod)
	// Wait for pod success.
	// but avoid using WaitForSuccess because we want the container logs upon failure #109295
	podErr := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, fmt.Sprintf("%s or %s", v1.PodSucceeded, v1.PodFailed), wl.Timeout(),
	func(pod *v1.Pod) (bool, error) {
	switch pod.Status.Phase {
	case v1.PodFailed:
	return true, fmt.Errorf("pod %q failed with reason: %q, message: %q", pod.Name, pod.Status.Reason, pod.Status.Message)
	case v1.PodSucceeded:
	return true, nil
	default:
	return false, nil
	}
	},
	)
	podLogs, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, pod.Name, pod.Spec.Containers[0].Name)
	framework.ExpectNoError(err)
	if podErr != nil {
	framework.Logf("dumping pod logs due to pod error detected: \n%s", podLogs)
	framework.Failf("pod error: %v", podErr)
	}
	perf, err := wl.ExtractPerformanceFromLogs(podLogs)
	framework.ExpectNoError(err)
	framework.Logf("Time to complete workload %s: %v", wl.Name(), perf)
	// using framework.ExpectNoError for consistency would cause changes the output format
	gomega.Expect(podErr).To(gomega.Succeed(), "wait for pod %q to succeed", pod.Name)
	}

	ginkgo.BeforeEach(func(ctx context.Context) {
	ginkgo.By("ensure environment has enough CPU + Memory to run")
	minimumRequiredCPU := resource.MustParse("15")
	minimumRequiredMemory := resource.MustParse("48Gi")
	localNodeCap := getLocalNode(ctx, f).Status.Allocatable
	cpuCap := localNodeCap[v1.ResourceCPU]
	memCap := localNodeCap[v1.ResourceMemory]
	if cpuCap.Cmp(minimumRequiredCPU) == -1 {
	e2eskipper.Skipf("Skipping Node Performance Tests due to lack of CPU. Required %v is less than capacity %v.", minimumRequiredCPU, cpuCap)
	}
	if memCap.Cmp(minimumRequiredMemory) == -1 {
	e2eskipper.Skipf("Skipping Node Performance Tests due to lack of memory. Required %v is less than capacity %v.", minimumRequiredMemory, memCap)
	}
	})

	ginkgo.Context("Run node performance testing with pre-defined workloads", func() {
	ginkgo.BeforeEach(func() {
	wl = workloads.NodePerfWorkloads[0]
	})
	ginkgo.It("NAS parallel benchmark (NPB) suite - Integer Sort (IS) workload", func(ctx context.Context) {
	ginkgo.DeferCleanup(cleanup)
	runWorkload(ctx)
	})
	})
	ginkgo.Context("Run node performance testing with pre-defined workloads", func() {
	ginkgo.BeforeEach(func() {
	wl = workloads.NodePerfWorkloads[1]
	})
	ginkgo.It("NAS parallel benchmark (NPB) suite - Embarrassingly Parallel (EP) workload", func(ctx context.Context) {
	ginkgo.DeferCleanup(cleanup)
	runWorkload(ctx)
	})
	})
	ginkgo.Context("Run node performance testing with pre-defined workloads", func() {
	ginkgo.BeforeEach(func() {
	wl = workloads.NodePerfWorkloads[2]
	})
	ginkgo.It("TensorFlow workload", func(ctx context.Context) {
	ginkgo.DeferCleanup(cleanup)
	runWorkload(ctx)
	})
	})
	})