| /* |
| Copyright 2014 The Kubernetes Authors. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| */ |
| |
| package debug |
| |
| import ( |
| "context" |
| "fmt" |
| "sort" |
| "time" |
| |
| "github.com/onsi/ginkgo/v2" |
| |
| v1 "k8s.io/api/core/v1" |
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" |
| "k8s.io/apimachinery/pkg/fields" |
| clientset "k8s.io/client-go/kubernetes" |
| restclient "k8s.io/client-go/rest" |
| "k8s.io/kubernetes/test/e2e/framework" |
| e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics" |
| e2epod "k8s.io/kubernetes/test/e2e/framework/pod" |
| ) |
| |
| // EventsLister is a func that lists events. |
| type EventsLister func(opts metav1.ListOptions, ns string) (*v1.EventList, error) |
| |
| // dumpEventsInNamespace dumps events in the given namespace. |
| func dumpEventsInNamespace(eventsLister EventsLister, namespace string) { |
| ginkgo.By(fmt.Sprintf("Collecting events from namespace %q.", namespace)) |
| events, err := eventsLister(metav1.ListOptions{}, namespace) |
| framework.ExpectNoError(err, "failed to list events in namespace %q", namespace) |
| |
| ginkgo.By(fmt.Sprintf("Found %d events.", len(events.Items))) |
| // Sort events by their first timestamp |
| sortedEvents := events.Items |
| if len(sortedEvents) > 1 { |
| sort.Sort(byFirstTimestamp(sortedEvents)) |
| } |
| for _, e := range sortedEvents { |
| framework.Logf("At %v - event for %v: %v %v: %v", e.FirstTimestamp, e.InvolvedObject.Name, e.Source, e.Reason, e.Message) |
| } |
| // Note that we don't wait for any Cleanup to propagate, which means |
| // that if you delete a bunch of pods right before ending your test, |
| // you may or may not see the killing/deletion/Cleanup events. |
| } |
| |
| // DumpAllNamespaceInfo dumps events, pods and nodes information in the given namespace. |
| func DumpAllNamespaceInfo(ctx context.Context, c clientset.Interface, namespace string) { |
| dumpEventsInNamespace(func(opts metav1.ListOptions, ns string) (*v1.EventList, error) { |
| return c.CoreV1().Events(ns).List(ctx, opts) |
| }, namespace) |
| |
| e2epod.DumpAllPodInfoForNamespace(ctx, c, namespace, framework.TestContext.ReportDir) |
| |
| // If cluster is large, then the following logs are basically useless, because: |
| // 1. it takes tens of minutes or hours to grab all of them |
| // 2. there are so many of them that working with them are mostly impossible |
| // So we dump them only if the cluster is relatively small. |
| maxNodesForDump := framework.TestContext.MaxNodesToGather |
| nodes, err := c.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) |
| if err != nil { |
| framework.Logf("unable to fetch node list: %v", err) |
| return |
| } |
| if len(nodes.Items) <= maxNodesForDump { |
| dumpAllNodeInfo(ctx, c, nodes) |
| } else { |
| framework.Logf("skipping dumping cluster info - cluster too large") |
| } |
| } |
| |
| // byFirstTimestamp sorts a slice of events by first timestamp, using their involvedObject's name as a tie breaker. |
| type byFirstTimestamp []v1.Event |
| |
| func (o byFirstTimestamp) Len() int { return len(o) } |
| func (o byFirstTimestamp) Swap(i, j int) { o[i], o[j] = o[j], o[i] } |
| |
| func (o byFirstTimestamp) Less(i, j int) bool { |
| if o[i].FirstTimestamp.Equal(&o[j].FirstTimestamp) { |
| return o[i].InvolvedObject.Name < o[j].InvolvedObject.Name |
| } |
| return o[i].FirstTimestamp.Before(&o[j].FirstTimestamp) |
| } |
| |
| func dumpAllNodeInfo(ctx context.Context, c clientset.Interface, nodes *v1.NodeList) { |
| names := make([]string, len(nodes.Items)) |
| for ix := range nodes.Items { |
| names[ix] = nodes.Items[ix].Name |
| } |
| DumpNodeDebugInfo(ctx, c, names, framework.Logf) |
| } |
| |
| // DumpNodeDebugInfo dumps debug information of the given nodes. |
| func DumpNodeDebugInfo(ctx context.Context, c clientset.Interface, nodeNames []string, logFunc func(fmt string, args ...interface{})) { |
| for _, n := range nodeNames { |
| logFunc("\nLogging node info for node %v", n) |
| node, err := c.CoreV1().Nodes().Get(ctx, n, metav1.GetOptions{}) |
| if err != nil { |
| logFunc("Error getting node info %v", err) |
| } |
| logFunc("Node Info: %v", node) |
| |
| logFunc("\nLogging kubelet events for node %v", n) |
| for _, e := range getNodeEvents(ctx, c, n) { |
| logFunc("source %v type %v message %v reason %v first ts %v last ts %v, involved obj %+v", |
| e.Source, e.Type, e.Message, e.Reason, e.FirstTimestamp, e.LastTimestamp, e.InvolvedObject) |
| } |
| logFunc("\nLogging pods the kubelet thinks is on node %v", n) |
| podList, err := getKubeletPods(ctx, c, n) |
| if err != nil { |
| logFunc("Unable to retrieve kubelet pods for node %v: %v", n, err) |
| continue |
| } |
| for _, p := range podList.Items { |
| logFunc("%v started at %v (%d+%d container statuses recorded)", p.Name, p.Status.StartTime, len(p.Status.InitContainerStatuses), len(p.Status.ContainerStatuses)) |
| for _, c := range p.Status.InitContainerStatuses { |
| logFunc("\tInit container %v ready: %v, restart count %v", |
| c.Name, c.Ready, c.RestartCount) |
| } |
| for _, c := range p.Status.ContainerStatuses { |
| logFunc("\tContainer %v ready: %v, restart count %v", |
| c.Name, c.Ready, c.RestartCount) |
| } |
| } |
| _, err = e2emetrics.HighLatencyKubeletOperations(ctx, c, 10*time.Second, n, logFunc) |
| framework.ExpectNoError(err) |
| // TODO: Log node resource info |
| } |
| } |
| |
| // getKubeletPods retrieves the list of pods on the kubelet. |
| func getKubeletPods(ctx context.Context, c clientset.Interface, node string) (*v1.PodList, error) { |
| var client restclient.Result |
| finished := make(chan struct{}, 1) |
| go func() { |
| // call chain tends to hang in some cases when Node is not ready. Add an artificial timeout for this call. #22165 |
| client = c.CoreV1().RESTClient().Get(). |
| Resource("nodes"). |
| SubResource("proxy"). |
| Name(fmt.Sprintf("%v:%v", node, framework.KubeletPort)). |
| Suffix("pods"). |
| Do(ctx) |
| |
| finished <- struct{}{} |
| }() |
| select { |
| case <-finished: |
| result := &v1.PodList{} |
| if err := client.Into(result); err != nil { |
| return &v1.PodList{}, err |
| } |
| return result, nil |
| case <-time.After(framework.PodGetTimeout): |
| return &v1.PodList{}, fmt.Errorf("Waiting up to %v for getting the list of pods", framework.PodGetTimeout) |
| } |
| } |
| |
| // logNodeEvents logs kubelet events from the given node. This includes kubelet |
| // restart and node unhealthy events. Note that listing events like this will mess |
| // with latency metrics, beware of calling it during a test. |
| func getNodeEvents(ctx context.Context, c clientset.Interface, nodeName string) []v1.Event { |
| selector := fields.Set{ |
| "involvedObject.kind": "Node", |
| "involvedObject.name": nodeName, |
| "involvedObject.namespace": metav1.NamespaceAll, |
| "source": "kubelet", |
| }.AsSelector().String() |
| options := metav1.ListOptions{FieldSelector: selector} |
| events, err := c.CoreV1().Events(metav1.NamespaceSystem).List(ctx, options) |
| if err != nil { |
| framework.Logf("Unexpected error retrieving node events %v", err) |
| return []v1.Event{} |
| } |
| return events.Items |
| } |