CHROMIUM: scheduler: log packages that take a long time to build
If a package gets stuck or takes a long time, it can be hard to see
or notice as the output from portage just stops as it waits. Lets
add a periodic check that logs a "Still building" message for every
package that takes more than 30min to complete, and log it every
30min after that.
So if a package takes less than 30min, there is no difference with
today's output. But if it takes longer, we'll get one message at
every 30min interval.
Even further, if a package takes more than 2hr to build, we'll
start dumping the current pstree listing for that package. Since
this can be a bit verbose, we hold it only for slow packages.
The slowest we have is Chrome and that should normally finish in
under 2hrs with Goma enabled on bots.
BUG=chromium:1095661
TEST=CQ passes
TEST=looked at output behavior with manual lower timeouts
Change-Id: I930f8065a506c7fe4f4de003e20d1335203505f9
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/portage_tool/+/2337614
Reviewed-by: Chris McDonald <cjmcdonald@chromium.org>
Tested-by: Mike Frysinger <vapier@chromium.org>
Commit-Queue: Mike Frysinger <vapier@chromium.org>
diff --git a/lib/_emerge/MergeListItem.py b/lib/_emerge/MergeListItem.py
index 938f801..ba92409 100644
--- a/lib/_emerge/MergeListItem.py
+++ b/lib/_emerge/MergeListItem.py
@@ -23,7 +23,8 @@
"find_blockers", "logger", "mtimedb", "pkg",
"pkg_count", "pkg_to_replace", "prefetcher",
"settings", "statusMessage", "world_atom") + \
- ("_install_task",)
+ ("_install_task",) + \
+ ("start_time", "next_ping")
def _start(self):
diff --git a/lib/_emerge/Scheduler.py b/lib/_emerge/Scheduler.py
index 3165052..9b483e6 100644
--- a/lib/_emerge/Scheduler.py
+++ b/lib/_emerge/Scheduler.py
@@ -4,10 +4,12 @@
from __future__ import division, print_function, unicode_literals
from collections import deque
+import datetime
import gc
import gzip
import logging
import signal
+import subprocess
import sys
import textwrap
import time
@@ -241,6 +243,7 @@
self._completed_tasks = set()
self._main_exit = None
self._main_loadavg_handle = None
+ self._slow_job_check_handle = None
self._failed_pkgs = []
self._failed_pkgs_all = []
@@ -1343,6 +1346,9 @@
def _main_loop(self):
self._main_exit = self._event_loop.create_future()
+ # Poll for slow jobs.
+ self._slow_job_check_schedule()
+
if self._max_load is not None and \
self._loadavg_latency is not None and \
(self._max_jobs is True or self._max_jobs > 1):
@@ -1405,6 +1411,9 @@
if self._job_delay_timeout_id is not None:
self._job_delay_timeout_id.cancel()
self._job_delay_timeout_id = None
+ if self._slow_job_check_handle is not None:
+ self._slow_job_check_handle.cancel()
+ self._slow_job_check_handle = None
def _choose_pkg(self):
"""
@@ -1632,6 +1641,59 @@
return False
+ def _slow_job_check_schedule(self):
+ """(Re)Schedule a slow job check."""
+ if self._slow_job_check_handle is not None:
+ self._slow_job_check_handle.cancel()
+ self._slow_job_check_handle = None
+ # Poll every 5 minutes. That should balance catching slow jobs while
+ # not adding too much overhead otherwise.
+ self._slow_job_check_handle = self._event_loop.call_later(
+ 5 * 60, self._slow_job_check)
+
+ def _slow_job_check(self):
+ """Check for jobs taking a "long" time and print their status."""
+ # We post updates every 30min, so only recheck packages that often.
+ ping_interval = datetime.timedelta(minutes=30)
+ delta_30min = datetime.timedelta(minutes=30)
+ delta_2hr = datetime.timedelta(hours=2)
+ now = datetime.datetime.utcnow()
+ for task in self._running_tasks.values():
+ if not isinstance(task, MergeListItem):
+ # Ignore merged tasks as they should be fast.
+ continue
+
+ # Ignore tasks we've already warned about.
+ if task.next_ping > now:
+ continue
+ # Ignore tasks that hasn't run long enough.
+ duration = now - task.start_time
+ if duration < delta_30min:
+ continue
+
+ task.next_ping = now + ping_interval
+ msg = 'Still building %s after %s' % (task.pkg, duration)
+
+ # If the job has been pending for a long time, include the current process tree.
+ if duration > delta_2hr:
+ # Portage likes to nest tasks with tasks, so we have to walk down an
+ # arbitrary depth to find the first node that has spawned a process.
+ subtask = task._current_task
+ while getattr(subtask, '_current_task', None) is not None:
+ if hasattr(subtask, 'pid'):
+ break
+ subtask = subtask._current_task
+ if hasattr(subtask, 'pid'):
+ pstree = subprocess.run(
+ ['pstree', '-Apals', str(subtask.pid)],
+ stdout=subprocess.PIPE, encoding='utf-8', errors='replace')
+ msg += pstree.stdout
+
+ task.statusMessage(msg)
+
+ self._slow_job_check_handle = None
+ self._slow_job_check_schedule()
+
def _schedule_tasks_imp(self):
"""
@rtype: bool
@@ -1731,7 +1793,9 @@
scheduler=self._sched_iface,
settings=self._allocate_config(pkg.root),
statusMessage=self._status_msg,
- world_atom=self._world_atom)
+ world_atom=self._world_atom,
+ start_time=datetime.datetime.utcnow(),
+ next_ping=datetime.datetime.utcnow())
return task