CHROMIUM: scheduler: log packages that take a long time to build

If a package gets stuck or takes a long time, it can be hard to see
or notice as the output from portage just stops as it waits.  Lets
add a periodic check that logs a "Still building" message for every
package that takes more than 30min to complete, and log it every
30min after that.

So if a package takes less than 30min, there is no difference with
today's output.  But if it takes longer, we'll get one message at
every 30min interval.

Even further, if a package takes more than 2hr to build, we'll
start dumping the current pstree listing for that package.  Since
this can be a bit verbose, we hold it only for slow packages.
The slowest we have is Chrome and that should normally finish in
under 2hrs with Goma enabled on bots.

BUG=chromium:1095661
TEST=CQ passes
TEST=looked at output behavior with manual lower timeouts

Change-Id: I930f8065a506c7fe4f4de003e20d1335203505f9
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/portage_tool/+/2337614
Reviewed-by: Chris McDonald <cjmcdonald@chromium.org>
Tested-by: Mike Frysinger <vapier@chromium.org>
Commit-Queue: Mike Frysinger <vapier@chromium.org>
diff --git a/lib/_emerge/MergeListItem.py b/lib/_emerge/MergeListItem.py
index 938f801..ba92409 100644
--- a/lib/_emerge/MergeListItem.py
+++ b/lib/_emerge/MergeListItem.py
@@ -23,7 +23,8 @@
 		"find_blockers", "logger", "mtimedb", "pkg",
 		"pkg_count", "pkg_to_replace", "prefetcher",
 		"settings", "statusMessage", "world_atom") + \
-		("_install_task",)
+		("_install_task",) + \
+		("start_time", "next_ping")
 
 	def _start(self):
 
diff --git a/lib/_emerge/Scheduler.py b/lib/_emerge/Scheduler.py
index 3165052..9b483e6 100644
--- a/lib/_emerge/Scheduler.py
+++ b/lib/_emerge/Scheduler.py
@@ -4,10 +4,12 @@
 from __future__ import division, print_function, unicode_literals
 
 from collections import deque
+import datetime
 import gc
 import gzip
 import logging
 import signal
+import subprocess
 import sys
 import textwrap
 import time
@@ -241,6 +243,7 @@
 		self._completed_tasks = set()
 		self._main_exit = None
 		self._main_loadavg_handle = None
+		self._slow_job_check_handle = None
 
 		self._failed_pkgs = []
 		self._failed_pkgs_all = []
@@ -1343,6 +1346,9 @@
 	def _main_loop(self):
 		self._main_exit = self._event_loop.create_future()
 
+		# Poll for slow jobs.
+		self._slow_job_check_schedule()
+
 		if self._max_load is not None and \
 			self._loadavg_latency is not None and \
 			(self._max_jobs is True or self._max_jobs > 1):
@@ -1405,6 +1411,9 @@
 		if self._job_delay_timeout_id is not None:
 			self._job_delay_timeout_id.cancel()
 			self._job_delay_timeout_id = None
+		if self._slow_job_check_handle is not None:
+			self._slow_job_check_handle.cancel()
+			self._slow_job_check_handle = None
 
 	def _choose_pkg(self):
 		"""
@@ -1632,6 +1641,59 @@
 
 		return False
 
+	def _slow_job_check_schedule(self):
+		"""(Re)Schedule a slow job check."""
+		if self._slow_job_check_handle is not None:
+			self._slow_job_check_handle.cancel()
+			self._slow_job_check_handle = None
+		# Poll every 5 minutes.  That should balance catching slow jobs while
+		# not adding too much overhead otherwise.
+		self._slow_job_check_handle = self._event_loop.call_later(
+			5 * 60, self._slow_job_check)
+
+	def _slow_job_check(self):
+		"""Check for jobs taking a "long" time and print their status."""
+		# We post updates every 30min, so only recheck packages that often.
+		ping_interval = datetime.timedelta(minutes=30)
+		delta_30min = datetime.timedelta(minutes=30)
+		delta_2hr = datetime.timedelta(hours=2)
+		now = datetime.datetime.utcnow()
+		for task in self._running_tasks.values():
+			if not isinstance(task, MergeListItem):
+				# Ignore merged tasks as they should be fast.
+				continue
+
+			# Ignore tasks we've already warned about.
+			if task.next_ping > now:
+				continue
+			# Ignore tasks that hasn't run long enough.
+			duration = now - task.start_time
+			if duration < delta_30min:
+				continue
+
+			task.next_ping = now + ping_interval
+			msg = 'Still building %s after %s' % (task.pkg, duration)
+
+			# If the job has been pending for a long time, include the current process tree.
+			if duration > delta_2hr:
+				# Portage likes to nest tasks with tasks, so we have to walk down an
+				# arbitrary depth to find the first node that has spawned a process.
+				subtask = task._current_task
+				while getattr(subtask, '_current_task', None) is not None:
+					if hasattr(subtask, 'pid'):
+						break
+					subtask = subtask._current_task
+				if hasattr(subtask, 'pid'):
+					pstree = subprocess.run(
+						['pstree', '-Apals', str(subtask.pid)],
+						stdout=subprocess.PIPE, encoding='utf-8', errors='replace')
+					msg += pstree.stdout
+
+			task.statusMessage(msg)
+
+		self._slow_job_check_handle = None
+		self._slow_job_check_schedule()
+
 	def _schedule_tasks_imp(self):
 		"""
 		@rtype: bool
@@ -1731,7 +1793,9 @@
 			scheduler=self._sched_iface,
 			settings=self._allocate_config(pkg.root),
 			statusMessage=self._status_msg,
-			world_atom=self._world_atom)
+			world_atom=self._world_atom,
+			start_time=datetime.datetime.utcnow(),
+			next_ping=datetime.datetime.utcnow())
 
 		return task