portage_util: improve Classify failure message w/non UTF-8 files

If we try to process an ebuild that isn't UTF-8 compatible, we crash
with a UnicodeDecodeError.  By itself, this is what we want, but the
error message is not easy to diagnose for users as it lacks any file
or line information.  When this script runs over thousands of ebuilds,
trying to find the single bad one can be hard.

Lets rework the file reading logic so that we decode each line by
ourselves, and if it's not valid UTF-8, log an explicit message with
the file, line number, and failing bytes.  Now it should be obvious
to the user where things have gone wrong.

BUG=chromium:997354
TEST=`./run_tests` passes
TEST=`cros_mark_as_stable -p sys-power/pm-utils --force commit` shows exact source of failure

Change-Id: I8fec28257e3b99611713bcad91fd884220ee0635
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/chromite/+/1929752
Reviewed-by: Alex Klein <saklein@chromium.org>
Commit-Queue: Mike Frysinger <vapier@chromium.org>
Tested-by: Mike Frysinger <vapier@chromium.org>
diff --git a/lib/portage_util.py b/lib/portage_util.py
index 57d1e2f..ea9b75d 100644
--- a/lib/portage_util.py
+++ b/lib/portage_util.py
@@ -537,26 +537,35 @@
     is_stable = False
     is_blacklisted = False
     has_test = False
-    for line in fileinput.input(ebuild_path):
-      if line.startswith('inherit '):
-        eclasses = set(line.split())
-        if 'cros-workon' in eclasses:
-          is_workon = True
-        if EBuild._ECLASS_IMPLIES_TEST & eclasses:
+    with open(ebuild_path, mode='rb') as fp:
+      for i, line in enumerate(fp):
+        # If the file has bad encodings, produce a helpful diagnostic for the
+        # user.  The default encoding exception lacks direct file context.
+        try:
+          line = line.decode('utf-8')
+        except UnicodeDecodeError:
+          logging.exception('%s: line %i: invalid UTF-8: %s',
+                            ebuild_path, i, line)
+          raise
+
+        if line.startswith('inherit '):
+          eclasses = set(line.split())
+          if 'cros-workon' in eclasses:
+            is_workon = True
+          if EBuild._ECLASS_IMPLIES_TEST & eclasses:
+            has_test = True
+        elif line.startswith('KEYWORDS='):
+          # Strip off the comments, then extract the value of the variable, then
+          # strip off any quotes.
+          line = line.split('#', 1)[0].split('=', 1)[1].strip('"\'')
+          for keyword in line.split():
+            if not keyword.startswith('~') and keyword != '-*':
+              is_stable = True
+        elif line.startswith('CROS_WORKON_BLACKLIST='):
+          is_blacklisted = True
+        elif (line.startswith('src_test()') or
+              line.startswith('platform_pkg_test()')):
           has_test = True
-      elif line.startswith('KEYWORDS='):
-        # Strip off the comments, then extract the value of the variable, then
-        # strip off any quotes.
-        line = line.split('#', 1)[0].split('=', 1)[1].strip('"\'')
-        for keyword in line.split():
-          if not keyword.startswith('~') and keyword != '-*':
-            is_stable = True
-      elif line.startswith('CROS_WORKON_BLACKLIST='):
-        is_blacklisted = True
-      elif (line.startswith('src_test()') or
-            line.startswith('platform_pkg_test()')):
-        has_test = True
-    fileinput.close()
     return EBuildClassifyAttributes(
         is_workon, is_stable, is_blacklisted, has_test)
 
diff --git a/lib/portage_util_unittest.py b/lib/portage_util_unittest.py
index cca890f..7f0685d 100644
--- a/lib/portage_util_unittest.py
+++ b/lib/portage_util_unittest.py
@@ -281,6 +281,30 @@
       attrs = portage_util.EBuild.Classify(ebuild_path)
       self.assertTrue(attrs.is_stable, msg='Failing: %s' % (keywords,))
 
+  def testClassifyEncodingASCII(self):
+    """Test Classify with ASCII file encodings."""
+    ebuild_path = os.path.join(self.tempdir, 'foo-1.ebuild')
+    # Generate a valid shell script with all possible ASCII values.
+    osutils.WriteFile(
+        ebuild_path,
+        'cat <<\\EOF\n%s\nEOF\n' % (''.join(chr(x) for x in range(0, 128)),))
+    # Just check that we don't throw an exception.
+    portage_util.EBuild.Classify(ebuild_path)
+
+  def testClassifyEncodingUTF8(self):
+    """Test Classify with UTF-8 file encodings."""
+    ebuild_path = os.path.join(self.tempdir, 'foo-1.ebuild')
+    osutils.WriteFile(ebuild_path, '# FöÖßbäłł')
+    # Just check that we don't throw an exception.
+    portage_util.EBuild.Classify(ebuild_path)
+
+  def testClassifyEncodingLatin1(self):
+    """Test Classify with ISO 8859-1 file encodings."""
+    ebuild_path = os.path.join(self.tempdir, 'foo-1.ebuild')
+    osutils.WriteFile(ebuild_path, b'# This is \xa0 bad UTF-8', mode='wb')
+    with self.assertRaises(UnicodeDecodeError):
+      portage_util.EBuild.Classify(ebuild_path)
+
 
 class ProjectAndPathTest(cros_test_lib.MockTempDirTestCase):
   """Project and Path related tests."""