Try UTF-8 when checking for 80-character lines.

BUG=None
TEST=pre-upload_unittest.py

Change-Id: I8b9070935489c0049201d38a3cc95043a3786006
Reviewed-on: https://gerrit.chromium.org/gerrit/30812
Commit-Ready: Jon Salz <jsalz@chromium.org>
Reviewed-by: Jon Salz <jsalz@chromium.org>
Tested-by: Jon Salz <jsalz@chromium.org>
diff --git a/pre-upload.py b/pre-upload.py
index b3ba2ce..467dc04 100755
--- a/pre-upload.py
+++ b/pre-upload.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 
@@ -160,6 +160,18 @@
   return _run_command(['git', 'show', commit])
 
 
+def _try_utf8_decode(data):
+  """Attempts to decode a string as UTF-8.
+
+  Returns:
+    The decoded Unicode object, or the original string if parsing fails.
+  """
+  try:
+    return unicode(data, 'utf-8', 'strict')
+  except UnicodeDecodeError:
+    return data
+
+
 def _get_file_diff(file, commit):
   """Returns a list of (linenum, lines) tuples that the commit touched."""
   output = _run_command(['git', 'show', '-p', '--no-ext-diff', commit, file])
@@ -172,7 +184,7 @@
       line_num = int(m.groups(1)[0])
       continue
     if line.startswith('+') and not line.startswith('++'):
-      new_lines.append((line_num, line[1:]))
+      new_lines.append((line_num, _try_utf8_decode(line[1:])))
     if not line.startswith('-'):
       line_num += 1
   return new_lines
@@ -210,6 +222,9 @@
   the text files to be submitted.
   """
   MAX_LEN = 80
+  SKIP_REGEXP = re.compile('|'.join([
+      r'https?://',
+      r'^#\s*(define|include|import|pragma|if|endif)\b']))
 
   errors = []
   files = _filter_files(_get_affected_files(commit),
@@ -219,18 +234,12 @@
   for afile in files:
     for line_num, line in _get_file_diff(afile, commit):
       # Allow certain lines to exceed the maxlen rule.
-      if (len(line) > MAX_LEN and
-          not 'http://' in line and
-          not 'https://' in line and
-          not line.startswith('#define') and
-          not line.startswith('#include') and
-          not line.startswith('#import') and
-          not line.startswith('#pragma') and
-          not line.startswith('#if') and
-          not line.startswith('#endif')):
-        errors.append('%s, line %s, %s chars' % (afile, line_num, len(line)))
-        if len(errors) == 5:  # Just show the first 5 errors.
-          break
+      if (len(line) <= MAX_LEN or SKIP_REGEXP.search(line)):
+        continue
+
+      errors.append('%s, line %s, %s chars' % (afile, line_num, len(line)))
+      if len(errors) == 5:  # Just show the first 5 errors.
+        break
 
   if errors:
     msg = 'Found lines longer than %s characters (first 5 shown):' % MAX_LEN
diff --git a/pre-upload_unittest.py b/pre-upload_unittest.py
new file mode 100755
index 0000000..5c4398d
--- /dev/null
+++ b/pre-upload_unittest.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import mox
+import unittest
+
+
+# pylint: disable=W0212
+
+pre_upload = __import__('pre-upload')
+
+
+class TryUTF8DecodeTest(unittest.TestCase):
+  def runTest(self):
+    self.assertEquals(u'', pre_upload._try_utf8_decode(''))
+    self.assertEquals(u'abc', pre_upload._try_utf8_decode('abc'))
+    self.assertEquals(u'你好布萊恩', pre_upload._try_utf8_decode('你好布萊恩'))
+    # Invalid UTF-8
+    self.assertEquals('\x80', pre_upload._try_utf8_decode('\x80'))
+
+
+class CheckNoLongLinesTest(unittest.TestCase):
+  def setUp(self):
+    self.mocker = mox.Mox()
+    self.mocker.StubOutWithMock(pre_upload, '_filter_files')
+    self.mocker.StubOutWithMock(pre_upload, '_get_affected_files')
+    self.mocker.StubOutWithMock(pre_upload, '_get_file_diff')
+    pre_upload._get_affected_files(mox.IgnoreArg()).AndReturn(['x.py'])
+    pre_upload._filter_files(
+        ['x.py'], mox.IgnoreArg(), mox.IgnoreArg()).AndReturn(['x.py'])
+
+  def tearDown(self):
+    self.mocker.UnsetStubs()
+    self.mocker.VerifyAll()
+
+  def runTest(self):
+    pre_upload._get_file_diff(mox.IgnoreArg(), mox.IgnoreArg()).AndReturn(
+        [(1, u"x" * 80),                      # OK
+         (2, "\x80" * 80),                    # OK
+         (3, u"x" * 81),                      # Too long
+         (4, "\x80" * 81),                    # Too long
+         (5, u"See http://" + (u"x" * 80)),   # OK (URL)
+         (6, u"See https://" + (u"x" * 80)),  # OK (URL)
+         (7, u"#  define " + (u"x" * 80)),    # OK (compiler directive)
+         (8, u"#define" + (u"x" * 74)),       # Too long
+         ])
+    self.mocker.ReplayAll()
+    failure = pre_upload._check_no_long_lines('PROJECT', 'COMMIT')
+    self.assertTrue(failure)
+    self.assertEquals('Found lines longer than 80 characters (first 5 shown):',
+                      failure.msg)
+    self.assertEquals(['x.py, line %d, 81 chars' % line
+                       for line in [3, 4, 8]],
+                      failure.items)
+
+
+if __name__ == '__main__':
+  unittest.main()