Ignore symlinks in _get_affected_files

_get_affected_files has historically returned symlinks but ignored directories,
in keeping with git's design. However, symlinks to directories are returned. If
you then pass a symlink to a function like _get_file_diff or _get_file_content,
it will in some cases return the set of changes in that commit under the
referenced directory. The result is that these functions treat the symlink as
an ordinary file whose contents is mangled git diff output, and will generally
report nonsense errors, e.g. that there's trailing whitespace on some
line for a directory.

As none of the existing checks (such as _check_no_long_lines) are sensible
for symlinks, the fix is to ignore changes to symlinks. This is accomplished
by switching from  a --name-status diff to a raw diff, and parsing the mode
bits (so we look at git's data, not the working tree). There is also a new
unit test.

BUG=none
TEST=pre-upload_unittest.py, plus manual verification of a commit that
     both changes a directory and introduces a symlink to that directory

Change-Id: If252c68d34706dc7d11e0e188148619abc0f66c8
Reviewed-on: https://chromium-review.googlesource.com/203648
Reviewed-by: Mike Frysinger <vapier@chromium.org>
Commit-Queue: Peter Ammon <pca@chromium.org>
Tested-by: Peter Ammon <pca@chromium.org>
diff --git a/pre-upload.py b/pre-upload.py
index 14be6ce..704cf3e 100755
--- a/pre-upload.py
+++ b/pre-upload.py
@@ -18,6 +18,7 @@
 import re
 import sys
 import subprocess
+import stat
 
 from errors import (VerifyException, HookFailure, PrintErrorForProject,
                     PrintErrorsForCommit)
@@ -256,31 +257,60 @@
   return new_lines
 
 
-def _get_affected_files(commit, include_deletes=False, relative=False):
-  """Returns list of absolute filepaths that were modified/added.
+def _parse_affected_files(output, include_deletes=False, relative=False):
+  """Parses git diff's 'raw' format, returning a list of modified file paths.
+
+  This excludes directories and symlinks, and optionally includes files that
+  were deleted.
 
   Args:
-    commit: The commit
-    include_deletes: If true we'll include delete in the list.
-    relative: Whether to return full paths to files.
+    output: The result of the 'git diff --raw' command
+    include_deletes: If true, we'll include deleted files in the result
+    relative: Whether to return relative or full paths to files
 
   Returns:
     A list of modified/added (and perhaps deleted) files
   """
-  output = _run_command(['git', 'diff', '--name-status', commit + '^!'])
   files = []
+  # See the documentation for 'git diff --raw' for the relevant format.
   for statusline in output.splitlines():
-    m = re.match('^(\w)+\t(.+)$', statusline.rstrip())
-    # Ignore deleted files, and return absolute paths of files
-    if include_deletes or m.group(1)[0] != 'D':
-      f = m.group(2)
+    attributes, paths = statusline.split('\t', 1)
+    _, mode, _, _, status = attributes.split(' ')
+
+    # Ignore symlinks and directories.
+    imode = int(mode, 8)
+    if stat.S_ISDIR(imode) or stat.S_ISLNK(imode):
+      continue
+
+    # Ignore deleted files, and optionally return absolute paths of files.
+    if include_deletes or status != 'D':
+      # If a file was merely modified, we will have a single file path.
+      # If it was moved, we will have two paths (source and destination).
+      # In either case, we want the last path.
+      f = paths.split('\t')[-1]
       if not relative:
         pwd = os.getcwd()
         f = os.path.join(pwd, f)
       files.append(f)
+
   return files
 
 
+def _get_affected_files(commit, include_deletes=False, relative=False):
+  """Returns list of file paths that were modified/added, excluding symlinks.
+
+  Args:
+    commit: The commit
+    include_deletes: If true, we'll include deleted files in the result
+    relative: Whether to return relative or full paths to files
+
+  Returns:
+    A list of modified/added (and perhaps deleted) files
+  """
+  output = _run_command(['git', 'diff', '--raw', commit + '^!'])
+  return _parse_affected_files(output, include_deletes, relative)
+
+
 def _get_commits():
   """Returns a list of commits for this review."""
   cmd = ['git', 'log', '%s..' % _get_upstream_branch(), '--format=%H']
@@ -330,7 +360,6 @@
   files = _filter_files(_get_affected_files(commit),
                         COMMON_INCLUDED_PATHS,
                         COMMON_EXCLUDED_PATHS)
-
   for afile in files:
     for line_num, line in _get_file_diff(afile, commit):
       if line.rstrip() != line:
diff --git a/pre-upload_unittest.py b/pre-upload_unittest.py
index 8120c69..feb4555 100755
--- a/pre-upload_unittest.py
+++ b/pre-upload_unittest.py
@@ -281,6 +281,28 @@
     ret = pre_upload._check_ebuild_virtual_pv(self.PRIVATE_VARIANT_OVERLAY, 'H')
     self.assertTrue(isinstance (ret, errors.HookFailure))
 
+class CheckGitOutputParsing(cros_test_lib.MockTestCase):
+  """Tests for git output parsing."""
+  def testParseAffectedFiles(self):
+    """Test parsing git diff --raw output."""
+    # Sample from git diff --raw.
+    sample_git_output = '\n'.join([
+        ":100644 100644 ff03961... a198e8b... M\tMakefile",
+        ":100644 000000 e69de29... 0000000... D\tP1/P2",
+        ":100755 100644 454d5ef... 0000000... C86\tP3\tP4",
+        ":100755 100644 454d5ef... 0000000... R86\tP5\tP6/P7",
+        ":100755 120644 454d5ef... 0000000... M\tIsASymlink",
+    ])
+    expected_modified_files_no_deletes = ['Makefile', 'P4', 'P6/P7']
+    expected_modified_files_with_deletes = ['Makefile', 'P1/P2', 'P4', 'P6/P7']
+    result = pre_upload._parse_affected_files(sample_git_output,
+                                              include_deletes=True,
+                                              relative=True)
+    self.assertEqual(result, expected_modified_files_with_deletes)
+    result = pre_upload._parse_affected_files(sample_git_output,
+                                              include_deletes=False,
+                                              relative=True)
+    self.assertEqual(result, expected_modified_files_no_deletes)
 
 if __name__ == '__main__':
   cros_test_lib.main()