pre-submit: Populate blocked_terms.txt

This patch populates the global list of blocked words. The list of
unblocked words are symlinked from the block list. Thus, nothing
will be blocked globally.

unblocked_terms.txt in this directry is referenced only if it
doesn't exist in the project directory. It's expected to be copied
to each project when a project is starting COIL. It will also serve
as a TODO list to locally track the progress.

See README.md for how to use unblocked_terms.txt for your project.

BUG=b:165908442
TEST=Run repo upload --cbr --dry-run .
  [COMMIT 1/1 7a40d010022f] test: blocked_terms.txt
  [FAILED] path/to/X: _check_keywords
  Found a blocked keyword in:
  .../main.c, line 41: Matched "dog-pile" with regex of "dog.?pile"

Change-Id: Iea03b536d24a1a1c9590aebed162c6624166a343
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/repohooks/+/2369239
Tested-by: Daisuke Nojiri <dnojiri@chromium.org>
Reviewed-by: Mike Frysinger <vapier@chromium.org>
Reviewed-by: Bernie Thompson <bhthompson@chromium.org>
Commit-Queue: Mike Frysinger <vapier@chromium.org>
Auto-Submit: Daisuke Nojiri <dnojiri@chromium.org>
diff --git a/README.md b/README.md
index a721c23..e90eabb 100644
--- a/README.md
+++ b/README.md
@@ -126,6 +126,28 @@
 cros_license_check: --exclude_regex=\b(checkpatch\.pl|kernel-doc)$
 ```
 
+## Blocked and Unblocked Word List
+
+`blocked_terms.txt` contains a list of words which are blocked if
+`keyword_check` is enabled in the project. `unblocked_terms.txt` is a copy of
+`blocked_terms.txt`. So, by default nothing will be blocked.
+
+`unblocked_terms.txt` in this directry is referenced only if it doesn't exist in
+a project directory. To transition to the default blocked word list for project
+X, one can do:
+
+1.  Copy `unblocked_terms.txt` to X.
+2.  Remove words which are not used in X.
+    `$ egrep -r -i -I '(keyword1|keyword2|...)'`
+3.  Test and submit CL.
+
+Then, when blocking word 'foo' for project X, one can do:
+
+1.  Remove 'foo' from `unblocked_terms.txt` in X.
+2.  Grep and fix matches.
+    `$ egrep -r -i -I foo`
+3.  Test & submit CL.
+
 # Third Party code
 
 We have many third party repos where you probably want to disable CrOS checks.
diff --git a/blocked_terms.txt b/blocked_terms.txt
index e69de29..1e1f9e3 100644
--- a/blocked_terms.txt
+++ b/blocked_terms.txt
@@ -0,0 +1,39 @@
+# DELETE THIS COMMENT IN YOUR COPY.
+#
+# This is the global list of words to be blocked. unblocked_terms.txt has the
+# same contents. Copy unblocked_terms.txt to your project to enable repo hook to
+# block the words.
+#
+# See repohooks/README.md for more details.
+
+black.?hat
+black.?list
+build.?cop
+crazy
+cripple
+dummy
+first.?class.?citizen
+grandfathered
+gr[ae]y.?hat
+gr[ae]y.?list
+insane
+\bhe\b
+\bshe\b
+\bhim\b
+\bher\b
+\bhis\b
+\bhers\b
+man.?in.?the.?middle
+master
+mitm
+native
+red.?line
+rtfm
+sane
+sanity
+slave
+white.?glove
+white.?hat
+white.?label
+white.?list
+wtf
diff --git a/pre-upload.py b/pre-upload.py
index 0b02750..7c252ea 100755
--- a/pre-upload.py
+++ b/pre-upload.py
@@ -118,6 +118,9 @@
 
 TEST_FIELD_RE = r'\nTEST=\S+'
 
+BLOCKED_TERMS_FILE = 'blocked_terms.txt'
+UNBLOCKED_TERMS_FILE = 'unblocked_terms.txt'
+
 # Exceptions
 
 
@@ -594,10 +597,21 @@
                               'Found a tab character in:')
 
 
+def _read_terms_file(terms_file):
+  """Read list of words from file, skipping comments and blank lines."""
+  file_terms = set()
+  for line in osutils.ReadFile(terms_file).splitlines():
+    # Allow comment and blank lines.
+    line = line.split('#', 1)[0]
+    if not line:
+      continue
+    file_terms.add(line)
+  return file_terms
+
+
 def _check_keywords(_project, commit, options=()):
   """Checks there are no blocked keywords in commit content."""
-  blocked_terms_file = os.path.join(_get_hooks_dir(), 'blocked_terms.txt')
-  common_keywords = set(osutils.ReadFile(blocked_terms_file).splitlines())
+  # Read options from override list.
   parser = argparse.ArgumentParser()
   parser.add_argument('--exclude_regex', action='append', default=[])
   parser.add_argument('--include_regex', action='append', default=[])
@@ -605,8 +619,19 @@
   parser.add_argument('--unblock', action='append', default=[])
   opts = parser.parse_args(options)
 
+  # Read blocked word list.
+  blocked_terms_file = os.path.join(_get_hooks_dir(), BLOCKED_TERMS_FILE)
+  common_keywords = _read_terms_file(blocked_terms_file)
+
+  # Read unblocked word list. Global list is skipped if local list exists.
+  unblocked_terms_file = os.path.join(_get_hooks_dir(), UNBLOCKED_TERMS_FILE)
+  if os.path.isfile(os.path.join(_project.dir, UNBLOCKED_TERMS_FILE)):
+    unblocked_terms_file = os.path.join(_project.dir, UNBLOCKED_TERMS_FILE)
+  unblocked_words = _read_terms_file(unblocked_terms_file)
+  unblocked_words.update(opts.unblock)
+
   keywords = set(common_keywords | set(opts.block))
-  keywords = sorted(keywords - set(opts.unblock))
+  keywords = sorted(keywords - unblocked_words)
   files = _filter_files(_get_affected_files(commit),
                         opts.include_regex + COMMON_INCLUDED_PATHS,
                         opts.exclude_regex + COMMON_EXCLUDED_PATHS)
@@ -619,12 +644,12 @@
         matched = True
         # The unblock values supercede blocked values, so if any unblock
         # regex matches a term found by the block list, we ignore it.
-        for unblocked in opts.unblock:
+        for unblocked in unblocked_words:
           if re.search(unblocked, m.group(0)):
             matched = False
             break
         if matched:
-          return 'Matched ' + word
+          return f'Matched "{m[0]}" with regex of "{word}"'
     return False
 
   diff_errors = _check_lines_in_diff(commit, files, _check_line,
diff --git a/pre-upload_unittest.py b/pre-upload_unittest.py
index d5170ec..59ea3d5 100755
--- a/pre-upload_unittest.py
+++ b/pre-upload_unittest.py
@@ -92,17 +92,21 @@
     self.assertEqual(u'hi \ufffd there', ret)
 
 
-class CheckKeywordsTest(PreUploadTestCase):
+class CheckKeywordsTest(PreUploadTestCase, cros_test_lib.TempDirTestCase):
   """Tests for _check_keywords."""
 
   def setUp(self):
     self.PatchObject(pre_upload, '_get_affected_files',
                      return_value=['x.ebuild'])
     self.PatchObject(pre_upload, '_filter_files', return_value=['x.ebuild'])
-    self.PatchObject(osutils, 'ReadFile',
-                     return_value='scruffy\nmangy\ndog.?pile\ncat.?circle')
+    # First call for blocked_terms.txt and second call for unblocked_terms.txt.
+    self.rf_mock = self.PatchObject(
+        osutils, 'ReadFile',
+        side_effect=['scruffy\nmangy\ndog.?pile\ncat.?circle', 'fox'])
     self.diff_mock = self.PatchObject(pre_upload, '_get_file_diff')
     self.desc_mock = self.PatchObject(pre_upload, '_get_commit_desc')
+    self.project = pre_upload.Project(name='PROJECT', dir=self.tempdir,
+                                      remote=None)
 
   def test_good_cases(self):
     self.desc_mock.return_value = 'Commit Message.\nLine 2'
@@ -110,9 +114,16 @@
         (1, 'Some text without keywords.'),
         (2, 'The dog is black has a partial keyword that does not count.'),
     ]
-    failures = pre_upload._check_keywords(ProjectNamed('PROJECT'), 'COMMIT')
+    failures = pre_upload._check_keywords(self.project, 'COMMIT')
     self.assertEqual(failures, [])
 
+    self.rf_mock.assert_has_calls([
+        mock.call(os.path.join(pre_upload._get_hooks_dir(),
+                               pre_upload.BLOCKED_TERMS_FILE)),
+        mock.call(os.path.join(pre_upload._get_hooks_dir(),
+                               pre_upload.UNBLOCKED_TERMS_FILE)),
+    ])
+
   def test_bad_cases(self):
     self.desc_mock.return_value = 'Commit Message.\nLine 2\nLine 3 scruffy'
     self.diff_mock.return_value = [
@@ -123,20 +134,21 @@
         (5, 'dogpiled substring catch'),
         (6, 'scruffy mangy dog, multiple in a line catch'),
     ]
-    failures = pre_upload._check_keywords(ProjectNamed('PROJECT'),
-                                          'COMMIT')
+    failures = pre_upload._check_keywords(self.project, 'COMMIT')
     self.assertNotEqual(failures, [])
     self.assertEqual('Found a blocked keyword in:', failures[0].msg)
-    self.assertEqual(['x.ebuild, line 1: Matched scruffy',
-                      'x.ebuild, line 2: Matched dog.?pile',
-                      'x.ebuild, line 3: Matched cat.?circle',
-                      'x.ebuild, line 4: Matched dog.?pile',
-                      'x.ebuild, line 5: Matched dog.?pile',
-                      'x.ebuild, line 6: Matched mangy'],
-                     failures[0].items)
+    self.assertEqual(
+        ['x.ebuild, line 1: Matched "Scruffy" with regex of "scruffy"',
+         'x.ebuild, line 2: Matched "dog-pile" with regex of "dog.?pile"',
+         'x.ebuild, line 3: Matched "cat_circle" with regex of "cat.?circle"',
+         'x.ebuild, line 4: Matched "dog pile" with regex of "dog.?pile"',
+         'x.ebuild, line 5: Matched "dogpile" with regex of "dog.?pile"',
+         'x.ebuild, line 6: Matched "mangy" with regex of "mangy"'],
+        failures[0].items)
     self.assertEqual('Found a blocked keyword in:', failures[1].msg)
-    self.assertEqual(['Commit message, line 3: Matched scruffy'],
-                     failures[1].items)
+    self.assertEqual(
+        ['Commit message, line 3: Matched "scruffy" with regex of "scruffy"'],
+        failures[1].items)
 
   def test_block_option_cases(self):
     self.desc_mock.return_value = 'Commit Message.\nLine 2 voldemort'
@@ -144,29 +156,27 @@
         (1, 'Line with a new term voldemort.'),
         (2, 'Line with only they who shall not be named.'),
     ]
-    failures = pre_upload._check_keywords(ProjectNamed('PROJECT'),
+    failures = pre_upload._check_keywords(self.project,
                                           'COMMIT', ['--block', 'voldemort'])
     self.assertNotEqual(failures, [])
     self.assertEqual('Found a blocked keyword in:', failures[0].msg)
-    self.assertEqual(['x.ebuild, line 1: Matched voldemort'], failures[0].items)
+    self.assertEqual(
+        ['x.ebuild, line 1: Matched "voldemort" with regex of "voldemort"'],
+        failures[0].items)
     self.assertEqual('Found a blocked keyword in:', failures[1].msg)
-    self.assertEqual(['Commit message, line 2: Matched voldemort'],
-                     failures[1].items)
+    self.assertEqual(
+        ['Commit message, line 2: '
+         'Matched "voldemort" with regex of "voldemort"'], failures[1].items)
 
   def test_unblock_option_cases(self):
     self.desc_mock.return_value = 'Commit message with scruffy'
     self.diff_mock.return_value = [
-        (1, 'Line with a now unblocked term scruffy'),
-        (2, 'Line with without any blocked terms'),
-    ]
-    failures = pre_upload._check_keywords(ProjectNamed('PROJECT'),
-                                          'COMMIT', ['--unblock', 'scru.?fy'])
-    self.assertEqual(failures, [])
-    self.diff_mock.return_value = [
         (1, 'Line with two unblocked terms scruffy big dog-pile'),
         (2, 'Line with without any blocked terms'),
     ]
-    failures = pre_upload._check_keywords(ProjectNamed('PROJECT'),
+    # scruffy matches regex of 'scruffy' in block list but excluded by
+    # different regex of 'scru.?fy' in unblock list.
+    failures = pre_upload._check_keywords(self.project,
                                           'COMMIT', ['--unblock', 'dog.?pile',
                                                      '--unblock', 'scru.?fy'])
     self.assertEqual(failures, [])
diff --git a/unblocked_terms.txt b/unblocked_terms.txt
new file mode 120000
index 0000000..9f7f7f4
--- /dev/null
+++ b/unblocked_terms.txt
@@ -0,0 +1 @@
+blocked_terms.txt
\ No newline at end of file