[Autotest] Copy only new data in /var/log folder after each test.

Copy all content in /var/log folder causes overhead in both test time and
test result storage. The /var/log folder's size can be over 40MB after a
test is finished. Code change is made to implement the diff logic to only
copy new data in /var/log folder before and after each test.

BUG=chromium:217983
TEST=manually run autoserv with sleep test and verify new data in /var/log
folder is copied over to [test]/sysinfo/var/log_diff. Also use trybot build
trybot-lumpy-paladin/R28-3915.0.0-b741 to confirm autoupdate job's /var/log
is collected as expected.
DEPLOY=manual, before push to prod, this CL must be backport to all branches
being tested.

Change-Id: Ic78c5aa6c35eeab63bb3a0d8cb7095b7f55f217b
Reviewed-on: https://gerrit.chromium.org/gerrit/47138
Reviewed-by: Scott Zawalski <scottz@chromium.org>
Tested-by: Dan Shi <dshi@chromium.org>
diff --git a/client/bin/site_sysinfo.py b/client/bin/site_sysinfo.py
index 6266aaa..77f8d3d 100755
--- a/client/bin/site_sysinfo.py
+++ b/client/bin/site_sysinfo.py
@@ -4,7 +4,8 @@
 
 import os
 
-from autotest_lib.client.common_lib import utils, global_config
+from autotest_lib.client.common_lib import log
+from autotest_lib.client.common_lib import error, utils, global_config
 from autotest_lib.client.bin import base_sysinfo
 from autotest_lib.client.cros import constants
 
@@ -69,6 +70,152 @@
                                           log_dir, parent_dir))
 
 
+class file_stat(object):
+    """Store the file size and inode, used for retrieving new data in file."""
+    def __init__(self, file_path):
+        """Collect the size and inode information of a file.
+
+        @param file_path: full path to the file.
+
+        """
+        stat = os.stat(file_path)
+        # Start size of the file, skip that amount of bytes when do diff.
+        self.st_size = stat.st_size
+        # inode of the file. If inode is changed, treat this as a new file and
+        # copy the whole file.
+        self.st_ino = stat.st_ino
+
+
+class diffable_logdir(logdir):
+    """Represents a log directory that only new content will be copied.
+
+    An instance of this class should be added in both
+    before_iteration_loggables and after_iteration_loggables. This is to
+    guarantee the file status information is collected when run method is
+    called in before_iteration_loggables, and diff is executed when run
+    method is called in after_iteration_loggables.
+
+    """
+    def __init__(self, directory, additional_exclude=None,
+                 keep_file_hierarchy=True, append_diff_in_name=True):
+        """
+        Constructor of a diffable_logdir instance.
+
+        @param directory: directory to be diffed after an iteration finished.
+        @param additional_exclude: additional dir to be excluded, not used.
+        @param keep_file_hierarchy: True if need to preserve full path, e.g.,
+            sysinfo/var/log/sysstat, v.s. sysinfo/sysstat if it's False.
+        @param append_diff_in_name: True if you want to append '_diff' to the
+            folder name to indicate it's a diff, e.g., var/log_diff. Option
+            keep_file_hierarchy must be True for this to take effect.
+
+        """
+        super(diffable_logdir, self).__init__(directory, additional_exclude)
+        self.additional_exclude = additional_exclude
+        self.keep_file_hierarchy = keep_file_hierarchy
+        self.append_diff_in_name = append_diff_in_name
+        # Init dictionary to store all file status for files in the directory.
+        self._log_stats = {}
+
+
+    def _get_init_status_of_src_dir(self, src_dir):
+        """Get initial status of files in src_dir folder.
+
+        @param src_dir: directory to be diff-ed.
+
+        """
+        # Dictionary used to store the initial status of files in src_dir.
+        for file_path in self._get_all_files(src_dir):
+            self._log_stats[file_path] = file_stat(file_path)
+        self.file_stats_collected = True
+
+
+    def _get_all_files(self, path):
+        """Iterate through files in given path including subdirectories.
+
+        @param path: root directory.
+        @return: an iterator that iterates through all files in given path
+            including subdirectories.
+
+        """
+        if not os.path.exists(path):
+            yield []
+        for root, dirs, files in os.walk(path):
+            for f in files:
+                if f.startswith('autoserv'):
+                    continue
+                yield os.path.join(root, f)
+
+
+    def _copy_new_data_in_file(self, file_path, src_dir, dest_dir):
+        """Copy all new data in a file to target directory.
+
+        @param file_path: full path to the file to be copied.
+        @param src_dir: source directory to do the diff.
+        @param dest_dir: target directory to store new data of src_dir.
+
+        """
+        bytes_to_skip = 0
+        if self._log_stats.has_key(file_path):
+            prev_stat = self._log_stats[file_path]
+            new_stat = os.stat(file_path)
+            if new_stat.st_ino == prev_stat.st_ino:
+                bytes_to_skip = prev_stat.st_size
+            if new_stat.st_size == bytes_to_skip:
+                return
+            elif new_stat.st_size < prev_stat.st_size:
+                # File is modified to a smaller size, copy whole file.
+                bytes_to_skip = 0
+        try:
+            with open(file_path, 'r') as in_log:
+                if bytes_to_skip > 0:
+                    in_log.seek(bytes_to_skip)
+                # Skip src_dir in path, e.g., src_dir/[sub_dir]/file_name.
+                target_path = os.path.join(dest_dir,
+                                           os.path.relpath(file_path, src_dir))
+                target_dir = os.path.dirname(target_path)
+                if not os.path.exists(target_dir):
+                    os.makedirs(target_dir)
+                with open(target_path, "w") as out_log:
+                    out_log.write(in_log.read())
+        except IOError as e:
+            logging.error('Diff %s failed with error: %s', file_path, e)
+
+
+    def _log_diff(self, src_dir, dest_dir):
+        """Log all of the new data in src_dir to dest_dir.
+
+        @param src_dir: source directory to do the diff.
+        @param dest_dir: target directory to store new data of src_dir.
+
+        """
+        if self.keep_file_hierarchy:
+            dir = src_dir.lstrip('/')
+            if self.append_diff_in_name:
+                dir = dir.rstrip('/') + '_diff'
+            dest_dir = os.path.join(dest_dir, dir)
+
+        if not os.path.exists(dest_dir):
+            os.makedirs(dest_dir)
+
+        for src_file in self._get_all_files(src_dir):
+            self._copy_new_data_in_file(src_file, src_dir, dest_dir)
+
+
+    def run(self, log_dir, collect_init_status=True):
+        """Copies new content from self.dir to the destination log_dir.
+
+        @param log_dir: The destination log directory.
+        @param collect_init_status: Set to True if run method is called to
+            collect the initial status of files.
+
+        """
+        if collect_init_status:
+            self._get_init_status_of_src_dir(self.dir)
+        elif os.path.exists(self.dir):
+                self._log_diff(self.dir, log_dir)
+
+
 class purgeable_logdir(logdir):
     """Represents a log directory that will be purged."""
     def __init__(self, directory, additional_exclude=None):
@@ -94,6 +241,14 @@
         if not collect_corefiles:
             crash_exclude_string = "*.core"
 
+        # This is added in before and after_iteration_loggables. When run is
+        # called in before_iteration_loggables, it collects file status in
+        # the directory. When run is called in after_iteration_loggables, diff
+        # is executed.
+        diffable_log = diffable_logdir(constants.LOG_DIR)
+        self.diffable_loggables = set()
+        self.diffable_loggables.add(diffable_log)
+
         # add in some extra command logging
         self.boot_loggables.add(command("ls -l /boot",
                                         "boot_file_list"))
@@ -102,7 +257,6 @@
         self.test_loggables.add(
             purgeable_logdir(
                 os.path.join(constants.CRYPTOHOME_MOUNT_PT, "log")))
-        self.test_loggables.add(logdir("/var/log"))
         # We only want to gather and purge crash reports after the client test
         # runs in case a client test is checking that a crash found at boot
         # (such as a kernel crash) is handled.
@@ -127,6 +281,44 @@
             purgeable_logdir(constants.CRASH_REPORTER_RESIDUE_DIR))
 
 
+    @log.log_and_ignore_errors("pre-test sysinfo error:")
+    def log_before_each_test(self, test):
+        """Logging hook called before a test starts.
+
+        @param test: A test object.
+        """
+        super(site_sysinfo, self).log_before_each_test(test)
+
+        for log in self.diffable_loggables:
+            log.run(log_dir=None, collect_init_status=True)
+
+
+    @log.log_and_ignore_errors("post-test sysinfo error:")
+    def log_after_each_test(self, test):
+        """Logging hook called after a test finishs.
+
+        @param test: A test object.
+        """
+        super(site_sysinfo, self).log_after_each_test(test)
+
+        test_sysinfodir = self._get_sysinfodir(test.outputdir)
+        for log in self.diffable_loggables:
+            log.run(log_dir=test_sysinfodir, collect_init_status=False)
+
+
+    def _get_chrome_version(self):
+        """Gets the Chrome version number as a string.
+
+        @return The current Chrome version number as a string.  It is specified
+            in format "X.X.X.X" if it can be parsed in that format, otherwise
+            it is specified as the full output of "chrome --version".
+
+        """
+        version_string = utils.system_output(self._CHROME_VERSION_COMMAND)
+        match = re.search('\d+\.\d+\.\d+\.\d+', version_string)
+        return match.group(0) if match else version_string
+
+
     def log_test_keyvals(self, test_sysinfodir):
         keyval = super(site_sysinfo, self).log_test_keyvals(test_sysinfodir)
 
diff --git a/client/bin/site_sysinfo_unittest.py b/client/bin/site_sysinfo_unittest.py
new file mode 100644
index 0000000..c9078fc
--- /dev/null
+++ b/client/bin/site_sysinfo_unittest.py
@@ -0,0 +1,99 @@
+#!/usr/bin/python
+
+"""Tests for site_sysinfo."""
+
+__author__ = 'dshi@google.com (Dan Shi)'
+
+import common
+import os
+import random
+import unittest
+from autotest_lib.client.bin import site_sysinfo
+from autotest_lib.client.common_lib import autotemp
+
+
+class diffable_logdir_test(unittest.TestCase):
+    """Tests for methods in class diffable_logdir."""
+
+
+    def setUp(self):
+        """Initialize a temp direcotry with test files."""
+        self.tempdir = autotemp.tempdir(unique_id='diffable_logdir')
+        self.src_dir = os.path.join(self.tempdir.name, 'src')
+        self.dest_dir = os.path.join(self.tempdir.name, 'dest')
+
+        self.existing_files = ['existing_file_'+str(i) for i in range(3)]
+        self.existing_files_folder = ['', 'sub', 'sub/sub2']
+        self.existing_files_path = [os.path.join(self.src_dir, folder, f)
+                                    for f,folder in zip(self.existing_files,
+                                                self.existing_files_folder)]
+        self.new_files = ['new_file_'+str(i) for i in range(2)]
+        self.new_files_folder = ['sub', 'sub/sub3']
+        self.new_files_path = [os.path.join(self.src_dir, folder, f)
+                                    for f,folder in zip(self.new_files,
+                                                self.new_files_folder)]
+
+        # Create some file with random data in source directory.
+        for p in self.existing_files_path:
+            self.append_text_to_file(str(random.random()), p)
+
+
+    def tearDown(self):
+        """Clearn up."""
+        self.tempdir.clean()
+
+
+    def append_text_to_file(self, text, file_path):
+        """Append text to the end of a file, create the file if not existed.
+
+        @param text: text to be appended to a file.
+        @param file_path: path to the file.
+
+        """
+        dir_name = os.path.dirname(file_path)
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name)
+        with open(file_path, 'a') as f:
+            f.write(text)
+
+
+    def test_diffable_logdir_success(self):
+        """Test the diff function to save new data from a directory."""
+        info = site_sysinfo.diffable_logdir(self.src_dir,
+                                            keep_file_hierarchy=False,
+                                            append_diff_in_name=False)
+        # Run the first time to collect file status.
+        info.run(log_dir=None, collect_init_status=True)
+
+        # Add new files to the test directory.
+        for file_name, file_path in zip(self.new_files,
+                                         self.new_files_path):
+            self.append_text_to_file(file_name, file_path)
+
+        # Temp file for existing_file_2, used to hold on the inode. If the
+        # file is deleted and recreated, its inode might not change.
+        existing_file_2 = self.existing_files_path[2]
+        existing_file_2_tmp =  existing_file_2 + '_tmp'
+        os.rename(existing_file_2, existing_file_2_tmp)
+
+        # Append data to existing file.
+        for file_name, file_path in zip(self.existing_files,
+                                         self.existing_files_path):
+            self.append_text_to_file(file_name, file_path)
+
+        # Remove the tmp file.
+        os.remove(existing_file_2_tmp)
+
+        # Run the second time to do diff.
+        info.run(self.dest_dir, collect_init_status=False)
+
+        # Validate files in dest_dir.
+        for file_name, file_path in zip(self.existing_files+self.new_files,
+                                self.existing_files_path+self.new_files_path):
+            file_path = file_path.replace('src', 'dest')
+            with open(file_path, 'r') as f:
+                self.assertEqual(file_name, f.read())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/client/cros/constants.py b/client/cros/constants.py
index eff1402..995df90 100644
--- a/client/cros/constants.py
+++ b/client/cros/constants.py
@@ -7,7 +7,7 @@
 # Constants used by other constants.
 USER_DATA_DIR = '/home/chronos'
 WHITELIST_DIR = '/var/lib/whitelist'
-
+LOG_DIR = '/var/log'
 
 # Rest of constants.
 BROWSER = 'chrome'