client/bin/result_tools/dedupe_file_throttler.py - mirrors/cros/chromiumos/third_party/autotest - Git at Google

 # Copyright 2017 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """This throttler tries to remove the remove repeated files sharing the same
 prefix, for example, screenshots or dumps in the same folder. The dedupe logic
 does not compare the file content, instead, it sorts the files with the same
 prefix and remove files in the middle part.
 """

 import os
 import re

 import result_info_lib
 import throttler_lib
 import utils_lib


 # Number of files to keep for the oldest files.
 OLDEST_FILES_TO_KEEP_COUNT = 2
 # Number of files to keep for the newest files.
 NEWEST_FILES_TO_KEEP_COUNT = 1

 # Files with path mathing following patterns should not be deduped.
 NO_DEDUPE_FILE_PATTERNS = [
         'debug/.*',
         '.*perf.data$',       # Performance test data.
         '.*/debug/.*',
         '.*dir_summary_\d+.json',
         ]

 # regex pattern to get the prefix of a file.
 PREFIX_PATTERN = '([a-zA-Z_-]*).*'

 def _group_by(file_infos, keys):
     """Group the file infos by the given keys.

     @param file_infos: A list of ResultInfo objects.
     @param keys: A list of names of the attribute to group the file infos by.
     @return: A dictionary of grouped_key: [ResultInfo].
     """
     grouped_infos = {}
     for info in file_infos:
         key_values = []
         for key in keys:
             key_values.append(getattr(info, key))
         grouped_key = os.sep.join(key_values)
         if grouped_key not in grouped_infos:
             grouped_infos[grouped_key] = []
         grouped_infos[grouped_key].append(info)
     return grouped_infos


 def _dedupe_files(summary, file_infos, max_result_size_KB):
     """Delete the given file and update the summary.

     @param summary: A ResultInfo object containing result summary.
     @param file_infos: A list of ResultInfo objects to be de-duplicated.
     @param max_result_size_KB: Maximum test result size in KB.
     """
     # Sort file infos based on the modify date of the file.
     file_infos.sort(
             key=lambda f: result_info_lib.get_last_modification_time(f.path))
     file_infos_to_delete = file_infos[
             OLDEST_FILES_TO_KEEP_COUNT:-NEWEST_FILES_TO_KEEP_COUNT]

     for file_info in file_infos_to_delete:
         if throttler_lib.try_delete_file_on_disk(file_info.path):
             file_info.trimmed_size = 0

             if throttler_lib.check_throttle_limit(summary, max_result_size_KB):
                 return


 def throttle(summary, max_result_size_KB):
     """Throttle the files in summary by de-duplicating files.

     Stop throttling until all files are processed or the result size is already
     reduced to be under the given max_result_size_KB.

     @param summary: A ResultInfo object containing result summary.
     @param max_result_size_KB: Maximum test result size in KB.
     """
     _, grouped_files = throttler_lib.sort_result_files(summary)
     for pattern in throttler_lib.RESULT_THROTTLE_PRIORITY:
         throttable_files = list(throttler_lib.get_throttleable_files(
                 grouped_files[pattern], NO_DEDUPE_FILE_PATTERNS))

         for info in throttable_files:
             info.parent_dir = os.path.dirname(info.path)
             info.prefix = re.match(PREFIX_PATTERN, info.name).group(1)

         # Group files for each parent directory
         grouped_infos = _group_by(throttable_files, ['parent_dir', 'prefix'])

         for infos in grouped_infos.values():
             if (len(infos) <=
                 OLDEST_FILES_TO_KEEP_COUNT + NEWEST_FILES_TO_KEEP_COUNT):
                 # No need to dedupe if the count of file is too few.
                 continue

             # Remove files can be deduped
             utils_lib.LOG('De-duplicating files in %s with the same prefix of '
                           '"%s"' % (infos[0].parent_dir, infos[0].prefix))
             #dedupe_file_infos = [i.result_info for i in infos]
             _dedupe_files(summary, infos, max_result_size_KB)

             if throttler_lib.check_throttle_limit(summary, max_result_size_KB):
                 return
	# Copyright 2017 The Chromium OS Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""This throttler tries to remove the remove repeated files sharing the same
	prefix, for example, screenshots or dumps in the same folder. The dedupe logic
	does not compare the file content, instead, it sorts the files with the same
	prefix and remove files in the middle part.
	"""

	import os
	import re

	import result_info_lib
	import throttler_lib
	import utils_lib


	# Number of files to keep for the oldest files.
	OLDEST_FILES_TO_KEEP_COUNT = 2
	# Number of files to keep for the newest files.
	NEWEST_FILES_TO_KEEP_COUNT = 1

	# Files with path mathing following patterns should not be deduped.
	NO_DEDUPE_FILE_PATTERNS = [
	'debug/.*',
	'.*perf.data$', # Performance test data.
	'./debug/.',
	'.*dir_summary_\d+.json',
	]

	# regex pattern to get the prefix of a file.
	PREFIX_PATTERN = '([a-zA-Z_-]).'

	def _group_by(file_infos, keys):
	"""Group the file infos by the given keys.

	@param file_infos: A list of ResultInfo objects.
	@param keys: A list of names of the attribute to group the file infos by.
	@return: A dictionary of grouped_key: [ResultInfo].
	"""
	grouped_infos = {}
	for info in file_infos:
	key_values = []
	for key in keys:
	key_values.append(getattr(info, key))
	grouped_key = os.sep.join(key_values)
	if grouped_key not in grouped_infos:
	grouped_infos[grouped_key] = []
	grouped_infos[grouped_key].append(info)
	return grouped_infos


	def _dedupe_files(summary, file_infos, max_result_size_KB):
	"""Delete the given file and update the summary.

	@param summary: A ResultInfo object containing result summary.
	@param file_infos: A list of ResultInfo objects to be de-duplicated.
	@param max_result_size_KB: Maximum test result size in KB.
	"""
	# Sort file infos based on the modify date of the file.
	file_infos.sort(
	key=lambda f: result_info_lib.get_last_modification_time(f.path))
	file_infos_to_delete = file_infos[
	OLDEST_FILES_TO_KEEP_COUNT:-NEWEST_FILES_TO_KEEP_COUNT]

	for file_info in file_infos_to_delete:
	if throttler_lib.try_delete_file_on_disk(file_info.path):
	file_info.trimmed_size = 0

	if throttler_lib.check_throttle_limit(summary, max_result_size_KB):
	return


	def throttle(summary, max_result_size_KB):
	"""Throttle the files in summary by de-duplicating files.

	Stop throttling until all files are processed or the result size is already
	reduced to be under the given max_result_size_KB.

	@param summary: A ResultInfo object containing result summary.
	@param max_result_size_KB: Maximum test result size in KB.
	"""
	_, grouped_files = throttler_lib.sort_result_files(summary)
	for pattern in throttler_lib.RESULT_THROTTLE_PRIORITY:
	throttable_files = list(throttler_lib.get_throttleable_files(
	grouped_files[pattern], NO_DEDUPE_FILE_PATTERNS))

	for info in throttable_files:
	info.parent_dir = os.path.dirname(info.path)
	info.prefix = re.match(PREFIX_PATTERN, info.name).group(1)

	# Group files for each parent directory
	grouped_infos = _group_by(throttable_files, ['parent_dir', 'prefix'])

	for infos in grouped_infos.values():
	if (len(infos) <=
	OLDEST_FILES_TO_KEEP_COUNT + NEWEST_FILES_TO_KEEP_COUNT):
	# No need to dedupe if the count of file is too few.
	continue

	# Remove files can be deduped
	utils_lib.LOG('De-duplicating files in %s with the same prefix of '
	'"%s"' % (infos[0].parent_dir, infos[0].prefix))
	#dedupe_file_infos = [i.result_info for i in infos]
	_dedupe_files(summary, infos, max_result_size_KB)

	if throttler_lib.check_throttle_limit(summary, max_result_size_KB):
	return