blob: b92498b17fb298c64514d5e0fa1b8b6bc2023288 [file] [log] [blame]
#!/usr/bin/python
# Copyright 2014 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Script to remove unused gconv charset modules from a build."""
import logging.handlers
import argparse
import operator
import os
import stat
import subprocess
import sys
# Possible paths to the gconv-modules file. The path depends on the platform.
GCONV_MODULES_PATHS = (
'usr/lib64/gconv/gconv-modules',
'usr/lib/gconv/gconv-modules',
)
# List of function names (symbols) known to use a charset as a parameter.
GCONV_SYMBOLS = (
# glibc
'iconv_open',
'iconv',
# glib
'g_convert',
'g_convert_with_fallback',
'g_iconv',
'g_locale_to_utf8',
'g_get_charset',
)
class GconvModules(object):
"""Class to manipulate the gconv/gconv-modules file and referenced modules.
This class parses the contents of the gconv-modules file installed by glibc
which provides the definition of the charsets supported by iconv_open(3). It
allows to load the current gconv-modules file and rewrite it to include only
a subset of the supported modules, removing the other modules.
Each charset is involved on some transformation between that charset and an
internal representation. This transformation is defined on a .so file loaded
dynamically with dlopen(3) when the charset defined in this file is requested
to iconv_open(3).
See the comments on gconv-modules file for syntax details.
"""
def __init__(self, gconv_modules_fn):
"""Initialize the class.
Args:
gconv_modules_fn: Path to gconv/gconv-modules file.
"""
self._fn = gconv_modules_fn
# An alias map of charsets. The key (fromcharset) is the alias name and
# the value (tocharset) is the real charset name. We also support a value
# that is an alias for another charset.
self._alias = {}
# The modules dict goes from charset to module names (the filenames without
# the .so extension). Since several transformations involving the same
# charset could be defined in different files, the values of this dict are
# a set of module names.
self._modules = {}
def Load(self):
"""Load the charsets from gconv-modules."""
for l in open(self._fn):
l = l.rstrip('\n')
if not l or l[0] == '#': # Comment
continue
lst = l.split()
if not lst:
continue
elif lst[0] == 'module':
_, fromset, toset, filename = lst[:4]
for charset in (fromset, toset):
charset = charset.rstrip('/')
mods = self._modules.get(charset, set())
mods.add(filename)
self._modules[charset] = mods
elif lst[0] == 'alias':
_, fromset, toset = lst
fromset = fromset.rstrip('/')
toset = toset.rstrip('/')
# Warn if the same charset is defined as two different aliases
if self._alias.get(fromset, toset) != toset:
logging.error('charset "%s" already defined as "%s".',
fromset, self._alias[fromset])
self._alias[fromset] = toset
else:
logging.error('Unknown line: %s', l)
logging.debug('Found %d modules and %d alias on %s',
len(self._modules), len(self._alias), self._fn)
charsets = sorted(self._alias.keys() + self._modules.keys())
# Remove the 'INTERNAL' charset from the list, since it is not a charset
# but an internal representation used to convert to and from other charsets.
if 'INTERNAL' in charsets:
charsets.remove('INTERNAL')
return charsets
def Rewrite(self, used_charsets, dry_run=False):
"""Rewrite gconv-modules file with only the used charsets.
Args:
used_charsets: A list of used charsets. This should be a subset of the
list returned by Load().
dry_run: Whether this function should not change any file.
"""
# Compute the used modules.
used_modules = set()
for charset in used_charsets:
while charset in self._alias:
charset = self._alias[charset]
used_modules.update(self._modules[charset])
unused_modules = reduce(set.union, self._modules.values()) - used_modules
logging.debug('Used modules: %s', ', '.join(sorted(used_modules)))
modules_dir = os.path.dirname(self._fn)
unused_size = 0
for module in sorted(unused_modules):
module_path = os.path.join(modules_dir, '%s.so' % module)
unused_size += os.lstat(module_path).st_size
logging.debug('rm %s', module_path)
if not dry_run:
os.unlink(module_path)
logging.info('Using %d gconv modules. Removed %d unused modules (%.1f KiB)',
len(used_modules), len(unused_modules), unused_size / 1024.)
# Recompute the gconv-modules file with only the included gconv modules.
result = []
for ln in open(self._fn):
l = ln.rstrip('\n')
lst = l.split()
if not l or l[0] == '#' or not lst:
result.append(ln) # Keep comments and copyright headers.
elif lst[0] == 'module':
_, _, _, filename = lst[:4]
if filename in used_modules:
result.append(ln) # Used module
elif lst[0] == 'alias':
_, charset, _ = lst
charset = charset.rstrip('/')
while charset in self._alias:
charset = self._alias[charset]
if used_modules.intersection(self._modules[charset]):
result.append(ln) # Alias to an used module
else:
logging.error('Unknown line: %s', l)
if not dry_run:
with open(self._fn, 'w') as f:
f.write(''.join(result))
def MultipleStringMatch(patterns, corpus):
"""Search a list of strings in a corpus string.
Args:
patterns: A list of strings.
corpus: The text where to search for the strings.
Result:
A list of Booleans stating whether each pattern string was found on the
corpus or not.
"""
# TODO(deymo): Use an implementation of Aho-Corasick to speedup this search.
return [s in corpus for s in patterns]
def GconvStrip(args):
"""Process gconv-modules and remove unused modules.
Args:
args: The command-line args passed to the script.
Returns:
The exit code number indicating whether the process succeeded.
"""
root_st = os.lstat(args.root)
if not stat.S_ISDIR(root_st.st_mode):
raise Exception("root (%s) must be a directory.")
# Detect the possible locations of the gconv-modules file.
gconv_modules_files = []
for rootfs_gconv_modules in GCONV_MODULES_PATHS:
gconv_modules = os.path.join(args.root, rootfs_gconv_modules)
if os.path.exists(gconv_modules):
gconv_modules_files.append(gconv_modules)
if not gconv_modules_files:
logging.error('gconv-modules file not found.')
return 1
# Only one gconv-modules files should be present, either on /usr/lib or
# /usr/lib64, but not both.
if len(gconv_modules_files) > 1:
logging.error('Found several gconv-modules files.')
return 1
gconv_modules_fn = gconv_modules_files[0]
logging.info('Removing unused gconv files defined in %s', gconv_modules_fn)
gmods = GconvModules(gconv_modules_fn)
charsets = gmods.Load()
# Use scanelf to search for all the binary files on the rootfs that require
# or define the symbol iconv_open. We also include the binaries that define
# it since there could be internal calls to it from other functions.
files = set()
for symbol in GCONV_SYMBOLS:
output = subprocess.check_output([
'scanelf', '--mount', '--quiet', '--recursive', '--symbol', symbol,
args.root])
symbol_files = [l.split()[1] for l in output.splitlines()
if l.startswith(symbol)]
logging.debug('Symbol %s found on %d files.', symbol, len(symbol_files))
files.update(symbol_files)
# The charsets are represented as null-terminated strings on the binary files,
# so we append the '\0' to each string. This prevents some false positives
# when the name of the charset is a substring of some other string. It doesn't
# prevent false positives when the charset name is the suffix of another
# string, for example a binary with the string "DON'T DO IT\0" will match the
# 'IT' charset. Empirical test on ChromeOS images suggests that only 4
# charsets could fall in category.
strings = [s + '\0' for s in charsets]
logging.debug('Will search for %d strings in %d files',
len(strings), len(files))
global_used = [False] * len(strings)
for fn in files:
with open(fn, 'rb') as f:
used_fn = MultipleStringMatch(strings, f.read())
global_used = map(operator.or_, global_used, used_fn)
# Check the verbose flag to avoid running an useless loop.
if args.verbose and any(used_fn):
logging.debug('File %s:', fn)
for i in range(len(used_fn)):
if used_fn[i]:
logging.debug(' - %s:', strings[i])
used_charsets = [charsets[i] for i in range(len(charsets)) if global_used[i]]
gmods.Rewrite(used_charsets, args.dry_run)
return 0
def main():
"""Main function to start the script."""
parser = argparse.ArgumentParser()
parser.add_argument(
'-V', '--verbose', dest='verbose', action='store_true', default=False,
help='Verbose',)
parser.add_argument(
'--dry-run', dest='dry_run', action='store_true', default=False,
help='process but don\'t modify any file.',)
parser.add_argument(
'root', help='path to the directory where the rootfs is mounted.',)
logging_format = '%(asctime)s - %(filename)s - %(levelname)-8s: %(message)s'
date_format = '%Y/%m/%d %H:%M:%S'
logging.basicConfig(level=logging.INFO, format=logging_format,
datefmt=date_format)
args = parser.parse_args()
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
logging.debug('Options are %s ', args)
return GconvStrip(args)
if __name__ == '__main__':
sys.exit(main())