blob: 187800345c430b392ad88afc78624eca6df5d1f2 [file] [log] [blame]
# Copyright 2014 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Script to remove unused gconv charset modules from a build."""
import functools
import glob
import logging
import operator
import os
import stat
from chromite.third_party import lddtree
from chromite.lib import commandline
from chromite.lib import cros_build_lib
from chromite.lib import osutils
try:
import pytest # pylint: disable=import-error
ahocorasick = pytest.importorskip("ahocorasick")
except ImportError:
import ahocorasick
# Path pattern to search for the gconv-modules file.
GCONV_MODULES_PATH = "usr/*/gconv/gconv-modules"
# Sticky modules. These charsets modules are always included even if they
# aren't used. You can specify any charset name as supported by 'iconv_open',
# for example, 'LATIN1' or 'ISO-8859-1'.
STICKY_MODULES = ("UTF-16", "UTF-32", "UNICODE")
# List of function names (symbols) known to use a charset as a parameter.
GCONV_SYMBOLS = (
# glibc
"iconv_open",
"iconv",
# glib
"g_convert",
"g_convert_with_fallback",
"g_iconv",
"g_locale_to_utf8",
"g_get_charset",
)
class GconvModules:
"""Class to manipulate the gconv/gconv-modules file and referenced modules.
This class parses the contents of the gconv-modules file installed by glibc
which provides the definition of the charsets supported by iconv_open(3). It
allows to load the current gconv-modules file and rewrite it to include only
a subset of the supported modules, removing the other modules.
Each charset is involved on some transformation between that charset and an
internal representation. This transformation is defined on a .so file loaded
dynamically with dlopen(3) when the charset defined in this file is
requested to iconv_open(3).
See the comments on gconv-modules file for syntax details.
"""
def __init__(self, gconv_modules_file, modules_dir):
"""Initialize the class.
Args:
gconv_modules_file: Path to gconv/gconv-modules file.
modules_dir: Path to the directory that contains the gconv modules.
"""
self._filename = gconv_modules_file
self._modules_dir = modules_dir
# An alias map of charsets. The key (fromcharset) is the alias name and
# the value (tocharset) is the real charset name. We also support a
# value that is an alias for another charset.
self._alias = {}
# The modules dict goes from charset to module names (the filenames
# without the .so extension). Since several transformations involving
# the same charset could be defined in different files, the values of
# this dict are a set of module names.
self._modules = {}
def Load(self):
"""Load the charsets from gconv-modules."""
with open(self._filename, encoding="utf-8") as fp:
for line in fp:
line = line.split("#", 1)[0].strip()
if not line:
# Ignore blank lines & comments.
continue
lst = line.split()
if lst[0] == "module":
_, fromset, toset, filename = lst[:4]
for charset in (fromset, toset):
charset = charset.rstrip("/")
mods = self._modules.get(charset, set())
mods.add(filename)
self._modules[charset] = mods
elif lst[0] == "alias":
_, fromset, toset = lst
fromset = fromset.rstrip("/")
toset = toset.rstrip("/")
# Warn if the same charset is defined as two different
# aliases.
if self._alias.get(fromset, toset) != toset:
logging.error(
'charset "%s" already defined as "%s".',
fromset,
self._alias[fromset],
)
self._alias[fromset] = toset
else:
cros_build_lib.Die("Unknown line: %s", line)
logging.debug(
"Found %d modules and %d alias in %s",
len(self._modules),
len(self._alias),
self._filename,
)
charsets = sorted(list(self._alias) + list(self._modules))
# Remove the 'INTERNAL' charset from the list, since it is not a charset
# but an internal representation used to convert to and from other
# charsets.
if "INTERNAL" in charsets:
charsets.remove("INTERNAL")
return charsets
def Rewrite(self, used_charsets, dryrun=False):
"""Rewrite gconv-modules file with only the used charsets.
Args:
used_charsets: A list of used charsets. This should be a subset of
the list returned by Load().
dryrun: Whether this function should not change any file.
"""
# Compute the used modules.
used_modules = set()
for charset in used_charsets:
while charset in self._alias:
charset = self._alias[charset]
used_modules.update(self._modules[charset])
unused_modules = (
functools.reduce(set.union, list(self._modules.values()))
- used_modules
)
all_modules = set.union(used_modules, unused_modules)
# The list of charsets that depend on a given library. For example,
# libdeps['libCNS.so'] is the set of all the modules that require that
# library. These libraries live in the same directory as the modules.
libdeps = {}
for module in all_modules:
deps = lddtree.ParseELF(
os.path.join(self._modules_dir, "%s.so" % module),
self._modules_dir,
[],
)
if "needed" not in deps:
continue
for lib in deps["needed"]:
# Ignore the libs without a path defined (outside the
# modules_dir).
if deps["libs"][lib]["path"]:
libdeps[lib] = libdeps.get(lib, set()).union([module])
used_libdeps = set(
lib
for lib, deps in libdeps.items()
if deps.intersection(used_modules)
)
unused_libdeps = set(libdeps).difference(used_libdeps)
logging.debug("Used modules: %s", ", ".join(sorted(used_modules)))
logging.debug("Used dependency libs: %s, ".join(sorted(used_libdeps)))
unused_size = 0
for module in sorted(unused_modules):
module_path = os.path.join(self._modules_dir, "%s.so" % module)
unused_size += os.lstat(module_path).st_size
logging.debug("rm %s", module_path)
if not dryrun:
os.unlink(module_path)
unused_libdeps_size = 0
for lib in sorted(unused_libdeps):
lib_path = os.path.join(self._modules_dir, lib)
unused_libdeps_size += os.lstat(lib_path).st_size
logging.debug("rm %s", lib_path)
if not dryrun:
os.unlink(lib_path)
logging.info(
"Done. Using %d gconv modules. Removed %d unused modules"
" (%.1f KiB) and %d unused dependencies (%.1f KiB)",
len(used_modules),
len(unused_modules),
unused_size / 1024.0,
len(unused_libdeps),
unused_libdeps_size / 1024.0,
)
# Recompute the gconv-modules file with only the included gconv modules.
result = []
with open(self._filename, encoding="utf-8") as fp:
for line in fp:
lst = line.split("#", 1)[0].strip().split()
if not lst:
# Keep comments and copyright headers.
result.append(line)
elif lst[0] == "module":
_, _, _, filename = lst[:4]
if filename in used_modules:
# Used module
result.append(line)
elif lst[0] == "alias":
_, charset, _ = lst
charset = charset.rstrip("/")
while charset in self._alias:
charset = self._alias[charset]
if used_modules.intersection(self._modules[charset]):
# Alias to an used module
result.append(line)
else:
cros_build_lib.Die("Unknown line: %s", line)
if not dryrun:
osutils.WriteFile(self._filename, "".join(result))
def MultipleStringMatch(patterns, corpus):
"""Search a list of strings in a corpus string.
Args:
patterns: A list of strings.
corpus: The text where to search for the strings.
Returns:
A list of Booleans stating whether each pattern string was found in the
corpus or not.
"""
result = [False] * len(patterns)
tree = ahocorasick.Automaton()
for i, word in enumerate(patterns):
tree.add_word(word, i)
tree.make_automaton()
for _, i in tree.iter(corpus):
result[i] = True
return result
def GconvStrip(opts):
"""Process gconv-modules and remove unused modules.
Args:
opts: The command-line args passed to the script.
Returns:
The exit code number indicating whether the process succeeded.
"""
root_st = os.lstat(opts.root)
if not stat.S_ISDIR(root_st.st_mode):
cros_build_lib.Die("root (%s) must be a directory.", opts.root)
# Detect the possible locations of the gconv-modules file.
gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH))
if not gconv_modules_files:
logging.warning("gconv-modules file not found.")
return 1
# Only one gconv-modules files should be present, either on /usr/lib or
# /usr/lib64, but not both.
if len(gconv_modules_files) > 1:
cros_build_lib.Die("Found several gconv-modules files.")
gconv_modules_file = gconv_modules_files[0]
logging.info(
"Searching for unused gconv files defined in %s", gconv_modules_file
)
# Additional gconv-modules configuration files can be present in the
# co-located gconv-modules.d. glibc installs a gconv-modules-extra.conf
# here by default.
modules_dir = os.path.dirname(gconv_modules_file)
extras = glob.glob(
os.path.join(
modules_dir,
os.path.basename(gconv_modules_file) + ".d",
"*.conf",
)
)
gmods_groups = [GconvModules(gconv_modules_file, modules_dir)]
gmods_groups.extend(GconvModules(x, modules_dir) for x in extras)
# Use scanelf to search for all the binary files on the rootfs that require
# or define the symbol iconv_open. We also include the binaries that define
# it since there could be internal calls to it from other functions.
symbols = ",".join(GCONV_SYMBOLS)
cmd = [
"scanelf",
"--mount",
"--quiet",
"--recursive",
"--format",
"#s%F",
"--symbol",
symbols,
opts.root,
]
result = cros_build_lib.run(
cmd, stdout=True, print_cmd=False, encoding="utf-8"
)
files = set(result.stdout.splitlines())
logging.debug("Symbols %s found on %d files.", symbols, len(files))
for gmods in gmods_groups:
charsets = gmods.Load()
# The charsets are represented as nul-terminated strings in the binary
# files, so we append the '\0' to each string. This prevents some false
# positives when the name of the charset is a substring of some other
# string. It doesn't prevent false positives when the charset name is
# the suffix of another string, for example a binary with the string
# "DON'T DO IT\0" will match the 'IT' charset. Empirical test on
# ChromeOS images suggests that only 4 charsets could fall in category.
strings = [s.encode("utf-8") + b"x\00" for s in charsets]
logging.info(
"Will search for %d strings in %d files", len(strings), len(files)
)
# Charsets listed in STICKY_MOUDLES are initialized as used. Note that
# those strings should be listed in the gconv-modules file.
unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
if unknown_sticky_modules:
logging.warning(
"The following charsets were explicitly requested in "
"STICKY_MODULES even though they don't exist: %s",
", ".join(unknown_sticky_modules),
)
global_used = [charset in STICKY_MODULES for charset in charsets]
for filename in files:
used_filenames = MultipleStringMatch(
strings, osutils.ReadFile(filename, mode="rb")
)
global_used = [
operator.or_(*x) for x in zip(global_used, used_filenames)
]
# Check the debug flag to avoid running a useless loop.
if opts.debug and any(used_filenames):
logging.debug("File %s:", filename)
for i, used_filename in enumerate(used_filenames):
if used_filename:
logging.debug(" - %s", strings[i])
used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
gmods.Rewrite(used_charsets, opts.dryrun)
return 0
def ParseArgs(argv):
"""Return parsed commandline arguments."""
parser = commandline.ArgumentParser(description=__doc__, dryrun=True)
parser.add_argument(
"root",
type="path",
help="path to the directory where the rootfs is mounted.",
)
opts = parser.parse_args(argv)
opts.Freeze()
return opts
def main(argv):
"""Main function to start the script."""
opts = ParseArgs(argv)
logging.debug("Options are %s", opts)
return GconvStrip(opts)