scripts/dep_tracker.py - mirrors/cros/chromiumos/chromite - Git at Google

 # -*- coding: utf-8 -*-
 # Copyright 2014 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Script to discover dependencies and other file information from a build.

 Some files in the image are installed to provide some functionality, such as
 chrome, shill or bluetoothd provide different functionality that can be
 present or not on a given build. Many other files are dependencies from these
 files that need to be present in the image for them to work. These dependencies
 come from needed shared libraries, executed files and other configuration files
 read.

 This script currently discovers dependencies between ELF files for libraries
 required at load time (libraries loaded by the dynamic linker) but not
 libraries loaded at runtime with dlopen(). It also computes size and file type
 in several cases to help understand the contents of the built image.
 """

 from __future__ import print_function

 import itertools
 import json
 import multiprocessing
 import os
 import stat

 from chromite.lib import commandline
 from chromite.lib import cros_logging as logging
 from chromite.lib import filetype
 from chromite.lib import parseelf
 from chromite.lib import portage_util
 from chromite.scripts import lddtree


 # Regex to parse Gentoo atoms. This should match the following ebuild names,
 # splitting the package name from the version.
 # without version:
 #   chromeos-base/tty
 #   chromeos-base/libchrome-271506
 #   sys-kernel/chromeos-kernel-3_8
 # with version:
 #   chromeos-base/tty-0.0.1-r4
 #   chromeos-base/libchrome-271506-r5
 #   sys-kernel/chromeos-kernel-3_8-3.8.11-r35
 RE_EBUILD_WITHOUT_VERSION = r'^([a-z0-9\-]+/[a-zA-Z0-9\_\+\-]+)$'
 RE_EBUILD_WITH_VERSION = (
     r'^=?([a-z0-9\-]+/[a-zA-Z0-9\_\+\-]+)\-([^\-]+(\-r\d+)?)$')


 def ParseELFWithArgs(args):
   """Wrapper to parseelf.ParseELF accepting a single arg.

   This wrapper is required to use multiprocessing.Pool.map function.

   Returns:
     A 2-tuple with the passed relative path and the result of ParseELF(). On
     error, when ParseELF() returns None, this function returns None.
   """
   elf = parseelf.ParseELF(*args)
   if elf is None:
     return
   return args[1], elf


 class DepTracker(object):
   """Tracks dependencies and file information in a root directory.

   This class computes dependencies and other information related to the files
   in the root image.
   """

   def __init__(self, root, jobs=1):
     root_st = os.lstat(root)
     if not stat.S_ISDIR(root_st.st_mode):
       raise Exception('root (%s) must be a directory' % root)
     self._root = root.rstrip('/') + '/'
     self._file_type_decoder = filetype.FileTypeDecoder(root)

     # A wrapper to the multiprocess map function. We avoid launching a pool
     # of processes when jobs is 1 so python exceptions kill the main process,
     # useful for debugging.
     if jobs > 1:
       self._pool = multiprocessing.Pool(jobs)
       self._imap = self._pool.map
     else:
       self._imap = itertools.imap

     self._files = {}
     self._ebuilds = {}

     # Mapping of rel_paths for symlinks and hardlinks. Hardlinks are assumed
     # to point to the lowest lexicographically file with the same inode.
     self._symlinks = {}
     self._hardlinks = {}

   def Init(self):
     """Generates the initial list of files."""
     # First iteration over all the files in root searching for symlinks and
     # non-regular files.
     seen_inodes = {}
     for basepath, _, filenames in sorted(os.walk(self._root)):
       for filename in sorted(filenames):
         full_path = os.path.join(basepath, filename)
         rel_path = full_path[len(self._root):]
         st = os.lstat(full_path)

         file_data = {
             'size': st.st_size,
         }
         self._files[rel_path] = file_data

         # Track symlinks.
         if stat.S_ISLNK(st.st_mode):
           link_path = os.readlink(full_path)
           # lddtree's normpath handles a little more cases than the os.path
           # version. In particular, it handles the '//' case.
           self._symlinks[rel_path] = (
               link_path.lstrip('/') if link_path and link_path[0] == '/' else
               lddtree.normpath(os.path.join(os.path.dirname(rel_path),
                                             link_path)))
           file_data['deps'] = {
               'symlink': [self._symlinks[rel_path]]
           }

         # Track hardlinks.
         if st.st_ino in seen_inodes:
           self._hardlinks[rel_path] = seen_inodes[st.st_ino]
           continue
         seen_inodes[st.st_ino] = rel_path

   def SaveJSON(self, filename):
     """Save the computed information to a JSON file.

     Args:
       filename: The destination JSON file.
     """
     data = {
         'files': self._files,
         'ebuilds': self._ebuilds,
     }
     json.dump(data, open(filename, 'w'))

   def ComputeEbuildDeps(self, sysroot):
     """Compute the dependencies between ebuilds and files.

     Iterates over the list of ebuilds in the database and annotates the files
     with the ebuilds they are in. For each ebuild installing a file in the root,
     also compute the direct dependencies. Stores the information internally.

     Args:
       sysroot: The path to the sysroot, for example "/build/link".
     """
     portage_db = portage_util.PortageDB(sysroot)
     if not os.path.exists(portage_db.db_path):
       logging.warning('PortageDB directory not found: %s', portage_db.db_path)
       return

     for pkg in portage_db.InstalledPackages():
       pkg_files = []
       pkg_size = 0
       cpf = '%s/%s' % (pkg.category, pkg.pf)
       for typ, rel_path in pkg.ListContents():
         # We ignore other entries like for example "dir".
         if not typ in (pkg.OBJ, pkg.SYM):
           continue
         # We ignore files installed in the SYSROOT that weren't copied to the
         # image.
         if not rel_path in self._files:
           continue
         pkg_files.append(rel_path)
         file_data = self._files[rel_path]
         if 'ebuild' in file_data:
           logging.warning('Duplicated entry for %s: %s and %s',
                           rel_path, file_data['ebuild'], cpf)
         file_data['ebuild'] = cpf
         pkg_size += file_data['size']
       # Ignore packages that don't install any file.
       if not pkg_files:
         continue
       self._ebuilds[cpf] = {
           'size': pkg_size,
           'files': len(pkg_files),
           'atom': '%s/%s' % (pkg.category, pkg.package),
           'version': pkg.version,
       }
     # TODO(deymo): Parse dependencies between ebuilds.

   def ComputeELFFileDeps(self):
     """Computes the dependencies between files.

     Computes the dependencies between the files in the root directory passed
     during construction. The dependencies are inferred for ELF files.
     The list of dependencies for each file in the passed rootfs as a dict().
     The result's keys are the relative path of the files and the value of each
     file is a list of dependencies. A dependency is a tuple (dep_path,
     dep_type) where the dep_path is relative path from the passed root to the
     dependent file and dep_type is one the following strings stating how the
     dependency was discovered:
       'ldd': The dependent ELF file is listed as needed in the dynamic section.
       'symlink': The dependent file is a symlink to the depending.
     If there are dependencies of a given type whose target file wasn't
     determined, a tuple (None, dep_type) is included. This is the case for
     example is a program uses library that wasn't found.
     """
     ldpaths = lddtree.LoadLdpaths(self._root)

     # First iteration over all the files in root searching for symlinks and
     # non-regular files.
     parseelf_args = []
     for rel_path, file_data in self._files.iteritems():
       if rel_path in self._symlinks or rel_path in self._hardlinks:
         continue

       full_path = os.path.join(self._root, rel_path)
       st = os.lstat(full_path)
       if not stat.S_ISREG(st.st_mode):
         continue
       parseelf_args.append((self._root, rel_path, ldpaths))

     # Parallelize the ELF lookup step since it is quite expensive.
     elfs = dict(x for x in self._imap(ParseELFWithArgs, parseelf_args)
                 if not x is None)

     for rel_path, elf in elfs.iteritems():
       file_data = self._files[rel_path]
       # Fill in the ftype if not set yet. We complete this value at this point
       # to avoid re-parsing the ELF file later.
       if not 'ftype' in file_data:
         ftype = self._file_type_decoder.GetType(rel_path, elf=elf)
         if ftype:
           file_data['ftype'] = ftype

       file_deps = file_data.get('deps', {})
       # Dependencies based on the result of ldd.
       for lib in elf.get('needed', []):
         lib_path = elf['libs'][lib]['path']
         if not 'ldd' in file_deps:
           file_deps['ldd'] = []
         file_deps['ldd'].append(lib_path)

       if file_deps:
         file_data['deps'] = file_deps

   def ComputeFileTypes(self):
     """Computes all the missing file type for the files in the root."""
     for rel_path, file_data in self._files.iteritems():
       if 'ftype' in file_data:
         continue
       ftype = self._file_type_decoder.GetType(rel_path)
       if ftype:
         file_data['ftype'] = ftype


 def ParseArgs(argv):
   """Return parsed commandline arguments."""

   parser = commandline.ArgumentParser()
   parser.add_argument(
       '-j', '--jobs', type=int, default=multiprocessing.cpu_count(),
       help='number of simultaneous jobs.')
   parser.add_argument(
       '--sysroot', type='path', metavar='SYSROOT',
       help='parse portage DB for ebuild information from the provided sysroot.')
   parser.add_argument(
       '--json', type='path',
       help='store information in JSON file.')

   parser.add_argument(
       'root', type='path',
       help='path to the directory where the rootfs is mounted.')

   opts = parser.parse_args(argv)
   opts.Freeze()
   return opts


 def main(argv):
   """Main function to start the script."""
   opts = ParseArgs(argv)
   logging.debug('Options are %s', opts)

   dt = DepTracker(opts.root, jobs=opts.jobs)
   dt.Init()

   dt.ComputeELFFileDeps()
   dt.ComputeFileTypes()

   if opts.sysroot:
     dt.ComputeEbuildDeps(opts.sysroot)

   if opts.json:
     dt.SaveJSON(opts.json)
	# -- coding: utf-8 --
	# Copyright 2014 The Chromium OS Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Script to discover dependencies and other file information from a build.

	Some files in the image are installed to provide some functionality, such as
	chrome, shill or bluetoothd provide different functionality that can be
	present or not on a given build. Many other files are dependencies from these
	files that need to be present in the image for them to work. These dependencies
	come from needed shared libraries, executed files and other configuration files
	read.

	This script currently discovers dependencies between ELF files for libraries
	required at load time (libraries loaded by the dynamic linker) but not
	libraries loaded at runtime with dlopen(). It also computes size and file type
	in several cases to help understand the contents of the built image.
	"""

	from __future__ import print_function

	import itertools
	import json
	import multiprocessing
	import os
	import stat

	from chromite.lib import commandline
	from chromite.lib import cros_logging as logging
	from chromite.lib import filetype
	from chromite.lib import parseelf
	from chromite.lib import portage_util
	from chromite.scripts import lddtree


	# Regex to parse Gentoo atoms. This should match the following ebuild names,
	# splitting the package name from the version.
	# without version:
	# chromeos-base/tty
	# chromeos-base/libchrome-271506
	# sys-kernel/chromeos-kernel-3_8
	# with version:
	# chromeos-base/tty-0.0.1-r4
	# chromeos-base/libchrome-271506-r5
	# sys-kernel/chromeos-kernel-3_8-3.8.11-r35
	RE_EBUILD_WITHOUT_VERSION = r'^([a-z0-9\-]+/[a-zA-Z0-9\_\+\-]+)$'
	RE_EBUILD_WITH_VERSION = (
	r'^=?([a-z0-9\-]+/[a-zA-Z0-9\_\+\-]+)\-([^\-]+(\-r\d+)?)$')


	def ParseELFWithArgs(args):
	"""Wrapper to parseelf.ParseELF accepting a single arg.

	This wrapper is required to use multiprocessing.Pool.map function.

	Returns:
	A 2-tuple with the passed relative path and the result of ParseELF(). On
	error, when ParseELF() returns None, this function returns None.
	"""
	elf = parseelf.ParseELF(*args)
	if elf is None:
	return
	return args[1], elf


	class DepTracker(object):
	"""Tracks dependencies and file information in a root directory.

	This class computes dependencies and other information related to the files
	in the root image.
	"""

	def __init__(self, root, jobs=1):
	root_st = os.lstat(root)
	if not stat.S_ISDIR(root_st.st_mode):
	raise Exception('root (%s) must be a directory' % root)
	self._root = root.rstrip('/') + '/'
	self._file_type_decoder = filetype.FileTypeDecoder(root)

	# A wrapper to the multiprocess map function. We avoid launching a pool
	# of processes when jobs is 1 so python exceptions kill the main process,
	# useful for debugging.
	if jobs > 1:
	self._pool = multiprocessing.Pool(jobs)
	self._imap = self._pool.map
	else:
	self._imap = itertools.imap

	self._files = {}
	self._ebuilds = {}

	# Mapping of rel_paths for symlinks and hardlinks. Hardlinks are assumed
	# to point to the lowest lexicographically file with the same inode.
	self._symlinks = {}
	self._hardlinks = {}

	def Init(self):
	"""Generates the initial list of files."""
	# First iteration over all the files in root searching for symlinks and
	# non-regular files.
	seen_inodes = {}
	for basepath, _, filenames in sorted(os.walk(self._root)):
	for filename in sorted(filenames):
	full_path = os.path.join(basepath, filename)
	rel_path = full_path[len(self._root):]
	st = os.lstat(full_path)

	file_data = {
	'size': st.st_size,
	}
	self._files[rel_path] = file_data

	# Track symlinks.
	if stat.S_ISLNK(st.st_mode):
	link_path = os.readlink(full_path)
	# lddtree's normpath handles a little more cases than the os.path
	# version. In particular, it handles the '//' case.
	self._symlinks[rel_path] = (
	link_path.lstrip('/') if link_path and link_path[0] == '/' else
	lddtree.normpath(os.path.join(os.path.dirname(rel_path),
	link_path)))
	file_data['deps'] = {
	'symlink': [self._symlinks[rel_path]]
	}

	# Track hardlinks.
	if st.st_ino in seen_inodes:
	self._hardlinks[rel_path] = seen_inodes[st.st_ino]
	continue
	seen_inodes[st.st_ino] = rel_path

	def SaveJSON(self, filename):
	"""Save the computed information to a JSON file.

	Args:
	filename: The destination JSON file.
	"""
	data = {
	'files': self._files,
	'ebuilds': self._ebuilds,
	}
	json.dump(data, open(filename, 'w'))

	def ComputeEbuildDeps(self, sysroot):
	"""Compute the dependencies between ebuilds and files.

	Iterates over the list of ebuilds in the database and annotates the files
	with the ebuilds they are in. For each ebuild installing a file in the root,
	also compute the direct dependencies. Stores the information internally.

	Args:
	sysroot: The path to the sysroot, for example "/build/link".
	"""
	portage_db = portage_util.PortageDB(sysroot)
	if not os.path.exists(portage_db.db_path):
	logging.warning('PortageDB directory not found: %s', portage_db.db_path)
	return

	for pkg in portage_db.InstalledPackages():
	pkg_files = []
	pkg_size = 0
	cpf = '%s/%s' % (pkg.category, pkg.pf)
	for typ, rel_path in pkg.ListContents():
	# We ignore other entries like for example "dir".
	if not typ in (pkg.OBJ, pkg.SYM):
	continue
	# We ignore files installed in the SYSROOT that weren't copied to the
	# image.
	if not rel_path in self._files:
	continue
	pkg_files.append(rel_path)
	file_data = self._files[rel_path]
	if 'ebuild' in file_data:
	logging.warning('Duplicated entry for %s: %s and %s',
	rel_path, file_data['ebuild'], cpf)
	file_data['ebuild'] = cpf
	pkg_size += file_data['size']
	# Ignore packages that don't install any file.
	if not pkg_files:
	continue
	self._ebuilds[cpf] = {
	'size': pkg_size,
	'files': len(pkg_files),
	'atom': '%s/%s' % (pkg.category, pkg.package),
	'version': pkg.version,
	}
	# TODO(deymo): Parse dependencies between ebuilds.

	def ComputeELFFileDeps(self):
	"""Computes the dependencies between files.

	Computes the dependencies between the files in the root directory passed
	during construction. The dependencies are inferred for ELF files.
	The list of dependencies for each file in the passed rootfs as a dict().
	The result's keys are the relative path of the files and the value of each
	file is a list of dependencies. A dependency is a tuple (dep_path,
	dep_type) where the dep_path is relative path from the passed root to the
	dependent file and dep_type is one the following strings stating how the
	dependency was discovered:
	'ldd': The dependent ELF file is listed as needed in the dynamic section.
	'symlink': The dependent file is a symlink to the depending.
	If there are dependencies of a given type whose target file wasn't
	determined, a tuple (None, dep_type) is included. This is the case for
	example is a program uses library that wasn't found.
	"""
	ldpaths = lddtree.LoadLdpaths(self._root)

	# First iteration over all the files in root searching for symlinks and
	# non-regular files.
	parseelf_args = []
	for rel_path, file_data in self._files.iteritems():
	if rel_path in self._symlinks or rel_path in self._hardlinks:
	continue

	full_path = os.path.join(self._root, rel_path)
	st = os.lstat(full_path)
	if not stat.S_ISREG(st.st_mode):
	continue
	parseelf_args.append((self._root, rel_path, ldpaths))

	# Parallelize the ELF lookup step since it is quite expensive.
	elfs = dict(x for x in self._imap(ParseELFWithArgs, parseelf_args)
	if not x is None)

	for rel_path, elf in elfs.iteritems():
	file_data = self._files[rel_path]
	# Fill in the ftype if not set yet. We complete this value at this point
	# to avoid re-parsing the ELF file later.
	if not 'ftype' in file_data:
	ftype = self._file_type_decoder.GetType(rel_path, elf=elf)
	if ftype:
	file_data['ftype'] = ftype

	file_deps = file_data.get('deps', {})
	# Dependencies based on the result of ldd.
	for lib in elf.get('needed', []):
	lib_path = elf['libs'][lib]['path']
	if not 'ldd' in file_deps:
	file_deps['ldd'] = []
	file_deps['ldd'].append(lib_path)

	if file_deps:
	file_data['deps'] = file_deps

	def ComputeFileTypes(self):
	"""Computes all the missing file type for the files in the root."""
	for rel_path, file_data in self._files.iteritems():
	if 'ftype' in file_data:
	continue
	ftype = self._file_type_decoder.GetType(rel_path)
	if ftype:
	file_data['ftype'] = ftype


	def ParseArgs(argv):
	"""Return parsed commandline arguments."""

	parser = commandline.ArgumentParser()
	parser.add_argument(
	'-j', '--jobs', type=int, default=multiprocessing.cpu_count(),
	help='number of simultaneous jobs.')
	parser.add_argument(
	'--sysroot', type='path', metavar='SYSROOT',
	help='parse portage DB for ebuild information from the provided sysroot.')
	parser.add_argument(
	'--json', type='path',
	help='store information in JSON file.')

	parser.add_argument(
	'root', type='path',
	help='path to the directory where the rootfs is mounted.')

	opts = parser.parse_args(argv)
	opts.Freeze()
	return opts


	def main(argv):
	"""Main function to start the script."""
	opts = ParseArgs(argv)
	logging.debug('Options are %s', opts)

	dt = DepTracker(opts.root, jobs=opts.jobs)
	dt.Init()

	dt.ComputeELFFileDeps()
	dt.ComputeFileTypes()

	if opts.sysroot:
	dt.ComputeEbuildDeps(opts.sysroot)

	if opts.json:
	dt.SaveJSON(opts.json)