lib/filetype.py - mirrors/cros/chromiumos/chromite - Git at Google

 # -*- coding: utf-8 -*-
 # Copyright 2014 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """File type decoding class for Chromium OS rootfs file bucketing.

 This file decodes the type of file based on the contents, filename and other
 metadata. The result is a string that represents the file type and subtypes
 of the file, separated by slashes (/). The first level is one of the following:
 "text", "binary" and "inode". The first two refer to the contents of the file
 for regular files, while the third one is used for special files such as
 directories, symlinks, block devices, etc.

 The file type can have more than one level, for example "binary/elf/static",
 "binary/image/png", or "text/conf". See the filetype_unittest.py file for more
 examples.

 The purpose of this module is to provide a file type that splits the contents
 of a Chromium OS build in small buckets, partitioning cases where other standard
 classifications keep in the same set.
 """

 from __future__ import print_function

 import itertools
 import mmap
 import os
 import re
 import stat
 import sys

 import six

 from chromite.lib import parseelf

 try:
   import pytest  # pylint: disable=import-error
   magic = pytest.importorskip('magic')
 except ImportError:
   import magic  # pylint: disable=import-error


 assert sys.version_info >= (3, 6), 'This module requires Python 3.6+'


 # The buffer size we would use to read files from the disk.
 FILE_BUFFER_SIZE = 32 * 1024


 def SplitShebang(header):
   r"""Splits a shebang (#!) into command and arguments.

   Args:
     header: The first line of a shebang file, for example
         "#!/usr/bin/env -uPWD python foo.py\n". The referenced command must be
         an absolute path with optionally some arguments.

   Returns:
     A tuple of strings (command, args) where the first string is the called
     and the second is the list of arguments as passed in the header.

   Raises:
     ValueError if the passed header is not a valid shebang line.
   """
   # We convert strings to bytes and then scan the bytes so we don't have to
   # worry about being given non-UTF-8 binary data.  If we're unable to decode
   # back into UTF-8, we'll just ignore the shebang.  There's no situation that
   # we care to support that would matter here.
   if isinstance(header, six.string_types):
     header = header.encode('utf-8')
   m = re.match(br'#!\s*(/[^\s]+)\s*(.*)$', header)
   if m:
     try:
       return m.group(1).decode('utf-8'), m.group(2).strip().decode('utf-8')
     except UnicodeDecodeError:
       raise ValueError('shebang (#!) line is not valid UTF-8')
   raise ValueError('shebang (#!) line expected')


 class FileTypeDecoder(object):
   """Class to help decode the type of a file.

   This class implements a single GetType() method that decodes the type of a
   file based on the contents and metadata. This class holds some global data
   shared between several calls to that method.
   """

   # Whitelist of mime types and their mapping to file type.
   MIME_TYPE_MAPPING = {
       'application/x-gzip': 'binary/compressed/gzip',
       'application/x-bzip2': 'binary/compressed/bzip2',
       'application/x-xz': 'binary/compressed/xz',

       # Goobuntu magic database returns 'gzip' instead of 'x-gzip'. This
       # supports running dep_tracker outside the chroot for development.
       'application/gzip': 'binary/compressed/gzip',
   }

   def __init__(self, root='/'):
     """Initializes the internal state.

     Args:
       root: Path to the root directory where all the files live. This will be
       assumed as the root directory for absolute symlinks.
     """
     self._root = root
     self._mime = magic.open(magic.MIME_TYPE)
     self._mime.load()

   def __del__(self):
     self._mime.close()

   def GetType(self, rel_path, st=None, elf=None):
     """Return the file type of the passed file.

     Does a best-effort attempt to infer the file type of the passed file. If
     only rel_path is provided, the stat_struct information and parsed ELF data
     will be computed. If the information is already available, such as if the
     ELF file is already parsed, passing st and elf will speed up the file
     detection.

     Args:
       rel_path: The path to the file, used to detect the filetype from the
           contents of the file.
       st: The stat_result struct of the file.
       elf: The result of parseelf.ParseELF().

     Returns:
       A string with the file type classified in categories separated by /. For
       example, a dynamic library will return 'binary/elf/dynamic-so'. If the
       type can't be inferred it returns None.
     """
     # Analysis based on inode data.
     if st is None:
       st = os.lstat(os.path.join(self._root, rel_path))
     if stat.S_ISDIR(st.st_mode):
       return 'inode/directory'
     if stat.S_ISLNK(st.st_mode):
       return 'inode/symlink'
     if not stat.S_ISREG(st.st_mode):
       return 'inode/special'
     if st.st_size == 0:
       return 'inode/empty'

     # Analysis based on the ELF header and contents.
     if elf:
       return self._GetELFType(elf)

     # Analysis based on the file contents.
     try:
       with open(os.path.join(self._root, rel_path), 'rb') as fobj:
         fmap = mmap.mmap(fobj.fileno(), 0, prot=mmap.PROT_READ)
         result = self._GetTypeFromContent(rel_path, fobj, fmap)
         fmap.close()
         return result
     except IOError:
       return

   def _GetTypeFromContent(self, rel_path, fobj, fmap):
     """Return the file path based on the file contents.

     This helper function detect the file type based on the contents of the file.

     Args:
       rel_path: The path to the file, used to detect the filetype from the
           contents of the file.
       fobj: a file() object for random access to rel_path.
       fmap: a mmap object mapping the whole rel_path file for reading.
     """

     # Detect if the file is binary based on the presence of non-ASCII chars. We
     # include some the first 32 chars often used in text files but we exclude
     # the rest.
     # Python 2 creates bytes as chars when we want ints (like Python 3).
     # TODO(vapier): Drop this once we require Python 3 everywhere.
     if sys.version_info.major < 3:
       to_ints = lambda s: (ord(x) for x in s)
     else:
       to_ints = lambda s: s
     ascii_chars = set(to_ints(b'\a\b\t\n\v\f\r\x1b'))
     ascii_chars.update(range(32, 128))
     is_binary = any(set(to_ints(chunk)) - ascii_chars
                     for chunk in iter(lambda: fmap.read(FILE_BUFFER_SIZE), b''))

     # We use the first part of the file in several checks.
     fmap.seek(0)
     first_kib = fmap.read(1024)

     # Binary files.
     if is_binary:
       # The elf argument was not passed, so compute it now if the file is an
       # ELF.
       if first_kib.startswith(b'\x7fELF'):
         return self._GetELFType(parseelf.ParseELF(self._root, rel_path,
                                                   parse_symbols=False))

       if first_kib.startswith(b'MZ\x90\0'):
         return 'binary/dos-bin'

       if len(first_kib) >= 512 and first_kib[510:512] == b'\x55\xaa':
         return 'binary/bootsector/x86'

       # Firmware file depend on the technical details of the device they run on,
       # so there's no easy way to detect them. We use the filename to guess that
       # case.
       if '/firmware/' in rel_path and (
           rel_path.endswith('.fw') or
           rel_path[-4:] in ('.bin', '.cis', '.csp', '.dsp')):
         return 'binary/firmware'

       # TZif (timezone) files. See tzfile(5) for details.
       if (first_kib.startswith(b'TZif' + b'\0' * 16) or
           first_kib.startswith(b'TZif2' + b'\0' * 15) or
           first_kib.startswith(b'TZif3' + b'\0' * 15)):
         return 'binary/tzfile'

       # Whitelist some binary mime types.
       fobj.seek(0)
       # _mime.descriptor() will close the passed file descriptor.
       mime_type = self._mime.descriptor(os.dup(fobj.fileno()))
       if mime_type.startswith('image/'):
         return 'binary/' + mime_type
       if mime_type in self.MIME_TYPE_MAPPING:
         return self.MIME_TYPE_MAPPING[mime_type]

       # Other binary files.
       return 'binary'

     # Text files.
     # Read the first couple of lines used in the following checks. This will
     # only read the required lines, with the '\n' char at the end of each line
     # except on the last one if it is not present on that line. At this point
     # we know that the file is not empty, so at least one line existst.
     fmap.seek(0)
     first_lines = list(itertools.islice(iter(fmap.readline, b''), 0, 10))
     head_line = first_lines[0]

     # #! or "shebangs". Only those files with a single line are considered
     # shebangs. Some files start with "#!" but are other kind of files, such
     # as python or bash scripts.
     try:
       prog_name, args = SplitShebang(head_line)
       if len(first_lines) == 1:
         return 'text/shebang'

       prog_name = os.path.basename(prog_name)
       args = args.split()
       if prog_name == 'env':
         # If "env" is called, we skip all the arguments passed to env (flags,
         # VAR=value) and treat the program name as the program to use.
         for i, arg in enumerate(args):
           if arg == '--' and (i + 1) < len(args):
             prog_name = args[i + 1]
             break
           if not arg or arg[0] == '-' or '=' in arg:
             continue
           prog_name = arg
           break

       # Strip the version number from comon programs like "python2.7".
       prog_name = prog_name.rstrip('0123456789-.')

       if prog_name in ('awk', 'bash', 'dash', 'ksh', 'perl', 'python', 'sh'):
         return 'text/script/' + prog_name
       # Other unknown script.
       return 'text/script'
     except ValueError:
       pass

     # PEM files.
     if head_line.strip() == b'-----BEGIN CERTIFICATE-----':
       return 'text/pem/cert'
     if head_line.strip() == b'-----BEGIN RSA PRIVATE KEY-----':
       return 'text/pem/rsa-private'

     # Linker script.
     if head_line.strip() == b'/* GNU ld script':
       return 'text/ld-script'

     # Protobuf files.
     if rel_path.endswith('.proto'):
       return 'text/proto'

     if len(first_lines) == 1:
       if re.match(br'[0-9\.]+$', head_line):
         return 'text/oneline/number'
       return 'text/oneline'

     return 'text'

   @staticmethod
   def _GetELFType(elf):
     """Returns the file type for ELF files.

     Args:
       elf: The result of parseelf.ParseELF().
     """
     if elf['type'] == 'ET_REL':
       elf_type = 'object'
     elif (not '.dynamic' in elf['sections'] and
           not 'PT_DYNAMIC' in elf['segments']):
       elf_type = 'static'
     else:
       if elf['is_lib']:
         elf_type = 'dynamic-so'
       else:
         elf_type = 'dynamic-bin'
     return 'binary/elf/' + elf_type

   @classmethod
   def DecodeFile(cls, path):
     """Decodes the file type of the passed file.

     This function is a wrapper to the FileTypeDecoder class to decode the type
     of a single file. If you need to decode multiple files please use
     FileTypeDecoder class instead.

     Args:
       path: The path to the file or directory.

     Returns:
       A string with the decoded file type or None if it couldn't be decoded.
     """
     return cls('.').GetType(path)
	# -- coding: utf-8 --
	# Copyright 2014 The Chromium OS Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""File type decoding class for Chromium OS rootfs file bucketing.

	This file decodes the type of file based on the contents, filename and other
	metadata. The result is a string that represents the file type and subtypes
	of the file, separated by slashes (/). The first level is one of the following:
	"text", "binary" and "inode". The first two refer to the contents of the file
	for regular files, while the third one is used for special files such as
	directories, symlinks, block devices, etc.

	The file type can have more than one level, for example "binary/elf/static",
	"binary/image/png", or "text/conf". See the filetype_unittest.py file for more
	examples.

	The purpose of this module is to provide a file type that splits the contents
	of a Chromium OS build in small buckets, partitioning cases where other standard
	classifications keep in the same set.
	"""

	from __future__ import print_function

	import itertools
	import mmap
	import os
	import re
	import stat
	import sys

	import six

	from chromite.lib import parseelf

	try:
	import pytest # pylint: disable=import-error
	magic = pytest.importorskip('magic')
	except ImportError:
	import magic # pylint: disable=import-error


	assert sys.version_info >= (3, 6), 'This module requires Python 3.6+'


	# The buffer size we would use to read files from the disk.
	FILE_BUFFER_SIZE = 32 * 1024


	def SplitShebang(header):
	r"""Splits a shebang (#!) into command and arguments.

	Args:
	header: The first line of a shebang file, for example
	"#!/usr/bin/env -uPWD python foo.py\n". The referenced command must be
	an absolute path with optionally some arguments.

	Returns:
	A tuple of strings (command, args) where the first string is the called
	and the second is the list of arguments as passed in the header.

	Raises:
	ValueError if the passed header is not a valid shebang line.
	"""
	# We convert strings to bytes and then scan the bytes so we don't have to
	# worry about being given non-UTF-8 binary data. If we're unable to decode
	# back into UTF-8, we'll just ignore the shebang. There's no situation that
	# we care to support that would matter here.
	if isinstance(header, six.string_types):
	header = header.encode('utf-8')
	m = re.match(br'#!\s(/[^\s]+)\s(.*)$', header)
	if m:
	try:
	return m.group(1).decode('utf-8'), m.group(2).strip().decode('utf-8')
	except UnicodeDecodeError:
	raise ValueError('shebang (#!) line is not valid UTF-8')
	raise ValueError('shebang (#!) line expected')


	class FileTypeDecoder(object):
	"""Class to help decode the type of a file.

	This class implements a single GetType() method that decodes the type of a
	file based on the contents and metadata. This class holds some global data
	shared between several calls to that method.
	"""

	# Whitelist of mime types and their mapping to file type.
	MIME_TYPE_MAPPING = {
	'application/x-gzip': 'binary/compressed/gzip',
	'application/x-bzip2': 'binary/compressed/bzip2',
	'application/x-xz': 'binary/compressed/xz',

	# Goobuntu magic database returns 'gzip' instead of 'x-gzip'. This
	# supports running dep_tracker outside the chroot for development.
	'application/gzip': 'binary/compressed/gzip',
	}

	def __init__(self, root='/'):
	"""Initializes the internal state.

	Args:
	root: Path to the root directory where all the files live. This will be
	assumed as the root directory for absolute symlinks.
	"""
	self._root = root
	self._mime = magic.open(magic.MIME_TYPE)
	self._mime.load()

	def __del__(self):
	self._mime.close()

	def GetType(self, rel_path, st=None, elf=None):
	"""Return the file type of the passed file.

	Does a best-effort attempt to infer the file type of the passed file. If
	only rel_path is provided, the stat_struct information and parsed ELF data
	will be computed. If the information is already available, such as if the
	ELF file is already parsed, passing st and elf will speed up the file
	detection.

	Args:
	rel_path: The path to the file, used to detect the filetype from the
	contents of the file.
	st: The stat_result struct of the file.
	elf: The result of parseelf.ParseELF().

	Returns:
	A string with the file type classified in categories separated by /. For
	example, a dynamic library will return 'binary/elf/dynamic-so'. If the
	type can't be inferred it returns None.
	"""
	# Analysis based on inode data.
	if st is None:
	st = os.lstat(os.path.join(self._root, rel_path))
	if stat.S_ISDIR(st.st_mode):
	return 'inode/directory'
	if stat.S_ISLNK(st.st_mode):
	return 'inode/symlink'
	if not stat.S_ISREG(st.st_mode):
	return 'inode/special'
	if st.st_size == 0:
	return 'inode/empty'

	# Analysis based on the ELF header and contents.
	if elf:
	return self._GetELFType(elf)

	# Analysis based on the file contents.
	try:
	with open(os.path.join(self._root, rel_path), 'rb') as fobj:
	fmap = mmap.mmap(fobj.fileno(), 0, prot=mmap.PROT_READ)
	result = self._GetTypeFromContent(rel_path, fobj, fmap)
	fmap.close()
	return result
	except IOError:
	return

	def _GetTypeFromContent(self, rel_path, fobj, fmap):
	"""Return the file path based on the file contents.

	This helper function detect the file type based on the contents of the file.

	Args:
	rel_path: The path to the file, used to detect the filetype from the
	contents of the file.
	fobj: a file() object for random access to rel_path.
	fmap: a mmap object mapping the whole rel_path file for reading.
	"""

	# Detect if the file is binary based on the presence of non-ASCII chars. We
	# include some the first 32 chars often used in text files but we exclude
	# the rest.
	# Python 2 creates bytes as chars when we want ints (like Python 3).
	# TODO(vapier): Drop this once we require Python 3 everywhere.
	if sys.version_info.major < 3:
	to_ints = lambda s: (ord(x) for x in s)
	else:
	to_ints = lambda s: s
	ascii_chars = set(to_ints(b'\a\b\t\n\v\f\r\x1b'))
	ascii_chars.update(range(32, 128))
	is_binary = any(set(to_ints(chunk)) - ascii_chars
	for chunk in iter(lambda: fmap.read(FILE_BUFFER_SIZE), b''))

	# We use the first part of the file in several checks.
	fmap.seek(0)
	first_kib = fmap.read(1024)

	# Binary files.
	if is_binary:
	# The elf argument was not passed, so compute it now if the file is an
	# ELF.
	if first_kib.startswith(b'\x7fELF'):
	return self._GetELFType(parseelf.ParseELF(self._root, rel_path,
	parse_symbols=False))

	if first_kib.startswith(b'MZ\x90\0'):
	return 'binary/dos-bin'

	if len(first_kib) >= 512 and first_kib[510:512] == b'\x55\xaa':
	return 'binary/bootsector/x86'

	# Firmware file depend on the technical details of the device they run on,
	# so there's no easy way to detect them. We use the filename to guess that
	# case.
	if '/firmware/' in rel_path and (
	rel_path.endswith('.fw') or
	rel_path[-4:] in ('.bin', '.cis', '.csp', '.dsp')):
	return 'binary/firmware'

	# TZif (timezone) files. See tzfile(5) for details.
	if (first_kib.startswith(b'TZif' + b'\0' * 16) or
	first_kib.startswith(b'TZif2' + b'\0' * 15) or
	first_kib.startswith(b'TZif3' + b'\0' * 15)):
	return 'binary/tzfile'

	# Whitelist some binary mime types.
	fobj.seek(0)
	# _mime.descriptor() will close the passed file descriptor.
	mime_type = self._mime.descriptor(os.dup(fobj.fileno()))
	if mime_type.startswith('image/'):
	return 'binary/' + mime_type
	if mime_type in self.MIME_TYPE_MAPPING:
	return self.MIME_TYPE_MAPPING[mime_type]

	# Other binary files.
	return 'binary'

	# Text files.
	# Read the first couple of lines used in the following checks. This will
	# only read the required lines, with the '\n' char at the end of each line
	# except on the last one if it is not present on that line. At this point
	# we know that the file is not empty, so at least one line existst.
	fmap.seek(0)
	first_lines = list(itertools.islice(iter(fmap.readline, b''), 0, 10))
	head_line = first_lines[0]

	# #! or "shebangs". Only those files with a single line are considered
	# shebangs. Some files start with "#!" but are other kind of files, such
	# as python or bash scripts.
	try:
	prog_name, args = SplitShebang(head_line)
	if len(first_lines) == 1:
	return 'text/shebang'

	prog_name = os.path.basename(prog_name)
	args = args.split()
	if prog_name == 'env':
	# If "env" is called, we skip all the arguments passed to env (flags,
	# VAR=value) and treat the program name as the program to use.
	for i, arg in enumerate(args):
	if arg == '--' and (i + 1) < len(args):
	prog_name = args[i + 1]
	break
	if not arg or arg[0] == '-' or '=' in arg:
	continue
	prog_name = arg
	break

	# Strip the version number from comon programs like "python2.7".
	prog_name = prog_name.rstrip('0123456789-.')

	if prog_name in ('awk', 'bash', 'dash', 'ksh', 'perl', 'python', 'sh'):
	return 'text/script/' + prog_name
	# Other unknown script.
	return 'text/script'
	except ValueError:
	pass

	# PEM files.
	if head_line.strip() == b'-----BEGIN CERTIFICATE-----':
	return 'text/pem/cert'
	if head_line.strip() == b'-----BEGIN RSA PRIVATE KEY-----':
	return 'text/pem/rsa-private'

	# Linker script.
	if head_line.strip() == b'/* GNU ld script':
	return 'text/ld-script'

	# Protobuf files.
	if rel_path.endswith('.proto'):
	return 'text/proto'

	if len(first_lines) == 1:
	if re.match(br'[0-9\.]+$', head_line):
	return 'text/oneline/number'
	return 'text/oneline'

	return 'text'

	@staticmethod
	def _GetELFType(elf):
	"""Returns the file type for ELF files.

	Args:
	elf: The result of parseelf.ParseELF().
	"""
	if elf['type'] == 'ET_REL':
	elf_type = 'object'
	elif (not '.dynamic' in elf['sections'] and
	not 'PT_DYNAMIC' in elf['segments']):
	elf_type = 'static'
	else:
	if elf['is_lib']:
	elf_type = 'dynamic-so'
	else:
	elf_type = 'dynamic-bin'
	return 'binary/elf/' + elf_type

	@classmethod
	def DecodeFile(cls, path):
	"""Decodes the file type of the passed file.

	This function is a wrapper to the FileTypeDecoder class to decode the type
	of a single file. If you need to decode multiple files please use
	FileTypeDecoder class instead.

	Args:
	path: The path to the file or directory.

	Returns:
	A string with the decoded file type or None if it couldn't be decoded.
	"""
	return cls('.').GetType(path)