sys-libs/glibc/files/local/glibc-2.23-dlopen.patch - mirrors/cros/chromiumos/overlays/chromiumos-overlay - Git at Google

 This patch is not necessary for the bug fix, just makes concurrency
 code review easier (removes a data race and overflow from a broken
 code path).

 dlopen can oom crash anyway in _dl_resize_dtv and it's probably
 better to crash than leave half setup modules around.

 2016-11-30  Szabolcs Nagy  <szabolcs.nagy@arm.com>

 	* elf/dl-tls.c (_dl_add_to_slotinfo): OOM crash.
 diff --git a/elf/dl-tls.c b/elf/dl-tls.c
 index 60f4c1d..2feee67 100644
 --- a/elf/dl-tls.c
 +++ b/elf/dl-tls.c
 @@ -959,18 +959,10 @@  _dl_add_to_slotinfo (struct link_map *l)
  		+ TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo));
        if (listp == NULL)
  	{
 -	  /* We ran out of memory.  We will simply fail this
 -	     call but don't undo anything we did so far.  The
 -	     application will crash or be terminated anyway very
 -	     soon.  */
 -
 -	  /* We have to do this since some entries in the dtv
 -	     slotinfo array might already point to this
 -	     generation.  */
 -	  ++GL(dl_tls_generation);
 -
 -	  _dl_signal_error (ENOMEM, "dlopen", NULL, N_("\
 -cannot create TLS data structures"));
 +	  /* We ran out of memory in dlopen while updating tls structures.
 +	     TODO: side-effects should be rolled back and the failure should
 +	     be reported to the caller, but that's hard.  */
 +	  oom ();
  	}

        listp->len = TLS_SLOTINFO_SURPLUS;


 This fixes a subset of the issues described in
 https://sourceware.org/ml/libc-alpha/2016-11/msg01026.html
 without adding locks to pthread_create.

 Only races between dlopen and pthread_create were considered,
 and the asserts got removed that tried to check for concurrency
 issues.

 The patch is incomplete because dlclose, tls access and
 dl_iterate_phdr related code paths are not modified.

 dlclose should be updated in a similar fashion to dlopen
 to make the patch complete alternatively pthread_create
 may take the GL(dl_load_write_lock) to sync with dlclose
 or the GL(dl_load_lock) to sync with dlopen and dlclose
 (that would simplify the concurrency design, but increase
 lock contention on the locks).

 2016-11-30  Szabolcs Nagy  <szabolcs.nagy@arm.com>

 	[BZ #19329]
 	* elf/dl-open.c (dl_open_worker): Write GL(dl_tls_generation)
 	atomically.
 	* elf/dl-tls.c (_dl_allocate_tls_init): Read GL(dl_tls_generation),
 	GL(dl_tls_max_dtv_idx), slotinfo entries and listp->next atomically.
 	Remove assertions that cannot be guaranteed.
 	(_dl_add_to_slotinfo): Write the slotinfo entries and listp->next
 	atomically.
 diff --git a/elf/dl-open.c b/elf/dl-open.c
 index f5ca261..e6f6e0b 100644
 --- a/elf/dl-open.c
 +++ b/elf/dl-open.c
 @@ -524,9 +524,17 @@  dl_open_worker (void *a)
      }

    /* Bump the generation number if necessary.  */
 -  if (any_tls && __builtin_expect (++GL(dl_tls_generation) == 0, 0))
 -    _dl_fatal_printf (N_("\
 +  if (any_tls)
 +    {
 +      /* This cannot be in a data-race so non-atomic load is valid too.  */
 +      size_t newgen = atomic_load_relaxed (&GL(dl_tls_generation)) + 1;
 +      /* Synchronize with _dl_allocate_tls_init (see notes there) and
 +	 avoid storing an overflowed counter.  */
 +      if (__builtin_expect (newgen == 0, 0))
 +	_dl_fatal_printf (N_("\
  TLS generation counter wrapped!  Please report this."));
 +      atomic_store_release (&GL(dl_tls_generation), newgen);
 +    }

    /* We need a second pass for static tls data, because _dl_update_slotinfo
       must not be run while calls to _dl_add_to_slotinfo are still pending.  */
 diff --git a/elf/dl-tls.c b/elf/dl-tls.c
 index 2feee67..7dbee0d 100644
 --- a/elf/dl-tls.c
 +++ b/elf/dl-tls.c
 @@ -470,6 +470,36 @@  _dl_resize_dtv (dtv_t *dtv)
  }


 +/*
 +CONCURRENCY NOTES
 +
 +dlopen (and dlclose) holds the GL(dl_load_lock) while writing shared state,
 +which may be concurrently read by pthread_create and tls access without taking
 +the lock, so atomic access should be used.  The shared state:
 +
 +  GL(dl_tls_max_dtv_idx) - max modid assigned, (modid can be reused).
 +  GL(dl_tls_generation) - generation count, incremented by dlopen and dlclose.
 +  GL(dl_tls_dtv_slotinfo_list) - list of entries, contains generation count
 +                                 and link_map for each module with a modid.
 +
 +A module gets a modid assigned if it has tls, a modid identifies a slotinfo
 +entry and it is the index of the corresponding dtv slot.  The generation count
 +is assigned to slotinfo entries of a newly loaded or unloaded module and its
 +newly loaded or unloaded dependencies.
 +
 +TODO: dlclose may free memory read by a concurrent pthread_create or tls
 +access.  This is broken now, so it is assumed that dlclose does not free
 +link_map structures while pthread_create or __tls_get_addr is reading them.
 +
 +pthread_create calls _dl_allocate_tls_init (before creating the new thread),
 +which should guarantee that the dtv is in a consistent state at the end:
 +
 +All slotinfo updates with generation <= dtv[0].counter are reflected in the
 +dtv and arbitrary later module unloads may also be reflected as unallocated
 +entries. (Note: a modid reuse implies a module unload and accessing tls in
 +an unloaded module is undefined.)
 +*/
 +
  void *
  internal_function
  _dl_allocate_tls_init (void *result)
 @@ -482,12 +512,24 @@  _dl_allocate_tls_init (void *result)
    struct dtv_slotinfo_list *listp;
    size_t total = 0;
    size_t maxgen = 0;
 +  /* Synchronizes with the increments in dl_{open,close}_worker.
 +     Slotinfo updates of this generation are sequenced before the
 +     write we read from here.  */
 +  size_t gen_count = atomic_load_acquire (&GL(dl_tls_generation));
 +  /* Either reads from the last write that is sequenced before the
 +     generation counter increment we synchronized with or a write
 +     made by a later dlopen/dlclose.  dlclose may decrement this,
 +     but only if related modules are unloaded.  So it is an upper
 +     bound on non-unloaded modids up to gen_count generation.  */
 +  size_t dtv_slots = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx));

    /* Check if the current dtv is big enough.   */
 -  if (dtv[-1].counter < GL(dl_tls_max_dtv_idx))
 +  if (dtv[-1].counter < dtv_slots)
      {
        /* Resize the dtv.  */
        dtv = _dl_resize_dtv (dtv);
 +      /* _dl_resize_dtv rereads GL(dl_tls_max_dtv_idx) which may decrease.  */
 +      dtv_slots = dtv[-1].counter;

        /* Install this new dtv in the thread data structures.  */
        INSTALL_DTV (result, &dtv[-1]);
 @@ -504,22 +546,33 @@  _dl_allocate_tls_init (void *result)
        for (cnt = total == 0 ? 1 : 0; cnt < listp->len; ++cnt)
  	{
  	  struct link_map *map;
 +	  size_t gen;
  	  void *dest;

  	  /* Check for the total number of used slots.  */
 -	  if (total + cnt > GL(dl_tls_max_dtv_idx))
 +	  if (total + cnt > dtv_slots)
  	    break;

 -	  map = listp->slotinfo[cnt].map;
 +	  /* Synchronize with dl_add_to_slotinfo and remove_slotinfo.  */
 +	  map = atomic_load_acquire (&listp->slotinfo[cnt].map);
  	  if (map == NULL)
  	    /* Unused entry.  */
  	    continue;

 +	  /* Consistent generation count with the map read above.
 +	     Inconsistent gen may be read if the entry is being reused,
 +	     in which case it is larger than gen_count and we skip it.  */
 +	  gen = atomic_load_relaxed (&listp->slotinfo[cnt].gen);
 +	  if (gen > gen_count)
 +	    /* New entry.  */
 +	    continue;
 +
  	  /* Keep track of the maximum generation number.  This might
  	     not be the generation counter.  */
 -	  assert (listp->slotinfo[cnt].gen <= GL(dl_tls_generation));
 -	  maxgen = MAX (maxgen, listp->slotinfo[cnt].gen);
 +	  maxgen = MAX (maxgen, gen);

 +	  /* TODO: concurrent dlclose may free map which would break
 +	     the rest of the code below.  */
  	  dtv[map->l_tls_modid].pointer.val = TLS_DTV_UNALLOCATED;
  	  dtv[map->l_tls_modid].pointer.to_free = NULL;

 @@ -549,11 +602,15 @@  _dl_allocate_tls_init (void *result)
  	}

        total += cnt;
 -      if (total >= GL(dl_tls_max_dtv_idx))
 +      if (total > dtv_slots)
  	break;

 -      listp = listp->next;
 -      assert (listp != NULL);
 +      /* Synchronize with dl_add_to_slotinfo.  */
 +      listp = atomic_load_acquire (&listp->next);
 +      /* dtv_slots is an upper bound on the number of entries we care
 +	 about, the list may end sooner.  */
 +      if (listp == NULL)
 +	break;
      }

    /* The DTV version is up-to-date now.  */
 @@ -954,7 +1011,7 @@  _dl_add_to_slotinfo (struct link_map *l)
  	 the first slot.  */
        assert (idx == 0);

 -      listp = prevp->next = (struct dtv_slotinfo_list *)
 +      listp = (struct dtv_slotinfo_list *)
  	malloc (sizeof (struct dtv_slotinfo_list)
  		+ TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo));
        if (listp == NULL)
 @@ -969,9 +1026,17 @@  _dl_add_to_slotinfo (struct link_map *l)
        listp->next = NULL;
        memset (listp->slotinfo, '\0',
  	      TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo));
 +      /* Add the new list item and synchronize with _dl_allocate_tls_init.  */
 +      atomic_store_release (&prevp->next, listp);
      }

    /* Add the information into the slotinfo data structure.  */
 -  listp->slotinfo[idx].map = l;
 -  listp->slotinfo[idx].gen = GL(dl_tls_generation) + 1;
 +
 +  /* This cannot be in a data-race so non-atomic load would be valid too.  */
 +  size_t newgen = atomic_load_relaxed (&GL(dl_tls_generation)) + 1;
 +  /* TODO: Concurrent readers may see an overflowed gen, which is bad,
 +     but overflow is guaranteed to crash the dlopen that is executing.  */
 +  atomic_store_relaxed (&listp->slotinfo[idx].gen, newgen);
 +  /* Synchronize with _dl_allocate_tls_init (see notes there).  */
 +  atomic_store_release (&listp->slotinfo[idx].map, l);
  }
	This patch is not necessary for the bug fix, just makes concurrency
	code review easier (removes a data race and overflow from a broken
	code path).

	dlopen can oom crash anyway in _dl_resize_dtv and it's probably
	better to crash than leave half setup modules around.

	2016-11-30 Szabolcs Nagy <szabolcs.nagy@arm.com>

	* elf/dl-tls.c (_dl_add_to_slotinfo): OOM crash.
	diff --git a/elf/dl-tls.c b/elf/dl-tls.c
	index 60f4c1d..2feee67 100644
	--- a/elf/dl-tls.c
	+++ b/elf/dl-tls.c
	@@ -959,18 +959,10 @@ _dl_add_to_slotinfo (struct link_map *l)
	+ TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo));
	if (listp == NULL)
	{
	- /* We ran out of memory. We will simply fail this
	- call but don't undo anything we did so far. The
	- application will crash or be terminated anyway very
	- soon. */
	-
	- /* We have to do this since some entries in the dtv
	- slotinfo array might already point to this
	- generation. */
	- ++GL(dl_tls_generation);
	-
	- _dl_signal_error (ENOMEM, "dlopen", NULL, N_("\
	-cannot create TLS data structures"));
	+ /* We ran out of memory in dlopen while updating tls structures.
	+ TODO: side-effects should be rolled back and the failure should
	+ be reported to the caller, but that's hard. */
	+ oom ();
	}

	listp->len = TLS_SLOTINFO_SURPLUS;


	This fixes a subset of the issues described in
	https://sourceware.org/ml/libc-alpha/2016-11/msg01026.html
	without adding locks to pthread_create.

	Only races between dlopen and pthread_create were considered,
	and the asserts got removed that tried to check for concurrency
	issues.

	The patch is incomplete because dlclose, tls access and
	dl_iterate_phdr related code paths are not modified.

	dlclose should be updated in a similar fashion to dlopen
	to make the patch complete alternatively pthread_create
	may take the GL(dl_load_write_lock) to sync with dlclose
	or the GL(dl_load_lock) to sync with dlopen and dlclose
	(that would simplify the concurrency design, but increase
	lock contention on the locks).

	2016-11-30 Szabolcs Nagy <szabolcs.nagy@arm.com>

	[BZ #19329]
	* elf/dl-open.c (dl_open_worker): Write GL(dl_tls_generation)
	atomically.
	* elf/dl-tls.c (_dl_allocate_tls_init): Read GL(dl_tls_generation),
	GL(dl_tls_max_dtv_idx), slotinfo entries and listp->next atomically.
	Remove assertions that cannot be guaranteed.
	(_dl_add_to_slotinfo): Write the slotinfo entries and listp->next
	atomically.
	diff --git a/elf/dl-open.c b/elf/dl-open.c
	index f5ca261..e6f6e0b 100644
	--- a/elf/dl-open.c
	+++ b/elf/dl-open.c
	@@ -524,9 +524,17 @@ dl_open_worker (void *a)
	}

	/* Bump the generation number if necessary. */
	- if (any_tls && __builtin_expect (++GL(dl_tls_generation) == 0, 0))
	- _dl_fatal_printf (N_("\
	+ if (any_tls)
	+ {
	+ /* This cannot be in a data-race so non-atomic load is valid too. */
	+ size_t newgen = atomic_load_relaxed (&GL(dl_tls_generation)) + 1;
	+ /* Synchronize with _dl_allocate_tls_init (see notes there) and
	+ avoid storing an overflowed counter. */
	+ if (__builtin_expect (newgen == 0, 0))
	+ _dl_fatal_printf (N_("\
	TLS generation counter wrapped! Please report this."));
	+ atomic_store_release (&GL(dl_tls_generation), newgen);
	+ }

	/* We need a second pass for static tls data, because _dl_update_slotinfo
	must not be run while calls to _dl_add_to_slotinfo are still pending. */
	diff --git a/elf/dl-tls.c b/elf/dl-tls.c
	index 2feee67..7dbee0d 100644
	--- a/elf/dl-tls.c
	+++ b/elf/dl-tls.c
	@@ -470,6 +470,36 @@ _dl_resize_dtv (dtv_t *dtv)
	}


	+/*
	+CONCURRENCY NOTES
	+
	+dlopen (and dlclose) holds the GL(dl_load_lock) while writing shared state,
	+which may be concurrently read by pthread_create and tls access without taking
	+the lock, so atomic access should be used. The shared state:
	+
	+ GL(dl_tls_max_dtv_idx) - max modid assigned, (modid can be reused).
	+ GL(dl_tls_generation) - generation count, incremented by dlopen and dlclose.
	+ GL(dl_tls_dtv_slotinfo_list) - list of entries, contains generation count
	+ and link_map for each module with a modid.
	+
	+A module gets a modid assigned if it has tls, a modid identifies a slotinfo
	+entry and it is the index of the corresponding dtv slot. The generation count
	+is assigned to slotinfo entries of a newly loaded or unloaded module and its
	+newly loaded or unloaded dependencies.
	+
	+TODO: dlclose may free memory read by a concurrent pthread_create or tls
	+access. This is broken now, so it is assumed that dlclose does not free
	+link_map structures while pthread_create or __tls_get_addr is reading them.
	+
	+pthread_create calls _dl_allocate_tls_init (before creating the new thread),
	+which should guarantee that the dtv is in a consistent state at the end:
	+
	+All slotinfo updates with generation <= dtv[0].counter are reflected in the
	+dtv and arbitrary later module unloads may also be reflected as unallocated
	+entries. (Note: a modid reuse implies a module unload and accessing tls in
	+an unloaded module is undefined.)
	+*/
	+
	void *
	internal_function
	_dl_allocate_tls_init (void *result)
	@@ -482,12 +512,24 @@ _dl_allocate_tls_init (void *result)
	struct dtv_slotinfo_list *listp;
	size_t total = 0;
	size_t maxgen = 0;
	+ /* Synchronizes with the increments in dl_{open,close}_worker.
	+ Slotinfo updates of this generation are sequenced before the
	+ write we read from here. */
	+ size_t gen_count = atomic_load_acquire (&GL(dl_tls_generation));
	+ /* Either reads from the last write that is sequenced before the
	+ generation counter increment we synchronized with or a write
	+ made by a later dlopen/dlclose. dlclose may decrement this,
	+ but only if related modules are unloaded. So it is an upper
	+ bound on non-unloaded modids up to gen_count generation. */
	+ size_t dtv_slots = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx));

	/* Check if the current dtv is big enough. */
	- if (dtv[-1].counter < GL(dl_tls_max_dtv_idx))
	+ if (dtv[-1].counter < dtv_slots)
	{
	/* Resize the dtv. */
	dtv = _dl_resize_dtv (dtv);
	+ /* _dl_resize_dtv rereads GL(dl_tls_max_dtv_idx) which may decrease. */
	+ dtv_slots = dtv[-1].counter;

	/* Install this new dtv in the thread data structures. */
	INSTALL_DTV (result, &dtv[-1]);
	@@ -504,22 +546,33 @@ _dl_allocate_tls_init (void *result)
	for (cnt = total == 0 ? 1 : 0; cnt < listp->len; ++cnt)
	{
	struct link_map *map;
	+ size_t gen;
	void *dest;

	/* Check for the total number of used slots. */
	- if (total + cnt > GL(dl_tls_max_dtv_idx))
	+ if (total + cnt > dtv_slots)
	break;

	- map = listp->slotinfo[cnt].map;
	+ /* Synchronize with dl_add_to_slotinfo and remove_slotinfo. */
	+ map = atomic_load_acquire (&listp->slotinfo[cnt].map);
	if (map == NULL)
	/* Unused entry. */
	continue;

	+ /* Consistent generation count with the map read above.
	+ Inconsistent gen may be read if the entry is being reused,
	+ in which case it is larger than gen_count and we skip it. */
	+ gen = atomic_load_relaxed (&listp->slotinfo[cnt].gen);
	+ if (gen > gen_count)
	+ /* New entry. */
	+ continue;
	+
	/* Keep track of the maximum generation number. This might
	not be the generation counter. */
	- assert (listp->slotinfo[cnt].gen <= GL(dl_tls_generation));
	- maxgen = MAX (maxgen, listp->slotinfo[cnt].gen);
	+ maxgen = MAX (maxgen, gen);

	+ /* TODO: concurrent dlclose may free map which would break
	+ the rest of the code below. */
	dtv[map->l_tls_modid].pointer.val = TLS_DTV_UNALLOCATED;
	dtv[map->l_tls_modid].pointer.to_free = NULL;

	@@ -549,11 +602,15 @@ _dl_allocate_tls_init (void *result)
	}

	total += cnt;
	- if (total >= GL(dl_tls_max_dtv_idx))
	+ if (total > dtv_slots)
	break;

	- listp = listp->next;
	- assert (listp != NULL);
	+ /* Synchronize with dl_add_to_slotinfo. */
	+ listp = atomic_load_acquire (&listp->next);
	+ /* dtv_slots is an upper bound on the number of entries we care
	+ about, the list may end sooner. */
	+ if (listp == NULL)
	+ break;
	}

	/* The DTV version is up-to-date now. */
	@@ -954,7 +1011,7 @@ _dl_add_to_slotinfo (struct link_map *l)
	the first slot. */
	assert (idx == 0);

	- listp = prevp->next = (struct dtv_slotinfo_list *)
	+ listp = (struct dtv_slotinfo_list *)
	malloc (sizeof (struct dtv_slotinfo_list)
	+ TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo));
	if (listp == NULL)
	@@ -969,9 +1026,17 @@ _dl_add_to_slotinfo (struct link_map *l)
	listp->next = NULL;
	memset (listp->slotinfo, '\0',
	TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo));
	+ /* Add the new list item and synchronize with _dl_allocate_tls_init. */
	+ atomic_store_release (&prevp->next, listp);
	}

	/* Add the information into the slotinfo data structure. */
	- listp->slotinfo[idx].map = l;
	- listp->slotinfo[idx].gen = GL(dl_tls_generation) + 1;
	+
	+ /* This cannot be in a data-race so non-atomic load would be valid too. */
	+ size_t newgen = atomic_load_relaxed (&GL(dl_tls_generation)) + 1;
	+ /* TODO: Concurrent readers may see an overflowed gen, which is bad,
	+ but overflow is guaranteed to crash the dlopen that is executing. */
	+ atomic_store_relaxed (&listp->slotinfo[idx].gen, newgen);
	+ /* Synchronize with _dl_allocate_tls_init (see notes there). */
	+ atomic_store_release (&listp->slotinfo[idx].map, l);
	}