tools/include/linux/ring_buffer.h - third_party/kernel - Git at Google

 #ifndef _TOOLS_LINUX_RING_BUFFER_H_
 #define _TOOLS_LINUX_RING_BUFFER_H_

 #include <asm/barrier.h>
 #include <linux/perf_event.h>

 /*
  * Contract with kernel for walking the perf ring buffer from
  * user space requires the following barrier pairing (quote
  * from kernel/events/ring_buffer.c):
  *
  *   Since the mmap() consumer (userspace) can run on a
  *   different CPU:
  *
  *   kernel                             user
  *
  *   if (LOAD ->data_tail) {            LOAD ->data_head
  *                      (A)             smp_rmb()       (C)
  *      STORE $data                     LOAD $data
  *      smp_wmb()       (B)             smp_mb()        (D)
  *      STORE ->data_head               STORE ->data_tail
  *   }
  *
  *   Where A pairs with D, and B pairs with C.
  *
  *   In our case A is a control dependency that separates the
  *   load of the ->data_tail and the stores of $data. In case
  *   ->data_tail indicates there is no room in the buffer to
  *   store $data we do not.
  *
  *   D needs to be a full barrier since it separates the data
  *   READ from the tail WRITE.
  *
  *   For B a WMB is sufficient since it separates two WRITEs,
  *   and for C an RMB is sufficient since it separates two READs.
  *
  * Note, instead of B, C, D we could also use smp_store_release()
  * in B and D as well as smp_load_acquire() in C.
  *
  * However, this optimization does not make sense for all kernel
  * supported architectures since for a fair number it would
  * resolve into READ_ONCE() + smp_mb() pair for smp_load_acquire(),
  * and smp_mb() + WRITE_ONCE() pair for smp_store_release().
  *
  * Thus for those smp_wmb() in B and smp_rmb() in C would still
  * be less expensive. For the case of D this has either the same
  * cost or is less expensive, for example, due to TSO x86 can
  * avoid the CPU barrier entirely.
  */

 static inline u64 ring_buffer_read_head(struct perf_event_mmap_page *base)
 {
 /*
  * Architectures where smp_load_acquire() does not fallback to
  * READ_ONCE() + smp_mb() pair.
  */
 #if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__) || \
     defined(__ia64__) || defined(__sparc__) && defined(__arch64__)
 	return smp_load_acquire(&base->data_head);
 #else
 	u64 head = READ_ONCE(base->data_head);

 	smp_rmb();
 	return head;
 #endif
 }

 static inline void ring_buffer_write_tail(struct perf_event_mmap_page *base,
 					  u64 tail)
 {
 	smp_store_release(&base->data_tail, tail);
 }

 #endif /* _TOOLS_LINUX_RING_BUFFER_H_ */
	#ifndef _TOOLS_LINUX_RING_BUFFER_H_
	#define _TOOLS_LINUX_RING_BUFFER_H_

	#include <asm/barrier.h>
	#include <linux/perf_event.h>

	/*
	* Contract with kernel for walking the perf ring buffer from
	* user space requires the following barrier pairing (quote
	* from kernel/events/ring_buffer.c):
	*
	* Since the mmap() consumer (userspace) can run on a
	* different CPU:
	*
	* kernel user
	*
	* if (LOAD ->data_tail) { LOAD ->data_head
	* (A) smp_rmb() (C)
	* STORE $data LOAD $data
	* smp_wmb() (B) smp_mb() (D)
	* STORE ->data_head STORE ->data_tail
	* }
	*
	* Where A pairs with D, and B pairs with C.
	*
	* In our case A is a control dependency that separates the
	* load of the ->data_tail and the stores of $data. In case
	* ->data_tail indicates there is no room in the buffer to
	* store $data we do not.
	*
	* D needs to be a full barrier since it separates the data
	* READ from the tail WRITE.
	*
	* For B a WMB is sufficient since it separates two WRITEs,
	* and for C an RMB is sufficient since it separates two READs.
	*
	* Note, instead of B, C, D we could also use smp_store_release()
	* in B and D as well as smp_load_acquire() in C.
	*
	* However, this optimization does not make sense for all kernel
	* supported architectures since for a fair number it would
	* resolve into READ_ONCE() + smp_mb() pair for smp_load_acquire(),
	* and smp_mb() + WRITE_ONCE() pair for smp_store_release().
	*
	* Thus for those smp_wmb() in B and smp_rmb() in C would still
	* be less expensive. For the case of D this has either the same
	* cost or is less expensive, for example, due to TSO x86 can
	* avoid the CPU barrier entirely.
	*/

	static inline u64 ring_buffer_read_head(struct perf_event_mmap_page *base)
	{
	/*
	* Architectures where smp_load_acquire() does not fallback to
	* READ_ONCE() + smp_mb() pair.
	*/
	#if defined(__x86_64__) \|\| defined(__aarch64__) \|\| defined(__powerpc64__) \|\| \
	defined(__ia64__) \|\| defined(__sparc__) && defined(__arch64__)
	return smp_load_acquire(&base->data_head);
	#else
	u64 head = READ_ONCE(base->data_head);

	smp_rmb();
	return head;
	#endif
	}

	static inline void ring_buffer_write_tail(struct perf_event_mmap_page *base,
	u64 tail)
	{
	smp_store_release(&base->data_tail, tail);
	}

	#endif /* _TOOLS_LINUX_RING_BUFFER_H_ */