merge-upstream/v5.4.210 from branch/tag: upstream/v5.4.210 into branch: main-R85-13310.B-cos-5.4
Changelog:
-------------------------------------------------------------
Alexey Kardashevskiy (1):
KVM: Don't null dereference ops->destroy
Chen-Yu Tsai (1):
media: v4l2-mem2mem: Apply DST_QUEUE_OFF_BASE on MMAP buffers across ioctls
Daniel Sneddon (1):
x86/speculation: Add RSB VM Exit protections
Greg Kroah-Hartman (1):
Linux 5.4.210
Jakub Sitnicki (1):
selftests/bpf: Extend verifier and bpf_sock tests for dst_port loads
Jean-Philippe Brucker (1):
selftests/bpf: Fix "dubious pointer arithmetic" test
John Fastabend (2):
bpf: Verifer, adjust_scalar_min_max_vals to always call update_reg_bounds()
bpf: Test_verifier, #70 error message updates for 32-bit right shift
Ning Qiang (1):
macintosh/adb: fix oob read in do_adb_query() function
Pawan Gupta (1):
x86/speculation: Add LFENCE to RSB fill sequence
Raghavendra Rao Ananta (1):
selftests: KVM: Handle compiler optimizations in ucall
Stanislav Fomichev (1):
selftests/bpf: Fix test_align verifier log patterns
Subbaraman Narayanamurthy (1):
thermal: Fix NULL pointer dereferences in of_thermal_ functions
Tony Luck (1):
ACPI: APEI: Better fix to avoid spamming the console with old error logs
Werner Sembach (2):
ACPI: video: Force backlight native for some TongFang devices
ACPI: video: Shortening quirk list by identifying Clevo by board_name only
BUG=b/242276560
TEST=tryjob, validation and K8s e2e
RELEASE_NOTE=Updated the Linux kernel to v5.4.210.
Change-Id: I7a938532130f60c6b2a5e329ce6fe282fbe99356
Signed-off-by: COS Kernel Merge Bot <cloud-image-merge-automation@prod.google.com>
diff --git a/Makefile b/Makefile
index 74abb7e..07c23b6 100644
--- a/Makefile
+++ b/Makefile
@@ -789,11 +789,20 @@
endif
endif
-# Initialize all stack variables with a pattern, if desired.
-ifdef CONFIG_INIT_STACK_ALL
+# Initialize all stack variables with a 0xAA pattern.
+ifdef CONFIG_INIT_STACK_ALL_PATTERN
KBUILD_CFLAGS += -ftrivial-auto-var-init=pattern
endif
+# Initialize all stack variables with a zero value.
+ifdef CONFIG_INIT_STACK_ALL_ZERO
+# Future support for zero initialization is still being debated, see
+# https://bugs.llvm.org/show_bug.cgi?id=45497. These flags are subject to being
+# renamed or dropped.
+KBUILD_CFLAGS += -ftrivial-auto-var-init=zero
+KBUILD_CFLAGS += -enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang
+endif
+
DEBUG_CFLAGS := $(call cc-option, -fno-var-tracking-assignments)
ifdef CONFIG_DEBUG_INFO
diff --git a/PRESUBMIT.cfg b/PRESUBMIT.cfg
new file mode 100644
index 0000000..2fcead1
--- /dev/null
+++ b/PRESUBMIT.cfg
@@ -0,0 +1,14 @@
+[Hook Overrides]
+aosp_license_check: false
+cros_license_check: false
+long_line_check: false
+stray_whitespace_check: false
+tab_check: false
+tabbed_indent_required_check: false
+signoff_check: true
+
+# Make sure RELEASE_NOTE field is present.
+release_note_field_check: true
+
+# Make sure cos_patch trailer is present.
+cos_patch_trailer_check: true
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 6f013e4..04d20dc 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -96,7 +96,7 @@
{
return 0;
}
-static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { }
+static inline void perf_get_data_addr(struct perf_event *event, struct pt_regs *regs, u64 *addrp) { }
static inline u32 perf_get_misc_flags(struct pt_regs *regs)
{
return 0;
@@ -127,7 +127,7 @@
static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) {}
-static inline void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) {}
+static inline void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *cpuhw) {}
static void pmao_restore_workaround(bool ebb) { }
#endif /* CONFIG_PPC32 */
@@ -182,7 +182,7 @@
* pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC, the
* [POWER7P_]MMCRA_SDAR_VALID bit in MMCRA, or the SDAR_VALID bit in SIER.
*/
-static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp)
+static inline void perf_get_data_addr(struct perf_event *event, struct pt_regs *regs, u64 *addrp)
{
unsigned long mmcra = regs->dsisr;
bool sdar_valid;
@@ -207,8 +207,7 @@
if (!(mmcra & MMCRA_SAMPLE_ENABLE) || sdar_valid)
*addrp = mfspr(SPRN_SDAR);
- if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN) &&
- is_kernel_addr(mfspr(SPRN_SDAR)))
+ if (is_kernel_addr(mfspr(SPRN_SDAR)) && perf_allow_kernel(&event->attr) != 0)
*addrp = 0;
}
@@ -447,7 +446,7 @@
}
/* Processing BHRB entries */
-static void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw)
+static void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *cpuhw)
{
u64 val;
u64 addr;
@@ -475,8 +474,7 @@
* exporting it to userspace (avoid exposure of regions
* where we could have speculative execution)
*/
- if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN) &&
- is_kernel_addr(addr))
+ if (is_kernel_addr(addr) && perf_allow_kernel(&event->attr) != 0)
continue;
/* Branches are read most recent first (ie. mfbhrb 0 is
@@ -2120,12 +2118,12 @@
if (event->attr.sample_type &
(PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR))
- perf_get_data_addr(regs, &data.addr);
+ perf_get_data_addr(event, regs, &data.addr);
if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) {
struct cpu_hw_events *cpuhw;
cpuhw = this_cpu_ptr(&cpu_hw_events);
- power_pmu_bhrb_read(cpuhw);
+ power_pmu_bhrb_read(event, cpuhw);
data.br_stack = &cpuhw->bhrb_stack;
}
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 741540d..6a3b599 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -555,9 +555,11 @@
* Note that the default paranoia setting permits unprivileged
* users to profile the kernel.
*/
- if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
- !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ if (event->attr.exclude_kernel) {
+ ret = perf_allow_kernel(&event->attr);
+ if (ret)
+ return ret;
+ }
if (x86_add_exclusive(x86_lbr_exclusive_bts))
return -EBUSY;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index f297620..bba40f7 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3320,8 +3320,9 @@
if (x86_pmu.version < 3)
return -EINVAL;
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ ret = perf_allow_cpu(&event->attr);
+ if (ret)
+ return ret;
event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index dee579e..a4cc660 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -776,8 +776,9 @@
* the user needs special permissions to be able to use it
*/
if (p4_ht_active() && p4_event_bind_map[v].shared) {
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ v = perf_allow_cpu(&event->attr);
+ if (v)
+ return v;
}
/* ESCR EventMask bits may be invalid */
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 848ce43..6c36956 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -36,6 +36,7 @@
void __init sme_unmap_bootdata(char *real_mode_data);
void __init sme_early_init(void);
+void __init sev_setup_arch(void);
void __init sme_encrypt_kernel(struct boot_params *bp);
void __init sme_enable(struct boot_params *bp);
@@ -65,6 +66,7 @@
static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
static inline void __init sme_early_init(void) { }
+static inline void __init sev_setup_arch(void) { }
static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
static inline void __init sme_enable(struct boot_params *bp) { }
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 366875a..66be1cc 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1120,6 +1120,12 @@
memblock_set_current_limit(ISA_END_ADDRESS);
e820__memblock_setup();
+ /*
+ * Needs to run after memblock setup because it needs the physical
+ * memory size.
+ */
+ sev_setup_arch();
+
reserve_bios_regions();
if (efi_enabled(EFI_MEMMAP)) {
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 7b55893..0007a51 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -196,6 +196,37 @@
swiotlb_force = SWIOTLB_FORCE;
}
+void __init sev_setup_arch(void)
+{
+ phys_addr_t total_mem = memblock_phys_mem_size();
+ unsigned long size;
+
+ if (!sev_active())
+ return;
+
+ /*
+ * For SEV, all DMA has to occur via shared/unencrypted pages.
+ * SEV uses SWIOTLB to make this happen without changing device
+ * drivers. However, depending on the workload being run, the
+ * default 64MB of SWIOTLB may not be enough and SWIOTLB may
+ * run out of buffers for DMA, resulting in I/O errors and/or
+ * performance degradation especially with high I/O workloads.
+ *
+ * Adjust the default size of SWIOTLB for SEV guests using
+ * a percentage of guest memory for SWIOTLB buffers.
+ * Also, as the SWIOTLB bounce buffer memory is allocated
+ * from low memory, ensure that the adjusted size is within
+ * the limits of low available memory.
+ *
+ * The percentage of guest memory used here for SWIOTLB buffers
+ * is more of an approximation of the static adjustment which
+ * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
+ */
+ size = total_mem * 6 / 100;
+ size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
+ swiotlb_adjust_size(size);
+}
+
static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
{
pgprot_t old_prot, new_prot;
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 28b92e3..796d660 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -59,6 +59,15 @@
rescue mode with init=/bin/sh, even when the /dev directory
on the rootfs is completely empty.
+config DEVTMPFS_SAFE
+ bool "Automount devtmpfs with nosuid/noexec"
+ depends on DEVTMPFS_MOUNT
+ default y
+ help
+ This instructs the kernel to automount devtmpfs with the
+ MS_NOEXEC and MS_NOSUID mount flags, which can prevent
+ certain kinds of code-execution attack on embedded platforms.
+
config STANDALONE
bool "Select only drivers that don't need compile-time external firmware"
default y
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 5e9b007..dbe0830 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -370,6 +370,7 @@
int devtmpfs_mount(const char *mntdir)
{
int err;
+ int mflags = MS_SILENT;
if (!mount_dev)
return 0;
@@ -377,7 +378,10 @@
if (!thread)
return 0;
- err = ksys_mount("devtmpfs", mntdir, "devtmpfs", MS_SILENT, NULL);
+#ifdef CONFIG_DEVTMPFS_SAFE
+ mflags |= MS_NOEXEC | MS_NOSUID;
+#endif
+ err = ksys_mount("devtmpfs", mntdir, "devtmpfs", mflags, NULL);
if (err)
printk(KERN_INFO "devtmpfs: error mounting %i\n", err);
else
diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c
index b869316..7ef3cec 100644
--- a/drivers/md/dm-init.c
+++ b/drivers/md/dm-init.c
@@ -301,3 +301,254 @@
module_param(create, charp, 0);
MODULE_PARM_DESC(create, "Create a mapped device in early boot");
+
+/* ---------------------------------------------------------------
+ * ChromeOS shim - convert dm= format to dm-mod.create= format
+ * ---------------------------------------------------------------
+ */
+
+struct dm_chrome_target {
+ char *field[4];
+};
+
+struct dm_chrome_dev {
+ char *name, *uuid, *mode;
+ unsigned int num_targets;
+ struct dm_chrome_target targets[DM_MAX_TARGETS];
+};
+
+static char __init *dm_chrome_parse_target(char *str, struct dm_chrome_target *tgt)
+{
+ unsigned int i;
+
+ tgt->field[0] = str;
+ /* Delimit first 3 fields that are separated by space */
+ for (i = 0; i < ARRAY_SIZE(tgt->field) - 1; i++) {
+ tgt->field[i + 1] = str_field_delimit(&tgt->field[i], ' ');
+ if (!tgt->field[i + 1])
+ return NULL;
+ }
+ /* Delimit last field that can be terminated by comma */
+ return str_field_delimit(&tgt->field[i], ',');
+}
+
+static char __init *dm_chrome_parse_dev(char *str, struct dm_chrome_dev *dev)
+{
+ char *target, *num;
+ unsigned int i;
+
+ if (!str)
+ return ERR_PTR(-EINVAL);
+
+ target = str_field_delimit(&str, ',');
+ if (!target)
+ return ERR_PTR(-EINVAL);
+
+ /* Delimit first 3 fields that are separated by space */
+ dev->name = str;
+ dev->uuid = str_field_delimit(&dev->name, ' ');
+ if (!dev->uuid)
+ return ERR_PTR(-EINVAL);
+
+ dev->mode = str_field_delimit(&dev->uuid, ' ');
+ if (!dev->mode)
+ return ERR_PTR(-EINVAL);
+
+ /* num is optional */
+ num = str_field_delimit(&dev->mode, ' ');
+ if (!num)
+ dev->num_targets = 1;
+ else {
+ /* Delimit num and check if it the last field */
+ if(str_field_delimit(&num, ' '))
+ return ERR_PTR(-EINVAL);
+ if (kstrtouint(num, 0, &dev->num_targets))
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (dev->num_targets > DM_MAX_TARGETS) {
+ DMERR("too many targets %u > %d",
+ dev->num_targets, DM_MAX_TARGETS);
+ return ERR_PTR(-EINVAL);
+ }
+
+ for (i = 0; i < dev->num_targets - 1; i++) {
+ target = dm_chrome_parse_target(target, &dev->targets[i]);
+ if (!target)
+ return ERR_PTR(-EINVAL);
+ }
+ /* The last one can return NULL if it reaches the end of str */
+ return dm_chrome_parse_target(target, &dev->targets[i]);
+}
+
+static char __init *dm_chrome_convert(struct dm_chrome_dev *devs, unsigned int num_devs)
+{
+ char *str = kmalloc(DM_MAX_STR_SIZE, GFP_KERNEL);
+ char *p = str;
+ unsigned int i, j;
+ int ret;
+
+ if (!str)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < num_devs; i++) {
+ if (!strcmp(devs[i].uuid, "none"))
+ devs[i].uuid = "";
+ ret = snprintf(p, DM_MAX_STR_SIZE - (p - str),
+ "%s,%s,,%s",
+ devs[i].name,
+ devs[i].uuid,
+ devs[i].mode);
+ if (ret < 0)
+ goto out;
+ p += ret;
+
+ for (j = 0; j < devs[i].num_targets; j++) {
+ ret = snprintf(p, DM_MAX_STR_SIZE - (p - str),
+ ",%s %s %s %s",
+ devs[i].targets[j].field[0],
+ devs[i].targets[j].field[1],
+ devs[i].targets[j].field[2],
+ devs[i].targets[j].field[3]);
+ if (ret < 0)
+ goto out;
+ p += ret;
+ }
+ if (i < num_devs - 1) {
+ ret = snprintf(p, DM_MAX_STR_SIZE - (p - str), ";");
+ if (ret < 0)
+ goto out;
+ p += ret;
+ }
+ }
+
+ return str;
+
+out:
+ kfree(str);
+ return ERR_PTR(ret);
+}
+
+/**
+ * dm_chrome_shim - convert old dm= format used in chromeos to the new
+ * upstream format.
+ *
+ * ChromeOS old format
+ * -------------------
+ * <device> ::= [<num>] <device-mapper>+
+ * <device-mapper> ::= <head> "," <target>+
+ * <head> ::= <name> <uuid> <mode> [<num>]
+ * <target> ::= <start> <length> <type> <options> ","
+ * <mode> ::= "ro" | "rw"
+ * <uuid> ::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | "none"
+ * <type> ::= "verity" | "bootcache" | ...
+ *
+ * Example:
+ * 2 vboot none ro 1,
+ * 0 1768000 bootcache
+ * device=aa55b119-2a47-8c45-946a-5ac57765011f+1
+ * signature=76e9be054b15884a9fa85973e9cb274c93afadb6
+ * cache_start=1768000 max_blocks=100000 size_limit=23 max_trace=20000,
+ * vroot none ro 1,
+ * 0 1740800 verity payload=254:0 hashtree=254:0 hashstart=1740800 alg=sha1
+ * root_hexdigest=76e9be054b15884a9fa85973e9cb274c93afadb6
+ * salt=5b3549d54d6c7a3837b9b81ed72e49463a64c03680c47835bef94d768e5646fe
+ *
+ * Notes:
+ * 1. uuid is a label for the device and we set it to "none".
+ * 2. The <num> field will be optional initially and assumed to be 1.
+ * Once all the scripts that set these fields have been set, it will
+ * be made mandatory.
+ */
+
+static char *chrome_create;
+
+static int __init dm_chrome_shim(char *arg) {
+ if (!arg || create)
+ return -EINVAL;
+ chrome_create = arg;
+ return 0;
+}
+
+static int __init dm_chrome_parse_devices(void)
+{
+ struct dm_chrome_dev *devs;
+ unsigned int num_devs, i;
+ char *next, *base_str;
+ int ret = 0;
+
+ /* Verify if dm-mod.create was not used */
+ if (!chrome_create || create)
+ return -EINVAL;
+
+ if (strlen(chrome_create) >= DM_MAX_STR_SIZE) {
+ DMERR("Argument is too big. Limit is %d\n", DM_MAX_STR_SIZE);
+ return -EINVAL;
+ }
+
+ base_str = kstrdup(chrome_create, GFP_KERNEL);
+ if (!base_str)
+ return -ENOMEM;
+
+ next = str_field_delimit(&base_str, ' ');
+ if (!next) {
+ ret = -EINVAL;
+ goto out_str;
+ }
+
+ /* if first field is not the optional <num> field */
+ if (kstrtouint(base_str, 0, &num_devs)) {
+ num_devs = 1;
+ /* rewind next pointer */
+ next = base_str;
+ }
+
+ if (num_devs > DM_MAX_DEVICES) {
+ DMERR("too many devices %u > %d", num_devs, DM_MAX_DEVICES);
+ ret = -EINVAL;
+ goto out_str;
+ }
+
+ devs = kcalloc(num_devs, sizeof(*devs), GFP_KERNEL);
+ if (!devs)
+ return -ENOMEM;
+
+ /* restore string */
+ strcpy(base_str, chrome_create);
+
+ /* parse devices */
+ for (i = 0; i < num_devs; i++) {
+ next = dm_chrome_parse_dev(next, &devs[i]);
+ if (IS_ERR(next)) {
+ DMERR("couldn't parse device");
+ ret = PTR_ERR(next);
+ goto out_devs;
+ }
+ }
+
+ create = dm_chrome_convert(devs, num_devs);
+ if (IS_ERR(create)) {
+ ret = PTR_ERR(create);
+ goto out_devs;
+ }
+
+ DMDEBUG("Converting:\n\tdm=\"%s\"\n\tdm-mod.create=\"%s\"\n",
+ chrome_create, create);
+
+ /* Call upstream code */
+ dm_init_init();
+
+ kfree(create);
+
+out_devs:
+ create = NULL;
+ kfree(devs);
+out_str:
+ kfree(base_str);
+
+ return ret;
+}
+
+late_initcall(dm_chrome_parse_devices);
+
+__setup("dm=", dm_chrome_shim);
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 9dcdf34b..727363b 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -16,8 +16,10 @@
#include "dm-verity.h"
#include "dm-verity-fec.h"
#include "dm-verity-verify-sig.h"
+#include <linux/delay.h>
#include <linux/module.h>
#include <linux/reboot.h>
+#include <crypto/hash.h>
#define DM_MSG_PREFIX "verity"
@@ -32,8 +34,9 @@
#define DM_VERITY_OPT_RESTART "restart_on_corruption"
#define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks"
#define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once"
+#define DM_VERITY_OPT_ERROR_BEHAVIOR "error_behavior"
-#define DM_VERITY_OPTS_MAX (3 + DM_VERITY_OPTS_FEC + \
+#define DM_VERITY_OPTS_MAX (4 + DM_VERITY_OPTS_FEC + \
DM_VERITY_ROOT_HASH_VERIFICATION_OPTS)
static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
@@ -47,6 +50,120 @@
unsigned n_blocks;
};
+/* Provide a lightweight means of specifying the global default for
+ * error behavior: eio, reboot, or none
+ * Legacy support for 0 = eio, 1 = reboot/panic, 2 = none, 3 = notify.
+ * This is matched to the enum in dm-verity.h.
+ */
+static char *error_behavior_istring[] = { "0", "1", "2", "3" };
+static const char *allowed_error_behaviors[] = { "eio", "panic", "none",
+ "notify", NULL };
+static char *error_behavior = "eio";
+module_param(error_behavior, charp, 0644);
+MODULE_PARM_DESC(error_behavior, "Behavior on error "
+ "(eio, panic, none, notify)");
+
+/* Controls whether verity_get_device will wait forever for a device. */
+static int dev_wait;
+module_param(dev_wait, int, 0444);
+MODULE_PARM_DESC(dev_wait, "Wait forever for a backing device");
+
+static BLOCKING_NOTIFIER_HEAD(verity_error_notifier);
+
+int dm_verity_register_error_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&verity_error_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(dm_verity_register_error_notifier);
+
+int dm_verity_unregister_error_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&verity_error_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(dm_verity_unregister_error_notifier);
+
+/* If the request is not successful, this handler takes action.
+ * TODO make this call a registered handler.
+ */
+static void verity_error(struct dm_verity *v, struct dm_verity_io *io,
+ blk_status_t status)
+{
+ const char *message = v->hash_failed ? "integrity" : "block";
+ int error_behavior = DM_VERITY_ERROR_BEHAVIOR_PANIC;
+ dev_t devt = 0;
+ u64 block = ~0;
+ struct dm_verity_error_state error_state;
+ /* If the hash did not fail, then this is likely transient. */
+ int transient = !v->hash_failed;
+
+ devt = v->data_dev->bdev->bd_dev;
+ error_behavior = v->error_behavior;
+
+ DMERR_LIMIT("verification failure occurred: %s failure", message);
+
+ if (error_behavior == DM_VERITY_ERROR_BEHAVIOR_NOTIFY) {
+ error_state.code = status;
+ error_state.transient = transient;
+ error_state.block = block;
+ error_state.message = message;
+ error_state.dev_start = v->data_start;
+ error_state.dev_len = v->data_blocks;
+ error_state.dev = v->data_dev->bdev;
+ error_state.hash_dev_start = v->hash_start;
+ error_state.hash_dev_len = v->hash_blocks;
+ error_state.hash_dev = v->hash_dev->bdev;
+
+ /* Set default fallthrough behavior. */
+ error_state.behavior = DM_VERITY_ERROR_BEHAVIOR_PANIC;
+ error_behavior = DM_VERITY_ERROR_BEHAVIOR_PANIC;
+
+ if (!blocking_notifier_call_chain(
+ &verity_error_notifier, transient, &error_state)) {
+ error_behavior = error_state.behavior;
+ }
+ }
+
+ switch (error_behavior) {
+ case DM_VERITY_ERROR_BEHAVIOR_EIO:
+ break;
+ case DM_VERITY_ERROR_BEHAVIOR_NONE:
+ break;
+ default:
+ if (!transient)
+ goto do_panic;
+ }
+ return;
+
+do_panic:
+ panic("dm-verity failure: "
+ "device:%u:%u status:%d block:%llu message:%s",
+ MAJOR(devt), MINOR(devt), status, (u64)block, message);
+}
+
+/**
+ * verity_parse_error_behavior - parse a behavior charp to the enum
+ * @behavior: NUL-terminated char array
+ *
+ * Checks if the behavior is valid either as text or as an index digit
+ * and returns the proper enum value in string form or ERR_PTR(-EINVAL)
+ * on error.
+ */
+static char *verity_parse_error_behavior(const char *behavior)
+{
+ const char **allowed = allowed_error_behaviors;
+ int index;
+
+ for (index = 0; *allowed; allowed++, index++)
+ if (!strcmp(*allowed, behavior) || behavior[0] == index + '0')
+ break;
+
+ if (!*allowed)
+ return ERR_PTR(-EINVAL);
+
+ /* Convert to the integer index matching the enum. */
+ return error_behavior_istring[index];
+}
+
/*
* Auxiliary structure appended to each dm-bufio buffer. If the value
* hash_verified is nonzero, hash of the block has been verified.
@@ -550,6 +667,8 @@
struct dm_verity *v = io->v;
struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
+ if (status && !verity_fec_is_enabled(io->v))
+ verity_error(v, io, status);
bio->bi_end_io = io->orig_bi_end_io;
bio->bi_status = status;
@@ -917,6 +1036,22 @@
return r;
continue;
+ } else if (!strcasecmp(arg_name, DM_VERITY_OPT_ERROR_BEHAVIOR)) {
+ int behavior;
+
+ if (!argc) {
+ ti->error = "Missing error behavior parameter";
+ return -EINVAL;
+ }
+ if (kstrtoint(dm_shift_arg(as), 0, &behavior) ||
+ behavior < 0) {
+ ti->error = "Bad error behavior parameter";
+ return -EINVAL;
+ }
+ v->error_behavior = behavior;
+ argc--;
+ continue;
+
} else if (verity_is_fec_opt_arg(arg_name)) {
r = verity_fec_parse_opt_args(as, v, &argc, arg_name);
if (r)
@@ -939,6 +1074,132 @@
return r;
}
+static int verity_get_device(struct dm_target *ti, const char *devname,
+ struct dm_dev **dm_dev)
+{
+ do {
+ /* Try the normal path first since if everything is ready, it
+ * will be the fastest.
+ */
+ if (!dm_get_device(ti, devname,
+ dm_table_get_mode(ti->table), dm_dev))
+ return 0;
+
+ if (!dev_wait)
+ break;
+
+ /* No need to be too aggressive since this is a slow path. */
+ msleep(500);
+ } while (dev_wait && (driver_probe_done() != 0 || *dm_dev == NULL));
+ return -1;
+}
+
+static void splitarg(char *arg, char **key, char **val)
+{
+ *key = strsep(&arg, "=");
+ *val = strsep(&arg, "");
+}
+
+/* Convert Chrome OS arguments into standard arguments */
+
+static char *chromeos_args(unsigned *pargc, char ***pargv)
+{
+ char *hashstart = NULL;
+ char **argv = *pargv;
+ int argc = *pargc;
+ char *key, *val;
+ int nargc = 10;
+ char **nargv;
+ char *errstr;
+ int i;
+
+ nargv = kcalloc(14, sizeof(char *), GFP_KERNEL);
+ if (!nargv)
+ return "Failed to allocate memory";
+
+ nargv[0] = "0"; /* version */
+ nargv[3] = "4096"; /* hash block size */
+ nargv[4] = "4096"; /* data block size */
+ nargv[9] = "-"; /* salt (optional) */
+
+ for (i = 0; i < argc; ++i) {
+ DMDEBUG("Argument %d: '%s'", i, argv[i]);
+ splitarg(argv[i], &key, &val);
+ if (!key) {
+ DMWARN("Bad argument %d: missing key?", i);
+ errstr = "Bad argument: missing key";
+ goto err;
+ }
+ if (!val) {
+ DMWARN("Bad argument %d='%s': missing value", i, key);
+ errstr = "Bad argument: missing value";
+ goto err;
+ }
+ if (!strcmp(key, "alg")) {
+ nargv[7] = val;
+ } else if (!strcmp(key, "payload")) {
+ nargv[1] = val;
+ } else if (!strcmp(key, "hashtree")) {
+ nargv[2] = val;
+ } else if (!strcmp(key, "root_hexdigest")) {
+ nargv[8] = val;
+ } else if (!strcmp(key, "hashstart")) {
+ unsigned long num;
+
+ if (kstrtoul(val, 10, &num)) {
+ errstr = "Invalid hashstart";
+ goto err;
+ }
+ num >>= (12 - SECTOR_SHIFT);
+ hashstart = kmalloc(24, GFP_KERNEL);
+ if (!hashstart) {
+ errstr = "Failed to allocate memory";
+ goto err;
+ }
+ scnprintf(hashstart, sizeof(hashstart), "%lu", num);
+ nargv[5] = hashstart;
+ nargv[6] = hashstart;
+ } else if (!strcmp(key, "salt")) {
+ nargv[9] = val;
+ } else if (!strcmp(key, DM_VERITY_OPT_ERROR_BEHAVIOR)) {
+ char *behavior = verity_parse_error_behavior(val);
+
+ if (IS_ERR(behavior)) {
+ errstr = "Invalid error behavior";
+ goto err;
+ }
+ nargv[10] = "2";
+ nargv[11] = key;
+ nargv[12] = behavior;
+ nargc = 13;
+ }
+ }
+
+ if (!nargv[1] || !nargv[2] || !nargv[5] || !nargv[7] || !nargv[8]) {
+ errstr = "Missing argument";
+ goto err;
+ }
+
+ *pargc = nargc;
+ *pargv = nargv;
+ return NULL;
+
+err:
+ kfree(nargv);
+ kfree(hashstart);
+ return errstr;
+}
+
+/* Release memory allocated for Chrome OS parameter conversion */
+
+static void free_chromeos_argv(char **argv)
+{
+ if (argv) {
+ kfree(argv[5]);
+ kfree(argv);
+ }
+}
+
/*
* Target parameters:
* <version> The current format is version 1.
@@ -965,10 +1226,19 @@
sector_t hash_position;
char dummy;
char *root_hash_digest_to_validate;
+ char **chromeos_argv = NULL;
+
+ if (argc < 10) {
+ ti->error = chromeos_args(&argc, &argv);
+ if (ti->error)
+ return -EINVAL;
+ chromeos_argv = argv;
+ }
v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
if (!v) {
ti->error = "Cannot allocate verity structure";
+ free_chromeos_argv(chromeos_argv);
return -ENOMEM;
}
ti->private = v;
@@ -998,13 +1268,13 @@
}
v->version = num;
- r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev);
+ r = verity_get_device(ti, argv[1], &v->data_dev);
if (r) {
ti->error = "Data device lookup failed";
goto bad;
}
- r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev);
+ r = verity_get_device(ti, argv[2], &v->hash_dev);
if (r) {
ti->error = "Hash device lookup failed";
goto bad;
@@ -1204,14 +1474,14 @@
__alignof__(struct dm_verity_io));
verity_verify_sig_opts_cleanup(&verify_args);
-
+ free_chromeos_argv(chromeos_argv);
return 0;
bad:
verity_verify_sig_opts_cleanup(&verify_args);
verity_dtr(ti);
-
+ free_chromeos_argv(chromeos_argv);
return r;
}
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 641b9e3..f453caf 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -14,6 +14,7 @@
#include <linux/dm-bufio.h>
#include <linux/device-mapper.h>
#include <crypto/hash.h>
+#include <linux/notifier.h>
#define DM_VERITY_MAX_LEVELS 63
@@ -55,6 +56,7 @@
int hash_failed; /* set to 1 if hash of any block failed */
enum verity_mode mode; /* mode for handling verification errors */
unsigned corrupted_errs;/* Number of errors for corrupted blocks */
+ int error_behavior; /* selects error behavior on io errors */
struct workqueue_struct *verify_wq;
@@ -92,6 +94,40 @@
*/
};
+struct verity_result {
+ struct completion completion;
+ int err;
+};
+
+struct dm_verity_error_state {
+ int code;
+ int transient; /* Likely to not happen after a reboot */
+ u64 block;
+ const char *message;
+
+ sector_t dev_start;
+ sector_t dev_len;
+ struct block_device *dev;
+
+ sector_t hash_dev_start;
+ sector_t hash_dev_len;
+ struct block_device *hash_dev;
+
+ /* Final behavior after all notifications are completed. */
+ int behavior;
+};
+
+/* This enum must be matched to allowed_error_behaviors in dm-verity.c */
+enum dm_verity_error_behavior {
+ DM_VERITY_ERROR_BEHAVIOR_EIO = 0,
+ DM_VERITY_ERROR_BEHAVIOR_PANIC,
+ DM_VERITY_ERROR_BEHAVIOR_NONE,
+ DM_VERITY_ERROR_BEHAVIOR_NOTIFY
+};
+
+int dm_verity_register_error_notifier(struct notifier_block *nb);
+int dm_verity_unregister_error_notifier(struct notifier_block *nb);
+
static inline struct ahash_request *verity_io_hash_req(struct dm_verity *v,
struct dm_verity_io *io)
{
diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h
index f19edd4..90d7ac5 100644
--- a/drivers/net/ethernet/google/gve/gve.h
+++ b/drivers/net/ethernet/google/gve/gve.h
@@ -27,6 +27,17 @@
/* 1 for management, 1 for rx, 1 for tx */
#define GVE_MIN_MSIX 3
+/* Numbers of gve tx/rx stats in stats report. */
+#define GVE_TX_STATS_REPORT_NUM 5
+#define GVE_RX_STATS_REPORT_NUM 2
+
+/* Numbers of NIC tx/rx stats in stats report. */
+#define NIC_TX_STATS_REPORT_NUM 0
+#define NIC_RX_STATS_REPORT_NUM 4
+
+/* Interval to schedule a service task, 20000ms. */
+#define GVE_SERVICE_TIMER_PERIOD 20000
+
/* Each slot in the desc ring has a 1:1 mapping to a slot in the data ring */
struct gve_rx_desc_queue {
struct gve_rx_desc *desc_ring; /* the descriptor ring */
@@ -39,6 +50,8 @@
struct page *page;
void *page_address;
u32 page_offset; /* offset to write to in page */
+ int pagecnt_bias; /* expected pagecnt if only the driver has a ref */
+ bool can_flip; /* page can be flipped and reused */
};
/* A list of pages registered with the device during setup and used by a queue
@@ -57,6 +70,7 @@
dma_addr_t data_bus; /* dma mapping of the slots */
struct gve_rx_slot_page_info *page_info; /* page info of the buffers */
struct gve_queue_page_list *qpl; /* qpl assigned to this queue */
+ bool raw_addressing; /* use raw_addressing? */
};
struct gve_priv;
@@ -71,6 +85,13 @@
u32 cnt; /* free-running total number of completed packets */
u32 fill_cnt; /* free-running total number of descs and buffs posted */
u32 mask; /* masks the cnt and fill_cnt to the size of the ring */
+ u32 db_threshold; /* threshold for posting new buffs and descs */
+ u64 rx_copybreak_pkt; /* free-running count of copybreak packets */
+ u64 rx_copied_pkt; /* free-running total number of copied packets */
+ u64 rx_skb_alloc_fail; /* free-running count of skb alloc fails */
+ u64 rx_buf_alloc_fail; /* free-running count of buffer alloc fails */
+ u64 rx_desc_err_dropped_pkt; /* free-running count of packets dropped by descriptor error */
+ u64 rx_no_refill_dropped_pkt; /* free-running count of packets dropped because of lack of buffer refill */
u32 q_num; /* queue index */
u32 ntfy_id; /* notification block index */
struct gve_queue_resources *q_resources; /* head and tail pointer idx */
@@ -91,12 +112,20 @@
u32 iov_padding; /* padding associated with this segment */
};
+struct gve_tx_dma_buf {
+ DEFINE_DMA_UNMAP_ADDR(dma);
+ DEFINE_DMA_UNMAP_LEN(len);
+};
+
/* Tracks the memory in the fifo occupied by the skb. Mapped 1:1 to a desc
* ring entry but only used for a pkt_desc not a seg_desc
*/
struct gve_tx_buffer_state {
struct sk_buff *skb; /* skb for this pkt */
- struct gve_tx_iovec iov[GVE_TX_MAX_IOVEC]; /* segments of this pkt */
+ union {
+ struct gve_tx_iovec iov[GVE_TX_MAX_IOVEC]; /* segments of this pkt */
+ struct gve_tx_dma_buf buf;
+ };
};
/* A TX buffer - each queue has one */
@@ -119,13 +148,16 @@
__be32 last_nic_done ____cacheline_aligned; /* NIC tail pointer */
u64 pkt_done; /* free-running - total packets completed */
u64 bytes_done; /* free-running - total bytes completed */
+ u32 dropped_pkt; /* free-running - total packets dropped */
/* Cacheline 2 -- Read-mostly fields */
union gve_tx_desc *desc ____cacheline_aligned;
struct gve_tx_buffer_state *info; /* Maps 1:1 to a desc */
struct netdev_queue *netdev_txq;
struct gve_queue_resources *q_resources; /* head and tail pointer idx */
+ struct device *dev;
u32 mask; /* masks req and done down to queue size */
+ bool raw_addressing; /* use raw_addressing? */
/* Slow-path fields */
u32 q_num ____cacheline_aligned; /* queue idx */
@@ -141,13 +173,13 @@
* associated with that irq.
*/
struct gve_notify_block {
- __be32 irq_db_index; /* idx into Bar2 - set by device, must be 1st */
+ __be32 *irq_db_index; /* pointer to idx into Bar2 */
char name[IFNAMSIZ + 16]; /* name registered with the kernel */
struct napi_struct napi; /* kernel napi struct for this block */
struct gve_priv *priv;
struct gve_tx_ring *tx; /* tx rings on this block */
struct gve_rx_ring *rx; /* rx rings on this block */
-} ____cacheline_aligned;
+};
/* Tracks allowed and current queue settings */
struct gve_queue_config {
@@ -161,13 +193,18 @@
unsigned long *qpl_id_map; /* bitmap of used qpl ids */
};
+struct gve_irq_db {
+ __be32 index;
+} ____cacheline_aligned;
+
struct gve_priv {
struct net_device *dev;
struct gve_tx_ring *tx; /* array of tx_cfg.num_queues */
struct gve_rx_ring *rx; /* array of rx_cfg.num_queues */
struct gve_queue_page_list *qpls; /* array of num qpls */
struct gve_notify_block *ntfy_blocks; /* array of num_ntfy_blks */
- dma_addr_t ntfy_block_bus;
+ struct gve_irq_db *irq_db_indices; /* array of num_ntfy_blks */
+ dma_addr_t irq_db_indices_bus;
struct msix_entry *msix_vectors; /* array of num_ntfy_blks + 1 */
char mgmt_msix_name[IFNAMSIZ + 16];
u32 mgmt_msix_idx;
@@ -178,11 +215,12 @@
u16 tx_desc_cnt; /* num desc per ring */
u16 rx_desc_cnt; /* num desc per ring */
u16 tx_pages_per_qpl; /* tx buffer length */
- u16 rx_pages_per_qpl; /* rx buffer length */
+ u16 rx_data_slot_cnt; /* rx buffer length */
u64 max_registered_pages;
u64 num_registered_pages; /* num pages registered with NIC */
u32 rx_copybreak; /* copy packets smaller than this */
u16 default_num_queues; /* default num queues to set up */
+ bool raw_addressing; /* true if this dev supports raw addressing */
struct gve_queue_config tx_cfg;
struct gve_queue_config rx_cfg;
@@ -202,24 +240,63 @@
dma_addr_t adminq_bus_addr;
u32 adminq_mask; /* masks prod_cnt to adminq size */
u32 adminq_prod_cnt; /* free-running count of AQ cmds executed */
+ u32 adminq_cmd_fail; /* free-running count of AQ cmds failed */
+ u32 adminq_timeouts; /* free-running count of AQ cmds timeouts */
+ /* free-running count of per AQ cmd executed */
+ u32 adminq_describe_device_cnt;
+ u32 adminq_cfg_device_resources_cnt;
+ u32 adminq_register_page_list_cnt;
+ u32 adminq_unregister_page_list_cnt;
+ u32 adminq_create_tx_queue_cnt;
+ u32 adminq_create_rx_queue_cnt;
+ u32 adminq_destroy_tx_queue_cnt;
+ u32 adminq_destroy_rx_queue_cnt;
+ u32 adminq_dcfg_device_resources_cnt;
+ u32 adminq_set_driver_parameter_cnt;
+ u32 adminq_report_stats_cnt;
+ /* Global stats */
+ u32 interface_up_cnt; /* count of times interface turned up */
+ u32 interface_down_cnt; /* count of times interface turned down */
+ u32 reset_cnt; /* count of reset */
+ u32 page_alloc_fail; /* count of page alloc fails */
+ u32 dma_mapping_error; /* count of dma mapping errors */
struct workqueue_struct *gve_wq;
struct work_struct service_task;
unsigned long service_task_flags;
unsigned long state_flags;
+
+ struct gve_stats_report *stats_report;
+ u64 stats_report_len;
+ dma_addr_t stats_report_bus; /* dma address for the stats report */
+ unsigned long ethtool_flags;
+
+ unsigned long service_timer_period;
+ struct timer_list service_timer;
+
+ /* Gvnic device link speed from hypervisor. */
+ u64 link_speed;
+
+ /* Gvnic device's dma mask, set during probe. */
+ u8 dma_mask;
};
-enum gve_service_task_flags {
- GVE_PRIV_FLAGS_DO_RESET = BIT(1),
- GVE_PRIV_FLAGS_RESET_IN_PROGRESS = BIT(2),
- GVE_PRIV_FLAGS_PROBE_IN_PROGRESS = BIT(3),
+enum gve_service_task_flags_bit {
+ GVE_PRIV_FLAGS_DO_RESET = 1,
+ GVE_PRIV_FLAGS_RESET_IN_PROGRESS = 2,
+ GVE_PRIV_FLAGS_PROBE_IN_PROGRESS = 3,
+ GVE_PRIV_FLAGS_DO_REPORT_STATS = 4,
};
-enum gve_state_flags {
- GVE_PRIV_FLAGS_ADMIN_QUEUE_OK = BIT(1),
- GVE_PRIV_FLAGS_DEVICE_RESOURCES_OK = BIT(2),
- GVE_PRIV_FLAGS_DEVICE_RINGS_OK = BIT(3),
- GVE_PRIV_FLAGS_NAPI_ENABLED = BIT(4),
+enum gve_state_flags_bit {
+ GVE_PRIV_FLAGS_ADMIN_QUEUE_OK = 1,
+ GVE_PRIV_FLAGS_DEVICE_RESOURCES_OK = 2,
+ GVE_PRIV_FLAGS_DEVICE_RINGS_OK = 3,
+ GVE_PRIV_FLAGS_NAPI_ENABLED = 4,
+};
+
+enum gve_ethtool_flags_bit {
+ GVE_PRIV_FLAGS_REPORT_STATS = 0,
};
static inline bool gve_get_do_reset(struct gve_priv *priv)
@@ -269,6 +346,22 @@
clear_bit(GVE_PRIV_FLAGS_PROBE_IN_PROGRESS, &priv->service_task_flags);
}
+static inline bool gve_get_do_report_stats(struct gve_priv *priv)
+{
+ return test_bit(GVE_PRIV_FLAGS_DO_REPORT_STATS,
+ &priv->service_task_flags);
+}
+
+static inline void gve_set_do_report_stats(struct gve_priv *priv)
+{
+ set_bit(GVE_PRIV_FLAGS_DO_REPORT_STATS, &priv->service_task_flags);
+}
+
+static inline void gve_clear_do_report_stats(struct gve_priv *priv)
+{
+ clear_bit(GVE_PRIV_FLAGS_DO_REPORT_STATS, &priv->service_task_flags);
+}
+
static inline bool gve_get_admin_queue_ok(struct gve_priv *priv)
{
return test_bit(GVE_PRIV_FLAGS_ADMIN_QUEUE_OK, &priv->state_flags);
@@ -329,12 +422,27 @@
clear_bit(GVE_PRIV_FLAGS_NAPI_ENABLED, &priv->state_flags);
}
+static inline bool gve_get_report_stats(struct gve_priv *priv)
+{
+ return test_bit(GVE_PRIV_FLAGS_REPORT_STATS, &priv->ethtool_flags);
+}
+
+static inline void gve_set_report_stats(struct gve_priv *priv)
+{
+ set_bit(GVE_PRIV_FLAGS_REPORT_STATS, &priv->ethtool_flags);
+}
+
+static inline void gve_clear_report_stats(struct gve_priv *priv)
+{
+ clear_bit(GVE_PRIV_FLAGS_REPORT_STATS, &priv->ethtool_flags);
+}
+
/* Returns the address of the ntfy_blocks irq doorbell
*/
static inline __be32 __iomem *gve_irq_doorbell(struct gve_priv *priv,
struct gve_notify_block *block)
{
- return &priv->db_bar2[be32_to_cpu(block->irq_db_index)];
+ return &priv->db_bar2[be32_to_cpu(*block->irq_db_index)];
}
/* Returns the index into ntfy_blocks of the given tx ring's block
@@ -355,14 +463,22 @@
*/
static inline u32 gve_num_tx_qpls(struct gve_priv *priv)
{
- return priv->tx_cfg.num_queues;
+ if (priv->raw_addressing) {
+ return 0;
+ } else {
+ return priv->tx_cfg.num_queues;
+ }
}
/* Returns the number of rx queue page lists
*/
static inline u32 gve_num_rx_qpls(struct gve_priv *priv)
{
- return priv->rx_cfg.num_queues;
+ if (priv->raw_addressing) {
+ return 0;
+ } else {
+ return priv->rx_cfg.num_queues;
+ }
}
/* Returns a pointer to the next available tx qpl in the list of qpls
@@ -416,18 +532,10 @@
return DMA_FROM_DEVICE;
}
-/* Returns true if the max mtu allows page recycling */
-static inline bool gve_can_recycle_pages(struct net_device *dev)
-{
- /* We can't recycle the pages if we can't fit a packet into half a
- * page.
- */
- return dev->max_mtu <= PAGE_SIZE / 2;
-}
-
/* buffers */
-int gve_alloc_page(struct device *dev, struct page **page, dma_addr_t *dma,
- enum dma_data_direction);
+int gve_alloc_page(struct gve_priv* priv, struct device *dev,
+ struct page **page, dma_addr_t *dma,
+ enum dma_data_direction, gfp_t gfp_flags);
void gve_free_page(struct device *dev, struct page *page, dma_addr_t dma,
enum dma_data_direction);
/* tx handling */
@@ -450,6 +558,8 @@
int gve_adjust_queues(struct gve_priv *priv,
struct gve_queue_config new_rx_config,
struct gve_queue_config new_tx_config);
+/* report stats handling */
+void gve_handle_report_stats(struct gve_priv *priv);
/* exported by ethtool.c */
extern const struct ethtool_ops gve_ethtool_ops;
/* needed by ethtool */
diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c
index c3ba7ba..b02336b 100644
--- a/drivers/net/ethernet/google/gve/gve_adminq.c
+++ b/drivers/net/ethernet/google/gve/gve_adminq.c
@@ -23,6 +23,19 @@
priv->adminq_mask = (PAGE_SIZE / sizeof(union gve_adminq_command)) - 1;
priv->adminq_prod_cnt = 0;
+ priv->adminq_cmd_fail = 0;
+ priv->adminq_timeouts = 0;
+ priv->adminq_describe_device_cnt = 0;
+ priv->adminq_cfg_device_resources_cnt = 0;
+ priv->adminq_register_page_list_cnt = 0;
+ priv->adminq_unregister_page_list_cnt = 0;
+ priv->adminq_create_tx_queue_cnt = 0;
+ priv->adminq_create_rx_queue_cnt = 0;
+ priv->adminq_destroy_tx_queue_cnt = 0;
+ priv->adminq_destroy_rx_queue_cnt = 0;
+ priv->adminq_dcfg_device_resources_cnt = 0;
+ priv->adminq_set_driver_parameter_cnt = 0;
+ priv->adminq_report_stats_cnt = 0;
/* Setup Admin queue with the device */
iowrite32be(priv->adminq_bus_addr / PAGE_SIZE,
@@ -81,17 +94,18 @@
return false;
}
-static int gve_adminq_parse_err(struct device *dev, u32 status)
+static int gve_adminq_parse_err(struct gve_priv *priv, u32 status)
{
if (status != GVE_ADMINQ_COMMAND_PASSED &&
- status != GVE_ADMINQ_COMMAND_UNSET)
- dev_err(dev, "AQ command failed with status %d\n", status);
-
+ status != GVE_ADMINQ_COMMAND_UNSET) {
+ dev_err(&priv->pdev->dev, "AQ command failed with status %d\n", status);
+ priv->adminq_cmd_fail++;
+ }
switch (status) {
case GVE_ADMINQ_COMMAND_PASSED:
return 0;
case GVE_ADMINQ_COMMAND_UNSET:
- dev_err(dev, "parse_aq_err: err and status both unset, this should not be possible.\n");
+ dev_err(&priv->pdev->dev, "parse_aq_err: err and status both unset, this should not be possible.\n");
return -EINVAL;
case GVE_ADMINQ_COMMAND_ERROR_ABORTED:
case GVE_ADMINQ_COMMAND_ERROR_CANCELLED:
@@ -116,36 +130,143 @@
case GVE_ADMINQ_COMMAND_ERROR_UNIMPLEMENTED:
return -ENOTSUPP;
default:
- dev_err(dev, "parse_aq_err: unknown status code %d\n", status);
+ dev_err(&priv->pdev->dev, "parse_aq_err: unknown status code %d\n", status);
return -EINVAL;
}
}
+/* Flushes all AQ commands currently queued and waits for them to complete.
+ * If there are failures, it will return the first error.
+ */
+static int gve_adminq_kick_and_wait(struct gve_priv *priv)
+{
+ u32 tail, head;
+ int i;
+
+ tail = ioread32be(&priv->reg_bar0->adminq_event_counter);
+ head = priv->adminq_prod_cnt;
+
+ gve_adminq_kick_cmd(priv, head);
+ if (!gve_adminq_wait_for_cmd(priv, head)) {
+ dev_err(&priv->pdev->dev, "AQ commands timed out, need to reset AQ\n");
+ priv->adminq_timeouts++;
+ return -ENOTRECOVERABLE;
+ }
+
+ for (i = tail; i < head; i++) {
+ union gve_adminq_command *cmd;
+ u32 status, err;
+
+ cmd = &priv->adminq[i & priv->adminq_mask];
+ status = be32_to_cpu(READ_ONCE(cmd->status));
+ err = gve_adminq_parse_err(priv, status);
+ if (err)
+ // Return the first error if we failed.
+ return err;
+ }
+
+ return 0;
+}
+
/* This function is not threadsafe - the caller is responsible for any
* necessary locks.
*/
-int gve_adminq_execute_cmd(struct gve_priv *priv,
- union gve_adminq_command *cmd_orig)
+static int gve_adminq_issue_cmd(struct gve_priv *priv,
+ union gve_adminq_command *cmd_orig)
{
union gve_adminq_command *cmd;
- u32 status = 0;
- u32 prod_cnt;
+ u32 tail;
+ u32 opcode;
+
+ tail = ioread32be(&priv->reg_bar0->adminq_event_counter);
+
+ // Check if next command will overflow the buffer.
+ if (((priv->adminq_prod_cnt + 1) & priv->adminq_mask) == tail) {
+ int err;
+
+ // Flush existing commands to make room.
+ err = gve_adminq_kick_and_wait(priv);
+ if (err)
+ return err;
+
+ // Retry.
+ tail = ioread32be(&priv->reg_bar0->adminq_event_counter);
+ if (((priv->adminq_prod_cnt + 1) & priv->adminq_mask) == tail) {
+ // This should never happen. We just flushed the
+ // command queue so there should be enough space.
+ return -ENOMEM;
+ }
+ }
cmd = &priv->adminq[priv->adminq_prod_cnt & priv->adminq_mask];
priv->adminq_prod_cnt++;
- prod_cnt = priv->adminq_prod_cnt;
memcpy(cmd, cmd_orig, sizeof(*cmd_orig));
+ opcode = be32_to_cpu(READ_ONCE(cmd->opcode));
- gve_adminq_kick_cmd(priv, prod_cnt);
- if (!gve_adminq_wait_for_cmd(priv, prod_cnt)) {
- dev_err(&priv->pdev->dev, "AQ command timed out, need to reset AQ\n");
- return -ENOTRECOVERABLE;
+ switch(opcode) {
+ case GVE_ADMINQ_DESCRIBE_DEVICE:
+ priv->adminq_describe_device_cnt++;
+ break;
+ case GVE_ADMINQ_CONFIGURE_DEVICE_RESOURCES:
+ priv->adminq_cfg_device_resources_cnt++;
+ break;
+ case GVE_ADMINQ_REGISTER_PAGE_LIST:
+ priv->adminq_register_page_list_cnt++;
+ break;
+ case GVE_ADMINQ_UNREGISTER_PAGE_LIST:
+ priv->adminq_unregister_page_list_cnt++;
+ break;
+ case GVE_ADMINQ_CREATE_TX_QUEUE:
+ priv->adminq_create_tx_queue_cnt++;
+ break;
+ case GVE_ADMINQ_CREATE_RX_QUEUE:
+ priv->adminq_create_rx_queue_cnt++;
+ break;
+ case GVE_ADMINQ_DESTROY_TX_QUEUE:
+ priv->adminq_destroy_tx_queue_cnt++;
+ break;
+ case GVE_ADMINQ_DESTROY_RX_QUEUE:
+ priv->adminq_destroy_rx_queue_cnt++;
+ break;
+ case GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES:
+ priv->adminq_dcfg_device_resources_cnt++;
+ break;
+ case GVE_ADMINQ_SET_DRIVER_PARAMETER:
+ priv->adminq_set_driver_parameter_cnt++;
+ break;
+ case GVE_ADMINQ_REPORT_STATS:
+ priv->adminq_report_stats_cnt++;
+ break;
+ default:
+ dev_err(&priv->pdev->dev, "unknown AQ command opcode %d\n", opcode);
}
- memcpy(cmd_orig, cmd, sizeof(*cmd));
- status = be32_to_cpu(READ_ONCE(cmd->status));
- return gve_adminq_parse_err(&priv->pdev->dev, status);
+ return 0;
+}
+
+/* This function is not threadsafe - the caller is responsible for any
+ * necessary locks.
+ * The caller is also responsible for making sure there are no commands
+ * waiting to be executed.
+ */
+static int gve_adminq_execute_cmd(struct gve_priv *priv,
+ union gve_adminq_command *cmd_orig)
+{
+ u32 tail, head;
+ int err;
+
+ tail = ioread32be(&priv->reg_bar0->adminq_event_counter);
+ head = priv->adminq_prod_cnt;
+ if (tail != head)
+ // This is not a valid path
+ return -EINVAL;
+
+ err = gve_adminq_issue_cmd(priv, cmd_orig);
+ if (err)
+ return err;
+
+ return gve_adminq_kick_and_wait(priv);
}
/* The device specifies that the management vector can either be the first irq
@@ -172,7 +293,7 @@
.num_counters = cpu_to_be32(num_counters),
.irq_db_addr = cpu_to_be64(db_array_bus_addr),
.num_irq_dbs = cpu_to_be32(num_ntfy_blks),
- .irq_db_stride = cpu_to_be32(sizeof(priv->ntfy_blocks[0])),
+ .irq_db_stride = cpu_to_be32(sizeof(*priv->irq_db_indices)),
.ntfy_blk_msix_base_idx =
cpu_to_be32(GVE_NTFY_BLK_BASE_MSIX_IDX),
};
@@ -190,80 +311,120 @@
return gve_adminq_execute_cmd(priv, &cmd);
}
-int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index)
+int gve_adminq_create_tx_queues(struct gve_priv *priv, u32 num_queues)
{
- struct gve_tx_ring *tx = &priv->tx[queue_index];
union gve_adminq_command cmd;
+ struct gve_tx_ring *tx;
+ u32 qpl_id;
+ int err;
+ int i;
- memset(&cmd, 0, sizeof(cmd));
- cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_TX_QUEUE);
- cmd.create_tx_queue = (struct gve_adminq_create_tx_queue) {
- .queue_id = cpu_to_be32(queue_index),
- .reserved = 0,
- .queue_resources_addr = cpu_to_be64(tx->q_resources_bus),
- .tx_ring_addr = cpu_to_be64(tx->bus),
- .queue_page_list_id = cpu_to_be32(tx->tx_fifo.qpl->id),
- .ntfy_id = cpu_to_be32(tx->ntfy_id),
- };
+ for (i = 0; i < num_queues; i++) {
+ tx = &priv->tx[i];
+ qpl_id = priv->raw_addressing ? GVE_RAW_ADDRESSING_QPL_ID :
+ tx->tx_fifo.qpl->id;
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_TX_QUEUE);
+ cmd.create_tx_queue = (struct gve_adminq_create_tx_queue) {
+ .queue_id = cpu_to_be32(i),
+ .reserved = 0,
+ .queue_resources_addr =
+ cpu_to_be64(tx->q_resources_bus),
+ .tx_ring_addr = cpu_to_be64(tx->bus),
+ .queue_page_list_id = cpu_to_be32(qpl_id),
+ .ntfy_id = cpu_to_be32(tx->ntfy_id),
+ };
+ err = gve_adminq_issue_cmd(priv, &cmd);
+ if (err)
+ return err;
+ }
- return gve_adminq_execute_cmd(priv, &cmd);
+ return gve_adminq_kick_and_wait(priv);
}
-int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index)
+int gve_adminq_create_rx_queues(struct gve_priv *priv, u32 num_queues)
{
- struct gve_rx_ring *rx = &priv->rx[queue_index];
union gve_adminq_command cmd;
+ struct gve_rx_ring *rx;
+ u32 qpl_id;
+ int err;
+ int i;
- memset(&cmd, 0, sizeof(cmd));
- cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_RX_QUEUE);
- cmd.create_rx_queue = (struct gve_adminq_create_rx_queue) {
- .queue_id = cpu_to_be32(queue_index),
- .index = cpu_to_be32(queue_index),
- .reserved = 0,
- .ntfy_id = cpu_to_be32(rx->ntfy_id),
- .queue_resources_addr = cpu_to_be64(rx->q_resources_bus),
- .rx_desc_ring_addr = cpu_to_be64(rx->desc.bus),
- .rx_data_ring_addr = cpu_to_be64(rx->data.data_bus),
- .queue_page_list_id = cpu_to_be32(rx->data.qpl->id),
- };
+ for (i = 0; i < num_queues; i++) {
+ rx = &priv->rx[i];
+ qpl_id = priv->raw_addressing ? GVE_RAW_ADDRESSING_QPL_ID :
+ rx->data.qpl->id;
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_RX_QUEUE);
+ cmd.create_rx_queue = (struct gve_adminq_create_rx_queue) {
+ .queue_id = cpu_to_be32(i),
+ .index = cpu_to_be32(i),
+ .reserved = 0,
+ .ntfy_id = cpu_to_be32(rx->ntfy_id),
+ .queue_resources_addr = cpu_to_be64(rx->q_resources_bus),
+ .rx_desc_ring_addr = cpu_to_be64(rx->desc.bus),
+ .rx_data_ring_addr = cpu_to_be64(rx->data.data_bus),
+ .queue_page_list_id = cpu_to_be32(qpl_id),
+ };
+ err = gve_adminq_issue_cmd(priv, &cmd);
+ if (err)
+ return err;
+ }
- return gve_adminq_execute_cmd(priv, &cmd);
+ return gve_adminq_kick_and_wait(priv);
}
-int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index)
+int gve_adminq_destroy_tx_queues(struct gve_priv *priv, u32 num_queues)
{
union gve_adminq_command cmd;
+ int err;
+ int i;
- memset(&cmd, 0, sizeof(cmd));
- cmd.opcode = cpu_to_be32(GVE_ADMINQ_DESTROY_TX_QUEUE);
- cmd.destroy_tx_queue = (struct gve_adminq_destroy_tx_queue) {
- .queue_id = cpu_to_be32(queue_index),
- };
+ for (i = 0; i < num_queues; i++) {
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opcode = cpu_to_be32(GVE_ADMINQ_DESTROY_TX_QUEUE);
+ cmd.destroy_tx_queue = (struct gve_adminq_destroy_tx_queue) {
+ .queue_id = cpu_to_be32(i),
+ };
+ err = gve_adminq_issue_cmd(priv, &cmd);
+ if (err)
+ return err;
+ }
- return gve_adminq_execute_cmd(priv, &cmd);
+ return gve_adminq_kick_and_wait(priv);
}
-int gve_adminq_destroy_rx_queue(struct gve_priv *priv, u32 queue_index)
+int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 num_queues)
{
union gve_adminq_command cmd;
+ int err;
+ int i;
- memset(&cmd, 0, sizeof(cmd));
- cmd.opcode = cpu_to_be32(GVE_ADMINQ_DESTROY_RX_QUEUE);
- cmd.destroy_rx_queue = (struct gve_adminq_destroy_rx_queue) {
- .queue_id = cpu_to_be32(queue_index),
- };
+ for (i = 0; i < num_queues; i++) {
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opcode = cpu_to_be32(GVE_ADMINQ_DESTROY_RX_QUEUE);
+ cmd.destroy_rx_queue = (struct gve_adminq_destroy_rx_queue) {
+ .queue_id = cpu_to_be32(i),
+ };
+ err = gve_adminq_issue_cmd(priv, &cmd);
+ if (err)
+ return err;
+ }
- return gve_adminq_execute_cmd(priv, &cmd);
+ return gve_adminq_kick_and_wait(priv);
}
int gve_adminq_describe_device(struct gve_priv *priv)
{
struct gve_device_descriptor *descriptor;
+ struct gve_device_option *dev_opt;
union gve_adminq_command cmd;
dma_addr_t descriptor_bus;
+ u16 num_options;
int err = 0;
u8 *mac;
u16 mtu;
+ int i;
memset(&cmd, 0, sizeof(cmd));
descriptor = dma_alloc_coherent(&priv->pdev->dev, PAGE_SIZE,
@@ -283,8 +444,8 @@
priv->tx_desc_cnt = be16_to_cpu(descriptor->tx_queue_entries);
if (priv->tx_desc_cnt * sizeof(priv->tx->desc[0]) < PAGE_SIZE) {
- netif_err(priv, drv, priv->dev, "Tx desc count %d too low\n",
- priv->tx_desc_cnt);
+ dev_err(&priv->pdev->dev, "Tx desc count %d too low\n",
+ priv->tx_desc_cnt);
err = -EINVAL;
goto free_device_descriptor;
}
@@ -293,7 +454,7 @@
< PAGE_SIZE ||
priv->rx_desc_cnt * sizeof(priv->rx->data.data_ring[0])
< PAGE_SIZE) {
- netif_err(priv, drv, priv->dev, "Rx desc count %d too low\n",
+ dev_err(&priv->pdev->dev, "Rx desc count %d too low\n",
priv->rx_desc_cnt);
err = -EINVAL;
goto free_device_descriptor;
@@ -302,7 +463,7 @@
be64_to_cpu(descriptor->max_registered_pages);
mtu = be16_to_cpu(descriptor->mtu);
if (mtu < ETH_MIN_MTU) {
- netif_err(priv, drv, priv->dev, "MTU %d below minimum MTU\n",
+ dev_err(&priv->pdev->dev, "MTU %d below minimum MTU\n",
mtu);
err = -EINVAL;
goto free_device_descriptor;
@@ -311,18 +472,64 @@
priv->num_event_counters = be16_to_cpu(descriptor->counters);
ether_addr_copy(priv->dev->dev_addr, descriptor->mac);
mac = descriptor->mac;
- netif_info(priv, drv, priv->dev, "MAC addr: %pM\n", mac);
+ dev_info(&priv->pdev->dev, "MAC addr: %pM\n", mac);
priv->tx_pages_per_qpl = be16_to_cpu(descriptor->tx_pages_per_qpl);
- priv->rx_pages_per_qpl = be16_to_cpu(descriptor->rx_pages_per_qpl);
- if (priv->rx_pages_per_qpl < priv->rx_desc_cnt) {
- netif_err(priv, drv, priv->dev, "rx_pages_per_qpl cannot be smaller than rx_desc_cnt, setting rx_desc_cnt down to %d.\n",
- priv->rx_pages_per_qpl);
- priv->rx_desc_cnt = priv->rx_pages_per_qpl;
+ priv->rx_data_slot_cnt = be16_to_cpu(descriptor->rx_pages_per_qpl);
+ if (priv->rx_data_slot_cnt < priv->rx_desc_cnt) {
+ dev_err(&priv->pdev->dev, "rx_data_slot_cnt cannot be smaller than rx_desc_cnt, setting rx_desc_cnt down to %d.\n",
+ priv->rx_data_slot_cnt);
+ priv->rx_desc_cnt = priv->rx_data_slot_cnt;
}
priv->default_num_queues = be16_to_cpu(descriptor->default_num_queues);
+ dev_opt = (struct gve_device_option *)((void *)descriptor +
+ sizeof(*descriptor));
+
+ num_options = be16_to_cpu(descriptor->num_device_options);
+ for (i = 0; i < num_options; i++) {
+ u16 option_id;
+ u16 option_length;
+
+ if ((void *)dev_opt + sizeof(*dev_opt) > (void *)descriptor +
+ be16_to_cpu(descriptor->total_length)) {
+ dev_err(&priv->dev->dev,
+ "num_options in device_descriptor does not match total length.\n");
+ err = -EINVAL;
+ goto free_device_descriptor;
+ }
+
+ option_id = be16_to_cpu(dev_opt->option_id);
+ option_length = be16_to_cpu(dev_opt->option_length);
+ switch(option_id) {
+ case GVE_DEV_OPT_ID_RAW_ADDRESSING:
+ /* If the length or feature mask doesn't match,
+ * continue without enabling the feature.
+ */
+ if (option_length != GVE_DEV_OPT_LEN_RAW_ADDRESSING ||
+ be32_to_cpu(dev_opt->feat_mask) !=
+ GVE_DEV_OPT_FEAT_MASK_RAW_ADDRESSING) {
+ dev_info(&priv->pdev->dev,
+ "Raw addressing device option not enabled, length or features mask did not match expected.\n");
+ priv->raw_addressing = false;
+ } else {
+ dev_info(&priv->pdev->dev,
+ "Raw addressing device option enabled.\n");
+ priv->raw_addressing = true;
+ }
+ break;
+ default:
+ /* If we don't recognize the option just continue
+ * without doing anything.
+ */
+ dev_info(&priv->pdev->dev,
+ "Unrecognized device option 0x%hx not enabled.\n",
+ option_id);
+ break;
+ }
+ dev_opt = (void *)dev_opt + sizeof(*dev_opt) + option_length;
+ }
free_device_descriptor:
- dma_free_coherent(&priv->pdev->dev, sizeof(*descriptor), descriptor,
+ dma_free_coherent(&priv->pdev->dev, PAGE_SIZE, descriptor,
descriptor_bus);
return err;
}
@@ -385,3 +592,45 @@
return gve_adminq_execute_cmd(priv, &cmd);
}
+
+int gve_adminq_report_stats(struct gve_priv *priv, u64 stats_report_len,
+ dma_addr_t stats_report_addr, u64 interval)
+{
+ union gve_adminq_command cmd;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opcode = cpu_to_be32(GVE_ADMINQ_REPORT_STATS);
+ cmd.report_stats = (struct gve_adminq_report_stats) {
+ .stats_report_len = cpu_to_be64(stats_report_len),
+ .stats_report_addr = cpu_to_be64(stats_report_addr),
+ .interval = cpu_to_be64(interval),
+ };
+
+ return gve_adminq_execute_cmd(priv, &cmd);
+}
+
+int gve_adminq_report_link_speed(struct gve_priv *priv)
+{
+ union gve_adminq_command gvnic_cmd;
+ dma_addr_t link_speed_region_bus;
+ u64* link_speed_region;
+ int err;
+
+ link_speed_region = dma_alloc_coherent(&priv->pdev->dev,
+ sizeof(*link_speed_region), &link_speed_region_bus, GFP_KERNEL);
+
+ if (!link_speed_region)
+ return -ENOMEM;
+
+ memset(&gvnic_cmd, 0, sizeof(gvnic_cmd));
+ gvnic_cmd.opcode = cpu_to_be32(GVE_ADMINQ_REPORT_LINK_SPEED);
+ gvnic_cmd.report_link_speed.link_speed_address =
+ cpu_to_be64(link_speed_region_bus);
+
+ err = gve_adminq_execute_cmd(priv, &gvnic_cmd);
+
+ priv->link_speed = be64_to_cpu(*link_speed_region);
+ dma_free_coherent(&priv->pdev->dev, sizeof(*link_speed_region),
+ link_speed_region, link_speed_region_bus);
+ return err;
+}
diff --git a/drivers/net/ethernet/google/gve/gve_adminq.h b/drivers/net/ethernet/google/gve/gve_adminq.h
index 4dfa06e..c2fc677 100644
--- a/drivers/net/ethernet/google/gve/gve_adminq.h
+++ b/drivers/net/ethernet/google/gve/gve_adminq.h
@@ -21,6 +21,8 @@
GVE_ADMINQ_DESTROY_RX_QUEUE = 0x8,
GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES = 0x9,
GVE_ADMINQ_SET_DRIVER_PARAMETER = 0xB,
+ GVE_ADMINQ_REPORT_STATS = 0xC,
+ GVE_ADMINQ_REPORT_LINK_SPEED = 0xD
};
/* Admin queue status codes */
@@ -77,12 +79,17 @@
static_assert(sizeof(struct gve_device_descriptor) == 40);
-struct device_option {
- __be32 option_id;
- __be32 option_length;
+struct gve_device_option {
+ __be16 option_id;
+ __be16 option_length;
+ __be32 feat_mask;
};
-static_assert(sizeof(struct device_option) == 8);
+static_assert(sizeof(struct gve_device_option) == 8);
+
+#define GVE_DEV_OPT_ID_RAW_ADDRESSING 0x1
+#define GVE_DEV_OPT_LEN_RAW_ADDRESSING 0x0
+#define GVE_DEV_OPT_FEAT_MASK_RAW_ADDRESSING 0x0
struct gve_adminq_configure_device_resources {
__be64 counter_array;
@@ -109,6 +116,8 @@
static_assert(sizeof(struct gve_adminq_unregister_page_list) == 4);
+#define GVE_RAW_ADDRESSING_QPL_ID 0xFFFFFFFF
+
struct gve_adminq_create_tx_queue {
__be32 queue_id;
__be32 reserved;
@@ -172,6 +181,51 @@
static_assert(sizeof(struct gve_adminq_set_driver_parameter) == 16);
+struct gve_adminq_report_stats {
+ __be64 stats_report_len;
+ __be64 stats_report_addr;
+ __be64 interval;
+};
+
+static_assert(sizeof(struct gve_adminq_report_stats) == 24);
+
+struct gve_adminq_report_link_speed {
+ __be64 link_speed_address;
+};
+
+static_assert(sizeof(struct gve_adminq_report_link_speed) == 8);
+
+struct stats {
+ __be32 stat_name;
+ __be32 queue_id;
+ __be64 value;
+};
+
+static_assert(sizeof(struct stats) == 16);
+
+struct gve_stats_report {
+ __be64 written_count;
+ struct stats stats[0];
+};
+
+static_assert(sizeof(struct gve_stats_report) == 8);
+
+enum gve_stat_names {
+ // stats from gve
+ TX_WAKE_CNT = 1,
+ TX_STOP_CNT = 2,
+ TX_FRAMES_SENT = 3,
+ TX_BYTES_SENT = 4,
+ TX_LAST_COMPLETION_PROCESSED = 5,
+ RX_NEXT_EXPECTED_SEQUENCE = 6,
+ RX_BUFFERS_POSTED = 7,
+ // stats from NIC
+ RX_QUEUE_DROP_CNT = 65,
+ RX_NO_BUFFERS_POSTED = 66,
+ RX_DROPS_PACKET_OVER_MRU = 67,
+ RX_DROPS_INVALID_CHECKSUM = 68,
+};
+
union gve_adminq_command {
struct {
__be32 opcode;
@@ -187,6 +241,8 @@
struct gve_adminq_register_page_list reg_page_list;
struct gve_adminq_unregister_page_list unreg_page_list;
struct gve_adminq_set_driver_parameter set_driver_param;
+ struct gve_adminq_report_stats report_stats;
+ struct gve_adminq_report_link_speed report_link_speed;
};
};
u8 reserved[64];
@@ -197,8 +253,6 @@
int gve_adminq_alloc(struct device *dev, struct gve_priv *priv);
void gve_adminq_free(struct device *dev, struct gve_priv *priv);
void gve_adminq_release(struct gve_priv *priv);
-int gve_adminq_execute_cmd(struct gve_priv *priv,
- union gve_adminq_command *cmd_orig);
int gve_adminq_describe_device(struct gve_priv *priv);
int gve_adminq_configure_device_resources(struct gve_priv *priv,
dma_addr_t counter_array_bus_addr,
@@ -206,12 +260,15 @@
dma_addr_t db_array_bus_addr,
u32 num_ntfy_blks);
int gve_adminq_deconfigure_device_resources(struct gve_priv *priv);
-int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_id);
-int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_id);
-int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_id);
-int gve_adminq_destroy_rx_queue(struct gve_priv *priv, u32 queue_id);
+int gve_adminq_create_tx_queues(struct gve_priv *priv, u32 num_queues);
+int gve_adminq_destroy_tx_queues(struct gve_priv *priv, u32 queue_id);
+int gve_adminq_create_rx_queues(struct gve_priv *priv, u32 num_queues);
+int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 queue_id);
int gve_adminq_register_page_list(struct gve_priv *priv,
struct gve_queue_page_list *qpl);
int gve_adminq_unregister_page_list(struct gve_priv *priv, u32 page_list_id);
int gve_adminq_set_mtu(struct gve_priv *priv, u64 mtu);
+int gve_adminq_report_stats(struct gve_priv *priv, u64 stats_report_len,
+ dma_addr_t stats_report_addr, u64 interval);
+int gve_adminq_report_link_speed(struct gve_priv *priv);
#endif /* _GVE_ADMINQ_H */
diff --git a/drivers/net/ethernet/google/gve/gve_desc.h b/drivers/net/ethernet/google/gve/gve_desc.h
index 54779871..a7da364 100644
--- a/drivers/net/ethernet/google/gve/gve_desc.h
+++ b/drivers/net/ethernet/google/gve/gve_desc.h
@@ -16,9 +16,11 @@
* Base addresses encoded in seg_addr are not assumed to be physical
* addresses. The ring format assumes these come from some linear address
* space. This could be physical memory, kernel virtual memory, user virtual
- * memory. gVNIC uses lists of registered pages. Each queue is assumed
- * to be associated with a single such linear address space to ensure a
- * consistent meaning for seg_addrs posted to its rings.
+ * memory.
+ * If raw dma addressing is not supported then gVNIC uses lists of registered
+ * pages. Each queue is assumed to be associated with a single such linear
+ * address space to ensure a consistent meaning for seg_addrs posted to its
+ * rings.
*/
struct gve_tx_pkt_desc {
@@ -72,12 +74,14 @@
} __packed;
static_assert(sizeof(struct gve_rx_desc) == 64);
-/* As with the Tx ring format, the qpl_offset entries below are offsets into an
- * ordered list of registered pages.
+/* If the device supports raw dma addressing then the addr in data slot is
+ * the dma address of the buffer.
+ * If the device only supports registered segments than the addr is a byte
+ * offset into the registered segment (an ordered list of pages) where the
+ * buffer is.
*/
struct gve_rx_data_slot {
- /* byte offset into the rx registered segment of this slot */
- __be64 qpl_offset;
+ __be64 addr;
};
/* GVE Recive Packet Descriptor Seq No */
diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c
index d8fa816..ae76683 100644
--- a/drivers/net/ethernet/google/gve/gve_ethtool.c
+++ b/drivers/net/ethernet/google/gve/gve_ethtool.c
@@ -6,6 +6,7 @@
#include <linux/rtnetlink.h>
#include "gve.h"
+#include "gve_adminq.h"
static void gve_get_drvinfo(struct net_device *netdev,
struct ethtool_drvinfo *info)
@@ -32,43 +33,86 @@
}
static const char gve_gstrings_main_stats[][ETH_GSTRING_LEN] = {
- "rx_packets", "tx_packets", "rx_bytes", "tx_bytes",
- "rx_dropped", "tx_dropped", "tx_timeouts",
+ "rx_packets", "rx_total_bytes", "rx_total_dropped_pkt",
+ "rx_skb_alloc_fail", "rx_buf_alloc_fail", "rx_desc_err_dropped_pkt",
+ "tx_packets", "tx_total_bytes", "tx_total_dropped_pkt", "tx_timeouts",
+ "interface_up_cnt", "interface_down_cnt", "reset_cnt",
+ "page_alloc_fail", "dma_mapping_error",
+};
+
+static const char gve_gstrings_rx_stats[][ETH_GSTRING_LEN] = {
+ "rx_posted_desc[%u]", "rx_completed_desc[%u]", "rx_bytes[%u]",
+ "rx_dropped_pkt[%u]", "rx_copybreak_pkt[%u]", "rx_copied_pkt[%u]",
+ "rx_queue_drop_cnt[%u]", "rx_no_buffers_posted[%u]",
+ "rx_drops_packet_over_mru[%u]", "rx_drops_invalid_checksum[%u]",
+};
+
+static const char gve_gstrings_tx_stats[][ETH_GSTRING_LEN] = {
+ "tx_posted_desc[%u]", "tx_completed_desc[%u]", "tx_bytes[%u]",
+ "tx_wake[%u]", "tx_stop[%u]", "tx_event_counter[%u]",
+};
+
+static const char gve_gstrings_adminq_stats[][ETH_GSTRING_LEN] = {
+ "adminq_prod_cnt", "adminq_cmd_fail", "adminq_timeouts",
+ "adminq_describe_device_cnt", "adminq_cfg_device_resources_cnt",
+ "adminq_register_page_list_cnt", "adminq_unregister_page_list_cnt",
+ "adminq_create_tx_queue_cnt", "adminq_create_rx_queue_cnt",
+ "adminq_destroy_tx_queue_cnt", "adminq_destroy_rx_queue_cnt",
+ "adminq_dcfg_device_resources_cnt", "adminq_set_driver_parameter_cnt",
+ "adminq_report_stats_cnt",
+};
+
+static const char gve_gstrings_priv_flags[][ETH_GSTRING_LEN] = {
+ "report-stats",
};
#define GVE_MAIN_STATS_LEN ARRAY_SIZE(gve_gstrings_main_stats)
-#define NUM_GVE_TX_CNTS 5
-#define NUM_GVE_RX_CNTS 2
+#define GVE_ADMINQ_STATS_LEN ARRAY_SIZE(gve_gstrings_adminq_stats)
+#define NUM_GVE_TX_CNTS ARRAY_SIZE(gve_gstrings_tx_stats)
+#define NUM_GVE_RX_CNTS ARRAY_SIZE(gve_gstrings_rx_stats)
+#define GVE_PRIV_FLAGS_STR_LEN ARRAY_SIZE(gve_gstrings_priv_flags)
static void gve_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
{
struct gve_priv *priv = netdev_priv(netdev);
char *s = (char *)data;
- int i;
+ int i, j;
- if (stringset != ETH_SS_STATS)
- return;
+ switch (stringset) {
+ case ETH_SS_STATS:
+ memcpy(s, *gve_gstrings_main_stats,
+ sizeof(gve_gstrings_main_stats));
+ s += sizeof(gve_gstrings_main_stats);
- memcpy(s, *gve_gstrings_main_stats,
- sizeof(gve_gstrings_main_stats));
- s += sizeof(gve_gstrings_main_stats);
- for (i = 0; i < priv->rx_cfg.num_queues; i++) {
- snprintf(s, ETH_GSTRING_LEN, "rx_desc_cnt[%u]", i);
- s += ETH_GSTRING_LEN;
- snprintf(s, ETH_GSTRING_LEN, "rx_desc_fill_cnt[%u]", i);
- s += ETH_GSTRING_LEN;
- }
- for (i = 0; i < priv->tx_cfg.num_queues; i++) {
- snprintf(s, ETH_GSTRING_LEN, "tx_req[%u]", i);
- s += ETH_GSTRING_LEN;
- snprintf(s, ETH_GSTRING_LEN, "tx_done[%u]", i);
- s += ETH_GSTRING_LEN;
- snprintf(s, ETH_GSTRING_LEN, "tx_wake[%u]", i);
- s += ETH_GSTRING_LEN;
- snprintf(s, ETH_GSTRING_LEN, "tx_stop[%u]", i);
- s += ETH_GSTRING_LEN;
- snprintf(s, ETH_GSTRING_LEN, "tx_event_counter[%u]", i);
- s += ETH_GSTRING_LEN;
+ for (i = 0; i < priv->rx_cfg.num_queues; i++) {
+ for (j = 0; j < NUM_GVE_RX_CNTS; j++) {
+ snprintf(s, ETH_GSTRING_LEN,
+ gve_gstrings_rx_stats[j], i);
+ s += ETH_GSTRING_LEN;
+ }
+ }
+
+ for (i = 0; i < priv->tx_cfg.num_queues; i++) {
+ for (j = 0; j < NUM_GVE_TX_CNTS; j++) {
+ snprintf(s, ETH_GSTRING_LEN,
+ gve_gstrings_tx_stats[j], i);
+ s += ETH_GSTRING_LEN;
+ }
+ }
+
+ memcpy(s, *gve_gstrings_adminq_stats,
+ sizeof(gve_gstrings_adminq_stats));
+ s += sizeof(gve_gstrings_adminq_stats);
+ break;
+
+ case ETH_SS_PRIV_FLAGS:
+ memcpy(s, *gve_gstrings_priv_flags,
+ sizeof(gve_gstrings_priv_flags));
+ s += sizeof(gve_gstrings_priv_flags);
+ break;
+
+ default:
+ break;
}
}
@@ -78,9 +122,11 @@
switch (sset) {
case ETH_SS_STATS:
- return GVE_MAIN_STATS_LEN +
+ return GVE_MAIN_STATS_LEN + GVE_ADMINQ_STATS_LEN +
(priv->rx_cfg.num_queues * NUM_GVE_RX_CNTS) +
(priv->tx_cfg.num_queues * NUM_GVE_TX_CNTS);
+ case ETH_SS_PRIV_FLAGS:
+ return GVE_PRIV_FLAGS_STR_LEN;
default:
return -EOPNOTSUPP;
}
@@ -90,24 +136,57 @@
gve_get_ethtool_stats(struct net_device *netdev,
struct ethtool_stats *stats, u64 *data)
{
+ u64 tmp_rx_pkts, tmp_rx_bytes, tmp_rx_skb_alloc_fail,
+ tmp_rx_buf_alloc_fail, tmp_rx_desc_err_dropped_pkt,
+ tmp_tx_pkts, tmp_tx_bytes;
+ u64 rx_pkts, rx_bytes, rx_skb_alloc_fail, rx_buf_alloc_fail,
+ rx_desc_err_dropped_pkt, tx_pkts,
+ tx_bytes;
struct gve_priv *priv = netdev_priv(netdev);
- u64 rx_pkts, rx_bytes, tx_pkts, tx_bytes;
+ int *rx_qid_to_stats_idx;
+ int *tx_qid_to_stats_idx;
+ struct stats *report_stats = priv->stats_report->stats;
+ int stats_idx, base_stats_idx, max_stats_idx;
+ bool skip_nic_stats;
unsigned int start;
int ring;
- int i;
+ int i, j;
ASSERT_RTNL();
- for (rx_pkts = 0, rx_bytes = 0, ring = 0;
+ rx_qid_to_stats_idx = kmalloc_array(priv->rx_cfg.num_queues,
+ sizeof(int), GFP_KERNEL);
+ if (!rx_qid_to_stats_idx) {
+ return;
+ }
+ tx_qid_to_stats_idx = kmalloc_array(priv->tx_cfg.num_queues,
+ sizeof(int), GFP_KERNEL);
+ if (!tx_qid_to_stats_idx) {
+ kfree(rx_qid_to_stats_idx);
+ return;
+ }
+
+ for (rx_pkts = 0, rx_bytes = 0, rx_skb_alloc_fail = 0,
+ rx_buf_alloc_fail = 0, rx_desc_err_dropped_pkt = 0, ring = 0;
ring < priv->rx_cfg.num_queues; ring++) {
if (priv->rx) {
do {
+ struct gve_rx_ring *rx = &priv->rx[ring];
start =
u64_stats_fetch_begin(&priv->rx[ring].statss);
- rx_pkts += priv->rx[ring].rpackets;
- rx_bytes += priv->rx[ring].rbytes;
+ tmp_rx_pkts = rx->rpackets;
+ tmp_rx_bytes = rx->rbytes;
+ tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail;
+ tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail;
+ tmp_rx_desc_err_dropped_pkt =
+ rx->rx_desc_err_dropped_pkt;
} while (u64_stats_fetch_retry(&priv->rx[ring].statss,
start));
+ rx_pkts += tmp_rx_pkts;
+ rx_bytes += tmp_rx_bytes;
+ rx_skb_alloc_fail += tmp_rx_skb_alloc_fail;
+ rx_buf_alloc_fail += tmp_rx_buf_alloc_fail;
+ rx_desc_err_dropped_pkt += tmp_rx_desc_err_dropped_pkt;
}
}
for (tx_pkts = 0, tx_bytes = 0, ring = 0;
@@ -116,34 +195,111 @@
do {
start =
u64_stats_fetch_begin(&priv->tx[ring].statss);
- tx_pkts += priv->tx[ring].pkt_done;
- tx_bytes += priv->tx[ring].bytes_done;
+ tmp_tx_pkts = priv->tx[ring].pkt_done;
+ tmp_tx_bytes = priv->tx[ring].bytes_done;
} while (u64_stats_fetch_retry(&priv->tx[ring].statss,
start));
+ tx_pkts += tmp_tx_pkts;
+ tx_bytes += tmp_tx_bytes;
}
}
i = 0;
data[i++] = rx_pkts;
- data[i++] = tx_pkts;
data[i++] = rx_bytes;
+ /* total rx dropped packets */
+ data[i++] = rx_skb_alloc_fail + rx_buf_alloc_fail +
+ rx_desc_err_dropped_pkt;
+ data[i++] = rx_skb_alloc_fail;
+ data[i++] = rx_buf_alloc_fail;
+ data[i++] = rx_desc_err_dropped_pkt;
+ data[i++] = tx_pkts;
data[i++] = tx_bytes;
- /* Skip rx_dropped and tx_dropped */
- i += 2;
+ /* Skip tx_dropped */
+ i++;
data[i++] = priv->tx_timeo_cnt;
+ data[i++] = priv->interface_up_cnt;
+ data[i++] = priv->interface_down_cnt;
+ data[i++] = priv->reset_cnt;
+ data[i++] = priv->page_alloc_fail;
+ data[i++] = priv->dma_mapping_error;
i = GVE_MAIN_STATS_LEN;
+ /* For rx cross-reporting stats, start from nic rx stats in report */
+ base_stats_idx = GVE_TX_STATS_REPORT_NUM * priv->tx_cfg.num_queues +
+ GVE_RX_STATS_REPORT_NUM * priv->rx_cfg.num_queues;
+ max_stats_idx = NIC_RX_STATS_REPORT_NUM * priv->rx_cfg.num_queues +
+ base_stats_idx;
+ /* Preprocess the stats report for rx, map queue id to start index */
+ skip_nic_stats = false;
+ for (stats_idx = base_stats_idx; stats_idx < max_stats_idx;
+ stats_idx += NIC_RX_STATS_REPORT_NUM) {
+ u32 stat_name = be32_to_cpu(report_stats[stats_idx].stat_name);
+ u32 queue_id = be32_to_cpu(report_stats[stats_idx].queue_id);
+ if (stat_name == 0) {
+ /* no stats written by NIC yet */
+ skip_nic_stats = true;
+ break;
+ }
+ rx_qid_to_stats_idx[queue_id] = stats_idx;
+ }
/* walk RX rings */
if (priv->rx) {
for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) {
struct gve_rx_ring *rx = &priv->rx[ring];
- data[i++] = rx->cnt;
data[i++] = rx->fill_cnt;
+ data[i++] = rx->cnt;
+ do {
+ start =
+ u64_stats_fetch_begin(&priv->rx[ring].statss);
+ tmp_rx_bytes = rx->rbytes;
+ tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail;
+ tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail;
+ tmp_rx_desc_err_dropped_pkt =
+ rx->rx_desc_err_dropped_pkt;
+ } while (u64_stats_fetch_retry(&priv->rx[ring].statss,
+ start));
+ data[i++] = tmp_rx_bytes;
+ /* rx dropped packets */
+ data[i++] = tmp_rx_skb_alloc_fail +
+ tmp_rx_buf_alloc_fail +
+ tmp_rx_desc_err_dropped_pkt;
+ data[i++] = rx->rx_copybreak_pkt;
+ data[i++] = rx->rx_copied_pkt;
+ /* stats from NIC */
+ if (skip_nic_stats) {
+ /* skip NIC rx stats */
+ i += NIC_RX_STATS_REPORT_NUM;
+ continue;
+ }
+ for (j = 0; j < NIC_RX_STATS_REPORT_NUM; j++) {
+ u64 value = be64_to_cpu(report_stats[
+ rx_qid_to_stats_idx[ring] + j].value);
+ data[i++] = value;
+ }
}
- } else {
+ }
+ else {
i += priv->rx_cfg.num_queues * NUM_GVE_RX_CNTS;
}
+ /* For tx cross-reporting stats, start from nic tx stats in report */
+ base_stats_idx = max_stats_idx;
+ max_stats_idx = NIC_TX_STATS_REPORT_NUM * priv->tx_cfg.num_queues +
+ max_stats_idx;
+ /* Preprocess the stats report for tx, map queue id to start index */
+ skip_nic_stats = false;
+ for (stats_idx = base_stats_idx; stats_idx < max_stats_idx;
+ stats_idx += NIC_TX_STATS_REPORT_NUM) {
+ u32 stat_name = be32_to_cpu(report_stats[stats_idx].stat_name);
+ u32 queue_id = be32_to_cpu(report_stats[stats_idx].queue_id);
+ if (stat_name == 0) {
+ /* no stats written by NIC yet */
+ skip_nic_stats = true;
+ break;
+ }
+ tx_qid_to_stats_idx[queue_id] = stats_idx;
+ }
/* walk TX rings */
if (priv->tx) {
for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) {
@@ -151,14 +307,51 @@
data[i++] = tx->req;
data[i++] = tx->done;
+ do {
+ start =
+ u64_stats_fetch_begin(&priv->tx[ring].statss);
+ tmp_tx_bytes = tx->bytes_done;
+ } while (u64_stats_fetch_retry(&priv->tx[ring].statss,
+ start));
+ data[i++] = tmp_tx_bytes;
data[i++] = tx->wake_queue;
data[i++] = tx->stop_queue;
data[i++] = be32_to_cpu(gve_tx_load_event_counter(priv,
tx));
+ /* stats from NIC */
+ if (skip_nic_stats) {
+ /* skip NIC tx stats */
+ i += NIC_TX_STATS_REPORT_NUM;
+ continue;
+ }
+ for (j = 0; j < NIC_TX_STATS_REPORT_NUM; j++) {
+ u64 value = be64_to_cpu(report_stats[
+ tx_qid_to_stats_idx[ring] + j].value);
+ data[i++] = value;
+ }
}
} else {
i += priv->tx_cfg.num_queues * NUM_GVE_TX_CNTS;
}
+
+ kfree(rx_qid_to_stats_idx);
+ kfree(tx_qid_to_stats_idx);
+
+ /* AQ Stats */
+ data[i++] = priv->adminq_prod_cnt;
+ data[i++] = priv->adminq_cmd_fail;
+ data[i++] = priv->adminq_timeouts;
+ data[i++] = priv->adminq_describe_device_cnt;
+ data[i++] = priv->adminq_cfg_device_resources_cnt;
+ data[i++] = priv->adminq_register_page_list_cnt;
+ data[i++] = priv->adminq_unregister_page_list_cnt;
+ data[i++] = priv->adminq_create_tx_queue_cnt;
+ data[i++] = priv->adminq_create_rx_queue_cnt;
+ data[i++] = priv->adminq_destroy_tx_queue_cnt;
+ data[i++] = priv->adminq_destroy_rx_queue_cnt;
+ data[i++] = priv->adminq_dcfg_device_resources_cnt;
+ data[i++] = priv->adminq_set_driver_parameter_cnt;
+ data[i++] = priv->adminq_report_stats_cnt;
}
static void gve_get_channels(struct net_device *netdev,
@@ -230,6 +423,99 @@
return -EOPNOTSUPP;
}
+static int gve_get_tunable(struct net_device *netdev,
+ const struct ethtool_tunable *etuna, void *value)
+{
+ struct gve_priv *priv = netdev_priv(netdev);
+
+ switch (etuna->id) {
+ case ETHTOOL_RX_COPYBREAK:
+ *(u32 *)value = priv->rx_copybreak;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+static int gve_set_tunable(struct net_device *netdev,
+ const struct ethtool_tunable *etuna,
+ const void *value)
+{
+ struct gve_priv *priv = netdev_priv(netdev);
+ u32 len;
+
+ switch(etuna->id) {
+ case ETHTOOL_RX_COPYBREAK:
+ len = *(u32 *)value;
+ if (len > priv->dev->mtu) {
+ return -EINVAL;
+ }
+ priv->rx_copybreak = len;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+static u32 gve_get_priv_flags(struct net_device *netdev)
+{
+ struct gve_priv *priv = netdev_priv(netdev);
+ u32 i, ret_flags = 0;
+
+ for (i = 0; i < GVE_PRIV_FLAGS_STR_LEN; i++) {
+ if (priv->ethtool_flags & BIT(i)) {
+ ret_flags |= BIT(i);
+ }
+ }
+ return ret_flags;
+}
+
+static int gve_set_priv_flags(struct net_device *netdev, u32 flags)
+{
+ struct gve_priv *priv = netdev_priv(netdev);
+ u64 ori_flags, new_flags;
+ u32 i;
+
+ ori_flags = READ_ONCE(priv->ethtool_flags);
+ new_flags = ori_flags;
+
+ for (i = 0; i < GVE_PRIV_FLAGS_STR_LEN; i++) {
+ if (flags & BIT(i))
+ new_flags |= BIT(i);
+ else
+ new_flags &= ~(BIT(i));
+ priv->ethtool_flags = new_flags;
+ /* set report-stats */
+ if (strcmp(gve_gstrings_priv_flags[i], "report-stats") == 0) {
+ /* update the stats when user turns report-stats on */
+ if (flags & BIT(i))
+ gve_handle_report_stats(priv);
+ /* zero off gve stats when report-stats turned off */
+ if (!(flags & BIT(i)) && (ori_flags & BIT(i))) {
+ int tx_stats_num = GVE_TX_STATS_REPORT_NUM *
+ priv->tx_cfg.num_queues;
+ int rx_stats_num = GVE_RX_STATS_REPORT_NUM *
+ priv->rx_cfg.num_queues;
+ memset(priv->stats_report->stats, 0,
+ (tx_stats_num + rx_stats_num) *
+ sizeof(struct stats));
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int gve_get_link_ksettings(struct net_device *netdev,
+ struct ethtool_link_ksettings *cmd)
+{
+ struct gve_priv *priv = netdev_priv(netdev);
+ int err = gve_adminq_report_link_speed(priv);
+
+ cmd->base.speed = priv->link_speed;
+ return err;
+}
+
const struct ethtool_ops gve_ethtool_ops = {
.get_drvinfo = gve_get_drvinfo,
.get_strings = gve_get_strings,
@@ -242,4 +528,9 @@
.get_link = ethtool_op_get_link,
.get_ringparam = gve_get_ringparam,
.reset = gve_user_reset,
+ .get_tunable = gve_get_tunable,
+ .set_tunable = gve_set_tunable,
+ .get_priv_flags = gve_get_priv_flags,
+ .set_priv_flags = gve_set_priv_flags,
+ .get_link_ksettings = gve_get_link_ksettings
};
diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index 5b450c6..c821a87 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -20,7 +20,7 @@
#define GVE_DEFAULT_RX_COPYBREAK (256)
#define DEFAULT_MSG_LEVEL (NETIF_MSG_DRV | NETIF_MSG_LINK)
-#define GVE_VERSION "1.0.0"
+#define GVE_VERSION "1.1.0"
#define GVE_VERSION_PREFIX "GVE-"
const char gve_version_str[] = GVE_VERSION;
@@ -83,6 +83,60 @@
priv->counter_array = NULL;
}
+void gve_service_task_schedule(struct gve_priv *priv)
+{
+ if (!gve_get_probe_in_progress(priv) &&
+ !gve_get_reset_in_progress(priv)) {
+ gve_set_do_report_stats(priv);
+ queue_work(priv->gve_wq, &priv->service_task);
+ }
+}
+
+static void gve_service_timer(struct timer_list *t)
+{
+ struct gve_priv *priv = from_timer(priv, t, service_timer);
+
+ mod_timer(&priv->service_timer,
+ round_jiffies(jiffies +
+ msecs_to_jiffies(priv->service_timer_period)));
+ gve_service_task_schedule(priv);
+}
+
+static int gve_alloc_stats_report(struct gve_priv *priv)
+{
+ int tx_stats_num, rx_stats_num;
+
+ tx_stats_num = (GVE_TX_STATS_REPORT_NUM + NIC_TX_STATS_REPORT_NUM) *
+ priv->tx_cfg.num_queues;
+ rx_stats_num = (GVE_RX_STATS_REPORT_NUM + NIC_RX_STATS_REPORT_NUM) *
+ priv->rx_cfg.num_queues;
+ priv->stats_report_len = sizeof(struct gve_stats_report) +
+ (tx_stats_num + rx_stats_num) *
+ sizeof(struct stats);
+ priv->stats_report =
+ dma_alloc_coherent(&priv->pdev->dev, priv->stats_report_len,
+ &priv->stats_report_bus, GFP_KERNEL);
+ if (!priv->stats_report)
+ return -ENOMEM;
+ /* Set up timer for periodic task */
+ timer_setup(&priv->service_timer, gve_service_timer, 0);
+ priv->service_timer_period = GVE_SERVICE_TIMER_PERIOD;
+ /* Start the service task timer */
+ mod_timer(&priv->service_timer,
+ round_jiffies(jiffies +
+ msecs_to_jiffies(priv->service_timer_period)));
+ return 0;
+}
+
+static void gve_free_stats_report(struct gve_priv *priv)
+{
+
+ del_timer_sync(&priv->service_timer);
+ dma_free_coherent(&priv->pdev->dev, priv->stats_report_len,
+ priv->stats_report, priv->stats_report_bus);
+ priv->stats_report = NULL;
+}
+
static irqreturn_t gve_mgmnt_intr(int irq, void *arg)
{
struct gve_priv *priv = arg;
@@ -192,15 +246,24 @@
dev_err(&priv->pdev->dev, "Did not receive management vector.\n");
goto abort_with_msix_enabled;
}
- priv->ntfy_blocks =
+
+ priv->irq_db_indices =
dma_alloc_coherent(&priv->pdev->dev,
priv->num_ntfy_blks *
- sizeof(*priv->ntfy_blocks),
- &priv->ntfy_block_bus, GFP_KERNEL);
- if (!priv->ntfy_blocks) {
+ sizeof(*priv->irq_db_indices),
+ &priv->irq_db_indices_bus, GFP_KERNEL);
+ if (!priv->irq_db_indices) {
err = -ENOMEM;
goto abort_with_mgmt_vector;
}
+
+ priv->ntfy_blocks = kvzalloc(priv->num_ntfy_blks *
+ sizeof(*priv->ntfy_blocks), GFP_KERNEL);
+ if (!priv->ntfy_blocks) {
+ err = -ENOMEM;
+ goto abort_with_irq_db_indices;
+ }
+
/* Setup the other blocks - the first n-1 vectors */
for (i = 0; i < priv->num_ntfy_blks; i++) {
struct gve_notify_block *block = &priv->ntfy_blocks[i];
@@ -218,6 +281,7 @@
}
irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector,
get_cpu_mask(i % active_cpus));
+ block->irq_db_index = &priv->irq_db_indices[i].index;
}
return 0;
abort_with_some_ntfy_blocks:
@@ -229,10 +293,13 @@
NULL);
free_irq(priv->msix_vectors[msix_idx].vector, block);
}
- dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks *
- sizeof(*priv->ntfy_blocks),
- priv->ntfy_blocks, priv->ntfy_block_bus);
+ kvfree(priv->ntfy_blocks);
priv->ntfy_blocks = NULL;
+abort_with_irq_db_indices:
+ dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks *
+ sizeof(*priv->irq_db_indices),
+ priv->irq_db_indices, priv->irq_db_indices_bus);
+ priv->irq_db_indices = NULL;
abort_with_mgmt_vector:
free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv);
abort_with_msix_enabled:
@@ -259,10 +326,12 @@
}
free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv);
}
- dma_free_coherent(&priv->pdev->dev,
- priv->num_ntfy_blks * sizeof(*priv->ntfy_blocks),
- priv->ntfy_blocks, priv->ntfy_block_bus);
+ kvfree(priv->ntfy_blocks);
priv->ntfy_blocks = NULL;
+ dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks *
+ sizeof(*priv->irq_db_indices),
+ priv->irq_db_indices, priv->irq_db_indices_bus);
+ priv->irq_db_indices = NULL;
pci_disable_msix(priv->pdev);
kvfree(priv->msix_vectors);
priv->msix_vectors = NULL;
@@ -278,19 +347,30 @@
err = gve_alloc_notify_blocks(priv);
if (err)
goto abort_with_counter;
+ err = gve_alloc_stats_report(priv);
+ if (err)
+ goto abort_with_ntfy_blocks;
err = gve_adminq_configure_device_resources(priv,
priv->counter_array_bus,
priv->num_event_counters,
- priv->ntfy_block_bus,
+ priv->irq_db_indices_bus,
priv->num_ntfy_blks);
if (unlikely(err)) {
dev_err(&priv->pdev->dev,
"could not setup device_resources: err=%d\n", err);
err = -ENXIO;
- goto abort_with_ntfy_blocks;
+ goto abort_with_stats_report;
}
+ err = gve_adminq_report_stats(priv, priv->stats_report_len,
+ priv->stats_report_bus,
+ GVE_SERVICE_TIMER_PERIOD);
+ if (err)
+ dev_err(&priv->pdev->dev,
+ "Failed to report stats: err=%d\n", err);
gve_set_device_resources_ok(priv);
return 0;
+abort_with_stats_report:
+ gve_free_stats_report(priv);
abort_with_ntfy_blocks:
gve_free_notify_blocks(priv);
abort_with_counter:
@@ -306,6 +386,14 @@
/* Tell device its resources are being freed */
if (gve_get_device_resources_ok(priv)) {
+ /* detach the stats report */
+ err = gve_adminq_report_stats(priv, 0, 0x0,
+ GVE_SERVICE_TIMER_PERIOD);
+ if (err) {
+ dev_err(&priv->pdev->dev,
+ "Failed to detach stats report: err=%d\n", err);
+ gve_trigger_reset(priv);
+ }
err = gve_adminq_deconfigure_device_resources(priv);
if (err) {
dev_err(&priv->pdev->dev,
@@ -316,6 +404,7 @@
}
gve_free_counter_array(priv);
gve_free_notify_blocks(priv);
+ gve_free_stats_report(priv);
gve_clear_device_resources_ok(priv);
}
@@ -379,35 +468,37 @@
int err;
int i;
- for (i = 0; i < priv->tx_cfg.num_queues; i++) {
- err = gve_adminq_create_tx_queue(priv, i);
- if (err) {
- netif_err(priv, drv, priv->dev, "failed to create tx queue %d\n",
- i);
- /* This failure will trigger a reset - no need to clean
- * up
- */
- return err;
- }
- netif_dbg(priv, drv, priv->dev, "created tx queue %d\n", i);
- }
- for (i = 0; i < priv->rx_cfg.num_queues; i++) {
- err = gve_adminq_create_rx_queue(priv, i);
- if (err) {
- netif_err(priv, drv, priv->dev, "failed to create rx queue %d\n",
- i);
- /* This failure will trigger a reset - no need to clean
- * up
- */
- return err;
- }
- /* Rx data ring has been prefilled with packet buffers at
- * queue allocation time.
- * Write the doorbell to provide descriptor slots and packet
- * buffers to the NIC.
+ err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues);
+ if (err) {
+ netif_err(priv, drv, priv->dev, "failed to create %d tx queues\n",
+ priv->tx_cfg.num_queues);
+ /* This failure will trigger a reset - no need to clean
+ * up
*/
+ return err;
+ }
+ netif_dbg(priv, drv, priv->dev, "created %d tx queues \n",
+ priv->tx_cfg.num_queues);
+
+ err = gve_adminq_create_rx_queues(priv, priv->rx_cfg.num_queues);
+ if (err) {
+ netif_err(priv, drv, priv->dev, "failed to create %d rx queues\n",
+ priv->rx_cfg.num_queues);
+ /* This failure will trigger a reset - no need to clean
+ * up
+ */
+ return err;
+ }
+ netif_dbg(priv, drv, priv->dev, "created %d rx queues \n",
+ priv->rx_cfg.num_queues);
+
+ /* Rx data ring has been prefilled with packet buffers at queue
+ * allocation time.
+ * Write the doorbell to provide descriptor slots and packet buffers
+ * to the NIC.
+ */
+ for (i = 0; i < priv->rx_cfg.num_queues; i++) {
gve_rx_write_doorbell(priv, &priv->rx[i]);
- netif_dbg(priv, drv, priv->dev, "created rx queue %d\n", i);
}
return 0;
@@ -466,34 +557,23 @@
static int gve_destroy_rings(struct gve_priv *priv)
{
int err;
- int i;
- for (i = 0; i < priv->tx_cfg.num_queues; i++) {
- err = gve_adminq_destroy_tx_queue(priv, i);
- if (err) {
- netif_err(priv, drv, priv->dev,
- "failed to destroy tx queue %d\n",
- i);
- /* This failure will trigger a reset - no need to clean
- * up
- */
- return err;
- }
- netif_dbg(priv, drv, priv->dev, "destroyed tx queue %d\n", i);
+ err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues);
+ if (err) {
+ netif_err(priv, drv, priv->dev,
+ "failed to destroy tx queues\n");
+ /* This failure will trigger a reset - no need to clean up */
+ return err;
}
- for (i = 0; i < priv->rx_cfg.num_queues; i++) {
- err = gve_adminq_destroy_rx_queue(priv, i);
- if (err) {
- netif_err(priv, drv, priv->dev,
- "failed to destroy rx queue %d\n",
- i);
- /* This failure will trigger a reset - no need to clean
- * up
- */
- return err;
- }
- netif_dbg(priv, drv, priv->dev, "destroyed rx queue %d\n", i);
+ netif_dbg(priv, drv, priv->dev, "destroyed tx queues\n");
+ err = gve_adminq_destroy_rx_queues(priv, priv->rx_cfg.num_queues);
+ if (err) {
+ netif_err(priv, drv, priv->dev,
+ "failed to destroy rx queues\n");
+ /* This failure will trigger a reset - no need to clean up */
+ return err;
}
+ netif_dbg(priv, drv, priv->dev, "destroyed rx queues\n");
return 0;
}
@@ -522,15 +602,25 @@
}
}
-int gve_alloc_page(struct device *dev, struct page **page, dma_addr_t *dma,
- enum dma_data_direction dir)
+int gve_alloc_page(struct gve_priv *priv, struct device *dev,
+ struct page **page, dma_addr_t *dma,
+ enum dma_data_direction dir, gfp_t gfp_flags)
{
- *page = alloc_page(GFP_KERNEL);
- if (!*page)
+ if (priv->dma_mask == 24)
+ gfp_flags |= GFP_DMA;
+ else if (priv->dma_mask == 32)
+ gfp_flags |= GFP_DMA32;
+
+ *page = alloc_page(gfp_flags);
+ if (!*page) {
+ priv->page_alloc_fail++;
return -ENOMEM;
+ }
*dma = dma_map_page(dev, *page, 0, PAGE_SIZE, dir);
if (dma_mapping_error(dev, *dma)) {
+ priv->dma_mapping_error++;
put_page(*page);
+ *page = NULL;
return -ENOMEM;
}
return 0;
@@ -564,9 +654,9 @@
return -ENOMEM;
for (i = 0; i < pages; i++) {
- err = gve_alloc_page(&priv->pdev->dev, &qpl->pages[i],
+ err = gve_alloc_page(priv, &priv->pdev->dev, &qpl->pages[i],
&qpl->page_buses[i],
- gve_qpl_dma_dir(priv, id));
+ gve_qpl_dma_dir(priv, id), GFP_KERNEL);
/* caller handles clean up */
if (err)
return -ENOMEM;
@@ -613,6 +703,10 @@
int i, j;
int err;
+ /* Raw addressing means no QPLs */
+ if (priv->raw_addressing)
+ return 0;
+
priv->qpls = kvzalloc(num_qpls * sizeof(*priv->qpls), GFP_KERNEL);
if (!priv->qpls)
return -ENOMEM;
@@ -625,7 +719,7 @@
}
for (; i < num_qpls; i++) {
err = gve_alloc_queue_page_list(priv, i,
- priv->rx_pages_per_qpl);
+ priv->rx_data_slot_cnt);
if (err)
goto free_qpls;
}
@@ -653,6 +747,10 @@
int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv);
int i;
+ /* Raw addressing means no QPLs */
+ if (priv->raw_addressing)
+ return;
+
kvfree(priv->qpl_cfg.qpl_id_map);
for (i = 0; i < num_qpls; i++)
@@ -704,7 +802,8 @@
gve_set_device_rings_ok(priv);
gve_turnup(priv);
- netif_carrier_on(dev);
+ queue_work(priv->gve_wq, &priv->service_task);
+ priv->interface_up_cnt++;
return 0;
free_rings:
@@ -746,6 +845,7 @@
gve_free_rings(priv);
gve_free_qpls(priv);
+ priv->interface_down_cnt++;
return 0;
err:
@@ -825,6 +925,7 @@
netif_tx_disable(priv->dev);
gve_clear_napi_enabled(priv);
+ gve_clear_report_stats(priv);
}
static void gve_turnup(struct gve_priv *priv)
@@ -875,6 +976,10 @@
dev_info(&priv->pdev->dev, "Device requested reset.\n");
gve_set_do_reset(priv);
}
+ if (GVE_DEVICE_STATUS_REPORT_STATS_MASK & status) {
+ dev_info(&priv->pdev->dev, "Device report stats on.\n");
+ gve_set_do_report_stats(priv);
+ }
}
static void gve_handle_reset(struct gve_priv *priv)
@@ -893,16 +998,100 @@
}
}
-/* Handle NIC status register changes and reset requests */
+void gve_handle_report_stats(struct gve_priv *priv)
+{
+ int idx, stats_idx = 0, tx_bytes;
+ unsigned int start = 0;
+ struct stats *stats = priv->stats_report->stats;
+
+ if (!gve_get_report_stats(priv))
+ return;
+
+ be64_add_cpu(&priv->stats_report->written_count, 1);
+ /* tx stats */
+ if (priv->tx) {
+ for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) {
+ do {
+ start = u64_stats_fetch_begin(&priv->tx[idx].statss);
+ tx_bytes = priv->tx[idx].bytes_done;
+ } while (u64_stats_fetch_retry(&priv->tx[idx].statss, start));
+ stats[stats_idx++] = (struct stats) {
+ .stat_name = cpu_to_be32(TX_WAKE_CNT),
+ .value = cpu_to_be64(priv->tx[idx].wake_queue),
+ .queue_id = cpu_to_be32(idx),
+ };
+ stats[stats_idx++] = (struct stats) {
+ .stat_name = cpu_to_be32(TX_STOP_CNT),
+ .value = cpu_to_be64(priv->tx[idx].stop_queue),
+ .queue_id = cpu_to_be32(idx),
+ };
+ stats[stats_idx++] = (struct stats) {
+ .stat_name = cpu_to_be32(TX_FRAMES_SENT),
+ .value = cpu_to_be64(priv->tx[idx].req),
+ .queue_id = cpu_to_be32(idx),
+ };
+ stats[stats_idx++] = (struct stats) {
+ .stat_name = cpu_to_be32(TX_BYTES_SENT),
+ .value = cpu_to_be64(tx_bytes),
+ .queue_id = cpu_to_be32(idx),
+ };
+ stats[stats_idx++] = (struct stats) {
+ .stat_name = cpu_to_be32(
+ TX_LAST_COMPLETION_PROCESSED),
+ .value = cpu_to_be64(priv->tx[idx].done),
+ .queue_id = cpu_to_be32(idx),
+ };
+ }
+ }
+ /* rx stats */
+ if (priv->rx) {
+ for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) {
+ stats[stats_idx++] = (struct stats) {
+ .stat_name = cpu_to_be32(
+ RX_NEXT_EXPECTED_SEQUENCE),
+ .value = cpu_to_be64(priv->rx[idx].desc.seqno),
+ .queue_id = cpu_to_be32(idx),
+ };
+ stats[stats_idx++] = (struct stats) {
+ .stat_name = cpu_to_be32(RX_BUFFERS_POSTED),
+ .value = cpu_to_be64(priv->rx[0].fill_cnt),
+ .queue_id = cpu_to_be32(idx),
+ };
+ }
+ }
+}
+
+void gve_handle_link_status(struct gve_priv *priv, bool link_status)
+{
+ if (!gve_get_napi_enabled(priv))
+ return;
+
+ if (link_status == netif_carrier_ok(priv->dev))
+ return;
+
+ if (link_status) {
+ netif_carrier_on(priv->dev);
+ } else {
+ dev_info(&priv->pdev->dev, "Device link is down.\n");
+ netif_carrier_off(priv->dev);
+ }
+}
+
+/* Handle NIC status register changes, reset requests and report stats */
static void gve_service_task(struct work_struct *work)
{
struct gve_priv *priv = container_of(work, struct gve_priv,
service_task);
+ u32 status = ioread32be(&priv->reg_bar0->device_status);
- gve_handle_status(priv,
- ioread32be(&priv->reg_bar0->device_status));
+ gve_handle_status(priv, status);
gve_handle_reset(priv);
+ gve_handle_link_status(priv, GVE_DEVICE_STATUS_LINK_STATUS_MASK & status);
+ if (gve_get_do_report_stats(priv)) {
+ gve_handle_report_stats(priv);
+ gve_clear_do_report_stats(priv);
+ }
}
static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device)
@@ -921,6 +1110,7 @@
if (skip_describe_device)
goto setup_device;
+ priv->raw_addressing = false;
/* Get the initial information we need from the device */
err = gve_adminq_describe_device(priv);
if (err) {
@@ -932,7 +1122,7 @@
priv->dev->max_mtu = PAGE_SIZE;
err = gve_adminq_set_mtu(priv, priv->dev->mtu);
if (err) {
- netif_err(priv, drv, priv->dev, "Could not set mtu");
+ dev_err(&priv->pdev->dev, "Could not set mtu");
goto err;
}
}
@@ -972,10 +1162,10 @@
priv->rx_cfg.num_queues);
}
- netif_info(priv, drv, priv->dev, "TX queues %d, RX queues %d\n",
- priv->tx_cfg.num_queues, priv->rx_cfg.num_queues);
- netif_info(priv, drv, priv->dev, "Max TX queues %d, Max RX queues %d\n",
- priv->tx_cfg.max_queues, priv->rx_cfg.max_queues);
+ dev_info(&priv->pdev->dev, "TX queues %d, RX queues %d\n",
+ priv->tx_cfg.num_queues, priv->rx_cfg.num_queues);
+ dev_info(&priv->pdev->dev, "Max TX queues %d, Max RX queues %d\n",
+ priv->tx_cfg.max_queues, priv->rx_cfg.max_queues);
setup_device:
err = gve_setup_device_resources(priv);
@@ -1055,6 +1245,9 @@
/* Set it all back up */
err = gve_reset_recovery(priv, was_up);
gve_clear_reset_in_progress(priv);
+ priv->reset_cnt++;
+ priv->interface_up_cnt = 0;
+ priv->interface_down_cnt = 0;
return err;
}
@@ -1082,6 +1275,7 @@
__be32 __iomem *db_bar;
struct gve_registers __iomem *reg_bar;
struct gve_priv *priv;
+ u8 dma_mask;
int err;
err = pci_enable_device(pdev);
@@ -1094,19 +1288,6 @@
pci_set_master(pdev);
- err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
- if (err) {
- dev_err(&pdev->dev, "Failed to set dma mask: err=%d\n", err);
- goto abort_with_pci_region;
- }
-
- err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
- if (err) {
- dev_err(&pdev->dev,
- "Failed to set consistent dma mask: err=%d\n", err);
- goto abort_with_pci_region;
- }
-
reg_bar = pci_iomap(pdev, GVE_REGISTER_BAR, 0);
if (!reg_bar) {
dev_err(&pdev->dev, "Failed to map pci bar!\n");
@@ -1121,10 +1302,28 @@
goto abort_with_reg_bar;
}
+ dma_mask = readb(®_bar->dma_mask);
+ // Default to 64 if the register isn't set
+ if (!dma_mask)
+ dma_mask = 64;
gve_write_version(®_bar->driver_version);
/* Get max queues to alloc etherdev */
max_tx_queues = ioread32be(®_bar->max_tx_queues);
max_rx_queues = ioread32be(®_bar->max_rx_queues);
+
+ err = pci_set_dma_mask(pdev, DMA_BIT_MASK(dma_mask));
+ if (err) {
+ dev_err(&pdev->dev, "Failed to set dma mask: err=%d\n", err);
+ goto abort_with_reg_bar;
+ }
+
+ err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(dma_mask));
+ if (err) {
+ dev_err(&pdev->dev,
+ "Failed to set consistent dma mask: err=%d\n", err);
+ goto abort_with_reg_bar;
+ }
+
/* Alloc and setup the netdev and priv */
dev = alloc_etherdev_mqs(sizeof(*priv), max_tx_queues, max_rx_queues);
if (!dev) {
@@ -1132,7 +1331,9 @@
goto abort_with_db_bar;
}
SET_NETDEV_DEV(dev, &pdev->dev);
+
pci_set_drvdata(pdev, dev);
+
dev->ethtool_ops = &gve_ethtool_ops;
dev->netdev_ops = &gve_netdev_ops;
/* advertise features */
@@ -1157,8 +1358,11 @@
priv->db_bar2 = db_bar;
priv->service_task_flags = 0x0;
priv->state_flags = 0x0;
+ priv->ethtool_flags = 0x0;
+ priv->dma_mask = dma_mask;
gve_set_probe_in_progress(priv);
+
priv->gve_wq = alloc_ordered_workqueue("gve", 0);
if (!priv->gve_wq) {
dev_err(&pdev->dev, "Could not allocate workqueue");
@@ -1180,6 +1384,7 @@
dev_info(&pdev->dev, "GVE version %s\n", gve_version_str);
gve_clear_probe_in_progress(priv);
queue_work(priv->gve_wq, &priv->service_task);
+
return 0;
abort_with_gve_init:
diff --git a/drivers/net/ethernet/google/gve/gve_register.h b/drivers/net/ethernet/google/gve/gve_register.h
index 84ab889..776c291 100644
--- a/drivers/net/ethernet/google/gve/gve_register.h
+++ b/drivers/net/ethernet/google/gve/gve_register.h
@@ -16,12 +16,14 @@
__be32 adminq_pfn;
__be32 adminq_doorbell;
__be32 adminq_event_counter;
- u8 reserved[3];
+ u8 reserved[2];
+ u8 dma_mask;
u8 driver_version;
};
enum gve_device_status_flags {
GVE_DEVICE_STATUS_RESET_MASK = BIT(1),
GVE_DEVICE_STATUS_LINK_STATUS_MASK = BIT(2),
+ GVE_DEVICE_STATUS_REPORT_STATS_MASK = BIT(3),
};
#endif /* _GVE_REGISTER_H_ */
diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c
index 9f52e72..a9bbe0e 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -16,12 +16,41 @@
block->rx = NULL;
}
+static void gve_rx_free_buffer(struct device *dev,
+ struct gve_rx_slot_page_info *page_info,
+ struct gve_rx_data_slot *data_slot) {
+ dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) -
+ page_info->page_offset);
+
+ page_ref_sub(page_info->page, page_info->pagecnt_bias - 1);
+ gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE);
+}
+
+static void gve_rx_unfill_pages(struct gve_priv *priv, struct gve_rx_ring *rx) {
+ u32 slots = rx->mask + 1;
+ int i;
+
+ if (rx->data.raw_addressing) {
+ for (i = 0; i < slots; i++)
+ gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i],
+ &rx->data.data_ring[i]);
+ } else {
+ for (i = 0; i < slots; i++)
+ page_ref_sub(rx->data.page_info[i].page,
+ rx->data.page_info[i].pagecnt_bias - 1);
+ gve_unassign_qpl(priv, rx->data.qpl->id);
+ rx->data.qpl = NULL;
+ }
+ kvfree(rx->data.page_info);
+ rx->data.page_info = NULL;
+}
+
static void gve_rx_free_ring(struct gve_priv *priv, int idx)
{
struct gve_rx_ring *rx = &priv->rx[idx];
struct device *dev = &priv->pdev->dev;
size_t bytes;
- u32 slots;
+ u32 slots = rx->mask + 1;
gve_rx_remove_from_block(priv, idx);
@@ -33,11 +62,8 @@
rx->q_resources, rx->q_resources_bus);
rx->q_resources = NULL;
- gve_unassign_qpl(priv, rx->data.qpl->id);
- rx->data.qpl = NULL;
- kvfree(rx->data.page_info);
+ gve_rx_unfill_pages(priv, rx);
- slots = rx->mask + 1;
bytes = sizeof(*rx->data.data_ring) * slots;
dma_free_coherent(dev, bytes, rx->data.data_ring,
rx->data.data_bus);
@@ -52,13 +78,17 @@
page_info->page = page;
page_info->page_offset = 0;
page_info->page_address = page_address(page);
- slot->qpl_offset = cpu_to_be64(addr);
+ slot->addr = cpu_to_be64(addr);
+ /* The page already has 1 ref */
+ page_ref_add(page, INT_MAX - 1);
+ page_info->pagecnt_bias = INT_MAX;
}
static int gve_prefill_rx_pages(struct gve_rx_ring *rx)
{
struct gve_priv *priv = rx->gve;
u32 slots;
+ int err;
int i;
/* Allocate one page per Rx queue slot. Each page is split into two
@@ -71,12 +101,32 @@
if (!rx->data.page_info)
return -ENOMEM;
- rx->data.qpl = gve_assign_rx_qpl(priv);
-
+ if (!rx->data.raw_addressing)
+ rx->data.qpl = gve_assign_rx_qpl(priv);
for (i = 0; i < slots; i++) {
- struct page *page = rx->data.qpl->pages[i];
- dma_addr_t addr = i * PAGE_SIZE;
+ struct page *page;
+ dma_addr_t addr;
+ if (rx->data.raw_addressing) {
+ err = gve_alloc_page(priv, &priv->pdev->dev, &page,
+ &addr, DMA_FROM_DEVICE,
+ GFP_KERNEL);
+ if (err) {
+ int j;
+
+ u64_stats_update_begin(&rx->statss);
+ rx->rx_buf_alloc_fail++;
+ u64_stats_update_end(&rx->statss);
+ for (j = 0; j < i; j++)
+ gve_rx_free_buffer(&priv->pdev->dev,
+ &rx->data.page_info[j],
+ &rx->data.data_ring[j]);
+ return err;
+ }
+ } else {
+ page = rx->data.qpl->pages[i];
+ addr = i * PAGE_SIZE;
+ }
gve_setup_rx_buffer(&rx->data.page_info[i],
&rx->data.data_ring[i], addr, page);
}
@@ -110,8 +160,9 @@
rx->gve = priv;
rx->q_num = idx;
- slots = priv->rx_pages_per_qpl;
+ slots = priv->rx_data_slot_cnt;
rx->mask = slots - 1;
+ rx->data.raw_addressing = priv->raw_addressing;
/* alloc rx data ring */
bytes = sizeof(*rx->data.data_ring) * slots;
@@ -156,8 +207,8 @@
err = -ENOMEM;
goto abort_with_q_resources;
}
- rx->mask = slots - 1;
rx->cnt = 0;
+ rx->db_threshold = priv->rx_desc_cnt / 2;
rx->desc.seqno = 1;
gve_rx_add_to_block(priv, idx);
@@ -168,7 +219,7 @@
rx->q_resources, rx->q_resources_bus);
rx->q_resources = NULL;
abort_filled:
- kvfree(rx->data.page_info);
+ gve_rx_unfill_pages(priv, rx);
abort_with_slots:
bytes = sizeof(*rx->data.data_ring) * slots;
dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus);
@@ -242,11 +293,11 @@
skb_copy_to_linear_data(skb, va, len);
skb->protocol = eth_type_trans(skb, dev);
+
return skb;
}
-static struct sk_buff *gve_rx_add_frags(struct net_device *dev,
- struct napi_struct *napi,
+static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi,
struct gve_rx_slot_page_info *page_info,
u16 len)
{
@@ -262,14 +313,135 @@
return skb;
}
-static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info,
- struct gve_rx_data_slot *data_ring)
+static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev,
+ struct gve_rx_slot_page_info *page_info,
+ struct gve_rx_data_slot *data_slot,
+ struct gve_rx_ring *rx)
{
- u64 addr = be64_to_cpu(data_ring->qpl_offset);
+ struct page *page;
+ dma_addr_t dma;
+ int err;
+ err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE,
+ GFP_ATOMIC);
+ if (err) {
+ u64_stats_update_begin(&rx->statss);
+ rx->rx_buf_alloc_fail++;
+ u64_stats_update_end(&rx->statss);
+ return err;
+ }
+
+ gve_setup_rx_buffer(page_info, data_slot, dma, page);
+ return 0;
+}
+
+static void gve_rx_flip_buffer(struct gve_rx_slot_page_info *page_info,
+ struct gve_rx_data_slot *data_slot)
+{
+ u64 addr = be64_to_cpu(data_slot->addr);
+
+ /* "flip" to other packet buffer on this page */
page_info->page_offset ^= PAGE_SIZE / 2;
addr ^= PAGE_SIZE / 2;
- data_ring->qpl_offset = cpu_to_be64(addr);
+ data_slot->addr = cpu_to_be64(addr);
+}
+
+static bool gve_rx_can_flip_buffers(struct net_device *netdev) {
+#if PAGE_SIZE == 4096
+ /* We can't flip a buffer if we can't fit a packet
+ * into half a page.
+ */
+ if (netdev->max_mtu + GVE_RX_PAD + ETH_HLEN > PAGE_SIZE / 2)
+ return false;
+ return true;
+#else
+ /* PAGE_SIZE != 4096 - don't try to reuse */
+ return false;
+#endif
+}
+
+static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info)
+{
+ int pagecount = page_count(page_info->page);
+
+ /* This page is not being used by any SKBs - reuse */
+ if (pagecount == page_info->pagecnt_bias) {
+ return 1;
+ /* This page is still being used by an SKB - we can't reuse */
+ } else if (pagecount > page_info->pagecnt_bias) {
+ return 0;
+ } else {
+ WARN(pagecount < page_info->pagecnt_bias,
+ "Pagecount should never be less than the bias.");
+ return -1;
+ }
+}
+
+static void gve_rx_update_pagecnt_bias(struct gve_rx_slot_page_info *page_info)
+{
+ page_info->pagecnt_bias--;
+ if (page_info->pagecnt_bias == 0) {
+ int pagecount = page_count(page_info->page);
+
+ /* If we have run out of bias - set it back up to INT_MAX
+ * minus the existing refs.
+ */
+ page_info->pagecnt_bias = INT_MAX - (pagecount);
+ /* Set pagecount back up to max */
+ page_ref_add(page_info->page, INT_MAX - pagecount);
+ }
+}
+
+static struct sk_buff *
+gve_rx_raw_addressing(struct device *dev, struct net_device *netdev,
+ struct gve_rx_slot_page_info *page_info, u16 len,
+ struct napi_struct *napi,
+ struct gve_rx_data_slot *data_slot, bool can_flip)
+{
+ struct sk_buff *skb = gve_rx_add_frags(napi, page_info, len);
+
+ if (!skb)
+ return NULL;
+
+ /* Optimistically stop the kernel from freeing the page.
+ * We will check again in refill to determine if we need to alloc a
+ * new page.
+ */
+ gve_rx_update_pagecnt_bias(page_info);
+ page_info->can_flip = can_flip;
+
+ return skb;
+}
+
+static struct sk_buff *
+gve_rx_qpl(struct device *dev, struct net_device *netdev,
+ struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info,
+ u16 len, struct napi_struct *napi,
+ struct gve_rx_data_slot *data_slot, bool recycle)
+{
+ struct sk_buff *skb;
+ /* if raw_addressing mode is not enabled gvnic can only receive into
+ * registered segments. If the buffer can't be recycled, our only
+ * choice is to copy the data out of it so that we can return it to the
+ * device.
+ */
+ if (recycle) {
+ skb = gve_rx_add_frags(napi, page_info, len);
+ /* No point in recycling if we didn't get the skb */
+ if (skb) {
+ /* Make sure the networking stack can't free the page */
+ gve_rx_update_pagecnt_bias(page_info);
+ gve_rx_flip_buffer(page_info, data_slot);
+ }
+ } else {
+ skb = gve_rx_copy(netdev, napi, page_info, len);
+ if (skb) {
+ u64_stats_update_begin(&rx->statss);
+ rx->rx_copied_pkt++;
+ u64_stats_update_end(&rx->statss);
+ }
+ }
+ return skb;
}
static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
@@ -278,67 +450,68 @@
struct gve_rx_slot_page_info *page_info;
struct gve_priv *priv = rx->gve;
struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
- struct net_device *dev = priv->dev;
- struct sk_buff *skb;
- int pagecount;
+ struct net_device *netdev = priv->dev;
+ struct gve_rx_data_slot *data_slot;
+ struct sk_buff *skb = NULL;
+ dma_addr_t page_bus;
u16 len;
/* drop this packet */
- if (unlikely(rx_desc->flags_seq & GVE_RXF_ERR))
- return true;
+ if (unlikely(rx_desc->flags_seq & GVE_RXF_ERR)) {
+ u64_stats_update_begin(&rx->statss);
+ rx->rx_desc_err_dropped_pkt++;
+ u64_stats_update_end(&rx->statss);
+ return false;
+ }
len = be16_to_cpu(rx_desc->len) - GVE_RX_PAD;
page_info = &rx->data.page_info[idx];
- dma_sync_single_for_cpu(&priv->pdev->dev, rx->data.qpl->page_buses[idx],
+
+ data_slot = &rx->data.data_ring[idx];
+ page_bus = (rx->data.raw_addressing) ?
+ be64_to_cpu(data_slot->addr) - page_info->page_offset:
+ rx->data.qpl->page_buses[idx];
+ dma_sync_single_for_cpu(&priv->pdev->dev, page_bus,
PAGE_SIZE, DMA_FROM_DEVICE);
- /* gvnic can only receive into registered segments. If the buffer
- * can't be recycled, our only choice is to copy the data out of
- * it so that we can return it to the device.
- */
-
- if (PAGE_SIZE == 4096) {
- if (len <= priv->rx_copybreak) {
- /* Just copy small packets */
- skb = gve_rx_copy(dev, napi, page_info, len);
- goto have_skb;
- }
- if (unlikely(!gve_can_recycle_pages(dev))) {
- skb = gve_rx_copy(dev, napi, page_info, len);
- goto have_skb;
- }
- pagecount = page_count(page_info->page);
- if (pagecount == 1) {
- /* No part of this page is used by any SKBs; we attach
- * the page fragment to a new SKB and pass it up the
- * stack.
- */
- skb = gve_rx_add_frags(dev, napi, page_info, len);
- if (!skb)
- return true;
- /* Make sure the kernel stack can't release the page */
- get_page(page_info->page);
- /* "flip" to other packet buffer on this page */
- gve_rx_flip_buff(page_info, &rx->data.data_ring[idx]);
- } else if (pagecount >= 2) {
- /* We have previously passed the other half of this
- * page up the stack, but it has not yet been freed.
- */
- skb = gve_rx_copy(dev, napi, page_info, len);
- } else {
- WARN(pagecount < 1, "Pagecount should never be < 1");
- return false;
- }
+ if (len <= priv->rx_copybreak) {
+ /* Just copy small packets */
+ skb = gve_rx_copy(netdev, napi, page_info, len);
+ if (skb) {
+ u64_stats_update_begin(&rx->statss);
+ rx->rx_copied_pkt++;
+ rx->rx_copybreak_pkt++;
+ u64_stats_update_end(&rx->statss);
+ }
} else {
- skb = gve_rx_copy(dev, napi, page_info, len);
+ bool can_flip = gve_rx_can_flip_buffers(netdev);
+ int recycle = 0;
+
+ if (can_flip) {
+ recycle = gve_rx_can_recycle_buffer(page_info);
+ if (recycle < 0) {
+ gve_schedule_reset(priv);
+ return false;
+ }
+ }
+ if (rx->data.raw_addressing) {
+ skb = gve_rx_raw_addressing(&priv->pdev->dev, netdev,
+ page_info, len, napi,
+ data_slot,
+ can_flip && recycle);
+ } else {
+ skb = gve_rx_qpl(&priv->pdev->dev, netdev, rx,
+ page_info, len, napi, data_slot,
+ can_flip && recycle);
+ }
}
-have_skb:
- /* We didn't manage to allocate an skb but we haven't had any
- * reset worthy failures.
- */
- if (!skb)
- return true;
+ if (!skb) {
+ u64_stats_update_begin(&rx->statss);
+ rx->rx_skb_alloc_fail++;
+ u64_stats_update_end(&rx->statss);
+ return false;
+ }
if (likely(feat & NETIF_F_RXCSUM)) {
/* NIC passes up the partial sum */
@@ -359,6 +532,7 @@
napi_gro_frags(napi);
else
napi_gro_receive(napi, skb);
+
return true;
}
@@ -371,26 +545,80 @@
next_idx = rx->cnt & rx->mask;
desc = rx->desc.desc_ring + next_idx;
+ /* make sure we have synchronized the seq no with the device */
+ smp_mb();
flags_seq = desc->flags_seq;
- /* Make sure we have synchronized the seq no with the device */
- smp_rmb();
+
return (GVE_SEQNO(flags_seq) == rx->desc.seqno);
}
+static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx)
+{
+ u32 fill_cnt = rx->fill_cnt;
+
+ while ((fill_cnt & rx->mask) != (rx->cnt & rx->mask)) {
+ u32 idx = fill_cnt & rx->mask;
+ struct gve_rx_slot_page_info *page_info =
+ &rx->data.page_info[idx];
+
+ if (page_info->can_flip) {
+ /* The other half of the page is free because it was
+ * free when we processed the descriptor. Flip to it.
+ */
+ struct gve_rx_data_slot *data_slot =
+ &rx->data.data_ring[idx];
+
+ gve_rx_flip_buffer(page_info, data_slot);
+ page_info->can_flip = false;
+ } else {
+ /* It is possible that the networking stack has already
+ * finished processing all outstanding packets in the buffer
+ * and it can be reused.
+ * Flipping is unceccessary here - if the networking stack still
+ * owns half the page it is impossible to tell which half. Either
+ * the whole page is free or it needs to be replaced.
+ */
+ int recycle = gve_rx_can_recycle_buffer(page_info);
+
+ if (recycle < 0) {
+ gve_schedule_reset(priv);
+ return false;
+ }
+ if (!recycle) {
+ /* We can't reuse the buffer - alloc a new one*/
+ struct gve_rx_data_slot *data_slot =
+ &rx->data.data_ring[idx];
+ struct device *dev = &priv->pdev->dev;
+
+ gve_rx_free_buffer(dev, page_info, data_slot);
+ page_info->page = NULL;
+ if (gve_rx_alloc_buffer(priv, dev, page_info,
+ data_slot, rx)) {
+ break;
+ }
+ }
+ }
+ fill_cnt++;
+ }
+ rx->fill_cnt = fill_cnt;
+ return true;
+}
+
bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget,
netdev_features_t feat)
{
struct gve_priv *priv = rx->gve;
+ u32 work_done = 0, packets = 0;
struct gve_rx_desc *desc;
u32 cnt = rx->cnt;
u32 idx = cnt & rx->mask;
- u32 work_done = 0;
u64 bytes = 0;
desc = rx->desc.desc_ring + idx;
while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) &&
work_done < budget) {
+ bool dropped;
netif_info(priv, rx_status, priv->dev,
"[%d] idx=%d desc=%p desc->flags_seq=0x%x\n",
rx->q_num, idx, desc, desc->flags_seq);
@@ -398,9 +626,11 @@
"[%d] seqno=%d rx->desc.seqno=%d\n",
rx->q_num, GVE_SEQNO(desc->flags_seq),
rx->desc.seqno);
- bytes += be16_to_cpu(desc->len) - GVE_RX_PAD;
- if (!gve_rx(rx, desc, feat, idx))
- gve_schedule_reset(priv);
+ dropped = !gve_rx(rx, desc, feat, idx);
+ if (!dropped) {
+ bytes += be16_to_cpu(desc->len) - GVE_RX_PAD;
+ packets++;
+ }
cnt++;
idx = cnt & rx->mask;
desc = rx->desc.desc_ring + idx;
@@ -412,13 +642,27 @@
return false;
u64_stats_update_begin(&rx->statss);
- rx->rpackets += work_done;
+ rx->rpackets += packets;
rx->rbytes += bytes;
u64_stats_update_end(&rx->statss);
rx->cnt = cnt;
- rx->fill_cnt += work_done;
+ /* restock ring slots */
+ if (!rx->data.raw_addressing) {
+ /* In QPL mode buffs are refilled as the desc are processed */
+ rx->fill_cnt += work_done;
+ dma_wmb();/* Ensure descs are visible before ringing doorbell */
+ gve_rx_write_doorbell(priv, rx);
+ } else if (rx->fill_cnt - cnt <= rx->db_threshold) {
+ /* In raw addressing mode buffs are only refilled if the avail
+ * falls below a threshold.
+ */
+ if(!gve_rx_refill_buffers(priv, rx))
+ return false;
+ /* restock desc ring slots */
+ dma_wmb();/* Ensure descs are visible before ringing doorbell */
+ gve_rx_write_doorbell(priv, rx);
+ }
- gve_rx_write_doorbell(priv, rx);
return gve_rx_work_pending(rx);
}
diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c
index b653197..c5b8db9 100644
--- a/drivers/net/ethernet/google/gve/gve_tx.c
+++ b/drivers/net/ethernet/google/gve/gve_tx.c
@@ -158,9 +158,11 @@
tx->q_resources, tx->q_resources_bus);
tx->q_resources = NULL;
- gve_tx_fifo_release(priv, &tx->tx_fifo);
- gve_unassign_qpl(priv, tx->tx_fifo.qpl->id);
- tx->tx_fifo.qpl = NULL;
+ if (!tx->raw_addressing) {
+ gve_tx_fifo_release(priv, &tx->tx_fifo);
+ gve_unassign_qpl(priv, tx->tx_fifo.qpl->id);
+ tx->tx_fifo.qpl = NULL;
+ }
bytes = sizeof(*tx->desc) * slots;
dma_free_coherent(hdev, bytes, tx->desc, tx->bus);
@@ -174,12 +176,16 @@
static void gve_tx_add_to_block(struct gve_priv *priv, int queue_idx)
{
+ unsigned int active_cpus = min_t(int, priv->num_ntfy_blks / 2,
+ num_online_cpus());
int ntfy_idx = gve_tx_idx_to_ntfy(priv, queue_idx);
struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
struct gve_tx_ring *tx = &priv->tx[queue_idx];
block->tx = tx;
tx->ntfy_id = ntfy_idx;
+ netif_set_xps_queue(priv->dev, get_cpu_mask(ntfy_idx % active_cpus),
+ queue_idx);
}
static int gve_tx_alloc_ring(struct gve_priv *priv, int idx)
@@ -206,13 +212,18 @@
if (!tx->desc)
goto abort_with_info;
- tx->tx_fifo.qpl = gve_assign_tx_qpl(priv);
- if (!tx->tx_fifo.qpl)
- goto abort_with_desc;
+ tx->raw_addressing = priv->raw_addressing;
+ tx->dev = &priv->pdev->dev;
+ if (!tx->raw_addressing) {
+ tx->tx_fifo.qpl = gve_assign_tx_qpl(priv);
- /* map Tx FIFO */
- if (gve_tx_fifo_init(priv, &tx->tx_fifo))
- goto abort_with_qpl;
+ if (!tx->tx_fifo.qpl)
+ goto abort_with_desc;
+
+ /* map Tx FIFO */
+ if (gve_tx_fifo_init(priv, &tx->tx_fifo))
+ goto abort_with_qpl;
+ }
tx->q_resources =
dma_alloc_coherent(hdev,
@@ -230,9 +241,11 @@
return 0;
abort_with_fifo:
- gve_tx_fifo_release(priv, &tx->tx_fifo);
+ if (!tx->raw_addressing)
+ gve_tx_fifo_release(priv, &tx->tx_fifo);
abort_with_qpl:
- gve_unassign_qpl(priv, tx->tx_fifo.qpl->id);
+ if (!tx->raw_addressing)
+ gve_unassign_qpl(priv, tx->tx_fifo.qpl->id);
abort_with_desc:
dma_free_coherent(hdev, bytes, tx->desc, tx->bus);
tx->desc = NULL;
@@ -310,22 +323,44 @@
* payload wraps to the beginning of the FIFO.
*/
#define MAX_TX_DESC_NEEDED 3
+static void gve_tx_unmap_buf(struct device *dev,
+ struct gve_tx_dma_buf *buf)
+{
+ const int buf_len = (int)dma_unmap_len(buf, len);
+ if (buf_len > 0) {
+ dma_unmap_single(dev, dma_unmap_addr(buf, dma),
+ dma_unmap_len(buf, len),
+ DMA_TO_DEVICE);
+ dma_unmap_len_set(buf, len, 0);
+ } else if (buf_len < 0) {
+ dma_unmap_page(dev, dma_unmap_addr(buf, dma),
+ -dma_unmap_len(buf, len),
+ DMA_TO_DEVICE);
+ dma_unmap_len_set(buf, len, 0);
+ }
+}
/* Check if sufficient resources (descriptor ring space, FIFO space) are
* available to transmit the given number of bytes.
*/
static inline bool gve_can_tx(struct gve_tx_ring *tx, int bytes_required)
{
- return (gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED &&
- gve_tx_fifo_can_alloc(&tx->tx_fifo, bytes_required));
+ bool can_alloc = true;
+
+ if (!tx->raw_addressing)
+ can_alloc = gve_tx_fifo_can_alloc(&tx->tx_fifo, bytes_required);
+
+ return (gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED && can_alloc);
}
/* Stops the queue if the skb cannot be transmitted. */
static int gve_maybe_stop_tx(struct gve_tx_ring *tx, struct sk_buff *skb)
{
- int bytes_required;
+ int bytes_required = 0;
- bytes_required = gve_skb_fifo_bytes_required(tx, skb);
+ if (!tx->raw_addressing)
+ bytes_required = gve_skb_fifo_bytes_required(tx, skb);
+
if (likely(gve_can_tx(tx, bytes_required)))
return 0;
@@ -394,22 +429,23 @@
seg_desc->seg.seg_addr = cpu_to_be64(addr);
}
-static void gve_dma_sync_for_device(struct device *dev, dma_addr_t *page_buses,
- u64 iov_offset, u64 iov_len)
+static void gve_dma_sync_for_device(struct gve_priv *priv,
+ dma_addr_t *page_buses,
+ u64 iov_offset, u64 iov_len)
{
u64 last_page = (iov_offset + iov_len - 1) / PAGE_SIZE;
u64 first_page = iov_offset / PAGE_SIZE;
- dma_addr_t dma;
u64 page;
for (page = first_page; page <= last_page; page++) {
- dma = page_buses[page];
- dma_sync_single_for_device(dev, dma, PAGE_SIZE, DMA_TO_DEVICE);
+ dma_addr_t dma = page_buses[page];
+ dma_sync_single_for_device(&priv->pdev->dev, dma, PAGE_SIZE,
+ DMA_TO_DEVICE);
}
}
-static int gve_tx_add_skb(struct gve_tx_ring *tx, struct sk_buff *skb,
- struct device *dev)
+static int gve_tx_add_skb_copy(struct gve_priv* priv, struct gve_tx_ring *tx,
+ struct sk_buff *skb)
{
int pad_bytes, hlen, hdr_nfrags, payload_nfrags, l4_hdr_offset;
union gve_tx_desc *pkt_desc, *seg_desc;
@@ -451,7 +487,7 @@
skb_copy_bits(skb, 0,
tx->tx_fifo.base + info->iov[hdr_nfrags - 1].iov_offset,
hlen);
- gve_dma_sync_for_device(dev, tx->tx_fifo.qpl->page_buses,
+ gve_dma_sync_for_device(priv, tx->tx_fifo.qpl->page_buses,
info->iov[hdr_nfrags - 1].iov_offset,
info->iov[hdr_nfrags - 1].iov_len);
copy_offset = hlen;
@@ -467,7 +503,7 @@
skb_copy_bits(skb, copy_offset,
tx->tx_fifo.base + info->iov[i].iov_offset,
info->iov[i].iov_len);
- gve_dma_sync_for_device(dev, tx->tx_fifo.qpl->page_buses,
+ gve_dma_sync_for_device(priv, tx->tx_fifo.qpl->page_buses,
info->iov[i].iov_offset,
info->iov[i].iov_len);
copy_offset += info->iov[i].iov_len;
@@ -476,6 +512,96 @@
return 1 + payload_nfrags;
}
+static int gve_tx_add_skb_no_copy(struct gve_priv *priv, struct gve_tx_ring *tx,
+ struct sk_buff *skb)
+{
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ int hlen, payload_nfrags, l4_hdr_offset, seg_idx_bias;
+ union gve_tx_desc *pkt_desc, *seg_desc;
+ struct gve_tx_buffer_state *info;
+ bool is_gso = skb_is_gso(skb);
+ u32 idx = tx->req & tx->mask;
+ struct gve_tx_dma_buf *buf;
+ int last_mapped = 0;
+ u64 addr;
+ u32 len;
+ int i;
+
+ info = &tx->info[idx];
+ pkt_desc = &tx->desc[idx];
+
+ l4_hdr_offset = skb_checksum_start_offset(skb);
+ /* If the skb is gso, then we want the tcp header in the first segment
+ * otherwise we want the linear portion of the skb (which will contain
+ * the checksum because skb->csum_start and skb->csum_offset are given
+ * relative to skb->head) in the first segment.
+ */
+ hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) :
+ skb_headlen(skb);
+ len = skb_headlen(skb);
+
+ info->skb = skb;
+
+ addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
+ if (unlikely(dma_mapping_error(tx->dev, addr))) {
+ priv->dma_mapping_error++;
+ goto drop;
+ }
+ buf = &info->buf;
+ dma_unmap_len_set(buf, len, len);
+ dma_unmap_addr_set(buf, dma, addr);
+
+ payload_nfrags = shinfo->nr_frags;
+ if (hlen < len) {
+ /* For gso the rest of the linear portion of the skb needs to
+ * be in its own descriptor.
+ */
+ payload_nfrags++;
+ gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
+ 1 + payload_nfrags, hlen, addr);
+
+ len -= hlen;
+ addr += hlen;
+ seg_desc = &tx->desc[(tx->req + 1) & tx->mask];
+ seg_idx_bias = 2;
+ gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr);
+ } else {
+ seg_idx_bias = 1;
+ gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
+ 1 + payload_nfrags, hlen, addr);
+ }
+
+ for (i = 0; i < payload_nfrags - (seg_idx_bias - 1); i++) {
+ const skb_frag_t* frag = &shinfo->frags[i];
+
+ idx = (tx->req + i + seg_idx_bias) & tx->mask;
+ seg_desc = &tx->desc[idx];
+ len = skb_frag_size(frag);
+ addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE);
+ if (unlikely(dma_mapping_error(tx->dev, addr))) {
+ priv->dma_mapping_error++;
+ goto unmap_drop;
+ }
+ buf = &tx->info[idx].buf;
+ dma_unmap_len_set(buf, len, -len);
+ dma_unmap_addr_set(buf, dma, addr);
+
+ gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr);
+ }
+
+ return 1 + payload_nfrags;
+
+unmap_drop:
+ i--;
+ for (last_mapped = i + seg_idx_bias; last_mapped >= 0; last_mapped--) {
+ idx = (tx->req + last_mapped) & tx->mask;
+ gve_tx_unmap_buf(tx->dev, &tx->info[idx].buf);
+ }
+drop:
+ tx->dropped_pkt++;
+ return 0;
+}
+
netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev)
{
struct gve_priv *priv = netdev_priv(dev);
@@ -491,20 +617,34 @@
* may have added descriptors without ringing the doorbell.
*/
+ /* Ensure tx descs from a prior gve_tx are visible before
+ * ringing doorbell.
+ */
+ dma_wmb();
gve_tx_put_doorbell(priv, tx->q_resources, tx->req);
return NETDEV_TX_BUSY;
}
- nsegs = gve_tx_add_skb(tx, skb, &priv->pdev->dev);
+ if (tx->raw_addressing)
+ nsegs = gve_tx_add_skb_no_copy(priv, tx, skb);
+ else
+ nsegs = gve_tx_add_skb_copy(priv, tx, skb);
- netdev_tx_sent_queue(tx->netdev_txq, skb->len);
- skb_tx_timestamp(skb);
+ /* If the packet is getting sent, we need to update the skb */
+ if (nsegs) {
+ netdev_tx_sent_queue(tx->netdev_txq, skb->len);
+ skb_tx_timestamp(skb);
+ }
- /* give packets to NIC */
+ /* Give packets to NIC. Even if this packet failed to send the doorbell
+ * might need to be rung because of xmit_more.
+ */
tx->req += nsegs;
if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more())
return NETDEV_TX_OK;
+ /* Ensure tx descs are visible before ringing doorbell */
+ dma_wmb();
gve_tx_put_doorbell(priv, tx->q_resources, tx->req);
return NETDEV_TX_OK;
}
@@ -529,24 +669,31 @@
info = &tx->info[idx];
skb = info->skb;
+ /* Unmap the buffer */
+ if (tx->raw_addressing)
+ gve_tx_unmap_buf(tx->dev, &tx->info[idx].buf);
/* Mark as free */
if (skb) {
info->skb = NULL;
bytes += skb->len;
pkts++;
dev_consume_skb_any(skb);
- /* FIFO free */
- for (i = 0; i < ARRAY_SIZE(info->iov); i++) {
- space_freed += info->iov[i].iov_len +
- info->iov[i].iov_padding;
- info->iov[i].iov_len = 0;
- info->iov[i].iov_padding = 0;
+ if (!tx->raw_addressing) {
+ /* FIFO free */
+ for (i = 0; i < ARRAY_SIZE(info->iov); i++) {
+ space_freed += info->iov[i].iov_len +
+ info->iov[i].iov_padding;
+ info->iov[i].iov_len = 0;
+ info->iov[i].iov_padding = 0;
+ }
}
}
tx->done++;
}
- gve_tx_free_fifo(&tx->tx_fifo, space_freed);
+ if (!tx->raw_addressing) {
+ gve_tx_free_fifo(&tx->tx_fifo, space_freed);
+ }
u64_stats_update_begin(&tx->statss);
tx->bytes_done += bytes;
tx->pkt_done += pkts;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 10fe7a7..0f2f2dd 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2631,6 +2631,7 @@
* Don't limit the IOMMU merged segment size.
*/
dma_set_max_seg_size(dev->dev, 0xffffffff);
+ dma_set_min_align_mask(dev->dev, dev->ctrl.page_size - 1);
mutex_unlock(&dev->shutdown_lock);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 1e44482..fb86ab8 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -14,6 +14,7 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/balloon_compaction.h>
+#include <linux/oom.h>
#include <linux/wait.h>
#include <linux/mm.h>
#include <linux/mount.h>
@@ -27,7 +28,9 @@
*/
#define VIRTIO_BALLOON_PAGES_PER_PAGE (unsigned)(PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT)
#define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256
-#define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80
+/* Maximum number of (4k) pages to deflate on OOM notifications. */
+#define VIRTIO_BALLOON_OOM_NR_PAGES 256
+#define VIRTIO_BALLOON_OOM_NOTIFY_PRIORITY 80
#define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \
__GFP_NOMEMALLOC)
@@ -111,8 +114,11 @@
/* Memory statistics */
struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
- /* To register a shrinker to shrink memory upon memory pressure */
+ /* Shrinker to return free pages - VIRTIO_BALLOON_F_FREE_PAGE_HINT */
struct shrinker shrinker;
+
+ /* OOM notifier to deflate on OOM - VIRTIO_BALLOON_F_DEFLATE_ON_OOM */
+ struct notifier_block oom_nb;
};
static struct virtio_device_id id_table[] = {
@@ -791,50 +797,13 @@
return blocks_freed << VIRTIO_BALLOON_FREE_PAGE_ORDER;
}
-static unsigned long leak_balloon_pages(struct virtio_balloon *vb,
- unsigned long pages_to_free)
-{
- return leak_balloon(vb, pages_to_free * VIRTIO_BALLOON_PAGES_PER_PAGE) /
- VIRTIO_BALLOON_PAGES_PER_PAGE;
-}
-
-static unsigned long shrink_balloon_pages(struct virtio_balloon *vb,
- unsigned long pages_to_free)
-{
- unsigned long pages_freed = 0;
-
- /*
- * One invocation of leak_balloon can deflate at most
- * VIRTIO_BALLOON_ARRAY_PFNS_MAX balloon pages, so we call it
- * multiple times to deflate pages till reaching pages_to_free.
- */
- while (vb->num_pages && pages_freed < pages_to_free)
- pages_freed += leak_balloon_pages(vb,
- pages_to_free - pages_freed);
-
- update_balloon_size(vb);
-
- return pages_freed;
-}
-
static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker,
struct shrink_control *sc)
{
- unsigned long pages_to_free, pages_freed = 0;
struct virtio_balloon *vb = container_of(shrinker,
struct virtio_balloon, shrinker);
- pages_to_free = sc->nr_to_scan;
-
- if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
- pages_freed = shrink_free_pages(vb, pages_to_free);
-
- if (pages_freed >= pages_to_free)
- return pages_freed;
-
- pages_freed += shrink_balloon_pages(vb, pages_to_free - pages_freed);
-
- return pages_freed;
+ return shrink_free_pages(vb, sc->nr_to_scan);
}
static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker,
@@ -842,12 +811,22 @@
{
struct virtio_balloon *vb = container_of(shrinker,
struct virtio_balloon, shrinker);
- unsigned long count;
- count = vb->num_pages / VIRTIO_BALLOON_PAGES_PER_PAGE;
- count += vb->num_free_page_blocks << VIRTIO_BALLOON_FREE_PAGE_ORDER;
+ return vb->num_free_page_blocks << VIRTIO_BALLOON_FREE_PAGE_ORDER;
+}
- return count;
+static int virtio_balloon_oom_notify(struct notifier_block *nb,
+ unsigned long dummy, void *parm)
+{
+ struct virtio_balloon *vb = container_of(nb,
+ struct virtio_balloon, oom_nb);
+ unsigned long *freed = parm;
+
+ *freed += leak_balloon(vb, VIRTIO_BALLOON_OOM_NR_PAGES) /
+ VIRTIO_BALLOON_PAGES_PER_PAGE;
+ update_balloon_size(vb);
+
+ return NOTIFY_OK;
}
static void virtio_balloon_unregister_shrinker(struct virtio_balloon *vb)
@@ -933,27 +912,37 @@
VIRTIO_BALLOON_CMD_ID_STOP);
spin_lock_init(&vb->free_page_list_lock);
INIT_LIST_HEAD(&vb->free_page_list);
+ /*
+ * We're allowed to reuse any free pages, even if they are
+ * still to be processed by the host.
+ */
+ err = virtio_balloon_register_shrinker(vb);
+ if (err)
+ goto out_del_balloon_wq;
if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) {
memset(&poison_val, PAGE_POISON, sizeof(poison_val));
virtio_cwrite(vb->vdev, struct virtio_balloon_config,
poison_val, &poison_val);
}
}
- /*
- * We continue to use VIRTIO_BALLOON_F_DEFLATE_ON_OOM to decide if a
- * shrinker needs to be registered to relieve memory pressure.
- */
+
if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) {
- err = virtio_balloon_register_shrinker(vb);
- if (err)
- goto out_del_balloon_wq;
+ vb->oom_nb.notifier_call = virtio_balloon_oom_notify;
+ vb->oom_nb.priority = VIRTIO_BALLOON_OOM_NOTIFY_PRIORITY;
+ err = register_oom_notifier(&vb->oom_nb);
+ if (err < 0)
+ goto out_unregister_shrinker;
}
+
virtio_device_ready(vdev);
if (towards_target(vb))
virtballoon_changed(vdev);
return 0;
+out_unregister_shrinker:
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
+ virtio_balloon_unregister_shrinker(vb);
out_del_balloon_wq:
if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
destroy_workqueue(vb->balloon_wq);
@@ -993,6 +982,8 @@
struct virtio_balloon *vb = vdev->priv;
if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
+ unregister_oom_notifier(&vb->oom_nb);
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
virtio_balloon_unregister_shrinker(vb);
spin_lock_irq(&vb->stop_update_lock);
vb->stop_update = true;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 031ff3f..29f10d5 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -412,7 +412,8 @@
* Return buffer_head on success or NULL in case of failure.
*/
struct buffer_head *
-ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
+ bool ignore_locked)
{
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -440,6 +441,12 @@
return ERR_PTR(-ENOMEM);
}
+ if (ignore_locked && buffer_locked(bh)) {
+ /* buffer under IO already, return if called for prefetching */
+ put_bh(bh);
+ return NULL;
+ }
+
if (bitmap_uptodate(bh))
goto verify;
@@ -486,10 +493,11 @@
* submit the buffer_head for reading
*/
set_buffer_new(bh);
- trace_ext4_read_block_bitmap_load(sb, block_group);
+ trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked);
bh->b_end_io = ext4_end_bitmap_read;
get_bh(bh);
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
+ submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO |
+ (ignore_locked ? REQ_RAHEAD : 0), bh);
return bh;
verify:
err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -532,7 +540,7 @@
struct buffer_head *bh;
int err;
- bh = ext4_read_block_bitmap_nowait(sb, block_group);
+ bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
if (IS_ERR(bh))
return bh;
err = ext4_wait_block_bitmap(sb, block_group, bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e932e84..76210d0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -198,6 +198,12 @@
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
+struct ext4_io_end_vec {
+ struct list_head list; /* list of io_end_vec */
+ loff_t offset; /* offset in the file */
+ ssize_t size; /* size of the extent */
+};
+
/*
* For converting unwritten extents on a work queue. 'handle' is used for
* buffered writeback.
@@ -211,8 +217,7 @@
* bios covering the extent */
unsigned int flag; /* unwritten or not */
atomic_t count; /* reference counter */
- loff_t offset; /* offset in the file */
- ssize_t size; /* size of the extent */
+ struct list_head list_vec; /* list of ext4_io_end_vec */
} ext4_io_end_t;
struct ext4_io_submit {
@@ -1148,6 +1153,7 @@
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */
+#define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
@@ -1481,6 +1487,8 @@
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
+ unsigned int s_mb_prefetch;
+ unsigned int s_mb_prefetch_limit;
/* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -2225,9 +2233,15 @@
struct mutex li_list_mtx;
};
+enum ext4_li_mode {
+ EXT4_LI_MODE_PREFETCH_BBITMAP,
+ EXT4_LI_MODE_ITABLE,
+};
+
struct ext4_li_request {
struct super_block *lr_super;
- struct ext4_sb_info *lr_sbi;
+ enum ext4_li_mode lr_mode;
+ ext4_group_t lr_first_not_zeroed;
ext4_group_t lr_next_group;
struct list_head lr_request;
unsigned long lr_next_sched;
@@ -2358,7 +2372,8 @@
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
- ext4_group_t block_group);
+ ext4_group_t block_group,
+ bool ignore_locked);
extern int ext4_wait_block_bitmap(struct super_block *sb,
ext4_group_t block_group,
struct buffer_head *bh);
@@ -2567,6 +2582,12 @@
extern void ext4_discard_preallocations(struct inode *);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
+extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
+ ext4_group_t group,
+ unsigned int nr, int *cnt);
+extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
+ unsigned int nr);
+
extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t block,
unsigned long count, int flags);
@@ -3045,6 +3066,7 @@
(1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \
(1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
+#define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
@@ -3059,6 +3081,8 @@
(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \
+ (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
#define EXT4_MAX_CONTENTION 8
#define EXT4_CONTENTION_THRESHOLD 2
@@ -3294,6 +3318,8 @@
loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
loff_t offset, ssize_t len);
+extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
+ ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -3352,6 +3378,8 @@
int len,
struct writeback_control *wbc,
bool keep_towrite);
+extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
+extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d5e649e..5e6a065 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5011,23 +5011,13 @@
int ret = 0;
int ret2 = 0;
struct ext4_map_blocks map;
- unsigned int credits, blkbits = inode->i_blkbits;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int credits = 0;
map.m_lblk = offset >> blkbits;
max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
- /*
- * This is somewhat ugly but the idea is clear: When transaction is
- * reserved, everything goes into it. Otherwise we rather start several
- * smaller transactions for conversion of each extent separately.
- */
- if (handle) {
- handle = ext4_journal_start_reserved(handle,
- EXT4_HT_EXT_CONVERT);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- credits = 0;
- } else {
+ if (!handle) {
/*
* credits to insert 1 extent into extent tree
*/
@@ -5058,11 +5048,40 @@
if (ret <= 0 || ret2)
break;
}
- if (!credits)
- ret2 = ext4_journal_stop(handle);
return ret > 0 ? ret2 : ret;
}
+int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
+{
+ int ret, err = 0;
+ struct ext4_io_end_vec *io_end_vec;
+
+ /*
+ * This is somewhat ugly but the idea is clear: When transaction is
+ * reserved, everything goes into it. Otherwise we rather start several
+ * smaller transactions for conversion of each extent separately.
+ */
+ if (handle) {
+ handle = ext4_journal_start_reserved(handle,
+ EXT4_HT_EXT_CONVERT);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ }
+
+ list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
+ ret = ext4_convert_unwritten_extents(handle, io_end->inode,
+ io_end_vec->offset,
+ io_end_vec->size);
+ if (ret)
+ break;
+ }
+
+ if (handle)
+ err = ext4_journal_stop(handle);
+
+ return ret < 0 ? ret : err;
+}
+
/*
* If newes is not existing extent (newes->ec_pblk equals zero) find
* delayed extent at start of newes and update newes accordingly and
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d8fee91..2e356e2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2376,6 +2376,79 @@
}
/*
+ * mpage_process_page - update page buffers corresponding to changed extent and
+ * may submit fully mapped page for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ * @m_lblk - logical block mapping.
+ * @m_pblk - corresponding physical mapping.
+ * @map_bh - determines on return whether this page requires any further
+ * mapping or not.
+ * Scan given page buffers corresponding to changed extent and update buffer
+ * state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits.
+ * If the given page is not fully mapped, we update @map to the next extent in
+ * the given page that needs mapping & return @map_bh as true.
+ */
+static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
+ ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
+ bool *map_bh)
+{
+ struct buffer_head *head, *bh;
+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
+ ext4_lblk_t lblk = *m_lblk;
+ ext4_fsblk_t pblock = *m_pblk;
+ int err = 0;
+ int blkbits = mpd->inode->i_blkbits;
+ ssize_t io_end_size = 0;
+ struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
+
+ bh = head = page_buffers(page);
+ do {
+ if (lblk < mpd->map.m_lblk)
+ continue;
+ if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+ /*
+ * Buffer after end of mapped extent.
+ * Find next buffer in the page to map.
+ */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ io_end_vec->size += io_end_size;
+ io_end_size = 0;
+
+ err = mpage_process_page_bufs(mpd, head, bh, lblk);
+ if (err > 0)
+ err = 0;
+ if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
+ if (IS_ERR(io_end_vec)) {
+ err = PTR_ERR(io_end_vec);
+ goto out;
+ }
+ io_end_vec->offset = mpd->map.m_lblk << blkbits;
+ }
+ *map_bh = true;
+ goto out;
+ }
+ if (buffer_delay(bh)) {
+ clear_buffer_delay(bh);
+ bh->b_blocknr = pblock++;
+ }
+ clear_buffer_unwritten(bh);
+ io_end_size += (1 << blkbits);
+ } while (lblk++, (bh = bh->b_this_page) != head);
+
+ io_end_vec->size += io_end_size;
+ io_end_size = 0;
+ *map_bh = false;
+out:
+ *m_lblk = lblk;
+ *m_pblk = pblock;
+ return err;
+}
+
+/*
* mpage_map_buffers - update buffers corresponding to changed extent and
* submit fully mapped pages for IO
*
@@ -2394,12 +2467,12 @@
struct pagevec pvec;
int nr_pages, i;
struct inode *inode = mpd->inode;
- struct buffer_head *head, *bh;
int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
pgoff_t start, end;
ext4_lblk_t lblk;
- sector_t pblock;
+ ext4_fsblk_t pblock;
int err;
+ bool map_bh = false;
start = mpd->map.m_lblk >> bpp_bits;
end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
@@ -2415,50 +2488,19 @@
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- bh = head = page_buffers(page);
- do {
- if (lblk < mpd->map.m_lblk)
- continue;
- if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
- /*
- * Buffer after end of mapped extent.
- * Find next buffer in the page to map.
- */
- mpd->map.m_len = 0;
- mpd->map.m_flags = 0;
- /*
- * FIXME: If dioread_nolock supports
- * blocksize < pagesize, we need to make
- * sure we add size mapped so far to
- * io_end->size as the following call
- * can submit the page for IO.
- */
- err = mpage_process_page_bufs(mpd, head,
- bh, lblk);
- pagevec_release(&pvec);
- if (err > 0)
- err = 0;
- return err;
- }
- if (buffer_delay(bh)) {
- clear_buffer_delay(bh);
- bh->b_blocknr = pblock++;
- }
- clear_buffer_unwritten(bh);
- } while (lblk++, (bh = bh->b_this_page) != head);
-
+ err = mpage_process_page(mpd, page, &lblk, &pblock,
+ &map_bh);
/*
- * FIXME: This is going to break if dioread_nolock
- * supports blocksize < pagesize as we will try to
- * convert potentially unmapped parts of inode.
+ * If map_bh is true, means page may require further bh
+ * mapping, or maybe the page was submitted for IO.
+ * So we return to call further extent mapping.
*/
- mpd->io_submit.io_end->size += PAGE_SIZE;
+ if (err < 0 || map_bh == true)
+ goto out;
/* Page fully mapped - let IO run! */
err = mpage_submit_page(mpd, page);
- if (err < 0) {
- pagevec_release(&pvec);
- return err;
- }
+ if (err < 0)
+ goto out;
}
pagevec_release(&pvec);
}
@@ -2466,6 +2508,9 @@
mpd->map.m_len = 0;
mpd->map.m_flags = 0;
return 0;
+out:
+ pagevec_release(&pvec);
+ return err;
}
static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
@@ -2545,9 +2590,13 @@
int err;
loff_t disksize;
int progress = 0;
+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
+ struct ext4_io_end_vec *io_end_vec;
- mpd->io_submit.io_end->offset =
- ((loff_t)map->m_lblk) << inode->i_blkbits;
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
+ if (IS_ERR(io_end_vec))
+ return PTR_ERR(io_end_vec);
+ io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
do {
err = mpage_map_one_extent(handle, mpd);
if (err < 0) {
@@ -3670,6 +3719,7 @@
ssize_t size, void *private)
{
ext4_io_end_t *io_end = private;
+ struct ext4_io_end_vec *io_end_vec;
/* if not async direct IO just return */
if (!io_end)
@@ -3687,8 +3737,9 @@
ext4_clear_io_unwritten_flag(io_end);
size = 0;
}
- io_end->offset = offset;
- io_end->size = size;
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
+ io_end_vec->offset = offset;
+ io_end_vec->size = size;
ext4_put_io_end(io_end);
return 0;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 3c3166b..3eb021b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -861,7 +861,7 @@
bh[i] = NULL;
continue;
}
- bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+ bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
if (IS_ERR(bh[i])) {
err = PTR_ERR(bh[i]);
bh[i] = NULL;
@@ -2045,6 +2045,7 @@
ext4_group_t group, int cr)
{
unsigned free, fragments;
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
@@ -2061,7 +2062,26 @@
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
+ struct ext4_group_desc *gdp =
+ ext4_get_group_desc(ac->ac_sb, group, NULL);
+ int ret;
+
+ /* cr=0/1 is a very optimistic search to find large
+ * good chunks almost for free. If buddy data is not
+ * ready, then this optimization makes no sense. But
+ * we never skip the first block group in a flex_bg,
+ * since this gets used for metadata block allocation,
+ * and we want to make sure we locate metadata blocks
+ * in the first block group in the flex_bg if
+ * possible.
+ */
+ if (cr < 2 &&
+ (!sbi->s_log_groups_per_flex ||
+ ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
+ !(ext4_has_group_desc_csum(ac->ac_sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
+ return 0;
+ ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
if (ret)
return ret;
}
@@ -2105,12 +2125,91 @@
return 0;
}
+/*
+ * Start prefetching @nr block bitmaps starting at @group.
+ * Return the next group which needs to be prefetched.
+ */
+ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
+ unsigned int nr, int *cnt)
+{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct buffer_head *bh;
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ while (nr-- > 0) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+ NULL);
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ /*
+ * Prefetch block groups with free blocks; but don't
+ * bother if it is marked uninitialized on disk, since
+ * it won't require I/O to read. Also only try to
+ * prefetch once, so we avoid getblk() call, which can
+ * be expensive.
+ */
+ if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
+ EXT4_MB_GRP_NEED_INIT(grp) &&
+ ext4_free_group_clusters(sb, gdp) > 0 &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ bh = ext4_read_block_bitmap_nowait(sb, group, true);
+ if (bh && !IS_ERR(bh)) {
+ if (!buffer_uptodate(bh) && cnt)
+ (*cnt)++;
+ brelse(bh);
+ }
+ }
+ if (++group >= ngroups)
+ group = 0;
+ }
+ blk_finish_plug(&plug);
+ return group;
+}
+
+/*
+ * Prefetching reads the block bitmap into the buffer cache; but we
+ * need to make sure that the buddy bitmap in the page cache has been
+ * initialized. Note that ext4_mb_init_group() will block if the I/O
+ * is not yet completed, or indeed if it was not initiated by
+ * ext4_mb_prefetch did not start the I/O.
+ *
+ * TODO: We should actually kick off the buddy bitmap setup in a work
+ * queue when the buffer I/O is completed, so that we don't block
+ * waiting for the block allocation bitmap read to finish when
+ * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
+ */
+void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
+ unsigned int nr)
+{
+ while (nr-- > 0) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+ NULL);
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ if (!group)
+ group = ext4_get_groups_count(sb);
+ group--;
+ grp = ext4_get_group_info(sb, group);
+
+ if (EXT4_MB_GRP_NEED_INIT(grp) &&
+ ext4_free_group_clusters(sb, gdp) > 0 &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ if (ext4_mb_init_group(sb, group, GFP_NOFS))
+ break;
+ }
+ }
+}
+
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
- ext4_group_t ngroups, group, i;
+ ext4_group_t prefetch_grp = 0, ngroups, group, i;
int cr;
int err = 0, first_err = 0;
+ unsigned int nr = 0, prefetch_ios = 0;
struct ext4_sb_info *sbi;
struct super_block *sb;
struct ext4_buddy e4b;
@@ -2178,6 +2277,7 @@
* from the goal value specified
*/
group = ac->ac_g_ex.fe_group;
+ prefetch_grp = group;
for (i = 0; i < ngroups; group++, i++) {
int ret = 0;
@@ -2189,6 +2289,29 @@
if (group >= ngroups)
group = 0;
+ /*
+ * Batch reads of the block allocation bitmaps
+ * to get multiple READs in flight; limit
+ * prefetching at cr=0/1, otherwise mballoc can
+ * spend a lot of time loading imperfect groups
+ */
+ if ((prefetch_grp == group) &&
+ (cr > 1 ||
+ prefetch_ios < sbi->s_mb_prefetch_limit)) {
+ unsigned int curr_ios = prefetch_ios;
+
+ nr = sbi->s_mb_prefetch;
+ if (ext4_has_feature_flex_bg(sb)) {
+ nr = (group / sbi->s_mb_prefetch) *
+ sbi->s_mb_prefetch;
+ nr = nr + sbi->s_mb_prefetch - group;
+ }
+ prefetch_grp = ext4_mb_prefetch(sb, group,
+ nr, &prefetch_ios);
+ if (prefetch_ios == curr_ios)
+ nr = 0;
+ }
+
/* This now checks without needing the buddy page */
ret = ext4_mb_good_group(ac, group, cr);
if (ret <= 0) {
@@ -2261,6 +2384,10 @@
out:
if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
err = first_err;
+
+ if (nr)
+ ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
+
return err;
}
@@ -2520,6 +2647,26 @@
goto err_freebuddy;
}
+ if (ext4_has_feature_flex_bg(sb)) {
+ /* a single flex group is supposed to be read by a single IO */
+ sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
+ sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+ } else {
+ sbi->s_mb_prefetch = 32;
+ }
+ if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch = ext4_get_groups_count(sb);
+ /* now many real IOs to prefetch within a single allocation at cr=0
+ * given cr=0 is an CPU-related optimization we shouldn't try to
+ * load too many groups, at some point we should start to use what
+ * we've got in memory.
+ * with an average random access time 5ms, it'd take a second to get
+ * 200 groups (* N with flex_bg), so let's make this limit 4
+ */
+ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
+ if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
+
return 0;
err_freebuddy:
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b66b335..e80999f 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -31,18 +31,56 @@
#include "acl.h"
static struct kmem_cache *io_end_cachep;
+static struct kmem_cache *io_end_vec_cachep;
int __init ext4_init_pageio(void)
{
io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
if (io_end_cachep == NULL)
return -ENOMEM;
+
+ io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0);
+ if (io_end_vec_cachep == NULL) {
+ kmem_cache_destroy(io_end_cachep);
+ return -ENOMEM;
+ }
return 0;
}
void ext4_exit_pageio(void)
{
kmem_cache_destroy(io_end_cachep);
+ kmem_cache_destroy(io_end_vec_cachep);
+}
+
+struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end)
+{
+ struct ext4_io_end_vec *io_end_vec;
+
+ io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS);
+ if (!io_end_vec)
+ return ERR_PTR(-ENOMEM);
+ INIT_LIST_HEAD(&io_end_vec->list);
+ list_add_tail(&io_end_vec->list, &io_end->list_vec);
+ return io_end_vec;
+}
+
+static void ext4_free_io_end_vec(ext4_io_end_t *io_end)
+{
+ struct ext4_io_end_vec *io_end_vec, *tmp;
+
+ if (list_empty(&io_end->list_vec))
+ return;
+ list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) {
+ list_del(&io_end_vec->list);
+ kmem_cache_free(io_end_vec_cachep, io_end_vec);
+ }
+}
+
+struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end)
+{
+ BUG_ON(list_empty(&io_end->list_vec));
+ return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list);
}
/*
@@ -127,6 +165,7 @@
ext4_finish_bio(bio);
bio_put(bio);
}
+ ext4_free_io_end_vec(io_end);
kmem_cache_free(io_end_cachep, io_end);
}
@@ -138,29 +177,26 @@
* cannot get to ext4_ext_truncate() before all IOs overlapping that range are
* completed (happens from ext4_free_ioend()).
*/
-static int ext4_end_io(ext4_io_end_t *io)
+static int ext4_end_io_end(ext4_io_end_t *io_end)
{
- struct inode *inode = io->inode;
- loff_t offset = io->offset;
- ssize_t size = io->size;
- handle_t *handle = io->handle;
+ struct inode *inode = io_end->inode;
+ handle_t *handle = io_end->handle;
int ret = 0;
- ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+ ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
- io, inode->i_ino, io->list.next, io->list.prev);
+ io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
- io->handle = NULL; /* Following call will use up the handle */
- ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
+ io_end->handle = NULL; /* Following call will use up the handle */
+ ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) {
ext4_msg(inode->i_sb, KERN_EMERG,
"failed to convert unwritten extents to written "
"extents -- potential data loss! "
- "(inode %lu, offset %llu, size %zd, error %d)",
- inode->i_ino, offset, size, ret);
+ "(inode %lu, error %d)", inode->i_ino, ret);
}
- ext4_clear_io_unwritten_flag(io);
- ext4_release_io_end(io);
+ ext4_clear_io_unwritten_flag(io_end);
+ ext4_release_io_end(io_end);
return ret;
}
@@ -168,21 +204,21 @@
{
#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
- ext4_io_end_t *io, *io0, *io1;
+ ext4_io_end_t *io_end, *io_end0, *io_end1;
if (list_empty(head))
return;
ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
- list_for_each_entry(io, head, list) {
- cur = &io->list;
+ list_for_each_entry(io_end, head, list) {
+ cur = &io_end->list;
before = cur->prev;
- io0 = container_of(before, ext4_io_end_t, list);
+ io_end0 = container_of(before, ext4_io_end_t, list);
after = cur->next;
- io1 = container_of(after, ext4_io_end_t, list);
+ io_end1 = container_of(after, ext4_io_end_t, list);
ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
- io, inode->i_ino, io0, io1);
+ io_end, inode->i_ino, io_end0, io_end1);
}
#endif
}
@@ -209,7 +245,7 @@
static int ext4_do_flush_completed_IO(struct inode *inode,
struct list_head *head)
{
- ext4_io_end_t *io;
+ ext4_io_end_t *io_end;
struct list_head unwritten;
unsigned long flags;
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -221,11 +257,11 @@
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
while (!list_empty(&unwritten)) {
- io = list_entry(unwritten.next, ext4_io_end_t, list);
- BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
- list_del_init(&io->list);
+ io_end = list_entry(unwritten.next, ext4_io_end_t, list);
+ BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ list_del_init(&io_end->list);
- err = ext4_end_io(io);
+ err = ext4_end_io_end(io_end);
if (unlikely(!ret && err))
ret = err;
}
@@ -244,19 +280,22 @@
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
{
- ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
- if (io) {
- io->inode = inode;
- INIT_LIST_HEAD(&io->list);
- atomic_set(&io->count, 1);
+ ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags);
+
+ if (io_end) {
+ io_end->inode = inode;
+ INIT_LIST_HEAD(&io_end->list);
+ INIT_LIST_HEAD(&io_end->list_vec);
+ atomic_set(&io_end->count, 1);
}
- return io;
+ return io_end;
}
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
if (atomic_dec_and_test(&io_end->count)) {
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
+ list_empty(&io_end->list_vec)) {
ext4_release_io_end(io_end);
return;
}
@@ -270,9 +309,8 @@
if (atomic_dec_and_test(&io_end->count)) {
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
- err = ext4_convert_unwritten_extents(io_end->handle,
- io_end->inode, io_end->offset,
- io_end->size);
+ err = ext4_convert_unwritten_io_end_vec(io_end->handle,
+ io_end);
io_end->handle = NULL;
ext4_clear_io_unwritten_flag(io_end);
}
@@ -309,10 +347,8 @@
struct inode *inode = io_end->inode;
ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
- "(offset %llu size %ld starting block %llu)",
+ "starting block %llu)",
bio->bi_status, inode->i_ino,
- (unsigned long long) io_end->offset,
- (long) io_end->size,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
mapping_set_error(inode->i_mapping,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index eba2506..f878d91 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1475,6 +1475,7 @@
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
+ Opt_prefetch_block_bitmaps,
};
static const match_table_t tokens = {
@@ -1561,6 +1562,7 @@
{Opt_test_dummy_encryption, "test_dummy_encryption"},
{Opt_nombcache, "nombcache"},
{Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
+ {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"},
{Opt_removed, "check=none"}, /* mount option from ext2/3 */
{Opt_removed, "nocheck"}, /* mount option from ext2/3 */
{Opt_removed, "reservation"}, /* mount option from ext2/3 */
@@ -1772,6 +1774,8 @@
{Opt_max_dir_size_kb, 0, MOPT_GTE0},
{Opt_test_dummy_encryption, 0, MOPT_GTE0},
{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
+ {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
+ MOPT_SET},
{Opt_err, 0, 0}
};
@@ -2085,7 +2089,7 @@
unsigned int *journal_ioprio,
int is_remount)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb);
char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
substring_t args[MAX_OPT_ARGS];
int token;
@@ -2139,16 +2143,6 @@
}
}
#endif
- if (test_opt(sb, DIOREAD_NOLOCK)) {
- int blocksize =
- BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
-
- if (blocksize < PAGE_SIZE) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "dioread_nolock if block size != PAGE_SIZE");
- return 0;
- }
- }
return 1;
}
@@ -3070,15 +3064,34 @@
static int ext4_run_li_request(struct ext4_li_request *elr)
{
struct ext4_group_desc *gdp = NULL;
- ext4_group_t group, ngroups;
- struct super_block *sb;
+ struct super_block *sb = elr->lr_super;
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ ext4_group_t group = elr->lr_next_group;
+ unsigned int prefetch_ios = 0;
int ret = 0;
u64 start_time;
- sb = elr->lr_super;
- ngroups = EXT4_SB(sb)->s_groups_count;
+ if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
+ elr->lr_next_group = ext4_mb_prefetch(sb, group,
+ EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios);
+ if (prefetch_ios)
+ ext4_mb_prefetch_fini(sb, elr->lr_next_group,
+ prefetch_ios);
+ trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group,
+ prefetch_ios);
+ if (group >= elr->lr_next_group) {
+ ret = 1;
+ if (elr->lr_first_not_zeroed != ngroups &&
+ !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
+ elr->lr_next_group = elr->lr_first_not_zeroed;
+ elr->lr_mode = EXT4_LI_MODE_ITABLE;
+ ret = 0;
+ }
+ }
+ return ret;
+ }
- for (group = elr->lr_next_group; group < ngroups; group++) {
+ for (; group < ngroups; group++) {
gdp = ext4_get_group_desc(sb, group, NULL);
if (!gdp) {
ret = 1;
@@ -3096,9 +3109,10 @@
start_time = ktime_get_real_ns();
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
+ trace_ext4_lazy_itable_init(sb, group);
if (elr->lr_timeout == 0) {
elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
- elr->lr_sbi->s_li_wait_mult);
+ EXT4_SB(elr->lr_super)->s_li_wait_mult);
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
elr->lr_next_group = group + 1;
@@ -3112,15 +3126,11 @@
*/
static void ext4_remove_li_request(struct ext4_li_request *elr)
{
- struct ext4_sb_info *sbi;
-
if (!elr)
return;
- sbi = elr->lr_sbi;
-
list_del(&elr->lr_request);
- sbi->s_li_request = NULL;
+ EXT4_SB(elr->lr_super)->s_li_request = NULL;
kfree(elr);
}
@@ -3329,7 +3339,6 @@
static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
ext4_group_t start)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_li_request *elr;
elr = kzalloc(sizeof(*elr), GFP_KERNEL);
@@ -3337,8 +3346,13 @@
return NULL;
elr->lr_super = sb;
- elr->lr_sbi = sbi;
- elr->lr_next_group = start;
+ elr->lr_first_not_zeroed = start;
+ if (test_opt(sb, PREFETCH_BLOCK_BITMAPS))
+ elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
+ else {
+ elr->lr_mode = EXT4_LI_MODE_ITABLE;
+ elr->lr_next_group = start;
+ }
/*
* Randomize first schedule time of the request to
@@ -3356,6 +3370,7 @@
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_li_request *elr = NULL;
ext4_group_t ngroups = sbi->s_groups_count;
+ enum ext4_li_mode lr_mode = EXT4_LI_MODE_ITABLE;
int ret = 0;
mutex_lock(&ext4_li_mtx);
@@ -3368,8 +3383,10 @@
goto out;
}
- if (first_not_zeroed == ngroups || sb_rdonly(sb) ||
- !test_opt(sb, INIT_INODE_TABLE))
+ if (test_opt(sb, PREFETCH_BLOCK_BITMAPS)) {
+ lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
+ } else if (first_not_zeroed == ngroups || sb_rdonly(sb) ||
+ !test_opt(sb, INIT_INODE_TABLE))
goto out;
elr = ext4_li_request_new(sb, first_not_zeroed);
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 9394360..254435e 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -200,6 +200,8 @@
EXT4_ATTR(first_error_time, 0444, first_error_time);
EXT4_ATTR(last_error_time, 0444, last_error_time);
EXT4_ATTR(journal_task, 0444, journal_task);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -231,6 +233,8 @@
ATTR_LIST(first_error_time),
ATTR_LIST(last_error_time),
ATTR_LIST(journal_task),
+ ATTR_LIST(mb_prefetch),
+ ATTR_LIST(mb_prefetch_limit),
NULL,
};
ATTRIBUTE_GROUPS(ext4);
diff --git a/fs/file_table.c b/fs/file_table.c
index 70e8fb6..5604fe8d 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -278,6 +278,7 @@
}
if (file->f_op->release)
file->f_op->release(inode, file);
+ security_file_pre_free(file);
if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
!(mode & FMODE_PATH))) {
cdev_put(inode->i_cdev);
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index cb5629bd..9f650c7 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -103,3 +103,11 @@
config PROC_PID_ARCH_STATUS
def_bool n
depends on PROC_FS
+
+config PROC_SELF_MEM_READONLY
+ bool "Force /proc/<pid>/mem paths to be read-only"
+ default y
+ help
+ When enabled, attempts to open /proc/self/mem for write access
+ will always fail. Write access to this file allows bypassing
+ of memory map permissions (such as modifying read-only code).
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5a187e9..020839e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -148,6 +148,12 @@
NULL, &proc_pid_attr_operations, \
{ .lsm = LSM })
+#ifdef CONFIG_PROC_SELF_MEM_READONLY
+# define PROC_PID_MEM_MODE S_IRUSR
+#else
+# define PROC_PID_MEM_MODE S_IRUSR|S_IWUSR
+#endif
+
/*
* Count the number of hardlinks for the pid_entry table, excluding the .
* and .. links.
@@ -886,7 +892,11 @@
static ssize_t mem_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
+#ifdef CONFIG_PROC_SELF_MEM_READONLY
+ return -EACCES;
+#else
return mem_rw(file, (char __user*)buf, count, ppos, 1);
+#endif
}
loff_t mem_lseek(struct file *file, loff_t offset, int orig)
@@ -3048,7 +3058,7 @@
#ifdef CONFIG_NUMA
REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
#endif
- REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
+ REG("mem", PROC_PID_MEM_MODE, proc_mem_operations),
LNK("cwd", proc_cwd_link),
LNK("root", proc_root_link),
LNK("exe", proc_exe_link),
@@ -3449,7 +3459,7 @@
#ifdef CONFIG_NUMA
REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
#endif
- REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
+ REG("mem", PROC_PID_MEM_MODE, proc_mem_operations),
LNK("cwd", proc_cwd_link),
LNK("root", proc_root_link),
LNK("exe", proc_exe_link),
diff --git a/include/linux/audit.h b/include/linux/audit.h
index aee3dc9..d887925 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -94,6 +94,17 @@
struct audit_ntp_data {};
#endif
+struct audit_task_info {
+ kuid_t loginuid;
+ unsigned int sessionid;
+ u64 contid;
+#ifdef CONFIG_AUDITSYSCALL
+ struct audit_context *ctx;
+#endif
+};
+
+extern struct audit_task_info init_struct_audit;
+
extern int is_audit_feature_set(int which);
extern int __init audit_register_class(int class, unsigned *list);
@@ -130,6 +141,9 @@
#ifdef CONFIG_AUDIT
/* These are defined in audit.c */
/* Public API */
+extern int audit_alloc(struct task_struct *task);
+extern void audit_free(struct task_struct *task);
+extern void __init audit_task_init(void);
extern __printf(4, 5)
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
const char *fmt, ...);
@@ -172,12 +186,25 @@
static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
{
- return tsk->loginuid;
+ if (!tsk->audit)
+ return INVALID_UID;
+ return tsk->audit->loginuid;
}
static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
{
- return tsk->sessionid;
+ if (!tsk->audit)
+ return AUDIT_SID_UNSET;
+ return tsk->audit->sessionid;
+}
+
+extern int audit_set_contid(struct task_struct *tsk, u64 contid);
+
+static inline u64 audit_get_contid(struct task_struct *tsk)
+{
+ if (!tsk->audit)
+ return AUDIT_CID_UNSET;
+ return tsk->audit->contid;
}
extern u32 audit_enabled;
@@ -185,6 +212,14 @@
extern int audit_signal_info(int sig, struct task_struct *t);
#else /* CONFIG_AUDIT */
+static inline int audit_alloc(struct task_struct *task)
+{
+ return 0;
+}
+static inline void audit_free(struct task_struct *task)
+{ }
+static inline void __init audit_task_init(void)
+{ }
static inline __printf(4, 5)
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
const char *fmt, ...)
@@ -236,6 +271,11 @@
return AUDIT_SID_UNSET;
}
+static inline u64 audit_get_contid(struct task_struct *tsk)
+{
+ return AUDIT_CID_UNSET;
+}
+
#define audit_enabled AUDIT_OFF
static inline int audit_signal_info(int sig, struct task_struct *t)
@@ -260,8 +300,6 @@
/* These are defined in auditsc.c */
/* Public API */
-extern int audit_alloc(struct task_struct *task);
-extern void __audit_free(struct task_struct *task);
extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1,
unsigned long a2, unsigned long a3);
extern void __audit_syscall_exit(int ret_success, long ret_value);
@@ -281,12 +319,14 @@
static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
{
- task->audit_context = ctx;
+ task->audit->ctx = ctx;
}
static inline struct audit_context *audit_context(void)
{
- return current->audit_context;
+ if (!current->audit)
+ return NULL;
+ return current->audit->ctx;
}
static inline bool audit_dummy_context(void)
@@ -294,11 +334,7 @@
void *p = audit_context();
return !p || *(int *)p;
}
-static inline void audit_free(struct task_struct *task)
-{
- if (unlikely(task->audit_context))
- __audit_free(task);
-}
+
static inline void audit_syscall_entry(int major, unsigned long a0,
unsigned long a1, unsigned long a2,
unsigned long a3)
@@ -516,12 +552,6 @@
extern int audit_n_rules;
extern int audit_signals;
#else /* CONFIG_AUDITSYSCALL */
-static inline int audit_alloc(struct task_struct *task)
-{
- return 0;
-}
-static inline void audit_free(struct task_struct *task)
-{ }
static inline void audit_syscall_entry(int major, unsigned long a0,
unsigned long a1, unsigned long a2,
unsigned long a3)
@@ -654,6 +684,16 @@
return uid_valid(audit_get_loginuid(tsk));
}
+static inline bool audit_contid_valid(u64 contid)
+{
+ return contid != AUDIT_CID_UNSET;
+}
+
+static inline bool audit_contid_set(struct task_struct *tsk)
+{
+ return audit_contid_valid(audit_get_contid(tsk));
+}
+
static inline void audit_log_string(struct audit_buffer *ab, const char *buf)
{
audit_log_n_string(ab, buf, strlen(buf));
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 169fd25..359fdae 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -378,6 +378,7 @@
}
#define cgroup_bpf_enabled (0)
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; })
#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5705cda..a30c7fe 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -205,6 +205,7 @@
ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */
ARG_PTR_TO_CTX, /* pointer to context */
+ ARG_PTR_TO_CTX_OR_NULL, /* pointer to context or NULL */
ARG_ANYTHING, /* any (initialized) argument is ok */
ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */
ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */
@@ -1082,6 +1083,7 @@
extern const struct bpf_func_proto bpf_strtol_proto;
extern const struct bpf_func_proto bpf_strtoul_proto;
extern const struct bpf_func_proto bpf_tcp_sock_proto;
+extern const struct bpf_func_proto bpf_jiffies64_proto;
/* Shared helpers among cBPF and eBPF. */
void bpf_user_rnd_init_once(void);
diff --git a/include/linux/device.h b/include/linux/device.h
index 3414b5a..4204396 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -993,6 +993,7 @@
* sg limitations.
*/
unsigned int max_segment_size;
+ unsigned int min_align_mask;
unsigned long segment_boundary_mask;
};
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 4d45067..953e1e3 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -751,6 +751,22 @@
return -EIO;
}
+static inline unsigned int dma_get_min_align_mask(struct device *dev)
+{
+ if (dev->dma_parms)
+ return dev->dma_parms->min_align_mask;
+ return 0;
+}
+
+static inline int dma_set_min_align_mask(struct device *dev,
+ unsigned int min_align_mask)
+{
+ if (WARN_ON_ONCE(!dev->dma_parms))
+ return -EIO;
+ dev->dma_parms->min_align_mask = min_align_mask;
+ return 0;
+}
+
static inline int dma_get_cache_alignment(void)
{
#ifdef ARCH_DMA_MINALIGN
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index a21dc54..22d4ce1 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -489,6 +489,10 @@
* @file_free_security:
* Deallocate and free any security structures stored in file->f_security.
* @file contains the file structure being modified.
+ * @file_pre_free_security:
+ * Perform any logging or LSM state updates for a file being deleted
+ * using fields of the file before they have been cleared.
+ * @file contains the file structure being freed
* @file_ioctl:
* @file contains the file structure.
* @cmd contains the operation to perform.
@@ -565,6 +569,10 @@
* @clone_flags contains the flags indicating what should be shared.
* Handle allocation of task-related resources.
* Returns a zero on success, negative values on failure.
+ * @task_post_alloc:
+ * @task task being allocated.
+ * Handle allocation of task-related resources after all task fields are
+ * filled in.
* @task_free:
* @task task about to be freed.
* Handle release of task-related resources. (Note that this can be called
@@ -713,6 +721,9 @@
* @cred contains the cred of the process where the signal originated, or
* NULL if the current task is the originator.
* Return 0 if permission is granted.
+ * @task_exit:
+ * Called early when a task is exiting before all state is lost.
+ * @p contains the task_struct for process.
* @task_prctl:
* Check permission before performing a process control operation on the
* current process.
@@ -1600,6 +1611,7 @@
int (*file_permission)(struct file *file, int mask);
int (*file_alloc_security)(struct file *file);
void (*file_free_security)(struct file *file);
+ void (*file_pre_free_security)(struct file *file);
int (*file_ioctl)(struct file *file, unsigned int cmd,
unsigned long arg);
int (*mmap_addr)(unsigned long addr);
@@ -1617,6 +1629,7 @@
int (*file_open)(struct file *file);
int (*task_alloc)(struct task_struct *task, unsigned long clone_flags);
+ void (*task_post_alloc)(struct task_struct *task); // Do not upstream.
void (*task_free)(struct task_struct *task);
int (*cred_alloc_blank)(struct cred *cred, gfp_t gfp);
void (*cred_free)(struct cred *cred);
@@ -1649,6 +1662,7 @@
int (*task_movememory)(struct task_struct *p);
int (*task_kill)(struct task_struct *p, struct kernel_siginfo *info,
int sig, const struct cred *cred);
+ void (*task_exit)(struct task_struct *p);
int (*task_prctl)(int option, unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5);
void (*task_to_inode)(struct task_struct *p, struct inode *inode);
@@ -1818,6 +1832,14 @@
void (*bpf_prog_free_security)(struct bpf_prog_aux *aux);
#endif /* CONFIG_BPF_SYSCALL */
int (*locked_down)(enum lockdown_reason what);
+#ifdef CONFIG_PERF_EVENTS
+ int (*perf_event_open)(struct perf_event_attr *attr, int type);
+ int (*perf_event_alloc)(struct perf_event *event);
+ void (*perf_event_free)(struct perf_event *event);
+ int (*perf_event_read)(struct perf_event *event);
+ int (*perf_event_write)(struct perf_event *event);
+
+#endif
};
struct security_hook_heads {
@@ -1906,6 +1928,7 @@
struct hlist_head file_permission;
struct hlist_head file_alloc_security;
struct hlist_head file_free_security;
+ struct hlist_head file_pre_free_security;
struct hlist_head file_ioctl;
struct hlist_head mmap_addr;
struct hlist_head mmap_file;
@@ -1917,6 +1940,7 @@
struct hlist_head file_receive;
struct hlist_head file_open;
struct hlist_head task_alloc;
+ struct hlist_head task_post_alloc;
struct hlist_head task_free;
struct hlist_head cred_alloc_blank;
struct hlist_head cred_free;
@@ -1943,6 +1967,7 @@
struct hlist_head task_getscheduler;
struct hlist_head task_movememory;
struct hlist_head task_kill;
+ struct hlist_head task_exit;
struct hlist_head task_prctl;
struct hlist_head task_to_inode;
struct hlist_head ipc_permission;
@@ -2060,6 +2085,13 @@
struct hlist_head bpf_prog_free_security;
#endif /* CONFIG_BPF_SYSCALL */
struct hlist_head locked_down;
+#ifdef CONFIG_PERF_EVENTS
+ struct hlist_head perf_event_open;
+ struct hlist_head perf_event_alloc;
+ struct hlist_head perf_event_free;
+ struct hlist_head perf_event_read;
+ struct hlist_head perf_event_write;
+#endif
} __randomize_layout;
/*
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b7ac395..6fc704f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -56,6 +56,7 @@
#include <linux/perf_regs.h>
#include <linux/cgroup.h>
#include <linux/refcount.h>
+#include <linux/security.h>
#include <asm/local.h>
struct perf_callchain_entry {
@@ -721,6 +722,9 @@
struct perf_cgroup *cgrp; /* cgroup event is attach to */
#endif
+#ifdef CONFIG_SECURITY
+ void *security;
+#endif
struct list_head sb_list;
#endif /* CONFIG_PERF_EVENTS */
};
@@ -1252,19 +1256,41 @@
int perf_event_max_stack_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
-static inline bool perf_paranoid_tracepoint_raw(void)
+/* Access to perf_event_open(2) syscall. */
+#define PERF_SECURITY_OPEN 0
+
+/* Finer grained perf_event_open(2) access control. */
+#define PERF_SECURITY_CPU 1
+#define PERF_SECURITY_KERNEL 2
+#define PERF_SECURITY_TRACEPOINT 3
+
+static inline int perf_is_paranoid(void)
{
return sysctl_perf_event_paranoid > -1;
}
-static inline bool perf_paranoid_cpu(void)
+static inline int perf_allow_kernel(struct perf_event_attr *attr)
{
- return sysctl_perf_event_paranoid > 0;
+ if (sysctl_perf_event_paranoid > 1 && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
}
-static inline bool perf_paranoid_kernel(void)
+static inline int perf_allow_cpu(struct perf_event_attr *attr)
{
- return sysctl_perf_event_paranoid > 1;
+ if (sysctl_perf_event_paranoid > 0 && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ return security_perf_event_open(attr, PERF_SECURITY_CPU);
+}
+
+static inline int perf_allow_tracepoint(struct perf_event_attr *attr)
+{
+ if (sysctl_perf_event_paranoid > -1 && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT);
}
extern void perf_event_init(void);
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 49538b1..dded498 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -45,6 +45,9 @@
int hide_pid;
int reboot; /* group exit code if this pidns was rebooted */
struct ns_common ns;
+#ifdef CONFIG_SECURITY_CONTAINER_MONITOR
+ u64 cid; /* Main container identifier, zero if not assigned. */
+#endif
} __randomize_layout;
extern struct pid_namespace init_pid_ns;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 171cb74..7ec87690 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -33,7 +33,6 @@
#include <linux/rseq.h>
/* task_struct member predeclarations (sorted alphabetically): */
-struct audit_context;
struct backing_dev_info;
struct bio_list;
struct blk_plug;
@@ -925,11 +924,7 @@
struct callback_head *task_works;
#ifdef CONFIG_AUDIT
-#ifdef CONFIG_AUDITSYSCALL
- struct audit_context *audit_context;
-#endif
- kuid_t loginuid;
- unsigned int sessionid;
+ struct audit_task_info *audit;
#endif
struct seccomp seccomp;
diff --git a/include/linux/security.h b/include/linux/security.h
index aa5c714..f77bdf5 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -361,6 +361,7 @@
int security_file_permission(struct file *file, int mask);
int security_file_alloc(struct file *file);
void security_file_free(struct file *file);
+void security_file_pre_free(struct file *file);
int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int security_mmap_file(struct file *file, unsigned long prot,
unsigned long flags);
@@ -375,6 +376,7 @@
int security_file_receive(struct file *file);
int security_file_open(struct file *file);
int security_task_alloc(struct task_struct *task, unsigned long clone_flags);
+void security_task_post_alloc(struct task_struct *task);
void security_task_free(struct task_struct *task);
int security_cred_alloc_blank(struct cred *cred, gfp_t gfp);
void security_cred_free(struct cred *cred);
@@ -406,6 +408,7 @@
int security_task_movememory(struct task_struct *p);
int security_task_kill(struct task_struct *p, struct kernel_siginfo *info,
int sig, const struct cred *cred);
+void security_task_exit(struct task_struct *p);
int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5);
void security_task_to_inode(struct task_struct *p, struct inode *inode);
@@ -901,6 +904,9 @@
static inline void security_file_free(struct file *file)
{ }
+static inline void security_file_pre_free(struct file *file)
+{ }
+
static inline int security_file_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
@@ -964,6 +970,9 @@
return 0;
}
+static inline void security_task_post_alloc(struct task_struct *task)
+{ }
+
static inline void security_task_free(struct task_struct *task)
{ }
@@ -1104,6 +1113,9 @@
return 0;
}
+static inline void security_task_exit(struct task_struct *p)
+{ }
+
static inline int security_task_prctl(int option, unsigned long arg2,
unsigned long arg3,
unsigned long arg4,
@@ -1904,5 +1916,41 @@
#endif /* CONFIG_SECURITY */
#endif /* CONFIG_BPF_SYSCALL */
-#endif /* ! __LINUX_SECURITY_H */
+#ifdef CONFIG_PERF_EVENTS
+struct perf_event_attr;
+#ifdef CONFIG_SECURITY
+extern int security_perf_event_open(struct perf_event_attr *attr, int type);
+extern int security_perf_event_alloc(struct perf_event *event);
+extern void security_perf_event_free(struct perf_event *event);
+extern int security_perf_event_read(struct perf_event *event);
+extern int security_perf_event_write(struct perf_event *event);
+#else
+static inline int security_perf_event_open(struct perf_event_attr *attr,
+ int type)
+{
+ return 0;
+}
+
+static inline int security_perf_event_alloc(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline void security_perf_event_free(struct perf_event *event)
+{
+}
+
+static inline int security_perf_event_read(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline int security_perf_event_write(struct perf_event *event)
+{
+ return 0;
+}
+#endif /* CONFIG_SECURITY */
+#endif /* CONFIG_PERF_EVENTS */
+
+#endif /* ! __LINUX_SECURITY_H */
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 0a8fced..a060b90 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -29,6 +29,10 @@
* controllable.
*/
#define IO_TLB_SHIFT 11
+#define IO_TLB_SIZE (1 << IO_TLB_SHIFT)
+
+/* default to 64MB */
+#define IO_TLB_DEFAULT_SIZE (64UL<<20)
extern void swiotlb_init(int verbose);
int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
@@ -80,6 +84,7 @@
unsigned int swiotlb_max_segment(void);
size_t swiotlb_max_mapping_size(struct device *dev);
bool is_swiotlb_active(void);
+void __init swiotlb_adjust_size(unsigned long new_size);
#else
#define swiotlb_force SWIOTLB_NO_FORCE
static inline bool is_swiotlb_buffer(phys_addr_t paddr)
@@ -108,6 +113,10 @@
{
return false;
}
+
+static inline void swiotlb_adjust_size(unsigned long new_size)
+{
+}
#endif /* CONFIG_SWIOTLB */
extern void swiotlb_print_info(void);
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index 4295de3..7e78e7d 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -45,9 +45,14 @@
sock_cgroup_set_classid(skcd, classid);
}
+static inline u32 __task_get_classid(struct task_struct *task)
+{
+ return task_cls_state(task)->classid;
+}
+
static inline u32 task_get_classid(const struct sk_buff *skb)
{
- u32 classid = task_cls_state(current)->classid;
+ u32 classid = __task_get_classid(current);
/* Due to the nature of the classifier it is required to ignore all
* packets originating from softirq context as accessing `current'
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index fe96bf2..81b9659 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -85,9 +85,8 @@
int iif, int sdif,
bool *refcounted)
{
- struct sock *sk = skb_steal_sock(skb);
+ struct sock *sk = skb_steal_sock(skb, refcounted);
- *refcounted = true;
if (sk)
return sk;
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index a186c24..d4d6110 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -386,10 +386,9 @@
const int sdif,
bool *refcounted)
{
- struct sock *sk = skb_steal_sock(skb);
+ struct sock *sk = skb_steal_sock(skb, refcounted);
const struct iphdr *iph = ip_hdr(skb);
- *refcounted = true;
if (sk)
return sk;
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 167e390..c8bddcf0 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -165,6 +165,9 @@
#ifdef CONFIG_XFRM
struct netns_xfrm xfrm;
#endif
+
+ atomic64_t net_cookie; /* written once */
+
#if IS_ENABLED(CONFIG_IP_VS)
struct netns_ipvs *ipvs;
#endif
@@ -229,6 +232,8 @@
struct net *get_net_ns_by_pid(pid_t pid);
struct net *get_net_ns_by_fd(int fd);
+u64 net_gen_cookie(struct net *net);
+
#ifdef CONFIG_SYSCTL
void ipx_register_sysctl(void);
void ipx_unregister_sysctl(void);
diff --git a/include/net/sock.h b/include/net/sock.h
index 9d68707..89d2cfc 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1623,6 +1623,7 @@
void sock_efree(struct sk_buff *skb);
#ifdef CONFIG_INET
void sock_edemux(struct sk_buff *skb);
+void sock_pfree(struct sk_buff *skb);
#else
#define sock_edemux sock_efree
#endif
@@ -2501,16 +2502,14 @@
write_pnet(&sk->sk_net, net);
}
-static inline struct sock *skb_steal_sock(struct sk_buff *skb)
+static inline bool
+skb_sk_is_prefetched(struct sk_buff *skb)
{
- if (skb->sk) {
- struct sock *sk = skb->sk;
-
- skb->destructor = NULL;
- skb->sk = NULL;
- return sk;
- }
- return NULL;
+#ifdef CONFIG_INET
+ return skb->destructor == sock_pfree;
+#else
+ return false;
+#endif /* CONFIG_INET */
}
/* This helper checks if a socket is a full socket,
@@ -2521,6 +2520,35 @@
return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
}
+static inline bool
+sk_is_refcounted(struct sock *sk)
+{
+ /* Only full sockets have sk->sk_flags. */
+ return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE);
+}
+
+/**
+ * skb_steal_sock
+ * @skb to steal the socket from
+ * @refcounted is set to true if the socket is reference-counted
+ */
+static inline struct sock *
+skb_steal_sock(struct sk_buff *skb, bool *refcounted)
+{
+ if (skb->sk) {
+ struct sock *sk = skb->sk;
+
+ *refcounted = true;
+ if (skb_sk_is_prefetched(skb))
+ *refcounted = sk_is_refcounted(sk);
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ return sk;
+ }
+ *refcounted = false;
+ return NULL;
+}
+
/* Checks if this SKB belongs to an HW offloaded socket
* and whether any SW fallbacks are required based on dev.
* Check decrypted mark in case skb_orphan() cleared socket.
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index d68e9e5..038f163 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -1292,18 +1292,34 @@
TP_ARGS(sb, group)
);
-DEFINE_EVENT(ext4__bitmap_load, ext4_read_block_bitmap_load,
+DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap,
TP_PROTO(struct super_block *sb, unsigned long group),
TP_ARGS(sb, group)
);
-DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap,
+TRACE_EVENT(ext4_read_block_bitmap_load,
+ TP_PROTO(struct super_block *sb, unsigned long group, bool prefetch),
- TP_PROTO(struct super_block *sb, unsigned long group),
+ TP_ARGS(sb, group, prefetch),
- TP_ARGS(sb, group)
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( __u32, group )
+ __field( bool, prefetch )
+
+ ),
+
+ TP_fast_assign(
+ __entry->dev = sb->s_dev;
+ __entry->group = group;
+ __entry->prefetch = prefetch;
+ ),
+
+ TP_printk("dev %d,%d group %u prefetch %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->group, __entry->prefetch)
);
TRACE_EVENT(ext4_direct_IO_enter,
@@ -2703,6 +2719,50 @@
__entry->function, __entry->line)
);
+TRACE_EVENT(ext4_prefetch_bitmaps,
+ TP_PROTO(struct super_block *sb, ext4_group_t group,
+ ext4_group_t next, unsigned int prefetch_ios),
+
+ TP_ARGS(sb, group, next, prefetch_ios),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( __u32, group )
+ __field( __u32, next )
+ __field( __u32, ios )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = sb->s_dev;
+ __entry->group = group;
+ __entry->next = next;
+ __entry->ios = prefetch_ios;
+ ),
+
+ TP_printk("dev %d,%d group %u next %u ios %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->group, __entry->next, __entry->ios)
+);
+
+TRACE_EVENT(ext4_lazy_itable_init,
+ TP_PROTO(struct super_block *sb, ext4_group_t group),
+
+ TP_ARGS(sb, group),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( __u32, group )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = sb->s_dev;
+ __entry->group = group;
+ ),
+
+ TP_printk("dev %d,%d group %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group)
+);
+
#endif /* _TRACE_EXT4_H */
/* This part must be outside protection */
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index c89c649..5d0ea2a 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -71,6 +71,7 @@
#define AUDIT_TTY_SET 1017 /* Set TTY auditing status */
#define AUDIT_SET_FEATURE 1018 /* Turn an audit feature on or off */
#define AUDIT_GET_FEATURE 1019 /* Get which features are enabled */
+#define AUDIT_CONTAINER_OP 1020 /* Define the container id and info */
#define AUDIT_FIRST_USER_MSG 1100 /* Userspace messages mostly uninteresting to kernel */
#define AUDIT_USER_AVC 1107 /* We filter this differently */
@@ -488,6 +489,7 @@
#define AUDIT_UID_UNSET (unsigned int)-1
#define AUDIT_SID_UNSET ((unsigned int)-1)
+#define AUDIT_CID_UNSET ((u64)-1)
/* audit_rule_data supports filter rules with both integer and string
* fields. It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cb06310..edaf3fe7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -199,6 +199,16 @@
BPF_CGROUP_UDP6_RECVMSG,
BPF_CGROUP_GETSOCKOPT,
BPF_CGROUP_SETSOCKOPT,
+ BPF_TRACE_RAW_TP,
+ BPF_TRACE_FENTRY,
+ BPF_TRACE_FEXIT,
+ BPF_MODIFY_RETURN,
+ BPF_LSM_MAC,
+ BPF_TRACE_ITER,
+ BPF_CGROUP_INET4_GETPEERNAME,
+ BPF_CGROUP_INET6_GETPEERNAME,
+ BPF_CGROUP_INET4_GETSOCKNAME,
+ BPF_CGROUP_INET6_GETSOCKNAME,
__MAX_BPF_ATTACH_TYPE
};
@@ -2750,6 +2760,231 @@
* **-EOPNOTSUPP** kernel configuration does not enable SYN cookies
*
* **-EPROTONOSUPPORT** IP packet version is not 4 or 6
+ *
+ * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * Description
+ * Write raw *data* blob into a special BPF perf event held by
+ * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * event must have the following attributes: **PERF_SAMPLE_RAW**
+ * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * **PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * The *flags* are used to indicate the index in *map* for which
+ * the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * to indicate that the index of the current CPU core should be
+ * used.
+ *
+ * The value to write, of *size*, is passed through eBPF stack and
+ * pointed by *data*.
+ *
+ * *ctx* is a pointer to in-kernel struct sk_buff.
+ *
+ * This helper is similar to **bpf_perf_event_output**\ () but
+ * restricted to raw_tracepoint bpf programs.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr)
+ * Description
+ * Safely attempt to read *size* bytes from user space address
+ * *unsafe_ptr* and store the data in *dst*.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
+ * Description
+ * Safely attempt to read *size* bytes from kernel space address
+ * *unsafe_ptr* and store the data in *dst*.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr)
+ * Description
+ * Copy a NUL terminated string from an unsafe user address
+ * *unsafe_ptr* to *dst*. The *size* should include the
+ * terminating NUL byte. In case the string length is smaller than
+ * *size*, the target is not padded with further NUL bytes. If the
+ * string length is larger than *size*, just *size*-1 bytes are
+ * copied and the last byte is set to NUL.
+ *
+ * On success, the length of the copied string is returned. This
+ * makes this helper useful in tracing programs for reading
+ * strings, and more importantly to get its length at runtime. See
+ * the following snippet:
+ *
+ * ::
+ *
+ * SEC("kprobe/sys_open")
+ * void bpf_sys_open(struct pt_regs *ctx)
+ * {
+ * char buf[PATHLEN]; // PATHLEN is defined to 256
+ * int res = bpf_probe_read_user_str(buf, sizeof(buf),
+ * ctx->di);
+ *
+ * // Consume buf, for example push it to
+ * // userspace via bpf_perf_event_output(); we
+ * // can use res (the string length) as event
+ * // size, after checking its boundaries.
+ * }
+ *
+ * In comparison, using **bpf_probe_read_user()** helper here
+ * instead to read the string would require to estimate the length
+ * at compile time, and would often result in copying more memory
+ * than necessary.
+ *
+ * Another useful use case is when parsing individual process
+ * arguments or individual environment variables navigating
+ * *current*\ **->mm->arg_start** and *current*\
+ * **->mm->env_start**: using this helper and the return value,
+ * one can quickly iterate at the right offset of the memory area.
+ * Return
+ * On success, the strictly positive length of the string,
+ * including the trailing NUL character. On error, a negative
+ * value.
+ *
+ * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr)
+ * Description
+ * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr*
+ * to *dst*. Same semantics as with bpf_probe_read_user_str() apply.
+ * Return
+ * On success, the strictly positive length of the string, including
+ * the trailing NUL character. On error, a negative value.
+ *
+ * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt)
+ * Description
+ * Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock.
+ * *rcv_nxt* is the ack_seq to be sent out.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_send_signal_thread(u32 sig)
+ * Description
+ * Send signal *sig* to the thread corresponding to the current task.
+ * Return
+ * 0 on success or successfully queued.
+ *
+ * **-EBUSY** if work queue under nmi is full.
+ *
+ * **-EINVAL** if *sig* is invalid.
+ *
+ * **-EPERM** if no permission to send the *sig*.
+ *
+ * **-EAGAIN** if bpf program can try again.
+ *
+ * u64 bpf_jiffies64(void)
+ * Description
+ * Obtain the 64bit jiffies
+ * Return
+ * The 64 bit jiffies
+ *
+ * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
+ * Description
+ * For an eBPF program attached to a perf event, retrieve the
+ * branch records (struct perf_branch_entry) associated to *ctx*
+ * and store it in the buffer pointed by *buf* up to size
+ * *size* bytes.
+ * Return
+ * On success, number of bytes written to *buf*. On error, a
+ * negative value.
+ *
+ * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to
+ * instead return the number of bytes required to store all the
+ * branch entries. If this flag is set, *buf* may be NULL.
+ *
+ * **-EINVAL** if arguments invalid or **size** not a multiple
+ * of sizeof(struct perf_branch_entry).
+ *
+ * **-ENOENT** if architecture does not support branch records.
+ *
+ * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size)
+ * Description
+ * Returns 0 on success, values for *pid* and *tgid* as seen from the current
+ * *namespace* will be returned in *nsdata*.
+ *
+ * On failure, the returned value is one of the following:
+ *
+ * **-EINVAL** if dev and inum supplied don't match dev_t and inode number
+ * with nsfs of current task, or if dev conversion to dev_t lost high bits.
+ *
+ * **-ENOENT** if pidns does not exists for the current task.
+ *
+ * int bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * Description
+ * Write raw *data* blob into a special BPF perf event held by
+ * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * event must have the following attributes: **PERF_SAMPLE_RAW**
+ * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * **PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * The *flags* are used to indicate the index in *map* for which
+ * the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * to indicate that the index of the current CPU core should be
+ * used.
+ *
+ * The value to write, of *size*, is passed through eBPF stack and
+ * pointed by *data*.
+ *
+ * *ctx* is a pointer to in-kernel struct xdp_buff.
+ *
+ * This helper is similar to **bpf_perf_eventoutput**\ () but
+ * restricted to raw_tracepoint bpf programs.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_get_netns_cookie(void *ctx)
+ * Description
+ * Retrieve the cookie (generated by the kernel) of the network
+ * namespace the input *ctx* is associated with. The network
+ * namespace cookie remains stable for its lifetime and provides
+ * a global identifier that can be assumed unique. If *ctx* is
+ * NULL, then the helper returns the cookie for the initial
+ * network namespace. The cookie itself is very similar to that
+ * of bpf_get_socket_cookie() helper, but for network namespaces
+ * instead of sockets.
+ * Return
+ * A 8-byte long opaque number.
+ *
+ * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level)
+ * Description
+ * Return id of cgroup v2 that is ancestor of the cgroup associated
+ * with the current task at the *ancestor_level*. The root cgroup
+ * is at *ancestor_level* zero and each step down the hierarchy
+ * increments the level. If *ancestor_level* == level of cgroup
+ * associated with the current task, then return value will be the
+ * same as that of **bpf_get_current_cgroup_id**\ ().
+ *
+ * The helper is useful to implement policies based on cgroups
+ * that are upper in hierarchy than immediate cgroup associated
+ * with the current task.
+ *
+ * The format of returned id and helper limitations are same as in
+ * **bpf_get_current_cgroup_id**\ ().
+ * Return
+ * The id is returned or 0 in case the id could not be retrieved.
+ *
+ * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
+ * Description
+ * Assign the *sk* to the *skb*. When combined with appropriate
+ * routing configuration to receive the packet towards the socket,
+ * will cause *skb* to be delivered to the specified socket.
+ * Subsequent redirection of *skb* via **bpf_redirect**\ (),
+ * **bpf_clone_redirect**\ () or other methods outside of BPF may
+ * interfere with successful delivery to the socket.
+ *
+ * This operation is only valid from TC ingress path.
+ *
+ * The *flags* argument must be zero.
+ * Return
+ * 0 on success, or a negative errno in case of failure.
+ *
+ * * **-EINVAL** Unsupported flags specified.
+ * * **-ENOENT** Socket is unavailable for assignment.
+ * * **-ENETUNREACH** Socket is unreachable (wrong netns).
+ * * **-EOPNOTSUPP** Unsupported operation, for example a
+ * call from outside of TC ingress.
+ * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport).
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -2862,7 +3097,21 @@
FN(sk_storage_get), \
FN(sk_storage_delete), \
FN(send_signal), \
- FN(tcp_gen_syncookie),
+ FN(tcp_gen_syncookie), \
+ FN(skb_output), \
+ FN(probe_read_user), \
+ FN(probe_read_kernel), \
+ FN(probe_read_user_str), \
+ FN(probe_read_kernel_str), \
+ FN(tcp_send_ack), \
+ FN(send_signal_thread), \
+ FN(jiffies64), \
+ FN(read_branch_records), \
+ FN(get_ns_current_pid_tgid), \
+ FN(xdp_output), \
+ FN(get_netns_cookie), \
+ FN(get_current_ancestor_cgroup_id), \
+ FN(sk_assign),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
diff --git a/init/init_task.c b/init/init_task.c
index 5d8359c..e40f14b 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -123,8 +123,7 @@
.thread_group = LIST_HEAD_INIT(init_task.thread_group),
.thread_node = LIST_HEAD_INIT(init_signals.thread_head),
#ifdef CONFIG_AUDIT
- .loginuid = INVALID_UID,
- .sessionid = AUDIT_SID_UNSET,
+ .audit = &init_struct_audit,
#endif
#ifdef CONFIG_PERF_EVENTS
.perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex),
diff --git a/init/main.c b/init/main.c
index a17a111..6fda736 100644
--- a/init/main.c
+++ b/init/main.c
@@ -94,6 +94,7 @@
#include <linux/rodata_test.h>
#include <linux/jump_label.h>
#include <linux/mem_encrypt.h>
+#include <linux/audit.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -526,14 +527,16 @@
{
const char *stack;
- if (IS_ENABLED(CONFIG_INIT_STACK_ALL))
- stack = "all";
+ if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN))
+ stack = "all(pattern)";
+ else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
+ stack = "all(zero)";
else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
- stack = "byref_all";
+ stack = "byref_all(zero)";
else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
- stack = "byref";
+ stack = "byref(zero)";
else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
- stack = "__user";
+ stack = "__user(zero)";
else
stack = "off";
@@ -768,6 +771,7 @@
nsfs_init();
cpuset_init();
cgroup_init();
+ audit_task_init();
taskstats_init_early();
delayacct_init();
diff --git a/kernel/audit.c b/kernel/audit.c
index db81418..e1d9316 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -202,6 +202,75 @@
struct sk_buff *skb;
};
+static struct kmem_cache *audit_task_cache;
+
+void __init audit_task_init(void)
+{
+ audit_task_cache = kmem_cache_create("audit_task",
+ sizeof(struct audit_task_info),
+ 0, SLAB_PANIC, NULL);
+}
+
+/**
+ * audit_alloc - allocate an audit info block for a task
+ * @tsk: task
+ *
+ * Call audit_alloc_syscall to filter on the task information and
+ * allocate a per-task audit context if necessary. This is called from
+ * copy_process, so no lock is needed.
+ */
+int audit_alloc(struct task_struct *tsk)
+{
+ int ret = 0;
+ struct audit_task_info *info;
+
+ info = kmem_cache_alloc(audit_task_cache, GFP_KERNEL);
+ if (!info) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ info->loginuid = audit_get_loginuid(current);
+ info->sessionid = audit_get_sessionid(current);
+ info->contid = audit_get_contid(current);
+ tsk->audit = info;
+
+ ret = audit_alloc_syscall(tsk);
+ if (ret) {
+ tsk->audit = NULL;
+ kmem_cache_free(audit_task_cache, info);
+ }
+out:
+ return ret;
+}
+
+struct audit_task_info init_struct_audit = {
+ .loginuid = INVALID_UID,
+ .sessionid = AUDIT_SID_UNSET,
+ .contid = AUDIT_CID_UNSET,
+#ifdef CONFIG_AUDITSYSCALL
+ .ctx = NULL,
+#endif
+};
+
+/**
+ * audit_free - free per-task audit info
+ * @tsk: task whose audit info block to free
+ *
+ * Called from copy_process and do_exit
+ */
+void audit_free(struct task_struct *tsk)
+{
+ struct audit_task_info *info = tsk->audit;
+
+ audit_free_syscall(tsk);
+ /* Freeing the audit_task_info struct must be performed after
+ * audit_log_exit() due to need for loginuid and sessionid.
+ */
+ info = tsk->audit;
+ tsk->audit = NULL;
+ kmem_cache_free(audit_task_cache, info);
+}
+
/**
* auditd_test_task - Check to see if a given task is an audit daemon
* @task: the task to check
@@ -2305,8 +2374,8 @@
sessionid = (unsigned int)atomic_inc_return(&session_id);
}
- current->sessionid = sessionid;
- current->loginuid = loginuid;
+ current->audit->sessionid = sessionid;
+ current->audit->loginuid = loginuid;
out:
audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
return rc;
@@ -2339,6 +2408,62 @@
return audit_signal_info_syscall(t);
}
+/*
+ * audit_set_contid - set current task's audit contid
+ * @task: target task
+ * @contid: contid value
+ *
+ * Returns 0 on success, -EPERM on permission failure.
+ *
+ * Called (set) from fs/proc/base.c::proc_contid_write().
+ */
+int audit_set_contid(struct task_struct *task, u64 contid)
+{
+ u64 oldcontid;
+ int rc = 0;
+ struct audit_buffer *ab;
+
+ task_lock(task);
+ /* Can't set if audit disabled */
+ if (!task->audit) {
+ task_unlock(task);
+ return -ENOPROTOOPT;
+ }
+ oldcontid = audit_get_contid(task);
+ read_lock(&tasklist_lock);
+ /* Don't allow the audit containerid to be unset */
+ if (!audit_contid_valid(contid))
+ rc = -EINVAL;
+ /* if we don't have caps, reject */
+ else if (!capable(CAP_AUDIT_CONTROL))
+ rc = -EPERM;
+ /* if task has children or is not single-threaded, deny */
+ else if (!list_empty(&task->children))
+ rc = -EBUSY;
+ else if (!(thread_group_leader(task) && thread_group_empty(task)))
+ rc = -EALREADY;
+ /* if contid is already set, deny */
+ else if (audit_contid_set(task))
+ rc = -ECHILD;
+ read_unlock(&tasklist_lock);
+ if (!rc)
+ task->audit->contid = contid;
+ task_unlock(task);
+
+ if (!audit_enabled)
+ return rc;
+
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONTAINER_OP);
+ if (!ab)
+ return rc;
+
+ audit_log_format(ab,
+ "op=set opid=%d contid=%llu old-contid=%llu",
+ task_tgid_nr(task), contid, oldcontid);
+ audit_log_end(ab);
+ return rc;
+}
+
/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
diff --git a/kernel/audit.h b/kernel/audit.h
index fed8e93..c703a0c 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -135,6 +135,7 @@
kuid_t target_uid;
unsigned int target_sessionid;
u32 target_sid;
+ u64 target_cid;
char target_comm[TASK_COMM_LEN];
struct audit_tree_refs *trees, *first_trees;
@@ -255,6 +256,8 @@
extern unsigned int audit_serial(void);
extern int auditsc_get_stamp(struct audit_context *ctx,
struct timespec64 *t, unsigned int *serial);
+extern int audit_alloc_syscall(struct task_struct *tsk);
+extern void audit_free_syscall(struct task_struct *tsk);
extern void audit_put_watch(struct audit_watch *watch);
extern void audit_get_watch(struct audit_watch *watch);
@@ -296,6 +299,9 @@
extern struct list_head *audit_killed_trees(void);
#else /* CONFIG_AUDITSYSCALL */
#define auditsc_get_stamp(c, t, s) 0
+#define audit_alloc_syscall(t) 0
+#define audit_free_syscall(t) {}
+
#define audit_put_watch(w) {}
#define audit_get_watch(w) {}
#define audit_to_watch(k, p, l, o) (-EINVAL)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e8e90c0..9258b24 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -113,6 +113,7 @@
kuid_t target_uid[AUDIT_AUX_PIDS];
unsigned int target_sessionid[AUDIT_AUX_PIDS];
u32 target_sid[AUDIT_AUX_PIDS];
+ u64 target_cid[AUDIT_AUX_PIDS];
char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
int pid_count;
};
@@ -903,23 +904,25 @@
return context;
}
-/**
- * audit_alloc - allocate an audit context block for a task
+/*
+ * audit_alloc_syscall - allocate an audit context block for a task
* @tsk: task
*
* Filter on the task information and allocate a per-task audit context
* if necessary. Doing so turns on system call auditing for the
- * specified task. This is called from copy_process, so no lock is
- * needed.
+ * specified task. This is called from copy_process via audit_alloc, so
+ * no lock is needed.
*/
-int audit_alloc(struct task_struct *tsk)
+int audit_alloc_syscall(struct task_struct *tsk)
{
struct audit_context *context;
enum audit_state state;
char *key = NULL;
- if (likely(!audit_ever_enabled))
+ if (likely(!audit_ever_enabled)) {
+ audit_set_context(tsk, NULL);
return 0; /* Return if not auditing. */
+ }
state = audit_filter_task(tsk, &key);
if (state == AUDIT_DISABLED) {
@@ -929,7 +932,7 @@
if (!(context = audit_alloc_context(state))) {
kfree(key);
- audit_log_lost("out of memory in audit_alloc");
+ audit_log_lost("out of memory in audit_alloc_syscall");
return -ENOMEM;
}
context->filterkey = key;
@@ -1626,14 +1629,15 @@
}
/**
- * __audit_free - free a per-task audit context
+ * audit_free_syscall - free per-task audit context info
* @tsk: task whose audit context block to free
*
- * Called from copy_process and do_exit
+ * Called from audit_free
*/
-void __audit_free(struct task_struct *tsk)
+void audit_free_syscall(struct task_struct *tsk)
{
- struct audit_context *context = tsk->audit_context;
+ struct audit_task_info *info = tsk->audit;
+ struct audit_context *context = info->ctx;
if (!context)
return;
@@ -1656,7 +1660,6 @@
if (context->current_state == AUDIT_RECORD_CONTEXT)
audit_log_exit();
}
-
audit_set_context(tsk, NULL);
audit_free_context(context);
}
@@ -2425,6 +2428,7 @@
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
security_task_getsecid(t, &context->target_sid);
+ context->target_cid = audit_get_contid(t);
memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
}
@@ -2452,6 +2456,7 @@
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
security_task_getsecid(t, &ctx->target_sid);
+ ctx->target_cid = audit_get_contid(t);
memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
return 0;
}
@@ -2473,6 +2478,7 @@
axp->target_uid[axp->pid_count] = t_uid;
axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
+ axp->target_cid[axp->pid_count] = audit_get_contid(t);
memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
axp->pid_count++;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 6b33a8a..627cd72 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2076,6 +2076,7 @@
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
const struct bpf_func_proto bpf_spin_lock_proto __weak;
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
+const struct bpf_func_proto bpf_jiffies64_proto __weak;
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a77d281..91fe4e0 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -11,6 +11,7 @@
#include <linux/uidgid.h>
#include <linux/filter.h>
#include <linux/ctype.h>
+#include <linux/jiffies.h>
#include "../../lib/kstrtox.h"
@@ -312,6 +313,17 @@
preempt_enable();
}
+BPF_CALL_0(bpf_jiffies64)
+{
+ return get_jiffies_64();
+}
+
+const struct bpf_func_proto bpf_jiffies64_proto = {
+ .func = bpf_jiffies64,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9ebdcda..ae34c99 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1599,6 +1599,10 @@
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
@@ -1926,6 +1930,10 @@
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
@@ -2074,6 +2082,10 @@
case BPF_CGROUP_INET6_POST_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f705d37..53b4dc8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3341,13 +3341,17 @@
expected_type = CONST_PTR_TO_MAP;
if (type != expected_type)
goto err_type;
- } else if (arg_type == ARG_PTR_TO_CTX) {
+ } else if (arg_type == ARG_PTR_TO_CTX ||
+ arg_type == ARG_PTR_TO_CTX_OR_NULL) {
expected_type = PTR_TO_CTX;
- if (type != expected_type)
- goto err_type;
- err = check_ctx_reg(env, reg, regno);
- if (err < 0)
- return err;
+ if (!(register_is_null(reg) &&
+ arg_type == ARG_PTR_TO_CTX_OR_NULL)) {
+ if (type != expected_type)
+ goto err_type;
+ err = check_ctx_reg(env, reg, regno);
+ if (err < 0)
+ return err;
+ }
} else if (arg_type == ARG_PTR_TO_SOCK_COMMON) {
expected_type = PTR_TO_SOCK_COMMON;
/* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */
@@ -6415,7 +6419,11 @@
switch (env->prog->type) {
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
- env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
+ env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
range = tnum_range(1, 1);
break;
case BPF_PROG_TYPE_CGROUP_SKB:
@@ -9444,6 +9452,30 @@
goto patch_call_imm;
}
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_jiffies64) {
+ struct bpf_insn ld_jiffies_addr[2] = {
+ BPF_LD_IMM64(BPF_REG_0,
+ (unsigned long)&jiffies),
+ };
+
+ insn_buf[0] = ld_jiffies_addr[0];
+ insn_buf[1] = ld_jiffies_addr[1];
+ insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0,
+ BPF_REG_0, 0);
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
+ cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
patch_call_imm:
fn = env->ops->get_func_proto(insn->imm, env->prog);
/* all functions that have prototype and verifier allowed
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 913cb71..08f1827 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -49,9 +49,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/swiotlb.h>
-#define OFFSET(val,align) ((unsigned long) \
- ( (val) & ( (align) - 1)))
-
#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
/*
@@ -151,8 +148,6 @@
max_segment = rounddown(val, PAGE_SIZE);
}
-/* default to 64MB */
-#define IO_TLB_DEFAULT_SIZE (64UL<<20)
unsigned long swiotlb_size_or_default(void)
{
unsigned long size;
@@ -162,6 +157,24 @@
return size ? size : (IO_TLB_DEFAULT_SIZE);
}
+void __init swiotlb_adjust_size(unsigned long new_size)
+{
+ unsigned long size;
+
+ /*
+ * If swiotlb parameter has not been specified, give a chance to
+ * architectures such as those supporting memory encryption to
+ * adjust/expand SWIOTLB size for their use.
+ */
+ if (!io_tlb_nslabs) {
+ size = ALIGN(new_size, 1 << IO_TLB_SHIFT);
+ io_tlb_nslabs = size >> IO_TLB_SHIFT;
+ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+
+ pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
+ }
+}
+
void swiotlb_print_info(void)
{
unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
@@ -177,6 +190,16 @@
bytes >> 20);
}
+static inline unsigned long io_tlb_offset(unsigned long val)
+{
+ return val & (IO_TLB_SEGSIZE - 1);
+}
+
+static inline unsigned long nr_slots(u64 val)
+{
+ return DIV_ROUND_UP(val, IO_TLB_SIZE);
+}
+
/*
* Early SWIOTLB allocation may be too early to allow an architecture to
* perform the desired operations. This function allows the architecture to
@@ -226,7 +249,7 @@
__func__, alloc_size, PAGE_SIZE);
for (i = 0; i < io_tlb_nslabs; i++) {
- io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+ io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
}
io_tlb_index = 0;
@@ -360,7 +383,7 @@
goto cleanup4;
for (i = 0; i < io_tlb_nslabs; i++) {
- io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+ io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
}
io_tlb_index = 0;
@@ -446,22 +469,121 @@
}
}
-phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
- dma_addr_t tbl_dma_addr,
- phys_addr_t orig_addr,
- size_t mapping_size,
+#define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT))
+/*
+ * Return the offset into a iotlb slot required to keep the device happy.
+ */
+static unsigned int swiotlb_align_offset(struct device *dev, u64 addr)
+{
+ return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1);
+}
+
+/*
+ * Carefully handle integer overflow which can occur when boundary_mask == ~0UL.
+ */
+static inline unsigned long get_max_slots(unsigned long boundary_mask)
+{
+ if (boundary_mask == ~0UL)
+ return 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+ return nr_slots(boundary_mask + 1);
+}
+
+static unsigned int wrap_index(unsigned int index)
+{
+ if (index >= io_tlb_nslabs)
+ return 0;
+ return index;
+}
+
+/*
+ * Find a suitable number of IO TLB entries size that will fit this request and
+ * allocate a buffer from that IO TLB pool.
+ */
+static int find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size)
+{
+ unsigned long boundary_mask = dma_get_seg_boundary(dev);
+ dma_addr_t tbl_dma_addr =
+ __phys_to_dma(dev, io_tlb_start) & boundary_mask;
+ unsigned long max_slots = get_max_slots(boundary_mask);
+ unsigned int iotlb_align_mask =
+ dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
+ unsigned int nslots = nr_slots(alloc_size), stride;
+ unsigned int index, wrap, count = 0, i;
+ unsigned long flags;
+
+ BUG_ON(!nslots);
+
+ /*
+ * For mappings with an alignment requirement don't bother looping to
+ * unaligned slots once we found an aligned one. For allocations of
+ * PAGE_SIZE or larger only look for page aligned allocations.
+ */
+ stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
+ if (alloc_size >= PAGE_SIZE)
+ stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
+
+ spin_lock_irqsave(&io_tlb_lock, flags);
+ if (unlikely(nslots > io_tlb_nslabs - io_tlb_used))
+ goto not_found;
+
+ index = wrap = wrap_index(ALIGN(io_tlb_index, stride));
+ do {
+ if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
+ (orig_addr & iotlb_align_mask)) {
+ index = wrap_index(index + 1);
+ continue;
+ }
+
+ /*
+ * If we find a slot that indicates we have 'nslots' number of
+ * contiguous buffers, we allocate the buffers from that slot
+ * and mark the entries as '0' indicating unavailable.
+ */
+ if (!iommu_is_span_boundary(index, nslots,
+ nr_slots(tbl_dma_addr),
+ max_slots)) {
+ if (io_tlb_list[index] >= nslots)
+ goto found;
+ }
+ index = wrap_index(index + stride);
+ } while (index != wrap);
+
+not_found:
+ spin_unlock_irqrestore(&io_tlb_lock, flags);
+ return -1;
+
+found:
+ for (i = index; i < index + nslots; i++)
+ io_tlb_list[i] = 0;
+ for (i = index - 1;
+ io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
+ io_tlb_list[i]; i--)
+ io_tlb_list[i] = ++count;
+
+ /*
+ * Update the indices to avoid searching in the next round.
+ */
+ if (index + nslots < io_tlb_nslabs)
+ io_tlb_index = index + nslots;
+ else
+ io_tlb_index = 0;
+
+ io_tlb_used += nslots;
+
+ spin_unlock_irqrestore(&io_tlb_lock, flags);
+ return index;
+}
+
+phys_addr_t swiotlb_tbl_map_single(struct device *dev, dma_addr_t dma_addr,
+ phys_addr_t orig_addr, size_t mapping_size,
size_t alloc_size,
enum dma_data_direction dir,
unsigned long attrs)
{
- unsigned long flags;
+ unsigned int offset = swiotlb_align_offset(dev, orig_addr);
+ unsigned int index, i;
phys_addr_t tlb_addr;
- unsigned int nslots, stride, index, wrap;
- int i;
- unsigned long mask;
- unsigned long offset_slots;
- unsigned long max_slots;
- unsigned long tmp_io_tlb_used;
if (no_iotlb_memory)
panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
@@ -470,107 +592,29 @@
pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
if (mapping_size > alloc_size) {
- dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
+ dev_warn_once(dev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
mapping_size, alloc_size);
return (phys_addr_t)DMA_MAPPING_ERROR;
}
- mask = dma_get_seg_boundary(hwdev);
-
- tbl_dma_addr &= mask;
-
- offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
-
- /*
- * Carefully handle integer overflow which can occur when mask == ~0UL.
- */
- max_slots = mask + 1
- ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
- : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
-
- /*
- * For mappings greater than or equal to a page, we limit the stride
- * (and hence alignment) to a page size.
- */
- nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
- if (alloc_size >= PAGE_SIZE)
- stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
- else
- stride = 1;
-
- BUG_ON(!nslots);
-
- /*
- * Find suitable number of IO TLB entries size that will fit this
- * request and allocate a buffer from that IO TLB pool.
- */
- spin_lock_irqsave(&io_tlb_lock, flags);
-
- if (unlikely(nslots > io_tlb_nslabs - io_tlb_used))
- goto not_found;
-
- index = ALIGN(io_tlb_index, stride);
- if (index >= io_tlb_nslabs)
- index = 0;
- wrap = index;
-
- do {
- while (iommu_is_span_boundary(index, nslots, offset_slots,
- max_slots)) {
- index += stride;
- if (index >= io_tlb_nslabs)
- index = 0;
- if (index == wrap)
- goto not_found;
- }
-
- /*
- * If we find a slot that indicates we have 'nslots' number of
- * contiguous buffers, we allocate the buffers from that slot
- * and mark the entries as '0' indicating unavailable.
- */
- if (io_tlb_list[index] >= nslots) {
- int count = 0;
-
- for (i = index; i < (int) (index + nslots); i++)
- io_tlb_list[i] = 0;
- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
- io_tlb_list[i] = ++count;
- tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT);
-
- /*
- * Update the indices to avoid searching in the next
- * round.
- */
- io_tlb_index = ((index + nslots) < io_tlb_nslabs
- ? (index + nslots) : 0);
-
- goto found;
- }
- index += stride;
- if (index >= io_tlb_nslabs)
- index = 0;
- } while (index != wrap);
-
-not_found:
- tmp_io_tlb_used = io_tlb_used;
-
- spin_unlock_irqrestore(&io_tlb_lock, flags);
- if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
- dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
- alloc_size, io_tlb_nslabs, tmp_io_tlb_used);
- return (phys_addr_t)DMA_MAPPING_ERROR;
-found:
- io_tlb_used += nslots;
- spin_unlock_irqrestore(&io_tlb_lock, flags);
+ index = find_slots(dev, orig_addr, alloc_size + offset);
+ if (index == -1) {
+ if (!(attrs & DMA_ATTR_NO_WARN))
+ dev_warn_ratelimited(dev,
+ "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
+ alloc_size, io_tlb_nslabs, io_tlb_used);
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
/*
* Save away the mapping from the original address to the DMA address.
* This is needed when we sync the memory. Then we sync the buffer if
* needed.
*/
- for (i = 0; i < nslots; i++)
- io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
+ for (i = 0; i < nr_slots(alloc_size + offset); i++)
+ io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i);
+
+ tlb_addr = slot_addr(io_tlb_start, index) + offset;
/*
* When dir == DMA_FROM_DEVICE we could omit the copy from the orig
* to the tlb buffer, if we knew for sure the device will
@@ -590,8 +634,9 @@
enum dma_data_direction dir, unsigned long attrs)
{
unsigned long flags;
- int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
- int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
+ unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
+ int i, count, nslots = nr_slots(alloc_size + offset);
+ int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT;
phys_addr_t orig_addr = io_tlb_orig_addr[index];
/*
@@ -609,26 +654,29 @@
* with slots below and above the pool being returned.
*/
spin_lock_irqsave(&io_tlb_lock, flags);
- {
- count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
- io_tlb_list[index + nslots] : 0);
- /*
- * Step 1: return the slots to the free list, merging the
- * slots with superceeding slots
- */
- for (i = index + nslots - 1; i >= index; i--) {
- io_tlb_list[i] = ++count;
- io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
- }
- /*
- * Step 2: merge the returned slots with the preceding slots,
- * if available (non zero)
- */
- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
- io_tlb_list[i] = ++count;
+ if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
+ count = io_tlb_list[index + nslots];
+ else
+ count = 0;
- io_tlb_used -= nslots;
+ /*
+ * Step 1: return the slots to the free list, merging the slots with
+ * superceeding slots
+ */
+ for (i = index + nslots - 1; i >= index; i--) {
+ io_tlb_list[i] = ++count;
+ io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
}
+
+ /*
+ * Step 2: merge the returned slots with the preceding slots, if
+ * available (non zero)
+ */
+ for (i = index - 1;
+ io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && io_tlb_list[i];
+ i--)
+ io_tlb_list[i] = ++count;
+ io_tlb_used -= nslots;
spin_unlock_irqrestore(&io_tlb_lock, flags);
}
@@ -641,7 +689,6 @@
if (orig_addr == INVALID_PHYS_ADDR)
return;
- orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1);
switch (target) {
case SYNC_FOR_CPU:
@@ -697,7 +744,7 @@
size_t swiotlb_max_mapping_size(struct device *dev)
{
- return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE;
+ return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE;
}
bool is_swiotlb_active(void)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0a54780..0253f36 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4243,8 +4243,9 @@
if (!task) {
/* Must be root to operate on a CPU event: */
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- return ERR_PTR(-EACCES);
+ err = perf_allow_cpu(&event->attr);
+ if (err)
+ return ERR_PTR(err);
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
ctx = &cpuctx->ctx;
@@ -4555,6 +4556,8 @@
unaccount_event(event);
+ security_perf_event_free(event);
+
if (event->rb) {
/*
* Can happen when we close an event with re-directed output.
@@ -5008,6 +5011,10 @@
struct perf_event_context *ctx;
int ret;
+ ret = security_perf_event_read(event);
+ if (ret)
+ return ret;
+
ctx = perf_event_ctx_lock(event);
ret = __perf_read(event, buf, count);
perf_event_ctx_unlock(event, ctx);
@@ -5272,6 +5279,11 @@
struct perf_event_context *ctx;
long ret;
+ /* Treat ioctl like writes as it is likely a mutating operation. */
+ ret = security_perf_event_write(event);
+ if (ret)
+ return ret;
+
ctx = perf_event_ctx_lock(event);
ret = _perf_ioctl(event, cmd, arg);
perf_event_ctx_unlock(event, ctx);
@@ -5736,6 +5748,10 @@
if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;
+ ret = security_perf_event_read(event);
+ if (ret)
+ return ret;
+
vma_size = vma->vm_end - vma->vm_start;
if (vma->vm_pgoff == 0) {
@@ -5869,7 +5885,7 @@
lock_limit >>= PAGE_SHIFT;
locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
- if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+ if ((locked > lock_limit) && perf_is_paranoid() &&
!capable(CAP_IPC_LOCK)) {
ret = -EPERM;
goto unlock;
@@ -10630,11 +10646,20 @@
}
}
+ err = security_perf_event_alloc(event);
+ if (err)
+ goto err_callchain_buffer;
+
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
return event;
+err_callchain_buffer:
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
err_addr_filters:
kfree(event->addr_filter_ranges);
@@ -10723,9 +10748,11 @@
attr->branch_sample_type = mask;
}
/* privileged levels capture (kernel, hv): check permissions */
- if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
- && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
+ ret = perf_allow_kernel(attr);
+ if (ret)
+ return ret;
+ }
}
if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -10955,13 +10982,19 @@
if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
+ /* Do we allow access to perf_event_open(2) ? */
+ err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
+ if (err)
+ return err;
+
err = perf_copy_attr(attr_uptr, &attr);
if (err)
return err;
if (!attr.exclude_kernel) {
- if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ err = perf_allow_kernel(&attr);
+ if (err)
+ return err;
}
if (attr.namespaces) {
@@ -10978,9 +11011,11 @@
}
/* Only privileged users can get physical addresses */
- if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
- perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
+ err = perf_allow_kernel(&attr);
+ if (err)
+ return err;
+ }
/* REGS_INTR can leak data, lockdown must prevent this */
if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
diff --git a/kernel/exit.c b/kernel/exit.c
index ece6477..db8f9ee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -63,6 +63,7 @@
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
+#include <linux/security.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -786,6 +787,8 @@
#endif
if (tsk->mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
+
+ security_task_exit(tsk);
}
acct_collect(code, group_dead);
if (group_dead)
diff --git a/kernel/fork.c b/kernel/fork.c
index 5b4a196..dc14860 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1950,7 +1950,6 @@
posix_cputimers_init(&p->posix_cputimers);
p->io_context = NULL;
- audit_set_context(p, NULL);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -2242,6 +2241,7 @@
uprobe_copy_process(p, clone_flags);
copy_oom_score_adj(clone_flags, p);
+ security_task_post_alloc(p);
return p;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index a9dfa04..643e0b1 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/kprobes.h>
+#include <linux/security.h>
#include "trace.h"
#include "trace_probe.h"
@@ -26,8 +27,10 @@
static int perf_trace_event_perm(struct trace_event_call *tp_event,
struct perf_event *p_event)
{
+ int ret;
+
if (tp_event->perf_perm) {
- int ret = tp_event->perf_perm(tp_event, p_event);
+ ret = tp_event->perf_perm(tp_event, p_event);
if (ret)
return ret;
}
@@ -46,8 +49,9 @@
/* The ftrace function trace is allowed only for root. */
if (ftrace_event_is_function(tp_event)) {
- if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
- return -EPERM;
+ ret = perf_allow_tracepoint(&p_event->attr);
+ if (ret)
+ return ret;
if (!is_sampling_event(p_event))
return 0;
@@ -82,8 +86,9 @@
* ...otherwise raw tracepoint data can be a severe data leak,
* only allow root to have these.
*/
- if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
- return -EPERM;
+ ret = perf_allow_tracepoint(&p_event->attr);
+ if (ret)
+ return ret;
return 0;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index 72bf780..350a8df 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2647,6 +2647,19 @@
.arg4_type = ARG_ANYTHING,
};
+#ifdef CONFIG_CGROUP_NET_CLASSID
+BPF_CALL_0(bpf_get_cgroup_classid_curr)
+{
+ return __task_get_classid(current);
+}
+
+static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
+ .func = bpf_get_cgroup_classid_curr,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+#endif
+
BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
return task_get_classid(skb);
@@ -4182,6 +4195,18 @@
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
+{
+ return sock_gen_cookie(ctx);
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
+ .func = bpf_get_socket_cookie_sock,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
{
return sock_gen_cookie(ctx->sk);
@@ -4194,6 +4219,39 @@
.arg1_type = ARG_PTR_TO_CTX,
};
+static u64 __bpf_get_netns_cookie(struct sock *sk)
+{
+#ifdef CONFIG_NET_NS
+ return net_gen_cookie(sk ? sk->sk_net.net : &init_net);
+#else
+ return 0;
+#endif
+}
+
+BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
+ .func = bpf_get_netns_cookie_sock,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
+ .func = bpf_get_netns_cookie_sock_addr,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
{
struct sock *sk = sk_to_full_sk(skb->sk);
@@ -4212,8 +4270,8 @@
.arg1_type = ARG_PTR_TO_CTX,
};
-BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock,
- struct bpf_map *, map, u64, flags, void *, data, u64, size)
+BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags,
+ void *, data, u64, size)
{
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
return -EINVAL;
@@ -4221,8 +4279,8 @@
return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
}
-static const struct bpf_func_proto bpf_sockopt_event_output_proto = {
- .func = bpf_sockopt_event_output,
+static const struct bpf_func_proto bpf_event_output_data_proto = {
+ .func = bpf_event_output_data,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
@@ -5440,8 +5498,7 @@
BPF_CALL_1(bpf_sk_release, struct sock *, sk)
{
- /* Only full sockets have sk->sk_flags. */
- if (!sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE))
+ if (sk_is_refcounted(sk))
sock_gen_put(sk);
return 0;
}
@@ -5970,6 +6027,36 @@
.arg5_type = ARG_CONST_SIZE,
};
+BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
+{
+ if (flags != 0)
+ return -EINVAL;
+ if (!skb_at_tc_ingress(skb))
+ return -EOPNOTSUPP;
+ if (unlikely(dev_net(skb->dev) != sock_net(sk)))
+ return -ENETUNREACH;
+ if (unlikely(sk_fullsock(sk) && sk->sk_reuseport))
+ return -ESOCKTNOSUPPORT;
+ if (sk_is_refcounted(sk) &&
+ unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
+ return -ENOENT;
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = sock_pfree;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_assign_proto = {
+ .func = bpf_sk_assign,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_SOCK_COMMON,
+ .arg3_type = ARG_ANYTHING,
+};
+
#endif /* CONFIG_INET */
bool bpf_helper_changes_pkt_data(void *func)
@@ -6046,6 +6133,8 @@
return &bpf_spin_unlock_proto;
case BPF_FUNC_trace_printk:
return bpf_get_trace_printk_proto();
+ case BPF_FUNC_jiffies64:
+ return &bpf_jiffies64_proto;
default:
return NULL;
}
@@ -6062,6 +6151,20 @@
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_sock_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
default:
return bpf_base_func_proto(func_id);
}
@@ -6086,8 +6189,20 @@
}
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_sock_addr_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_addr_proto;
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
#ifdef CONFIG_INET
case BPF_FUNC_sk_lookup_tcp:
return &bpf_sock_addr_sk_lookup_tcp_proto;
@@ -6261,6 +6376,8 @@
return &bpf_skb_ecn_set_ce_proto;
case BPF_FUNC_tcp_gen_syncookie:
return &bpf_tcp_gen_syncookie_proto;
+ case BPF_FUNC_sk_assign:
+ return &bpf_sk_assign_proto;
#endif
default:
return bpf_base_func_proto(func_id);
@@ -6330,7 +6447,7 @@
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
case BPF_FUNC_perf_event_output:
- return &bpf_sockopt_event_output_proto;
+ return &bpf_event_output_data_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
@@ -6365,6 +6482,16 @@
return &bpf_msg_push_data_proto;
case BPF_FUNC_msg_pop_data:
return &bpf_msg_pop_data_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
default:
return bpf_base_func_proto(func_id);
}
@@ -6979,6 +7106,8 @@
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
break;
@@ -6990,6 +7119,8 @@
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP6_RECVMSG:
break;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 62a972f..1e0723f 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -69,6 +69,20 @@
static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
+static atomic64_t cookie_gen;
+
+u64 net_gen_cookie(struct net *net)
+{
+ while (1) {
+ u64 res = atomic64_read(&net->net_cookie);
+
+ if (res)
+ return res;
+ res = atomic64_inc_return(&cookie_gen);
+ atomic64_cmpxchg(&net->net_cookie, 0, res);
+ }
+}
+
static struct net_generic *net_alloc_generic(void)
{
struct net_generic *ng;
@@ -1114,6 +1128,7 @@
panic("Could not allocate generic netns");
rcu_assign_pointer(init_net.gen, ng);
+ net_gen_cookie(&init_net);
down_write(&pernet_ops_rwsem);
if (setup_net(&init_net, &init_user_ns))
diff --git a/net/core/sock.c b/net/core/sock.c
index c84f68b..a4ee174 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2074,6 +2074,18 @@
}
EXPORT_SYMBOL(sock_efree);
+/* Buffer destructor for prefetch/receive path where reference count may
+ * not be held, e.g. for listen sockets.
+ */
+#ifdef CONFIG_INET
+void sock_pfree(struct sk_buff *skb)
+{
+ if (sk_is_refcounted(skb->sk))
+ sock_gen_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_pfree);
+#endif /* CONFIG_INET */
+
kuid_t sock_i_uid(struct sock *sk)
{
kuid_t uid;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d61ca7b..906802a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -753,12 +753,11 @@
}
EXPORT_SYMBOL(inet_accept);
-
/*
* This does both peername and sockname.
*/
int inet_getname(struct socket *sock, struct sockaddr *uaddr,
- int peer)
+ int peer)
{
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
@@ -779,6 +778,11 @@
sin->sin_port = inet->inet_sport;
sin->sin_addr.s_addr = addr;
}
+ if (cgroup_bpf_enabled)
+ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+ peer ? BPF_CGROUP_INET4_GETPEERNAME :
+ BPF_CGROUP_INET4_GETSOCKNAME,
+ NULL);
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
return sizeof(*sin);
}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index c59a78a..72cbfc7 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -494,7 +494,8 @@
IPCB(skb)->iif = skb->skb_iif;
/* Must drop socket now because of tproxy. */
- skb_orphan(skb);
+ if (!skb_sk_is_prefetched(skb))
+ skb_orphan(skb);
return skb;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 83fd4fa..febd69b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2296,6 +2296,7 @@
struct rtable *rt = skb_rtable(skb);
__be32 saddr, daddr;
struct net *net = dev_net(skb->dev);
+ bool refcounted;
/*
* Validate the packet.
@@ -2321,7 +2322,7 @@
if (udp4_csum_init(skb, uh, proto))
goto csum_error;
- sk = skb_steal_sock(skb);
+ sk = skb_steal_sock(skb, &refcounted);
if (sk) {
struct dst_entry *dst = skb_dst(skb);
int ret;
@@ -2330,7 +2331,8 @@
udp_sk_rx_dst_set(sk, dst);
ret = udp_unicast_rcv_skb(sk, skb, uh);
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return ret;
}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 56f396e..7883d84 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -501,9 +501,8 @@
/*
* This does both peername and sockname.
*/
-
int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
- int peer)
+ int peer)
{
struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr;
struct sock *sk = sock->sk;
@@ -528,9 +527,13 @@
sin->sin6_addr = np->saddr;
else
sin->sin6_addr = sk->sk_v6_rcv_saddr;
-
sin->sin6_port = inet->inet_sport;
}
+ if (cgroup_bpf_enabled)
+ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+ peer ? BPF_CGROUP_INET6_GETPEERNAME :
+ BPF_CGROUP_INET6_GETSOCKNAME,
+ NULL);
sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
sk->sk_bound_dev_if);
return sizeof(*sin);
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index e6c4966..99da86a 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -253,7 +253,8 @@
rcu_read_unlock();
/* Must drop socket now because of tproxy. */
- skb_orphan(skb);
+ if (!skb_sk_is_prefetched(skb))
+ skb_orphan(skb);
return skb;
err:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 040869f..bab45fd 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -848,6 +848,7 @@
struct net *net = dev_net(skb->dev);
struct udphdr *uh;
struct sock *sk;
+ bool refcounted;
u32 ulen = 0;
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
@@ -884,7 +885,7 @@
goto csum_error;
/* Check if the socket is already available, e.g. due to early demux */
- sk = skb_steal_sock(skb);
+ sk = skb_steal_sock(skb, &refcounted);
if (sk) {
struct dst_entry *dst = skb_dst(skb);
int ret;
@@ -893,12 +894,14 @@
udp6_sk_rx_dst_set(sk, dst);
if (!uh->check && !udp_sk(sk)->no_check6_rx) {
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
goto report_csum_error;
}
ret = udp6_unicast_rcv_skb(sk, skb, uh);
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return ret;
}
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index b7d83d0..0b015ce 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -12,6 +12,7 @@
#include <linux/bpf.h>
#include <net/netlink.h>
+#include <net/sock.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
@@ -53,6 +54,8 @@
bpf_compute_data_pointers(skb);
filter_res = BPF_PROG_RUN(filter, skb);
}
+ if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK)
+ skb_orphan(skb);
rcu_read_unlock();
/* A BPF program may overwrite the default action opcode.
diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index b14a7d4..238fb47 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include
@@ -82,21 +82,20 @@
$(if $(shell command -v -- $(c)gcc 2>/dev/null), $(c))))
# output directory for tests below
-TMPOUT = $(if $(KBUILD_EXTMOD),$(firstword $(KBUILD_EXTMOD))/).tmp_$$$$
+TMPOUT := $(if $(KBUILD_EXTMOD),$(firstword $(KBUILD_EXTMOD))/)
# try-run
# Usage: option = $(call try-run, $(CC)...-o "$$TMP",option-ok,otherwise)
# Exit code chooses option. "$$TMP" serves as a temporary file and is
# automatically cleaned up.
try-run = $(shell set -e; \
- TMP=$(TMPOUT)/tmp; \
- TMPO=$(TMPOUT)/tmp.o; \
- mkdir -p $(TMPOUT); \
- trap "rm -rf $(TMPOUT)" EXIT; \
+ TMP="$(TMPOUT).$$$$.tmp"; \
+ TMPO="$(TMPOUT).$$$$.o"; \
if ($(1)) >/dev/null 2>&1; \
then echo "$(2)"; \
else echo "$(3)"; \
- fi)
+ fi; \
+ rm -f "$$TMP" "$$TMPO")
# as-option
# Usage: cflags-y += $(call as-option,-Wa$(comma)-isa=foo,)
diff --git a/security/Kconfig b/security/Kconfig
index 52e5109..d652a8e 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -239,6 +239,7 @@
source "security/apparmor/Kconfig"
source "security/loadpin/Kconfig"
source "security/yama/Kconfig"
+source "security/container/Kconfig"
source "security/safesetid/Kconfig"
source "security/lockdown/Kconfig"
diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening
index af4c979b..269967c 100644
--- a/security/Kconfig.hardening
+++ b/security/Kconfig.hardening
@@ -19,13 +19,16 @@
menu "Memory initialization"
-config CC_HAS_AUTO_VAR_INIT
+config CC_HAS_AUTO_VAR_INIT_PATTERN
def_bool $(cc-option,-ftrivial-auto-var-init=pattern)
+config CC_HAS_AUTO_VAR_INIT_ZERO
+ def_bool $(cc-option,-ftrivial-auto-var-init=zero -enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang)
+
choice
prompt "Initialize kernel stack variables at function entry"
default GCC_PLUGIN_STRUCTLEAK_BYREF_ALL if COMPILE_TEST && GCC_PLUGINS
- default INIT_STACK_ALL if COMPILE_TEST && CC_HAS_AUTO_VAR_INIT
+ default INIT_STACK_ALL_PATTERN if COMPILE_TEST && CC_HAS_AUTO_VAR_INIT_PATTERN
default INIT_STACK_NONE
help
This option enables initialization of stack variables at
@@ -88,9 +91,9 @@
of uninitialized stack variable exploits and information
exposures.
- config INIT_STACK_ALL
+ config INIT_STACK_ALL_PATTERN
bool "0xAA-init everything on the stack (strongest)"
- depends on CC_HAS_AUTO_VAR_INIT
+ depends on CC_HAS_AUTO_VAR_INIT_PATTERN
help
Initializes everything on the stack with a 0xAA
pattern. This is intended to eliminate all classes
@@ -98,6 +101,24 @@
exposures, even variables that were warned to have been
left uninitialized.
+ Pattern initialization is known to provoke many existing bugs
+ related to uninitialized locals, e.g. pointers receive
+ non-NULL values, buffer sizes and indices are very big.
+
+ config INIT_STACK_ALL_ZERO
+ bool "zero-init everything on the stack (strongest and safest)"
+ depends on CC_HAS_AUTO_VAR_INIT_ZERO
+ help
+ Initializes everything on the stack with a zero
+ value. This is intended to eliminate all classes
+ of uninitialized stack variable exploits and information
+ exposures, even variables that were warned to have been
+ left uninitialized.
+
+ Zero initialization provides safe defaults for strings,
+ pointers, indices and sizes, and is therefore
+ more suitable as a security mitigation measure.
+
endchoice
config GCC_PLUGIN_STRUCTLEAK_VERBOSE
diff --git a/security/Makefile b/security/Makefile
index be1dd9d..a5d987b 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -10,6 +10,7 @@
subdir-$(CONFIG_SECURITY_APPARMOR) += apparmor
subdir-$(CONFIG_SECURITY_YAMA) += yama
subdir-$(CONFIG_SECURITY_LOADPIN) += loadpin
+subdir-$(CONFIG_SECURITY_CONTAINER_MONITOR) += container
subdir-$(CONFIG_SECURITY_SAFESETID) += safesetid
subdir-$(CONFIG_SECURITY_LOCKDOWN_LSM) += lockdown
@@ -30,6 +31,7 @@
obj-$(CONFIG_SECURITY_SAFESETID) += safesetid/
obj-$(CONFIG_SECURITY_LOCKDOWN_LSM) += lockdown/
obj-$(CONFIG_CGROUP_DEVICE) += device_cgroup.o
+obj-$(CONFIG_SECURITY_CONTAINER_MONITOR) += container/
# Object integrity file lists
subdir-$(CONFIG_INTEGRITY) += integrity
diff --git a/security/container/Kconfig b/security/container/Kconfig
new file mode 100644
index 0000000..72a51eb
--- /dev/null
+++ b/security/container/Kconfig
@@ -0,0 +1,17 @@
+config SECURITY_CONTAINER_MONITOR
+ bool "Monitor containerized processes"
+ depends on SECURITY
+ depends on MMU
+ depends on X86_64
+ select SECURITYFS
+ help
+ Instrument the Linux kernel to collect more information about containers
+ and identify security threats.
+
+config SECURITY_CONTAINER_MONITOR_DEBUG
+ bool "Enable debug pr_devel logs"
+ depends on SECURITY_CONTAINER_MONITOR
+ help
+ Define DEBUG for CSM files to compile verbose debugging messages.
+
+ Only for debugging/testing do not enable for production.
diff --git a/security/container/Makefile b/security/container/Makefile
new file mode 100644
index 0000000..9be2528
--- /dev/null
+++ b/security/container/Makefile
@@ -0,0 +1,16 @@
+PB_CCFLAGS := -DPB_SYSTEM_HEADER="<pbsystem.h>" \
+ -DPB_NO_ERRMSG \
+ -DPB_FIELD_16BIT \
+ -DPB_BUFFER_ONLY
+export PB_CCFLAGS
+
+subdir-$(CONFIG_SECURITY_CONTAINER_MONITOR) += protos
+
+obj-$(CONFIG_SECURITY_CONTAINER_MONITOR) += protos/
+obj-$(CONFIG_SECURITY_CONTAINER_MONITOR) += monitor.o pb.o process.o pipe.o
+
+ccflags-y := -I$(srctree)/security/container/protos \
+ -I$(srctree)/security/container/protos/nanopb \
+ -I$(srctree)/fs \
+ $(PB_CCFLAGS)
+ccflags-$(CONFIG_SECURITY_CONTAINER_MONITOR_DEBUG) += -DDEBUG
diff --git a/security/container/monitor.c b/security/container/monitor.c
new file mode 100644
index 0000000..947a79b
--- /dev/null
+++ b/security/container/monitor.c
@@ -0,0 +1,768 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Container Security Monitor module
+ *
+ * Copyright (c) 2018 Google, Inc
+ */
+
+#include "monitor.h"
+#include "process.h"
+
+#include <linux/audit.h>
+#include <linux/file.h>
+#include <linux/lsm_hooks.h>
+#include <linux/module.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/poll.h>
+#include <linux/rwsem.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/sysctl.h>
+#include <overlayfs/overlayfs.h>
+#include <uapi/linux/magic.h>
+
+/* protects csm_*_enabled and configurations. */
+DECLARE_RWSEM(csm_rwsem_config);
+
+/* queue used for poll wait on config changes. */
+static DECLARE_WAIT_QUEUE_HEAD(config_wait);
+
+/* increase each time a new configuration is applied. */
+static unsigned long config_version;
+
+/* Stats gathered from the LSM. */
+struct container_stats csm_stats;
+
+struct container_stats_mapping {
+ const char *key;
+ size_t *value;
+};
+
+/* Key value pair mapping for the sysfs entry. */
+struct container_stats_mapping csm_stats_mapping[] = {
+ { "ProtoEncodingFailed", &csm_stats.proto_encoding_failed },
+ { "WorkQueueFailed", &csm_stats.workqueue_failed },
+ { "EventWritingFailed", &csm_stats.event_writing_failed },
+ { "SizePickingFailed", &csm_stats.size_picking_failed },
+ { "PipeAlreadyOpened", &csm_stats.pipe_already_opened },
+ { "CsmSetxattr", &csm_stats.csm_setxattr },
+};
+
+/*
+ * Is monitoring enabled? Defaults to disabled.
+ * These variables might be used without locking csm_rwsem_config to check if an
+ * LSM hook can bail quickly. The semaphore is taken later to ensure CSM is
+ * still enabled.
+ *
+ * csm_enabled is true if any collector is enabled.
+ */
+bool csm_enabled;
+static bool csm_container_enabled;
+bool csm_execute_enabled;
+bool csm_memexec_enabled;
+
+/* securityfs control files */
+static struct dentry *csm_dir;
+static struct dentry *csm_enabled_file;
+static struct dentry *csm_container_file;
+static struct dentry *csm_config_file;
+static struct dentry *csm_config_vers_file;
+static struct dentry *csm_pipe_file;
+static struct dentry *csm_stats_file;
+
+/* Pipes to forward data to user-mode. */
+DECLARE_RWSEM(csm_rwsem_pipe);
+static struct file *csm_user_read_pipe;
+struct file *csm_user_write_pipe;
+
+/* Option to disable the CSM features at boot. */
+static bool cmdline_boot_disabled;
+bool cmdline_boot_vsock_enabled;
+
+/* Options disabled by default. */
+static bool cmdline_boot_pipe_enabled;
+static bool cmdline_boot_config_enabled;
+
+/* Option to fully enabled the LSM at boot for automated testing. */
+static bool cmdline_default_enabled;
+
+static int csm_boot_disabled_setup(char *str)
+{
+ return kstrtobool(str, &cmdline_boot_disabled);
+}
+early_param("csm.disabled", csm_boot_disabled_setup);
+
+static int csm_default_enabled_setup(char *str)
+{
+ return kstrtobool(str, &cmdline_default_enabled);
+}
+early_param("csm.default.enabled", csm_default_enabled_setup);
+
+static int csm_boot_vsock_enabled_setup(char *str)
+{
+ return kstrtobool(str, &cmdline_boot_vsock_enabled);
+}
+early_param("csm.vsock.enabled", csm_boot_vsock_enabled_setup);
+
+static int csm_boot_pipe_enabled_setup(char *str)
+{
+ return kstrtobool(str, &cmdline_boot_pipe_enabled);
+}
+early_param("csm.pipe.enabled", csm_boot_pipe_enabled_setup);
+
+static int csm_boot_config_enabled_setup(char *str)
+{
+ return kstrtobool(str, &cmdline_boot_config_enabled);
+}
+early_param("csm.config.enabled", csm_boot_config_enabled_setup);
+
+static bool pipe_in_use(void)
+{
+ struct pipe_inode_info *pipe;
+
+ lockdep_assert_held_write(&csm_rwsem_config);
+ if (csm_user_read_pipe) {
+ pipe = get_pipe_info(csm_user_read_pipe);
+ if (pipe)
+ return READ_ONCE(pipe->readers) > 1;
+ }
+ return false;
+}
+
+/* Close pipe, force has to be true to close pipe if it is still being used. */
+int close_pipe_files(bool force)
+{
+ if (csm_user_read_pipe) {
+ /* Pipe is still used. */
+ if (pipe_in_use()) {
+ if (!force)
+ return -EBUSY;
+ pr_warn("pipe is closed while it is still being used.\n");
+ }
+
+ fput(csm_user_read_pipe);
+ fput(csm_user_write_pipe);
+ csm_user_read_pipe = NULL;
+ csm_user_write_pipe = NULL;
+ }
+ return 0;
+}
+
+static void csm_update_config(schema_ConfigurationRequest *req)
+{
+ schema_ExecuteCollectorConfig *econf;
+ size_t i;
+ bool enumerate_processes = false;
+
+ /* Expect the lock to be held for write before this call. */
+ lockdep_assert_held_write(&csm_rwsem_config);
+
+ /* This covers the scenario where a client is connected and the config
+ * transitions the execute collector from disabled to enabled. In that
+ * case there may have been execute events not sent. So they are
+ * enumerated.
+ */
+ if (!csm_execute_enabled && req->execute_config.enabled &&
+ pipe_in_use())
+ enumerate_processes = true;
+
+ csm_container_enabled = req->container_config.enabled;
+ csm_execute_enabled = req->execute_config.enabled;
+ csm_memexec_enabled = req->memexec_config.enabled;
+
+ /* csm_enabled is true if any collector is enabled. */
+ csm_enabled = csm_container_enabled || csm_execute_enabled ||
+ csm_memexec_enabled;
+
+ /* Clean-up existing configurations. */
+ kfree(csm_execute_config.envp_allowlist);
+ memset(&csm_execute_config, 0, sizeof(csm_execute_config));
+
+ if (csm_execute_enabled) {
+ econf = &req->execute_config;
+ csm_execute_config.argv_limit = econf->argv_limit;
+ csm_execute_config.envp_limit = econf->envp_limit;
+
+ /* Swap the allowlist so it is not freed on return. */
+ csm_execute_config.envp_allowlist = econf->envp_allowlist.arg;
+ econf->envp_allowlist.arg = NULL;
+ }
+
+ /* Reset all stats and close pipe if disabled. */
+ if (!csm_enabled) {
+ for (i = 0; i < ARRAY_SIZE(csm_stats_mapping); i++)
+ *csm_stats_mapping[i].value = 0;
+
+ close_pipe_files(true);
+ }
+
+ config_version++;
+ if (enumerate_processes)
+ csm_enumerate_processes();
+ wake_up(&config_wait);
+}
+
+int csm_update_config_from_buffer(void *data, size_t size)
+{
+ schema_ConfigurationRequest c = {};
+ pb_istream_t istream;
+
+ c.execute_config.envp_allowlist.funcs.decode = pb_decode_string_array;
+
+ istream = pb_istream_from_buffer(data, size);
+ if (!pb_decode(&istream, schema_ConfigurationRequest_fields, &c)) {
+ kfree(c.execute_config.envp_allowlist.arg);
+ return -EINVAL;
+ }
+
+ down_write(&csm_rwsem_config);
+ csm_update_config(&c);
+ up_write(&csm_rwsem_config);
+
+ return 0;
+}
+
+static ssize_t csm_config_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ ssize_t err = 0;
+ void *mem;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* No partial writes. */
+ if (*ppos != 0)
+ return -EINVAL;
+
+ /* Duplicate user memory to safely parse protobuf. */
+ mem = memdup_user(buf, count);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
+
+ err = csm_update_config_from_buffer(mem, count);
+ if (!err)
+ err = count;
+
+ kfree(mem);
+ return err;
+}
+
+static const struct file_operations csm_config_fops = {
+ .write = csm_config_write,
+};
+
+static void csm_enable(void)
+{
+ schema_ConfigurationRequest req = {};
+
+ /* Expect the lock to be held for write before this call. */
+ lockdep_assert_held_write(&csm_rwsem_config);
+
+ /* Default configuration */
+ req.container_config.enabled = true;
+ req.execute_config.enabled = true;
+ req.execute_config.argv_limit = UINT_MAX;
+ req.execute_config.envp_limit = UINT_MAX;
+ req.memexec_config.enabled = true;
+ csm_update_config(&req);
+}
+
+static void csm_disable(void)
+{
+ schema_ConfigurationRequest req = {};
+
+ /* Expect the lock to be held for write before this call. */
+ lockdep_assert_held_write(&csm_rwsem_config);
+
+ /* Zero configuration disable all collectors. */
+ csm_update_config(&req);
+ pr_info("disabled\n");
+}
+
+static ssize_t csm_enabled_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ const char *str = csm_enabled ? "1\n" : "0\n";
+
+ return simple_read_from_buffer(buf, count, ppos, str, 2);
+}
+
+static ssize_t csm_enabled_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ bool enabled;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (count <= 0 || count > PAGE_SIZE || *ppos)
+ return -EINVAL;
+
+ err = kstrtobool_from_user(buf, count, &enabled);
+ if (err)
+ return err;
+
+ down_write(&csm_rwsem_config);
+
+ if (enabled)
+ csm_enable();
+ else
+ csm_disable();
+
+ up_write(&csm_rwsem_config);
+
+ return count;
+}
+
+static const struct file_operations csm_enabled_fops = {
+ .read = csm_enabled_read,
+ .write = csm_enabled_write,
+};
+
+static int csm_config_version_open(struct inode *inode, struct file *file)
+{
+ /* private_data is used to keep the latest config version read. */
+ file->private_data = (void*)-1;
+ return 0;
+}
+
+static ssize_t csm_config_version_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned long version = config_version;
+ file->private_data = (void*)version;
+ return simple_read_from_buffer(buf, count, ppos, &version,
+ sizeof(version));
+}
+
+static __poll_t csm_config_version_poll(struct file *file,
+ struct poll_table_struct *poll_tab)
+{
+ if ((unsigned long)file->private_data != config_version)
+ return EPOLLIN;
+ poll_wait(file, &config_wait, poll_tab);
+ if ((unsigned long)file->private_data != config_version)
+ return EPOLLIN;
+ return 0;
+}
+
+static const struct file_operations csm_config_version_fops = {
+ .open = csm_config_version_open,
+ .read = csm_config_version_read,
+ .poll = csm_config_version_poll,
+};
+
+static int csm_pipe_open(struct inode *inode, struct file *file)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (!csm_enabled)
+ return -EAGAIN;
+ return 0;
+}
+
+/* Similar to file_clone_open that is available only in 4.19 and up. */
+static inline struct file *pipe_clone_open(struct file *file)
+{
+ return dentry_open(&file->f_path, file->f_flags, file->f_cred);
+}
+
+/* Check if the pipe is still used, else recreate and dup it. */
+static struct file *csm_dup_pipe(void)
+{
+ long pipe_size = 1024 * PAGE_SIZE;
+ long actual_size;
+ struct file *pipes[2] = {NULL, NULL};
+ struct file *ret;
+ int err;
+
+ down_write(&csm_rwsem_pipe);
+
+ err = close_pipe_files(false);
+ if (err) {
+ ret = ERR_PTR(err);
+ csm_stats.pipe_already_opened++;
+ goto out;
+ }
+
+ err = create_pipe_files(pipes, O_NONBLOCK);
+ if (err) {
+ ret = ERR_PTR(err);
+ goto out;
+ }
+
+ /*
+ * Try to increase the pipe size to 1024 pages, if there is not
+ * enough memory, pipes will stay unchanged.
+ */
+ actual_size = pipe_fcntl(pipes[0], F_SETPIPE_SZ, pipe_size);
+ if (actual_size != pipe_size)
+ pr_err("failed to resize pipe to 1024 pages, error: %ld, fallback to the default value\n",
+ actual_size);
+
+ csm_user_read_pipe = pipes[0];
+ csm_user_write_pipe = pipes[1];
+
+ /* Clone the file so we can track if the reader is still used. */
+ ret = pipe_clone_open(csm_user_read_pipe);
+
+out:
+ up_write(&csm_rwsem_pipe);
+ return ret;
+}
+
+static ssize_t csm_pipe_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int fd;
+ ssize_t err;
+ struct file *local_pipe;
+
+ /* No partial reads. */
+ if (*ppos != 0)
+ return -EINVAL;
+
+ fd = get_unused_fd_flags(0);
+ if (fd < 0)
+ return fd;
+
+ local_pipe = csm_dup_pipe();
+ if (IS_ERR(local_pipe)) {
+ err = PTR_ERR(local_pipe);
+ local_pipe = NULL;
+ goto error;
+ }
+
+ err = simple_read_from_buffer(buf, count, ppos, &fd, sizeof(fd));
+ if (err < 0)
+ goto error;
+
+ if (err < sizeof(fd)) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ /* Install the file descriptor when we know everything succeeded. */
+ fd_install(fd, local_pipe);
+
+ csm_enumerate_processes();
+
+ return err;
+
+error:
+ if (local_pipe)
+ fput(local_pipe);
+ put_unused_fd(fd);
+ return err;
+}
+
+
+static const struct file_operations csm_pipe_fops = {
+ .open = csm_pipe_open,
+ .read = csm_pipe_read,
+};
+
+static void set_container_decode_callbacks(schema_Container *container)
+{
+ container->pod_namespace.funcs.decode = pb_decode_string_field;
+ container->pod_name.funcs.decode = pb_decode_string_field;
+ container->container_name.funcs.decode = pb_decode_string_field;
+ container->container_image_uri.funcs.decode = pb_decode_string_field;
+ container->labels.funcs.decode = pb_decode_string_array;
+}
+
+static void set_container_encode_callbacks(schema_Container *container)
+{
+ container->pod_namespace.funcs.encode = pb_encode_string_field;
+ container->pod_name.funcs.encode = pb_encode_string_field;
+ container->container_name.funcs.encode = pb_encode_string_field;
+ container->container_image_uri.funcs.encode = pb_encode_string_field;
+ container->labels.funcs.encode = pb_encode_string_array;
+}
+
+static void free_container_callbacks_args(schema_Container *container)
+{
+ kfree(container->pod_namespace.arg);
+ kfree(container->pod_name.arg);
+ kfree(container->container_name.arg);
+ kfree(container->container_image_uri.arg);
+ kfree(container->labels.arg);
+}
+
+static ssize_t csm_container_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ ssize_t err = 0;
+ void *mem;
+ u64 cid;
+ pb_istream_t istream;
+ struct task_struct *task;
+ schema_ContainerReport report = {};
+ schema_Event event = {};
+ schema_Container *container;
+ char *uuid = NULL;
+
+ /* Notify that this collector is not yet enabled. */
+ if (!csm_container_enabled)
+ return -EAGAIN;
+
+ /* No partial writes. */
+ if (*ppos != 0)
+ return -EINVAL;
+
+ /* Duplicate user memory to safely parse protobuf. */
+ mem = memdup_user(buf, count);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
+
+ /* Callback to decode string in protobuf. */
+ set_container_decode_callbacks(&report.container);
+
+ istream = pb_istream_from_buffer(mem, count);
+ if (!pb_decode(&istream, schema_ContainerReport_fields, &report)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Check protobuf is as expected */
+ if (report.pid == 0 ||
+ report.container.container_id != 0) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Find if the process id is linked to an existing container-id. */
+ rcu_read_lock();
+ task = find_task_by_pid_ns(report.pid, &init_pid_ns);
+ if (task) {
+ cid = audit_get_contid(task);
+ if (cid == AUDIT_CID_UNSET)
+ err = -ENOENT;
+ } else {
+ err = -ENOENT;
+ }
+ rcu_read_unlock();
+
+ if (err)
+ goto out;
+
+ uuid = kzalloc(PROCESS_UUID_SIZE, GFP_KERNEL);
+ if (!uuid)
+ goto out;
+
+ /* Provide the uuid for the top process of the container. */
+ err = get_process_uuid_by_pid(report.pid, uuid, PROCESS_UUID_SIZE);
+ if (err)
+ goto out;
+
+ /* Correct the container-id and feed the event to pipe */
+ report.has_container = true;
+ report.container.container_id = cid;
+ report.container.init_uuid.funcs.encode = pb_encode_uuid_field;
+ report.container.init_uuid.arg = uuid;
+ container = &event.event.container.container;
+ *container = report.container;
+
+ /* Use encode callback to generate the final proto. */
+ set_container_encode_callbacks(container);
+
+ event.which_event = schema_Event_container_tag;
+ event.event.container.has_container = true;
+
+ err = csm_sendeventproto(schema_Event_fields, &event);
+ if (!err)
+ err = count;
+
+out:
+ /* Free any allocated nanopb callback arguments. */
+ free_container_callbacks_args(&report.container);
+ kfree(uuid);
+ kfree(mem);
+ return err;
+}
+
+static const struct file_operations csm_container_fops = {
+ .write = csm_container_write,
+};
+
+static int csm_show_stats(struct seq_file *p, void *v)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(csm_stats_mapping); i++) {
+ seq_printf(p, "%s:\t%zu\n",
+ csm_stats_mapping[i].key,
+ *csm_stats_mapping[i].value);
+ }
+
+ return 0;
+}
+
+static int csm_stats_open(struct inode *inode, struct file *file)
+{
+ size_t i, size = 1; /* Start at one for the null byte. */
+
+ for (i = 0; i < ARRAY_SIZE(csm_stats_mapping); i++) {
+ /*
+ * Calculate the maximum length:
+ * - Length of the key
+ * - 3 additional chars :\t\n
+ * - longest unsigned 64-bit integer.
+ */
+ size += strlen(csm_stats_mapping[i].key)
+ + 3 + sizeof("18446744073709551615");
+ }
+
+ return single_open_size(file, csm_show_stats, NULL, size);
+}
+
+static const struct file_operations csm_stats_fops = {
+ .open = csm_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static bool is_d_overlayfs_mounted(struct dentry *dentry)
+{
+ struct super_block *mnt_sb;
+
+ if (dentry == NULL || dentry->d_inode == NULL)
+ return false;
+
+ mnt_sb = dentry->d_inode->i_sb;
+ if (mnt_sb == NULL || mnt_sb->s_magic != OVERLAYFS_SUPER_MAGIC)
+ return false;
+
+ return true;
+}
+
+static int csm_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ if (csm_enabled &&
+ (audit_get_contid(current) != AUDIT_CID_UNSET) &&
+ is_d_overlayfs_mounted(dentry) &&
+ (strcmp(name, XATTR_SECURITY_CSM) == 0))
+ csm_stats.csm_setxattr++;
+
+ return 0;
+}
+
+static struct security_hook_list csm_hooks[] __lsm_ro_after_init = {
+ /* Track process execution. */
+ LSM_HOOK_INIT(bprm_check_security, csm_bprm_check_security),
+ LSM_HOOK_INIT(task_post_alloc, csm_task_post_alloc),
+ LSM_HOOK_INIT(task_exit, csm_task_exit),
+
+ /* Track memory execution */
+ LSM_HOOK_INIT(file_mprotect, csm_mprotect),
+ LSM_HOOK_INIT(mmap_file, csm_mmap_file),
+
+ /* Track file modification provenance. */
+ LSM_HOOK_INIT(file_pre_free_security, csm_file_pre_free),
+
+ /* Block modyfing csm xattr. */
+ LSM_HOOK_INIT(inode_setxattr, csm_setxattr),
+};
+
+static int __init csm_init(void)
+{
+ int err;
+
+ if (cmdline_boot_disabled)
+ return 0;
+
+ if (cmdline_boot_vsock_enabled)
+ pr_debug("vsock is deprecated, but was enabled at boot\n");
+
+ csm_dir = securityfs_create_dir("container_monitor", NULL);
+ if (IS_ERR(csm_dir)) {
+ err = PTR_ERR(csm_dir);
+ goto error;
+ }
+
+ csm_enabled_file = securityfs_create_file("enabled", 0644, csm_dir,
+ NULL, &csm_enabled_fops);
+ if (IS_ERR(csm_enabled_file)) {
+ err = PTR_ERR(csm_enabled_file);
+ goto error_rmdir;
+ }
+
+ csm_container_file = securityfs_create_file("container", 0200, csm_dir,
+ NULL, &csm_container_fops);
+ if (IS_ERR(csm_container_file)) {
+ err = PTR_ERR(csm_container_file);
+ goto error_rm_enabled;
+ }
+
+ csm_config_vers_file = securityfs_create_file("config_version", 0400,
+ csm_dir, NULL,
+ &csm_config_version_fops);
+ if (IS_ERR(csm_config_vers_file)) {
+ err = PTR_ERR(csm_config_vers_file);
+ goto error_rm_container;
+ }
+
+ if (cmdline_boot_config_enabled) {
+ csm_config_file = securityfs_create_file("config", 0200,
+ csm_dir, NULL,
+ &csm_config_fops);
+ if (IS_ERR(csm_config_file)) {
+ err = PTR_ERR(csm_config_file);
+ goto error_rm_config_vers;
+ }
+ }
+
+ if (cmdline_boot_pipe_enabled) {
+ csm_pipe_file = securityfs_create_file("pipe", 0400, csm_dir,
+ NULL, &csm_pipe_fops);
+ if (IS_ERR(csm_pipe_file)) {
+ err = PTR_ERR(csm_pipe_file);
+ goto error_rm_config;
+ }
+ }
+
+ csm_stats_file = securityfs_create_file("stats", 0400, csm_dir,
+ NULL, &csm_stats_fops);
+ if (IS_ERR(csm_stats_file)) {
+ err = PTR_ERR(csm_stats_file);
+ goto error_rm_pipe;
+ }
+
+ pr_debug("created securityfs control files\n");
+
+ security_add_hooks(csm_hooks, ARRAY_SIZE(csm_hooks), "csm");
+ pr_debug("registered hooks\n");
+
+ /* Off-by-default, only used for testing images. */
+ if (cmdline_default_enabled) {
+ down_write(&csm_rwsem_config);
+ csm_enable();
+ up_write(&csm_rwsem_config);
+ }
+
+ return 0;
+
+error_rm_pipe:
+ if (cmdline_boot_pipe_enabled)
+ securityfs_remove(csm_pipe_file);
+error_rm_config:
+ if (cmdline_boot_config_enabled)
+ securityfs_remove(csm_config_file);
+error_rm_config_vers:
+ securityfs_remove(csm_config_vers_file);
+error_rm_container:
+ securityfs_remove(csm_container_file);
+error_rm_enabled:
+ securityfs_remove(csm_enabled_file);
+error_rmdir:
+ securityfs_remove(csm_dir);
+error:
+ pr_warn("fs initialization error: %d", err);
+ return err;
+}
+
+late_initcall(csm_init);
diff --git a/security/container/monitor.h b/security/container/monitor.h
new file mode 100644
index 0000000..654a808
--- /dev/null
+++ b/security/container/monitor.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Container Security Monitor module
+ *
+ * Copyright (c) 2018 Google, Inc
+ */
+
+#define pr_fmt(fmt) "container-security-monitor: " fmt
+
+#include <linux/kernel.h>
+#include <linux/security.h>
+#include <linux/fs.h>
+#include <linux/rwsem.h>
+#include <linux/binfmts.h>
+#include <linux/xattr.h>
+#include <config.pb.h>
+#include <event.pb.h>
+#include <pb_encode.h>
+#include <pb_decode.h>
+
+#include "monitoring_protocol.h"
+
+/* Part of the CSM configuration response. */
+#define CSM_VERSION 1
+
+/* protects csm_*_enabled and configurations. */
+extern struct rw_semaphore csm_rwsem_config;
+
+/*
+ * Is monitoring enabled? Defaults to disabled.
+ * These variables might be used as gates without locking (as processor ensures
+ * valid proper access for native scalar values) so it can bail quickly.
+ */
+extern bool csm_enabled;
+extern bool csm_execute_enabled;
+extern bool csm_memexec_enabled;
+
+/* Configuration options for execute collector. */
+struct execute_config {
+ size_t argv_limit;
+ size_t envp_limit;
+ char *envp_allowlist;
+};
+
+extern struct execute_config csm_execute_config;
+
+/* pipe to forward events to user-mode. */
+extern struct rw_semaphore csm_rwsem_pipe;
+extern struct file *csm_user_write_pipe;
+
+/* Stats on LSM events. */
+struct container_stats {
+ size_t proto_encoding_failed;
+ size_t event_writing_failed;
+ size_t workqueue_failed;
+ size_t size_picking_failed;
+ size_t pipe_already_opened;
+ size_t csm_setxattr;
+};
+
+extern struct container_stats csm_stats;
+
+/* Streams file numbers are unknown from the kernel */
+#define STDIN_FILENO 0
+#define STDOUT_FILENO 1
+#define STDERR_FILENO 2
+
+/* security attribute for file provenance. */
+#define XATTR_SECURITY_CSM XATTR_SECURITY_PREFIX "csm"
+
+/* monitor functions */
+int csm_update_config_from_buffer(void *data, size_t size);
+
+/* send event to userland */
+int csm_sendeventproto(const pb_msgdesc_t *fields, schema_Event *event);
+
+/* process events functions */
+int csm_bprm_check_security(struct linux_binprm *bprm);
+void csm_task_exit(struct task_struct *task);
+void csm_task_post_alloc(struct task_struct *task);
+int get_process_uuid_by_pid(pid_t pid_nr, char *buffer, size_t size);
+
+/* memory execution events functions */
+int csm_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
+ unsigned long prot);
+int csm_mmap_file(struct file *file, unsigned long reqprot,
+ unsigned long prot, unsigned long flags);
+
+/* Tracking of file modification provenance. */
+void csm_file_pre_free(struct file *file);
+
+/* nano functions */
+bool pb_encode_string_field(pb_ostream_t *stream, const pb_field_t *field,
+ void * const *arg);
+bool pb_decode_string_field(pb_istream_t *stream, const pb_field_t *field,
+ void **arg);
+ssize_t pb_encode_string_field_limit(pb_ostream_t *stream,
+ const pb_field_t *field,
+ void * const *arg, size_t limit);
+bool pb_encode_string_array(pb_ostream_t *stream, const pb_field_t *field,
+ void * const *arg);
+bool pb_decode_string_array(pb_istream_t *stream, const pb_field_t *field,
+ void **arg);
+bool pb_encode_uuid_field(pb_ostream_t *stream, const pb_field_t *field,
+ void * const *arg);
+bool pb_encode_ip4(pb_ostream_t *stream, const pb_field_t *field,
+ void * const *arg);
+bool pb_encode_ip6(pb_ostream_t *stream, const pb_field_t *field,
+ void * const *arg);
+
diff --git a/security/container/monitoring_protocol.h b/security/container/monitoring_protocol.h
new file mode 100644
index 0000000..dbdfc9c
--- /dev/null
+++ b/security/container/monitoring_protocol.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+/* Container security monitoring protocol definitions */
+
+#include <linux/types.h>
+
+enum csm_msgtype {
+ CSM_MSG_TYPE_HEARTBEAT = 1,
+ CSM_MSG_EVENT_PROTO = 2,
+ CSM_MSG_CONFIG_REQUEST_PROTO = 3,
+ CSM_MSG_CONFIG_RESPONSE_PROTO = 4,
+};
+
+struct csm_msg_hdr {
+ __le32 msg_type;
+ __le32 msg_length;
+};
+
+/* The process uuid is a 128-bits identifier */
+#define PROCESS_UUID_SIZE 16
+
+/* The entire structure forms the collision domain. */
+union process_uuid {
+ struct {
+ __u32 machineid;
+ __u64 start_time;
+ __u32 tgid;
+ } __attribute__((packed));
+ __u8 data[PROCESS_UUID_SIZE];
+};
diff --git a/security/container/pb.c b/security/container/pb.c
new file mode 100644
index 0000000..1cc7ecf
--- /dev/null
+++ b/security/container/pb.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Container Security Monitor module
+ *
+ * Copyright (c) 2018 Google, Inc
+ */
+
+#include "monitor.h"
+
+#include <linux/string.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+
+bool pb_encode_string_field(pb_ostream_t *stream, const pb_field_t *field,
+ void * const *arg)
+{
+ const uint8_t *str = (const uint8_t *)*arg;
+
+ /* If the string is not set, skip this string. */
+ if (!str)
+ return true;
+
+ if (!pb_encode_tag_for_field(stream, field))
+ return false;
+
+ return pb_encode_string(stream, str, strlen(str));
+}
+
+bool pb_decode_string_field(pb_istream_t *stream, const pb_field_t *field,
+ void **arg)
+{
+ size_t size;
+ void *data;
+
+ *arg = NULL;
+
+ size = stream->bytes_left;
+
+ /* Ensure a null-byte at the end */
+ if (size + 1 < size)
+ return false;
+
+ data = kzalloc(size + 1, GFP_KERNEL);
+ if (!data)
+ return false;
+
+ if (!pb_read(stream, data, size)) {
+ kfree(data);
+ return false;
+ }
+
+ *arg = data;
+
+ return true;
+}
+
+bool pb_encode_string_array(pb_ostream_t *stream, const pb_field_t *field,
+ void * const *arg)
+{
+ char *strs = (char *)*arg;
+
+ /* If the string array is not set, skip this string array. */
+ if (!strs)
+ return true;
+
+ do {
+ if (!pb_encode_string_field(stream, field,
+ (void * const *) &strs))
+ return false;
+
+ strs += strlen(strs) + 1;
+ } while (*strs != 0);
+
+ return true;
+}
+
+/* Limit the encoded string size and return how many characters were added. */
+ssize_t pb_encode_string_field_limit(pb_ostream_t *stream,
+ const pb_field_t *field,
+ void * const *arg, size_t limit)
+{
+ char *str = (char *)*arg;
+ size_t length;
+
+ /* If the string is not set, skip this string. */
+ if (!str)
+ return 0;
+
+ if (!pb_encode_tag_for_field(stream, field))
+ return -EINVAL;
+
+ length = strlen(str);
+ if (length > limit)
+ length = limit;
+
+ if (!pb_encode_string(stream, (uint8_t *)str, length))
+ return -EINVAL;
+
+ return length;
+}
+
+bool pb_decode_string_array(pb_istream_t *stream, const pb_field_t *field,
+ void **arg)
+{
+ size_t needed, used = 0;
+ char *data, *strs;
+
+ /* String length, and two null-bytes for the end of the list. */
+ needed = stream->bytes_left + 2;
+ if (needed < stream->bytes_left)
+ return false;
+
+ if (*arg) {
+ /* Calculate used space from the current list. */
+ strs = (char *)*arg;
+ do {
+ used += strlen(strs + used) + 1;
+ } while (strs[used] != 0);
+
+ if (used + needed < needed)
+ return false;
+ }
+
+ data = krealloc(*arg, used + needed, GFP_KERNEL);
+ if (!data)
+ return false;
+
+ /* Will always be freed by the caller */
+ *arg = data;
+
+ /* Reset the new part of the buffer. */
+ memset(data + used, 0, needed);
+
+ /* Read what's in the stream buffer only. */
+ if (!pb_read(stream, data + used, stream->bytes_left))
+ return false;
+
+ return true;
+}
+
+bool pb_encode_fixed_string(pb_ostream_t *stream, const pb_field_t *field,
+ const uint8_t *data, size_t length)
+{
+ /* If the data is not set, skip this string. */
+ if (!data)
+ return true;
+
+ if (!pb_encode_tag_for_field(stream, field))
+ return false;
+
+ return pb_encode_string(stream, data, length);
+}
+
+
+bool pb_encode_uuid_field(pb_ostream_t *stream, const pb_field_t *field,
+ void * const *arg)
+{
+ return pb_encode_fixed_string(stream, field, (const uint8_t *)*arg,
+ PROCESS_UUID_SIZE);
+}
+
+bool pb_encode_ip4(pb_ostream_t *stream, const pb_field_t *field,
+ void * const *arg)
+{
+ return pb_encode_fixed_string(stream, field, (const uint8_t *)*arg,
+ sizeof(struct in_addr));
+}
+
+bool pb_encode_ip6(pb_ostream_t *stream, const pb_field_t *field,
+ void * const *arg)
+{
+ return pb_encode_fixed_string(stream, field, (const uint8_t *)*arg,
+ sizeof(struct in6_addr));
+}
diff --git a/security/container/pipe.c b/security/container/pipe.c
new file mode 100644
index 0000000..16abb23
--- /dev/null
+++ b/security/container/pipe.c
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Container Security Monitor module
+ *
+ * Copyright (c) 2018 Google, Inc
+ */
+
+#include "monitor.h"
+
+#include <linux/pipe_fs_i.h>
+#include <linux/printk.h>
+#include <linux/ratelimit.h>
+#include <linux/uio.h>
+#include <linux/workqueue.h>
+
+/* csm protobuf work */
+static void csm_sendmsg_pipe_handler(struct work_struct *work);
+
+/* csm message work container */
+struct msg_work_data {
+ struct work_struct msg_work;
+ size_t pos_bytes_written;
+ char msg[];
+};
+
+/* Mutex to ensure sequential dumping of protos */
+static DEFINE_MUTEX(protodump);
+
+static ssize_t csm_user_pipe_write(struct kvec *vecs, size_t vecs_size,
+ size_t total_length)
+{
+ ssize_t perr = 0;
+ struct iov_iter io = { };
+ loff_t pos = 0;
+ struct pipe_inode_info *pipe;
+ unsigned int readers;
+
+ if (!csm_user_write_pipe)
+ return 0;
+
+ down_read(&csm_rwsem_pipe);
+
+ if (csm_user_write_pipe == NULL)
+ goto end;
+
+ /* The pipe info is the same for reader and write files. */
+ pipe = get_pipe_info(csm_user_write_pipe);
+
+ /* If nobody is listening, don't write events. */
+ readers = READ_ONCE(pipe->readers);
+ if (readers <= 1) {
+ WARN_ON(readers == 0);
+ goto end;
+ }
+
+
+ iov_iter_kvec(&io, WRITE, vecs, vecs_size, total_length);
+
+ file_start_write(csm_user_write_pipe);
+ perr = vfs_iter_write(csm_user_write_pipe, &io, &pos, 0);
+ file_end_write(csm_user_write_pipe);
+
+end:
+ up_read(&csm_rwsem_pipe);
+ return perr;
+}
+
+static int csm_sendmsg(int type, const void *buf, size_t len)
+{
+ struct csm_msg_hdr hdr = {
+ .msg_type = cpu_to_le32(type),
+ .msg_length = cpu_to_le32(sizeof(hdr) + len),
+ };
+ struct kvec vecs[] = {
+ {
+ .iov_base = &hdr,
+ .iov_len = sizeof(hdr),
+ }, {
+ .iov_base = (void *)buf,
+ .iov_len = len,
+ }
+ };
+ ssize_t perr;
+
+ perr = csm_user_pipe_write(vecs, ARRAY_SIZE(vecs),
+ le32_to_cpu(hdr.msg_length));
+ if (perr < 0) {
+ pr_warn_ratelimited("vfs_iter_write error (msg_type=%d, msg_length=%u): %zd\n",
+ type, le32_to_cpu(hdr.msg_length), perr);
+ csm_stats.event_writing_failed++;
+ }
+
+ return perr;
+}
+
+static bool csm_get_expected_size(size_t *size, const pb_msgdesc_t *fields,
+ const void *src_struct)
+{
+ schema_Event *event;
+
+ if (fields != schema_Event_fields)
+ goto other;
+
+ /* Size above 99% of the 100 containers tested running k8s. */
+ event = (schema_Event *)src_struct;
+ switch (event->which_event) {
+ case schema_Event_execute_tag:
+ *size = 3344;
+ return true;
+ case schema_Event_memexec_tag:
+ *size = 176;
+ return true;
+ case schema_Event_clone_tag:
+ *size = 50;
+ return true;
+ case schema_Event_exit_tag:
+ *size = 30;
+ return true;
+ }
+
+other:
+ /* If unknown, do the pre-computation. */
+ return pb_get_encoded_size(size, fields, src_struct);
+}
+
+static struct msg_work_data *csm_encodeproto(size_t size,
+ const pb_msgdesc_t *fields,
+ const void *src_struct)
+{
+ pb_ostream_t pos;
+ struct msg_work_data *wd;
+ size_t total;
+
+ total = size + sizeof(*wd);
+ if (total < size)
+ return ERR_PTR(-EINVAL);
+
+ wd = kmalloc(total, GFP_KERNEL);
+ if (!wd)
+ return ERR_PTR(-ENOMEM);
+
+ pos = pb_ostream_from_buffer(wd->msg, size);
+ if (!pb_encode(&pos, fields, src_struct)) {
+ kfree(wd);
+ return ERR_PTR(-EINVAL);
+ }
+
+ INIT_WORK(&wd->msg_work, csm_sendmsg_pipe_handler);
+ wd->pos_bytes_written = pos.bytes_written;
+ return wd;
+}
+
+static int csm_sendproto(int type, const pb_msgdesc_t *fields,
+ const void *src_struct)
+{
+ int err = 0;
+ size_t size, previous_size;
+ struct msg_work_data *wd;
+
+ /* Use the expected size first. */
+ if (!csm_get_expected_size(&size, fields, src_struct))
+ return -EINVAL;
+
+ wd = csm_encodeproto(size, fields, src_struct);
+ if (IS_ERR(wd)) {
+ /* If it failed, retry with the exact size. */
+ csm_stats.size_picking_failed++;
+ previous_size = size;
+
+ if (!pb_get_encoded_size(&size, fields, src_struct))
+ return -EINVAL;
+
+ wd = csm_encodeproto(size, fields, src_struct);
+ if (IS_ERR(wd)) {
+ csm_stats.proto_encoding_failed++;
+ return PTR_ERR(wd);
+ }
+
+ pr_debug("size picking failed %lu vs %lu\n", previous_size,
+ size);
+ }
+
+ /* The work handler takes care of cleanup, if successfully scheduled. */
+ if (likely(schedule_work(&wd->msg_work)))
+ return 0;
+
+ csm_stats.workqueue_failed++;
+ pr_err_ratelimited("Sent msg to workqueue unsuccessfully (assume dropped).\n");
+
+ kfree(wd);
+ return err;
+}
+
+static void csm_sendmsg_pipe_handler(struct work_struct *work)
+{
+ int err;
+ int type = CSM_MSG_EVENT_PROTO;
+ struct msg_work_data *wd = container_of(work, struct msg_work_data,
+ msg_work);
+
+ err = csm_sendmsg(type, wd->msg, wd->pos_bytes_written);
+ if (err < 0)
+ pr_err_ratelimited("csm_sendmsg failed in work handler %s\n",
+ __func__);
+
+ kfree(wd);
+}
+
+int csm_sendeventproto(const pb_msgdesc_t *fields, schema_Event *event)
+{
+ /* Last check before generating and sending an event. */
+ if (!csm_enabled)
+ return -ENOTSUPP;
+
+ event->timestamp = ktime_get_real_ns();
+
+ return csm_sendproto(CSM_MSG_EVENT_PROTO, fields, event);
+}
diff --git a/security/container/process.c b/security/container/process.c
new file mode 100644
index 0000000..0bf84c4
--- /dev/null
+++ b/security/container/process.c
@@ -0,0 +1,1167 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Container Security Monitor module
+ *
+ * Copyright (c) 2018 Google, Inc
+ */
+
+#include "monitor.h"
+
+#include <linux/atomic.h>
+#include <linux/audit.h>
+#include <linux/file.h>
+#include <linux/highmem.h>
+#include <linux/mempool.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/notifier.h>
+#include <linux/net.h>
+#include <linux/path.h>
+#include <linux/pid.h>
+#include <linux/pid_namespace.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/timekeeping.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+#include <linux/xattr.h>
+#include <net/ipv6.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <overlayfs/overlayfs.h>
+#include <uapi/linux/magic.h>
+#include <uapi/asm/mman.h>
+
+/* Configuration options for execute collector. */
+struct execute_config csm_execute_config;
+
+/* unique atomic value for the machine boot instance */
+static atomic_t machine_rand = ATOMIC_INIT(0);
+
+/* sequential container identifier */
+static atomic_t contid = ATOMIC_INIT(0);
+
+/* Generation id for each enumeration invocation. */
+static atomic_t enumeration_count = ATOMIC_INIT(0);
+
+struct file_provenance {
+ /* pid of the process doing the first write. */
+ pid_t tgid;
+ /* start_time of the process to uniquely identify it. */
+ u64 start_time;
+};
+
+struct csm_enumerate_processes_work_data {
+ struct work_struct work;
+ int enumeration_count;
+};
+
+static void *kmap_argument_stack(struct linux_binprm *bprm, void **ctx)
+{
+ char *argv;
+ int err;
+ unsigned long i, pos, count;
+ void *map;
+ struct page *page;
+
+ /* vma_pages() returns the number of pages reserved for the stack */
+ count = vma_pages(bprm->vma);
+
+ if (likely(count == 1)) {
+ err = get_user_pages_remote(current, bprm->mm, bprm->p, 1,
+ FOLL_FORCE, &page, NULL, NULL);
+ if (err != 1)
+ return NULL;
+
+ argv = kmap(page);
+ *ctx = page;
+ } else {
+ /*
+ * If more than one pages is needed, copy all of them to a set
+ * of pages. Parsing the argument across kmap pages in different
+ * addresses would make it impractical.
+ */
+ argv = vmalloc(count * PAGE_SIZE);
+ if (!argv)
+ return NULL;
+
+ for (i = 0; i < count; i++) {
+ pos = ALIGN_DOWN(bprm->p, PAGE_SIZE) + i * PAGE_SIZE;
+ err = get_user_pages_remote(current, bprm->mm, pos, 1,
+ FOLL_FORCE, &page, NULL,
+ NULL);
+ if (err <= 0) {
+ vfree(argv);
+ return NULL;
+ }
+
+ map = kmap(page);
+ memcpy(argv + i * PAGE_SIZE, map, PAGE_SIZE);
+ kunmap(page);
+ put_page(page);
+ }
+ *ctx = bprm;
+ }
+
+ return argv;
+}
+
+static void kunmap_argument_stack(struct linux_binprm *bprm, void *addr,
+ void *ctx)
+{
+ struct page *page;
+
+ if (!addr)
+ return;
+
+ if (likely(vma_pages(bprm->vma) == 1)) {
+ page = (struct page *)ctx;
+ kunmap(page);
+ put_page(ctx);
+ } else {
+ vfree(addr);
+ }
+}
+
+static char *find_array_next_entry(char *array, unsigned long *offset,
+ unsigned long end)
+{
+ char *entry;
+ unsigned long off = *offset;
+
+ if (off >= end)
+ return NULL;
+
+ /* Check the entry is null terminated and in bound */
+ entry = array + off;
+ while (array[off]) {
+ if (++off >= end)
+ return NULL;
+ }
+
+ /* Pass the null byte for the next iteration */
+ *offset = off + 1;
+
+ return entry;
+}
+
+struct string_arr_ctx {
+ struct linux_binprm *bprm;
+ void *stack;
+};
+
+static size_t get_config_limit(size_t *config_ptr)
+{
+ lockdep_assert_held_read(&csm_rwsem_config);
+
+ /*
+ * If execute is not enabled, do not capture arguments.
+ * The event proto won't be sent anyway.
+ */
+ if (!csm_execute_enabled)
+ return 0;
+
+ return *config_ptr;
+}
+
+static bool encode_current_argv(pb_ostream_t *stream, const pb_field_t *field,
+ void * const *arg)
+{
+ struct string_arr_ctx *ctx = (struct string_arr_ctx *)*arg;
+ int i;
+ struct linux_binprm *bprm = ctx->bprm;
+ unsigned long offset = bprm->p % PAGE_SIZE;
+ unsigned long end = vma_pages(bprm->vma) * PAGE_SIZE;
+ char *argv = ctx->stack;
+ char *entry;
+ size_t limit, used = 0;
+ ssize_t ret;
+
+ limit = get_config_limit(&csm_execute_config.argv_limit);
+ if (!limit)
+ return true;
+
+ for (i = 0; i < bprm->argc; i++) {
+ entry = find_array_next_entry(argv, &offset, end);
+ if (!entry)
+ return false;
+
+ ret = pb_encode_string_field_limit(stream, field,
+ (void * const *)&entry,
+ limit - used);
+ if (ret < 0)
+ return false;
+
+ used += ret;
+
+ if (used >= limit)
+ break;
+ }
+
+ return true;
+}
+
+static bool check_envp_allowlist(char *envp)
+{
+ bool ret = false;
+ char *strs, *equal;
+ size_t str_size, equal_pos;
+
+ /* If execute is not enabled, skip all. */
+ if (!csm_execute_enabled)
+ goto out;
+
+ /* No filter, allow all. */
+ strs = csm_execute_config.envp_allowlist;
+ if (!strs) {
+ ret = true;
+ goto out;
+ }
+
+ /*
+ * Identify the key=value separation.
+ * If none exists use the whole string as a key.
+ */
+ equal = strchr(envp, '=');
+ equal_pos = equal ? (equal - envp) : strlen(envp);
+
+ /* Default to skip if no match found. */
+ ret = false;
+
+ do {
+ str_size = strlen(strs);
+
+ /*
+ * If the filter length align with the key value equal sign,
+ * it might be a match, check the key value.
+ */
+ if (str_size == equal_pos &&
+ !strncmp(strs, envp, str_size)) {
+ ret = true;
+ goto out;
+ }
+
+ strs += str_size + 1;
+ } while (*strs != 0);
+
+out:
+ return ret;
+}
+
+static bool encode_current_envp(pb_ostream_t *stream, const pb_field_t *field,
+ void * const *arg)
+{
+ struct string_arr_ctx *ctx = (struct string_arr_ctx *)*arg;
+ int i;
+ struct linux_binprm *bprm = ctx->bprm;
+ unsigned long offset = bprm->p % PAGE_SIZE;
+ unsigned long end = vma_pages(bprm->vma) * PAGE_SIZE;
+ char *argv = ctx->stack;
+ char *entry;
+ size_t limit, used = 0;
+ ssize_t ret;
+
+ limit = get_config_limit(&csm_execute_config.envp_limit);
+ if (!limit)
+ return true;
+
+ /* Skip arguments */
+ for (i = 0; i < bprm->argc; i++) {
+ if (!find_array_next_entry(argv, &offset, end))
+ return false;
+ }
+
+ for (i = 0; i < bprm->envc; i++) {
+ entry = find_array_next_entry(argv, &offset, end);
+ if (!entry)
+ return false;
+
+ if (!check_envp_allowlist(entry))
+ continue;
+
+ ret = pb_encode_string_field_limit(stream, field,
+ (void * const *)&entry,
+ limit - used);
+ if (ret < 0)
+ return false;
+
+ used += ret;
+
+ if (used >= limit)
+ break;
+ }
+
+ return true;
+}
+
+static bool is_overlayfs_mounted(struct file *file)
+{
+ struct vfsmount *mnt;
+ struct super_block *mnt_sb;
+
+ mnt = file->f_path.mnt;
+ if (mnt == NULL)
+ return false;
+
+ mnt_sb = mnt->mnt_sb;
+ if (mnt_sb == NULL || mnt_sb->s_magic != OVERLAYFS_SUPER_MAGIC)
+ return false;
+
+ return true;
+}
+
+/*
+ * Before the process starts, identify a possible container by checking if the
+ * task is on a pid namespace and the target file is using an overlayfs mounting
+ * point. This check is valid for COS and GKE but not all existing containers.
+ */
+static bool is_possible_container(struct task_struct *task,
+ struct file *file)
+{
+ if (task_active_pid_ns(task) == &init_pid_ns)
+ return false;
+
+ return is_overlayfs_mounted(file);
+}
+
+/*
+ * Generates a random identifier for this boot instance.
+ * This identifier is generated only when needed to increase the entropy
+ * available compared to doing it at early boot.
+ */
+static u32 get_machine_id(void)
+{
+ int machineid, old;
+
+ machineid = atomic_read(&machine_rand);
+
+ if (unlikely(machineid == 0)) {
+ machineid = (int)get_random_int();
+ if (machineid == 0)
+ machineid = 1;
+ old = atomic_cmpxchg(&machine_rand, 0, machineid);
+
+ /* If someone beat us, use their value. */
+ if (old != 0)
+ machineid = old;
+ }
+
+ return (u32)machineid;
+}
+
+/*
+ * Generate a 128-bit unique identifier for the process by appending:
+ * - A machine identifier unique per boot.
+ * - The start time of the process in nanoseconds.
+ * - The tgid for the set of threads in a process.
+ */
+static int get_process_uuid(struct task_struct *task, char *buffer, size_t size)
+{
+ union process_uuid *id = (union process_uuid *)buffer;
+
+ memset(buffer, 0, size);
+
+ if (WARN_ON(size < PROCESS_UUID_SIZE))
+ return -EINVAL;
+
+ id->machineid = get_machine_id();
+ id->start_time = ktime_mono_to_real(task->group_leader->start_time);
+ id->tgid = task_tgid_nr(task);
+
+ return 0;
+}
+
+int get_process_uuid_by_pid(pid_t pid_nr, char *buffer, size_t size)
+{
+ int err;
+ struct task_struct *task = NULL;
+
+ rcu_read_lock();
+ task = find_task_by_pid_ns(pid_nr, &init_pid_ns);
+ if (!task) {
+ err = -ENOENT;
+ goto out;
+ }
+ err = get_process_uuid(task, buffer, size);
+out:
+ rcu_read_unlock();
+ return err;
+}
+
+static int get_process_uuid_from_xattr(struct file *file, char *buffer,
+ size_t size)
+{
+ struct dentry *dentry;
+ int err;
+ struct file_provenance prov;
+ union process_uuid *id = (union process_uuid *)buffer;
+
+ memset(buffer, 0, size);
+
+ if (WARN_ON(size < PROCESS_UUID_SIZE))
+ return -EINVAL;
+
+ /* The file is part of overlayfs on the upper layer. */
+ if (!is_overlayfs_mounted(file))
+ return -ENODATA;
+
+ dentry = ovl_dentry_upper(file->f_path.dentry);
+ if (!dentry)
+ return -ENODATA;
+
+ err = __vfs_getxattr(dentry, dentry->d_inode,
+ XATTR_SECURITY_CSM, &prov, sizeof(prov));
+ /* returns -ENODATA if the xattr does not exist. */
+ if (err < 0)
+ return err;
+ if (err != sizeof(prov)) {
+ pr_err("unexpected size for xattr: %zu -> %d\n",
+ size, err);
+ return -ENODATA;
+ }
+
+ id->machineid = get_machine_id();
+ id->start_time = prov.start_time;
+ id->tgid = prov.tgid;
+ return 0;
+}
+
+u64 csm_set_contid(struct task_struct *task)
+{
+ u64 cid;
+ struct pid_namespace *ns;
+
+ ns = task_active_pid_ns(task);
+ if (WARN_ON(!task->audit) || WARN_ON(!ns))
+ return AUDIT_CID_UNSET;
+
+ cid = atomic_inc_return(&contid);
+ task->audit->contid = cid;
+
+ /*
+ * If the namespace container-id is not set, use the one assigned
+ * to the first process created.
+ */
+ cmpxchg(&ns->cid, 0, cid);
+ return cid;
+}
+
+u64 csm_get_ns_contid(struct pid_namespace *ns)
+{
+ if (!ns || !ns->cid)
+ return AUDIT_CID_UNSET;
+
+ return ns->cid;
+}
+
+union ip_data {
+ struct in_addr ip4;
+ struct in6_addr ip6;
+};
+
+struct file_data {
+ void *allocated;
+ union ip_data local;
+ union ip_data remote;
+ char modified_uuid[PROCESS_UUID_SIZE];
+};
+
+static void free_file_data(struct file_data *fdata)
+{
+ free_page((unsigned long)fdata->allocated);
+ fdata->allocated = NULL;
+}
+
+static void fill_socket_description(struct sockaddr_storage *saddr,
+ union ip_data *idata,
+ schema_SocketIp *schema_socketip)
+{
+ struct sockaddr_in *sin4 = (struct sockaddr_in *)saddr;
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)saddr;
+
+ schema_socketip->family = saddr->ss_family;
+
+ switch (saddr->ss_family) {
+ case AF_INET:
+ schema_socketip->port = ntohs(sin4->sin_port);
+ idata->ip4 = sin4->sin_addr;
+ schema_socketip->ip.funcs.encode = pb_encode_ip4;
+ schema_socketip->ip.arg = &idata->ip4;
+ break;
+ case AF_INET6:
+ schema_socketip->port = ntohs(sin6->sin6_port);
+ idata->ip6 = sin6->sin6_addr;
+ schema_socketip->ip.funcs.encode = pb_encode_ip6;
+ schema_socketip->ip.arg = &idata->ip6;
+ break;
+ }
+}
+
+static int fill_file_overlayfs(struct file *file, schema_File *schema_file,
+ struct file_data *fdata)
+{
+ struct dentry *dentry;
+ int err;
+ schema_Overlay *overlayfs;
+
+ /* If not an overlayfs superblock, done. */
+ if (!is_overlayfs_mounted(file))
+ return 0;
+
+ dentry = file->f_path.dentry;
+ schema_file->which_filesystem = schema_File_overlayfs_tag;
+ overlayfs = &schema_file->filesystem.overlayfs;
+ overlayfs->lower_layer = ovl_dentry_lower(dentry);
+ overlayfs->upper_layer = ovl_dentry_upper(dentry);
+
+ err = get_process_uuid_from_xattr(file, fdata->modified_uuid,
+ sizeof(fdata->modified_uuid));
+ /* If there is no xattr, just skip the modified_uuid field. */
+ if (err == -ENODATA)
+ return 0;
+ if (err < 0)
+ return err;
+
+ overlayfs->modified_uuid.funcs.encode = pb_encode_uuid_field;
+ overlayfs->modified_uuid.arg = fdata->modified_uuid;
+ return 0;
+}
+
+static int fill_file_description(struct file *file, schema_File *schema_file,
+ struct file_data *fdata)
+{
+ char *buf;
+ int err;
+ u32 mode;
+ char *path;
+ struct socket *socket;
+ schema_Socket *socketfs;
+ struct sockaddr_storage saddr;
+
+ memset(fdata, 0, sizeof(*fdata));
+
+ if (file == NULL)
+ return 0;
+
+ schema_file->ino = file_inode(file)->i_ino;
+ mode = file_inode(file)->i_mode;
+
+ /* For pipes, no need to resolve the path. */
+ if (S_ISFIFO(mode))
+ return 0;
+
+ if (S_ISSOCK(mode)) {
+ socket = (struct socket *)file->private_data;
+ socketfs = &schema_file->filesystem.socket;
+
+ /* Local socket */
+ err = kernel_getsockname(socket, (struct sockaddr *)&saddr);
+ if (err >= 0) {
+ socketfs->has_local = true;
+ fill_socket_description(&saddr, &fdata->local,
+ &socketfs->local);
+ }
+
+ /* Remote socket, might not be connected. */
+ err = kernel_getpeername(socket, (struct sockaddr *)&saddr);
+ if (err >= 0) {
+ socketfs->has_remote = true;
+ fill_socket_description(&saddr, &fdata->remote,
+ &socketfs->remote);
+ }
+
+ schema_file->which_filesystem = schema_File_socket_tag;
+ return 0;
+ }
+
+ /*
+ * From this point, we care about all the other types of files as their
+ * path provides interesting insight.
+ */
+ buf = (char *)__get_free_page(GFP_KERNEL);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ fdata->allocated = buf;
+
+ path = d_path(&file->f_path, buf, PAGE_SIZE);
+ if (IS_ERR(path)) {
+ free_file_data(fdata);
+ return PTR_ERR(path);
+ }
+
+ schema_file->fullpath.funcs.encode = pb_encode_string_field;
+ schema_file->fullpath.arg = path; /* buf is freed in free_file_data. */
+
+ err = fill_file_overlayfs(file, schema_file, fdata);
+ if (err) {
+ free_file_data(fdata);
+ return err;
+ }
+
+ return 0;
+}
+
+static int fill_stream_description(schema_Descriptor *desc, int fd,
+ struct file_data *fdata)
+{
+ struct fd sfd;
+ struct file *file;
+ int err = 0;
+
+ sfd = fdget(fd);
+ file = sfd.file;
+
+ if (file == NULL) {
+ memset(fdata, 0, sizeof(*fdata));
+ goto end;
+ }
+
+ desc->mode = file_inode(file)->i_mode;
+ desc->has_file = true;
+ err = fill_file_description(file, &desc->file, fdata);
+
+end:
+ fdput(sfd);
+ return err;
+}
+
+static int populate_proc_uuid_common(schema_Process *proc, char *uuid,
+ size_t uuid_size, char *parent_uuid,
+ size_t parent_uuid_size,
+ struct task_struct *task)
+{
+ int err;
+ struct task_struct *parent;
+ /* Generate unique identifier for the process and its parent */
+ err = get_process_uuid(task, uuid, uuid_size);
+ if (err)
+ return err;
+
+ proc->uuid.funcs.encode = pb_encode_uuid_field;
+ proc->uuid.arg = uuid;
+
+ rcu_read_lock();
+
+ if (!pid_alive(task))
+ goto out;
+ /*
+ * I don't think this needs to be task_rcu_dereference because
+ * real_parent is only supposed to be accessed using RCU.
+ */
+ parent = rcu_dereference(task->real_parent);
+
+ if (parent) {
+ err = get_process_uuid(parent, parent_uuid, parent_uuid_size);
+ if (!err) {
+ proc->parent_uuid.funcs.encode = pb_encode_uuid_field;
+ proc->parent_uuid.arg = parent_uuid;
+ }
+ }
+
+out:
+ rcu_read_unlock();
+
+ return err;
+}
+
+/* Populate the fields that we always want to set in Process messages. */
+static int populate_proc_common(schema_Process *proc, char *uuid,
+ size_t uuid_size, char *parent_uuid,
+ size_t parent_uuid_size,
+ struct task_struct *task)
+{
+ u64 cid;
+ struct pid_namespace *ns = task_active_pid_ns(task);
+
+ /* Container identifier for the current namespace. */
+ proc->container_id = csm_get_ns_contid(ns);
+
+ /*
+ * If the process container-id is different, the process tree is part of
+ * a different session within the namespace (kubectl/docker exec,
+ * liveness probe or others).
+ */
+ cid = audit_get_contid(task);
+ if (proc->container_id != cid)
+ proc->exec_session_id = cid;
+
+ /* Add information about pid in different namespaces */
+ proc->pid = task_tgid_nr(task);
+ proc->parent_pid = task_ppid_nr(task);
+ proc->container_pid = task_tgid_nr_ns(task, ns);
+ proc->container_parent_pid = task_ppid_nr_ns(task, ns);
+
+ return populate_proc_uuid_common(proc, uuid, uuid_size, parent_uuid,
+ parent_uuid_size, task);
+}
+
+int csm_bprm_check_security(struct linux_binprm *bprm)
+{
+ char uuid[PROCESS_UUID_SIZE];
+ char parent_uuid[PROCESS_UUID_SIZE];
+ int err;
+ schema_Event event = {};
+ schema_Process *proc;
+ struct string_arr_ctx argv_ctx;
+ void *stack = NULL, *ctx = NULL;
+ u64 cid;
+ struct file_data path_data = {};
+ struct file_data stdin_data = {};
+ struct file_data stdout_data = {};
+ struct file_data stderr_data = {};
+
+ /*
+ * Always create a container-id for containerized processes.
+ * If the LSM is enabled later, we can track existing containers.
+ */
+ cid = audit_get_contid(current);
+
+ if (cid == AUDIT_CID_UNSET) {
+ if (!is_possible_container(current, bprm->file))
+ return 0;
+
+ cid = csm_set_contid(current);
+
+ if (cid == AUDIT_CID_UNSET)
+ return 0;
+ }
+
+ if (!csm_execute_enabled)
+ return 0;
+
+ /* The interpreter will call us again with more context. */
+ if (bprm->buf[0] == '#' && bprm->buf[1] == '!')
+ return 0;
+
+ proc = &event.event.execute.proc;
+ err = populate_proc_common(proc, uuid, sizeof(uuid), parent_uuid,
+ sizeof(parent_uuid), current);
+ if (err)
+ goto out_free_buf;
+
+ proc->creation_timestamp = ktime_get_real_ns();
+
+ /* Provide information about the launched binary. */
+ proc->has_binary = true;
+ err = fill_file_description(bprm->file, &proc->binary, &path_data);
+ if (err)
+ goto out_free_buf;
+
+ /* Information about streams */
+ proc->has_streams = true;
+
+ proc->streams.has_stdin = true;
+ err = fill_stream_description(&proc->streams.stdin, STDIN_FILENO,
+ &stdin_data);
+ if (err)
+ goto out_free_buf;
+
+ proc->streams.has_stdout = true;
+ err = fill_stream_description(&proc->streams.stdout, STDOUT_FILENO,
+ &stdout_data);
+ if (err)
+ goto out_free_buf;
+
+ proc->streams.has_stderr = true;
+ err = fill_stream_description(&proc->streams.stderr, STDERR_FILENO,
+ &stderr_data);
+ if (err)
+ goto out_free_buf;
+
+ stack = kmap_argument_stack(bprm, &ctx);
+ if (!stack) {
+ err = -EFAULT;
+ goto out_free_buf;
+ }
+
+ /* Capture process argument */
+ argv_ctx.bprm = bprm;
+ argv_ctx.stack = stack;
+ proc->args.argv.funcs.encode = encode_current_argv;
+ proc->args.argv.arg = &argv_ctx;
+
+ /* Capture process environment variables */
+ proc->args.envp.funcs.encode = encode_current_envp;
+ proc->args.envp.arg = &argv_ctx;
+
+ event.which_event = schema_Event_execute_tag;
+ event.event.execute.has_proc = true;
+ proc->has_args = true;
+
+ /*
+ * Configurations options are checked when computing the serialized
+ * protobufs.
+ */
+ down_read(&csm_rwsem_config);
+ err = csm_sendeventproto(schema_Event_fields, &event);
+ up_read(&csm_rwsem_config);
+
+ if (err)
+ pr_err("csm_sendeventproto returned %d on execve\n", err);
+ err = 0;
+
+out_free_buf:
+ kunmap_argument_stack(bprm, stack, ctx);
+ free_file_data(&path_data);
+ free_file_data(&stdin_data);
+ free_file_data(&stdout_data);
+ free_file_data(&stderr_data);
+
+ /*
+ * On failure, enforce it only if the execute config is enabled.
+ * If the collector was disabled, prefer to succeed to not impact the
+ * system.
+ */
+ if (unlikely(err < 0 && !csm_execute_enabled))
+ err = 0;
+
+ return err;
+}
+
+/* Create a clone event when a new task leader is created. */
+void csm_task_post_alloc(struct task_struct *task)
+{
+ int err;
+ char uuid[PROCESS_UUID_SIZE];
+ char parent_uuid[PROCESS_UUID_SIZE];
+ schema_Event event = {};
+ schema_Process *proc;
+
+ if (!csm_execute_enabled ||
+ audit_get_contid(task) == AUDIT_CID_UNSET ||
+ !thread_group_leader(task))
+ return;
+
+ proc = &event.event.clone.proc;
+
+ err = populate_proc_uuid_common(proc, uuid, sizeof(uuid), parent_uuid,
+ sizeof(parent_uuid), task);
+
+ event.which_event = schema_Event_clone_tag;
+ event.event.clone.has_proc = true;
+ err = csm_sendeventproto(schema_Event_fields, &event);
+ if (err)
+ pr_err("csm_sendeventproto returned %d on exit\n", err);
+}
+
+/*
+ * This LSM hook callback doesn't exist upstream and is called only when the
+ * last thread of a thread group exit.
+ */
+void csm_task_exit(struct task_struct *task)
+{
+ int err;
+ schema_Event event = {};
+ schema_ExitEvent *exit;
+ char uuid[PROCESS_UUID_SIZE];
+
+ if (!csm_execute_enabled ||
+ audit_get_contid(task) == AUDIT_CID_UNSET)
+ return;
+
+ exit = &event.event.exit;
+
+ /* Fetch the unique identifier for this process */
+ err = get_process_uuid(task, uuid, sizeof(uuid));
+ if (err) {
+ pr_err("failed to get process uuid on exit\n");
+ return;
+ }
+
+ exit->process_uuid.funcs.encode = pb_encode_uuid_field;
+ exit->process_uuid.arg = uuid;
+
+ event.which_event = schema_Event_exit_tag;
+
+ err = csm_sendeventproto(schema_Event_fields, &event);
+ if (err)
+ pr_err("csm_sendeventproto returned %d on exit\n", err);
+}
+
+int csm_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
+ unsigned long prot)
+{
+ char uuid[PROCESS_UUID_SIZE];
+ char parent_uuid[PROCESS_UUID_SIZE];
+ int err;
+ schema_Event event = {};
+ schema_MemoryExecEvent *memexec;
+ u64 cid;
+ struct file_data path_data = {};
+
+ cid = audit_get_contid(current);
+
+ if (!csm_memexec_enabled ||
+ !(prot & PROT_EXEC) ||
+ vma->vm_file == NULL ||
+ cid == AUDIT_CID_UNSET)
+ return 0;
+
+ memexec = &event.event.memexec;
+
+ err = fill_file_description(vma->vm_file, &memexec->mapped_file,
+ &path_data);
+ if (err)
+ return err;
+
+ err = populate_proc_common(&memexec->proc, uuid, sizeof(uuid),
+ parent_uuid, sizeof(parent_uuid), current);
+ if (err)
+ goto out;
+
+ memexec->prot_exec_timestamp = ktime_get_real_ns();
+ memexec->new_flags = prot;
+ memexec->req_flags = reqprot;
+ memexec->old_vm_flags = vma->vm_flags;
+
+ memexec->action = schema_MemoryExecEvent_Action_MPROTECT;
+ memexec->start_addr = vma->vm_start;
+ memexec->end_addr = vma->vm_end;
+
+ event.which_event = schema_Event_memexec_tag;
+ event.event.memexec.has_proc = true;
+ event.event.memexec.has_mapped_file = true;
+
+ err = csm_sendeventproto(schema_Event_fields, &event);
+ if (err)
+ pr_err("csm_sendeventproto returned %d on mprotect\n", err);
+ err = 0;
+
+ if (unlikely(err < 0 && !csm_memexec_enabled))
+ err = 0;
+
+out:
+ free_file_data(&path_data);
+ return err;
+}
+
+int csm_mmap_file(struct file *file, unsigned long reqprot,
+ unsigned long prot, unsigned long flags)
+{
+ char uuid[PROCESS_UUID_SIZE];
+ char parent_uuid[PROCESS_UUID_SIZE];
+ int err;
+ schema_Event event = {};
+ schema_MemoryExecEvent *memexec;
+ struct file *exe_file;
+ u64 cid;
+ struct file_data path_data = {};
+
+ cid = audit_get_contid(current);
+
+ if (!csm_memexec_enabled ||
+ !(prot & PROT_EXEC) ||
+ file == NULL ||
+ cid == AUDIT_CID_UNSET)
+ return 0;
+
+ memexec = &event.event.memexec;
+ err = fill_file_description(file, &memexec->mapped_file,
+ &path_data);
+ if (err)
+ return err;
+
+ err = populate_proc_common(&memexec->proc, uuid, sizeof(uuid),
+ parent_uuid, sizeof(parent_uuid), current);
+ if (err)
+ goto out;
+
+ /* get_mm_exe_file does its own locking on mm_sem. */
+ exe_file = get_mm_exe_file(current->mm);
+ if (exe_file) {
+ if (path_equal(&file->f_path, &exe_file->f_path))
+ memexec->is_initial_mmap = 1;
+ fput(exe_file);
+ }
+
+ memexec->prot_exec_timestamp = ktime_get_real_ns();
+ memexec->new_flags = prot;
+ memexec->req_flags = reqprot;
+ memexec->mmap_flags = flags;
+ memexec->action = schema_MemoryExecEvent_Action_MMAP_FILE;
+ event.which_event = schema_Event_memexec_tag;
+ event.event.memexec.has_proc = true;
+ event.event.memexec.has_mapped_file = true;
+
+ err = csm_sendeventproto(schema_Event_fields, &event);
+ if (err)
+ pr_err("csm_sendeventproto returned %d on mmap_file\n", err);
+ err = 0;
+
+ if (unlikely(err < 0 && !csm_memexec_enabled))
+ err = 0;
+
+out:
+ free_file_data(&path_data);
+ return err;
+}
+
+void csm_file_pre_free(struct file *file)
+{
+ struct dentry *dentry;
+ int err;
+ struct file_provenance prov;
+
+ /* The file was opened to be modified and the LSM is enabled */
+ if (!(file->f_mode & FMODE_WRITE) ||
+ !csm_enabled)
+ return;
+
+ /* The current process is containerized. */
+ if (audit_get_contid(current) == AUDIT_CID_UNSET)
+ return;
+
+ /* The file is part of overlayfs on the upper layer. */
+ if (!is_overlayfs_mounted(file))
+ return;
+
+ dentry = ovl_dentry_upper(file->f_path.dentry);
+ if (!dentry)
+ return;
+
+ err = __vfs_getxattr(dentry, dentry->d_inode, XATTR_SECURITY_CSM,
+ NULL, 0);
+ if (err != -ENODATA) {
+ if (err < 0)
+ pr_err("failed to get security attribute: %d\n", err);
+ return;
+ }
+
+ prov.tgid = task_tgid_nr(current);
+ prov.start_time = ktime_mono_to_real(current->group_leader->start_time);
+
+ err = __vfs_setxattr(dentry, dentry->d_inode, XATTR_SECURITY_CSM, &prov,
+ sizeof(prov), 0);
+ if (err < 0)
+ pr_err("failed to set security attribute: %d\n", err);
+}
+
+/*
+ * Based off of fs/proc/base.c:next_tgid
+ *
+ * next_thread_group_leader returns the task_struct of the next task with a pid
+ * greater than or equal to tgid. The reference count is increased so that
+ * rcu_read_unlock may be called, and preemption reenabled.
+ */
+static struct task_struct *next_thread_group_leader(pid_t *tgid)
+{
+ struct pid *pid;
+ struct task_struct *task;
+
+ cond_resched();
+ rcu_read_lock();
+retry:
+ task = NULL;
+ pid = find_ge_pid(*tgid, &init_pid_ns);
+ if (pid) {
+ *tgid = pid_nr_ns(pid, &init_pid_ns);
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task || !has_group_leader_pid(task) ||
+ audit_get_contid(task) == AUDIT_CID_UNSET) {
+ (*tgid) += 1;
+ goto retry;
+ }
+
+ /*
+ * Increment the reference count on the task before leaving
+ * the RCU grace period.
+ */
+ get_task_struct(task);
+ (*tgid) += 1;
+ }
+
+ rcu_read_unlock();
+ return task;
+}
+
+void delayed_enumerate_processes(struct work_struct *work)
+{
+ pid_t tgid = 0;
+ struct task_struct *task;
+ struct csm_enumerate_processes_work_data *wd = container_of(
+ work, struct csm_enumerate_processes_work_data, work);
+ int wd_enumeration_count = wd->enumeration_count;
+
+ kfree(wd);
+ wd = NULL;
+ work = NULL;
+
+ /*
+ * Try for only a single enumeration routine at a time, as long as the
+ * execute collector is enabled.
+ */
+ while ((wd_enumeration_count == atomic_read(&enumeration_count)) &&
+ READ_ONCE(csm_execute_enabled) &&
+ (task = next_thread_group_leader(&tgid))) {
+ int err;
+ char uuid[PROCESS_UUID_SIZE];
+ char parent_uuid[PROCESS_UUID_SIZE];
+ struct file *exe_file = NULL;
+ struct file_data path_data = {};
+ schema_Event event = {};
+ schema_Process *proc = &event.event.enumproc.proc;
+
+ exe_file = get_task_exe_file(task);
+ if (!exe_file) {
+ pr_err("failed to get enumerated process executable, pid: %u\n",
+ task_pid_nr(task));
+ goto next;
+ }
+
+ proc->has_binary = true;
+ err = fill_file_description(exe_file, &proc->binary,
+ &path_data);
+ if (err) {
+ pr_err("failed to fill enumerated process %u executable description: %d\n",
+ task_pid_nr(task), err);
+ goto next;
+ }
+
+ err = populate_proc_common(proc, uuid, sizeof(uuid),
+ parent_uuid, sizeof(parent_uuid),
+ task);
+ if (err) {
+ pr_err("failed to set pid %u common fields: %d\n",
+ task_pid_nr(task), err);
+ goto next;
+ }
+
+ if (task->flags & PF_EXITING)
+ goto next;
+
+ event.which_event = schema_Event_enumproc_tag;
+ event.event.execute.has_proc = true;
+ err = csm_sendeventproto(schema_Event_fields,
+ &event);
+ if (err) {
+ pr_err("failed to send pid %u enumerated process: %d\n",
+ task_pid_nr(task), err);
+ goto next;
+ }
+next:
+ free_file_data(&path_data);
+ if (exe_file)
+ fput(exe_file);
+
+ put_task_struct(task);
+ }
+}
+
+void csm_enumerate_processes(unsigned long const config_version)
+{
+ struct csm_enumerate_processes_work_data *wd;
+
+ wd = kmalloc(sizeof(*wd), GFP_KERNEL);
+ if (!wd)
+ return;
+
+ INIT_WORK(&wd->work, delayed_enumerate_processes);
+ wd->enumeration_count = atomic_add_return(1, &enumeration_count);
+ schedule_work(&wd->work);
+}
diff --git a/security/container/process.h b/security/container/process.h
new file mode 100644
index 0000000..1c98134
--- /dev/null
+++ b/security/container/process.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Container Security Monitor module
+ *
+ * Copyright (c) 2019 Google, Inc
+ */
+
+void csm_enumerate_processes(void);
diff --git a/security/container/protos/Makefile b/security/container/protos/Makefile
new file mode 100644
index 0000000..a88068b
--- /dev/null
+++ b/security/container/protos/Makefile
@@ -0,0 +1,10 @@
+subdir-$(CONFIG_SECURITY_CONTAINER_MONITOR) += nanopb
+
+obj-$(CONFIG_SECURITY_CONTAINER_MONITOR) += nanopb/
+obj-$(CONFIG_SECURITY_CONTAINER_MONITOR) += protos.o
+
+protos-y := config.pb.o event.pb.o
+
+ccflags-y := -I$(srctree)/security/container/protos \
+ -I$(srctree)/security/container/protos/nanopb \
+ $(PB_CCFLAGS)
diff --git a/security/container/protos/README b/security/container/protos/README
new file mode 100644
index 0000000..1b0628a
--- /dev/null
+++ b/security/container/protos/README
@@ -0,0 +1,18 @@
+This document provides guidance on how to change the protos used in this directory.
+
+Any change made to a proto file require to reformat it and regenerate nanopb
+sources. It also requires the proto files to be compatible to previously released versions.
+
+To reformat any proto file run: "clang-format -style=Google -i <file.proto>"
+
+To regenerate nanopb files:
+ - Install protoc
+ - apt-get install protobuf-compiler
+ - Clone/setup nanopb for version 0.3.9.1 (or clone the internal depot)
+ - git clone --depth=1 https://github.com/nanopb/nanopb.git
+ - cd nanopb
+ - git fetch --tags
+ - git checkout tags/0.3.9.1
+ - make -C generator/proto
+ - Run protoc with the nanopb definition
+ - protoc --plugin=<path_to_nanopb>/generator/protoc-gen-nanopb --nanopb_out=<path_to_linux>/security/container/protos/ <path_to_linux>/security/container/protos/<file.proto> --proto_path=<path_to_linux>/security/container/protos
diff --git a/security/container/protos/config.pb.c b/security/container/protos/config.pb.c
new file mode 100644
index 0000000..08436ee
--- /dev/null
+++ b/security/container/protos/config.pb.c
@@ -0,0 +1,25 @@
+/* Automatically generated nanopb constant definitions */
+/* Generated by nanopb-0.4.5 */
+
+#include "config.pb.h"
+#if PB_PROTO_HEADER_VERSION != 40
+#error Regenerate this file with the current version of nanopb generator.
+#endif
+
+PB_BIND(schema_ContainerCollectorConfig, schema_ContainerCollectorConfig, AUTO)
+
+
+PB_BIND(schema_ExecuteCollectorConfig, schema_ExecuteCollectorConfig, AUTO)
+
+
+PB_BIND(schema_MemExecCollectorConfig, schema_MemExecCollectorConfig, AUTO)
+
+
+PB_BIND(schema_ConfigurationRequest, schema_ConfigurationRequest, AUTO)
+
+
+PB_BIND(schema_ConfigurationResponse, schema_ConfigurationResponse, AUTO)
+
+
+
+
diff --git a/security/container/protos/config.pb.h b/security/container/protos/config.pb.h
new file mode 100644
index 0000000..893961e
--- /dev/null
+++ b/security/container/protos/config.pb.h
@@ -0,0 +1,157 @@
+/* Automatically generated nanopb header */
+/* Generated by nanopb-0.4.5 */
+
+#ifndef PB_SCHEMA_CONFIG_PB_H_INCLUDED
+#define PB_SCHEMA_CONFIG_PB_H_INCLUDED
+#include <pb.h>
+
+#if PB_PROTO_HEADER_VERSION != 40
+#error Regenerate this file with the current version of nanopb generator.
+#endif
+
+/* Enum definitions */
+typedef enum _schema_ConfigurationResponse_ErrorCode {
+ schema_ConfigurationResponse_ErrorCode_NO_ERROR = 0,
+ schema_ConfigurationResponse_ErrorCode_UNKNOWN = 2
+} schema_ConfigurationResponse_ErrorCode;
+
+/* Struct definitions */
+/* Report success or failure of previous ConfigurationRequest */
+typedef struct _schema_ConfigurationResponse {
+ schema_ConfigurationResponse_ErrorCode error;
+ pb_callback_t msg;
+ uint64_t version; /* Version of the LSM */
+ uint32_t kernel_version; /* LINUX_VERSION_CODE */
+} schema_ConfigurationResponse;
+
+/* Collect information about running containers */
+typedef struct _schema_ContainerCollectorConfig {
+ bool enabled;
+} schema_ContainerCollectorConfig;
+
+typedef struct _schema_ExecuteCollectorConfig {
+ bool enabled;
+ /* truncate argv/envp if cumulative length exceeds limit */
+ uint32_t argv_limit;
+ uint32_t envp_limit;
+ /* If specified, only report the named environment variables. An
+ empty envp_allowlist indicates that all environment variables
+ should be reported up to a cumulative total of envp_limit bytes. */
+ pb_callback_t envp_allowlist;
+} schema_ExecuteCollectorConfig;
+
+/* Collect information about executable memory mappings. */
+typedef struct _schema_MemExecCollectorConfig {
+ bool enabled;
+} schema_MemExecCollectorConfig;
+
+/* Convey configuration information to Guest LSM */
+typedef struct _schema_ConfigurationRequest {
+ bool has_container_config;
+ schema_ContainerCollectorConfig container_config;
+ bool has_execute_config;
+ schema_ExecuteCollectorConfig execute_config;
+ bool has_memexec_config;
+ schema_MemExecCollectorConfig memexec_config;
+} schema_ConfigurationRequest;
+
+
+/* Helper constants for enums */
+#define _schema_ConfigurationResponse_ErrorCode_MIN schema_ConfigurationResponse_ErrorCode_NO_ERROR
+#define _schema_ConfigurationResponse_ErrorCode_MAX schema_ConfigurationResponse_ErrorCode_UNKNOWN
+#define _schema_ConfigurationResponse_ErrorCode_ARRAYSIZE ((schema_ConfigurationResponse_ErrorCode)(schema_ConfigurationResponse_ErrorCode_UNKNOWN+1))
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Initializer values for message structs */
+#define schema_ContainerCollectorConfig_init_default {0}
+#define schema_ExecuteCollectorConfig_init_default {0, 0, 0, {{NULL}, NULL}}
+#define schema_MemExecCollectorConfig_init_default {0}
+#define schema_ConfigurationRequest_init_default {false, schema_ContainerCollectorConfig_init_default, false, schema_ExecuteCollectorConfig_init_default, false, schema_MemExecCollectorConfig_init_default}
+#define schema_ConfigurationResponse_init_default {_schema_ConfigurationResponse_ErrorCode_MIN, {{NULL}, NULL}, 0, 0}
+#define schema_ContainerCollectorConfig_init_zero {0}
+#define schema_ExecuteCollectorConfig_init_zero {0, 0, 0, {{NULL}, NULL}}
+#define schema_MemExecCollectorConfig_init_zero {0}
+#define schema_ConfigurationRequest_init_zero {false, schema_ContainerCollectorConfig_init_zero, false, schema_ExecuteCollectorConfig_init_zero, false, schema_MemExecCollectorConfig_init_zero}
+#define schema_ConfigurationResponse_init_zero {_schema_ConfigurationResponse_ErrorCode_MIN, {{NULL}, NULL}, 0, 0}
+
+/* Field tags (for use in manual encoding/decoding) */
+#define schema_ConfigurationResponse_error_tag 1
+#define schema_ConfigurationResponse_msg_tag 2
+#define schema_ConfigurationResponse_version_tag 3
+#define schema_ConfigurationResponse_kernel_version_tag 4
+#define schema_ContainerCollectorConfig_enabled_tag 1
+#define schema_ExecuteCollectorConfig_enabled_tag 1
+#define schema_ExecuteCollectorConfig_argv_limit_tag 2
+#define schema_ExecuteCollectorConfig_envp_limit_tag 3
+#define schema_ExecuteCollectorConfig_envp_allowlist_tag 4
+#define schema_MemExecCollectorConfig_enabled_tag 1
+#define schema_ConfigurationRequest_container_config_tag 1
+#define schema_ConfigurationRequest_execute_config_tag 2
+#define schema_ConfigurationRequest_memexec_config_tag 3
+
+/* Struct field encoding specification for nanopb */
+#define schema_ContainerCollectorConfig_FIELDLIST(X, a) \
+X(a, STATIC, SINGULAR, BOOL, enabled, 1)
+#define schema_ContainerCollectorConfig_CALLBACK NULL
+#define schema_ContainerCollectorConfig_DEFAULT NULL
+
+#define schema_ExecuteCollectorConfig_FIELDLIST(X, a) \
+X(a, STATIC, SINGULAR, BOOL, enabled, 1) \
+X(a, STATIC, SINGULAR, UINT32, argv_limit, 2) \
+X(a, STATIC, SINGULAR, UINT32, envp_limit, 3) \
+X(a, CALLBACK, REPEATED, STRING, envp_allowlist, 4)
+#define schema_ExecuteCollectorConfig_CALLBACK pb_default_field_callback
+#define schema_ExecuteCollectorConfig_DEFAULT NULL
+
+#define schema_MemExecCollectorConfig_FIELDLIST(X, a) \
+X(a, STATIC, SINGULAR, BOOL, enabled, 1)
+#define schema_MemExecCollectorConfig_CALLBACK NULL
+#define schema_MemExecCollectorConfig_DEFAULT NULL
+
+#define schema_ConfigurationRequest_FIELDLIST(X, a) \
+X(a, STATIC, OPTIONAL, MESSAGE, container_config, 1) \
+X(a, STATIC, OPTIONAL, MESSAGE, execute_config, 2) \
+X(a, STATIC, OPTIONAL, MESSAGE, memexec_config, 3)
+#define schema_ConfigurationRequest_CALLBACK NULL
+#define schema_ConfigurationRequest_DEFAULT NULL
+#define schema_ConfigurationRequest_container_config_MSGTYPE schema_ContainerCollectorConfig
+#define schema_ConfigurationRequest_execute_config_MSGTYPE schema_ExecuteCollectorConfig
+#define schema_ConfigurationRequest_memexec_config_MSGTYPE schema_MemExecCollectorConfig
+
+#define schema_ConfigurationResponse_FIELDLIST(X, a) \
+X(a, STATIC, SINGULAR, UENUM, error, 1) \
+X(a, CALLBACK, SINGULAR, STRING, msg, 2) \
+X(a, STATIC, SINGULAR, UINT64, version, 3) \
+X(a, STATIC, SINGULAR, UINT32, kernel_version, 4)
+#define schema_ConfigurationResponse_CALLBACK pb_default_field_callback
+#define schema_ConfigurationResponse_DEFAULT NULL
+
+extern const pb_msgdesc_t schema_ContainerCollectorConfig_msg;
+extern const pb_msgdesc_t schema_ExecuteCollectorConfig_msg;
+extern const pb_msgdesc_t schema_MemExecCollectorConfig_msg;
+extern const pb_msgdesc_t schema_ConfigurationRequest_msg;
+extern const pb_msgdesc_t schema_ConfigurationResponse_msg;
+
+/* Defines for backwards compatibility with code written before nanopb-0.4.0 */
+#define schema_ContainerCollectorConfig_fields &schema_ContainerCollectorConfig_msg
+#define schema_ExecuteCollectorConfig_fields &schema_ExecuteCollectorConfig_msg
+#define schema_MemExecCollectorConfig_fields &schema_MemExecCollectorConfig_msg
+#define schema_ConfigurationRequest_fields &schema_ConfigurationRequest_msg
+#define schema_ConfigurationResponse_fields &schema_ConfigurationResponse_msg
+
+/* Maximum encoded size of messages (where known) */
+/* schema_ExecuteCollectorConfig_size depends on runtime parameters */
+/* schema_ConfigurationRequest_size depends on runtime parameters */
+/* schema_ConfigurationResponse_size depends on runtime parameters */
+#define schema_ContainerCollectorConfig_size 2
+#define schema_MemExecCollectorConfig_size 2
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
diff --git a/security/container/protos/config.proto b/security/container/protos/config.proto
new file mode 100644
index 0000000..e32a517
--- /dev/null
+++ b/security/container/protos/config.proto
@@ -0,0 +1,51 @@
+syntax = "proto3";
+
+package schema;
+
+// Collect information about running containers
+message ContainerCollectorConfig {
+ bool enabled = 1;
+}
+
+message ExecuteCollectorConfig {
+ bool enabled = 1;
+
+ // truncate argv/envp if cumulative length exceeds limit
+ uint32 argv_limit = 2;
+ uint32 envp_limit = 3;
+
+ // If specified, only report the named environment variables. An
+ // empty envp_allowlist indicates that all environment variables
+ // should be reported up to a cumulative total of envp_limit bytes.
+ repeated string envp_allowlist = 4;
+}
+
+// Collect information about executable memory mappings.
+message MemExecCollectorConfig {
+ bool enabled = 1;
+}
+
+// Convey configuration information to Guest LSM
+message ConfigurationRequest {
+ ContainerCollectorConfig container_config = 1;
+ ExecuteCollectorConfig execute_config = 2;
+ MemExecCollectorConfig memexec_config = 3;
+
+ // Additional configuration messages will be added as new collectors
+ // are implemented
+}
+
+// Report success or failure of previous ConfigurationRequest
+message ConfigurationResponse {
+ enum ErrorCode {
+ // Keep values in sync with
+ // https://github.com/googleapis/googleapis/blob/master/google/rpc/code.proto
+ NO_ERROR = 0;
+ UNKNOWN = 2;
+ }
+
+ ErrorCode error = 1;
+ string msg = 2;
+ uint64 version = 3; // Version of the LSM
+ uint32 kernel_version = 4; // LINUX_VERSION_CODE
+}
diff --git a/security/container/protos/event.pb.c b/security/container/protos/event.pb.c
new file mode 100644
index 0000000..2293566
--- /dev/null
+++ b/security/container/protos/event.pb.c
@@ -0,0 +1,61 @@
+/* Automatically generated nanopb constant definitions */
+/* Generated by nanopb-0.4.5 */
+
+#include "event.pb.h"
+#if PB_PROTO_HEADER_VERSION != 40
+#error Regenerate this file with the current version of nanopb generator.
+#endif
+
+PB_BIND(schema_SocketIp, schema_SocketIp, AUTO)
+
+
+PB_BIND(schema_Socket, schema_Socket, AUTO)
+
+
+PB_BIND(schema_Overlay, schema_Overlay, AUTO)
+
+
+PB_BIND(schema_File, schema_File, AUTO)
+
+
+PB_BIND(schema_ProcessArguments, schema_ProcessArguments, AUTO)
+
+
+PB_BIND(schema_Descriptor, schema_Descriptor, AUTO)
+
+
+PB_BIND(schema_Streams, schema_Streams, 2)
+
+
+PB_BIND(schema_Process, schema_Process, 2)
+
+
+PB_BIND(schema_Container, schema_Container, AUTO)
+
+
+PB_BIND(schema_ExecuteEvent, schema_ExecuteEvent, 2)
+
+
+PB_BIND(schema_CloneEvent, schema_CloneEvent, 2)
+
+
+PB_BIND(schema_EnumerateProcessEvent, schema_EnumerateProcessEvent, 2)
+
+
+PB_BIND(schema_MemoryExecEvent, schema_MemoryExecEvent, 2)
+
+
+PB_BIND(schema_ContainerInfoEvent, schema_ContainerInfoEvent, AUTO)
+
+
+PB_BIND(schema_ExitEvent, schema_ExitEvent, AUTO)
+
+
+PB_BIND(schema_Event, schema_Event, 2)
+
+
+PB_BIND(schema_ContainerReport, schema_ContainerReport, AUTO)
+
+
+
+
diff --git a/security/container/protos/event.pb.h b/security/container/protos/event.pb.h
new file mode 100644
index 0000000..9535068
--- /dev/null
+++ b/security/container/protos/event.pb.h
@@ -0,0 +1,518 @@
+/* Automatically generated nanopb header */
+/* Generated by nanopb-0.4.5 */
+
+#ifndef PB_SCHEMA_EVENT_PB_H_INCLUDED
+#define PB_SCHEMA_EVENT_PB_H_INCLUDED
+#include <pb.h>
+
+#if PB_PROTO_HEADER_VERSION != 40
+#error Regenerate this file with the current version of nanopb generator.
+#endif
+
+/* Enum definitions */
+typedef enum _schema_MemoryExecEvent_Action {
+ schema_MemoryExecEvent_Action_UNDEFINED = 0,
+ schema_MemoryExecEvent_Action_MPROTECT = 1,
+ schema_MemoryExecEvent_Action_MMAP_FILE = 2
+} schema_MemoryExecEvent_Action;
+
+/* Struct definitions */
+/* The process with the indicated pid has exited. */
+typedef struct _schema_ExitEvent {
+ pb_callback_t process_uuid;
+} schema_ExitEvent;
+
+typedef struct _schema_Container {
+ uint64_t creation_timestamp; /* container create time in ns */
+ pb_callback_t pod_namespace;
+ pb_callback_t pod_name;
+ uint64_t container_id; /* unique across lifetime of Node */
+ pb_callback_t container_name;
+ pb_callback_t container_image_uri;
+ pb_callback_t labels;
+ pb_callback_t init_uuid;
+ pb_callback_t container_image_id;
+} schema_Container;
+
+typedef struct _schema_Overlay {
+ bool lower_layer;
+ bool upper_layer;
+ pb_callback_t modified_uuid; /* The process who first modified the file. */
+} schema_Overlay;
+
+typedef struct _schema_ProcessArguments {
+ pb_callback_t argv; /* process arguments */
+ uint32_t argv_truncated; /* number of characters truncated from argv */
+ pb_callback_t envp; /* process environment variables */
+ uint32_t envp_truncated; /* number of characters truncated from envp */
+} schema_ProcessArguments;
+
+typedef struct _schema_SocketIp {
+ uint32_t family; /* AF_* for socket type. */
+ pb_callback_t ip; /* ip4 or ip6 address. */
+ uint32_t port; /* port bind or connected. */
+} schema_SocketIp;
+
+/* Associate the following container information with all processes
+ that have the indicated container_id. */
+typedef struct _schema_ContainerInfoEvent {
+ bool has_container;
+ schema_Container container;
+} schema_ContainerInfoEvent;
+
+/* Message sent by the daemonset to the LSM for container enlightenment. */
+typedef struct _schema_ContainerReport {
+ uint32_t pid; /* Top pid of the running container. */
+ bool has_container;
+ schema_Container container; /* Information collected about the container. */
+} schema_ContainerReport;
+
+typedef struct _schema_Socket {
+ bool has_local;
+ schema_SocketIp local;
+ bool has_remote;
+ schema_SocketIp remote; /* unset if not connected. */
+} schema_Socket;
+
+typedef struct _schema_File {
+ pb_callback_t fullpath;
+ pb_size_t which_filesystem;
+ union {
+ schema_Overlay overlayfs;
+ schema_Socket socket;
+ } filesystem; /* inode number. */
+ uint32_t ino;
+ uint64_t ctime;
+} schema_File;
+
+typedef struct _schema_Descriptor {
+ uint32_t mode; /* file mode (stat st_mode) */
+ bool has_file;
+ schema_File file;
+} schema_Descriptor;
+
+typedef struct _schema_Streams {
+ bool has_stdin;
+ schema_Descriptor stdin;
+ bool has_stdout;
+ schema_Descriptor stdout;
+ bool has_stderr;
+ schema_Descriptor stderr;
+} schema_Streams;
+
+typedef struct _schema_Process {
+ uint64_t creation_timestamp; /* Only populated in ExecuteEvent, in ns. */
+ pb_callback_t uuid;
+ uint32_t pid;
+ bool has_binary;
+ schema_File binary; /* Only populated in ExecuteEvent. */
+ uint32_t parent_pid;
+ pb_callback_t parent_uuid;
+ uint64_t container_id; /* unique id of process's container */
+ uint32_t container_pid; /* pid inside the container namespace pid */
+ uint32_t container_parent_pid; /* optional */
+ bool has_args;
+ schema_ProcessArguments args; /* Only populated in ExecuteEvent. */
+ bool has_streams;
+ schema_Streams streams; /* Only populated in ExecuteEvent. */
+ uint64_t exec_session_id; /* identifier set for kubectl exec sessions. */
+} schema_Process;
+
+/* A process clone is being created. This message means that a cloning operation
+ is being attempted. It may be sent even if fork fails. */
+typedef struct _schema_CloneEvent {
+ bool has_proc;
+ schema_Process proc;
+} schema_CloneEvent;
+
+/* Processes that are enumerated at startup will be sent with this event. There
+ is no distinction from events we would have seen from fork or exec. */
+typedef struct _schema_EnumerateProcessEvent {
+ bool has_proc;
+ schema_Process proc;
+} schema_EnumerateProcessEvent;
+
+/* A binary being executed.
+ e.g., execve() */
+typedef struct _schema_ExecuteEvent {
+ bool has_proc;
+ schema_Process proc;
+} schema_ExecuteEvent;
+
+/* Collect information about mmap/mprotect calls with the PROT_EXEC flag set. */
+typedef struct _schema_MemoryExecEvent {
+ bool has_proc;
+ schema_Process proc; /* The origin process */
+ /* The timestamp in ns when the memory was set executable */
+ uint64_t prot_exec_timestamp;
+ /* The prot flags granted by the kernel for the operation */
+ uint64_t new_flags;
+ /* The prot flags requested for the mprotect/mmap operation */
+ uint64_t req_flags;
+ /* The vm_flags prior to the mprotect operation, if relevant */
+ uint64_t old_vm_flags;
+ /* The operational flags for the mmap operation, if relevant */
+ uint64_t mmap_flags;
+ /* Derived from the file struct describing the fd being mapped */
+ bool has_mapped_file;
+ schema_File mapped_file;
+ schema_MemoryExecEvent_Action action;
+ uint64_t start_addr; /* The executable memory region start addr */
+ uint64_t end_addr; /* The executable memory region end addr */
+ /* True if this event is a mmap of the process' binary */
+ bool is_initial_mmap;
+} schema_MemoryExecEvent;
+
+/* Next ID: 8 */
+typedef struct _schema_Event {
+ pb_size_t which_event;
+ union {
+ schema_ExecuteEvent execute;
+ schema_ContainerInfoEvent container;
+ schema_ExitEvent exit;
+ schema_MemoryExecEvent memexec;
+ schema_CloneEvent clone;
+ schema_EnumerateProcessEvent enumproc;
+ } event;
+ uint64_t timestamp;
+} schema_Event;
+
+
+/* Helper constants for enums */
+#define _schema_MemoryExecEvent_Action_MIN schema_MemoryExecEvent_Action_UNDEFINED
+#define _schema_MemoryExecEvent_Action_MAX schema_MemoryExecEvent_Action_MMAP_FILE
+#define _schema_MemoryExecEvent_Action_ARRAYSIZE ((schema_MemoryExecEvent_Action)(schema_MemoryExecEvent_Action_MMAP_FILE+1))
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Initializer values for message structs */
+#define schema_SocketIp_init_default {0, {{NULL}, NULL}, 0}
+#define schema_Socket_init_default {false, schema_SocketIp_init_default, false, schema_SocketIp_init_default}
+#define schema_Overlay_init_default {0, 0, {{NULL}, NULL}}
+#define schema_File_init_default {{{NULL}, NULL}, 0, {schema_Overlay_init_default}, 0, 0}
+#define schema_ProcessArguments_init_default {{{NULL}, NULL}, 0, {{NULL}, NULL}, 0}
+#define schema_Descriptor_init_default {0, false, schema_File_init_default}
+#define schema_Streams_init_default {false, schema_Descriptor_init_default, false, schema_Descriptor_init_default, false, schema_Descriptor_init_default}
+#define schema_Process_init_default {0, {{NULL}, NULL}, 0, false, schema_File_init_default, 0, {{NULL}, NULL}, 0, 0, 0, false, schema_ProcessArguments_init_default, false, schema_Streams_init_default, 0}
+#define schema_Container_init_default {0, {{NULL}, NULL}, {{NULL}, NULL}, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define schema_ExecuteEvent_init_default {false, schema_Process_init_default}
+#define schema_CloneEvent_init_default {false, schema_Process_init_default}
+#define schema_EnumerateProcessEvent_init_default {false, schema_Process_init_default}
+#define schema_MemoryExecEvent_init_default {false, schema_Process_init_default, 0, 0, 0, 0, 0, false, schema_File_init_default, _schema_MemoryExecEvent_Action_MIN, 0, 0, 0}
+#define schema_ContainerInfoEvent_init_default {false, schema_Container_init_default}
+#define schema_ExitEvent_init_default {{{NULL}, NULL}}
+#define schema_Event_init_default {0, {schema_ExecuteEvent_init_default}, 0}
+#define schema_ContainerReport_init_default {0, false, schema_Container_init_default}
+#define schema_SocketIp_init_zero {0, {{NULL}, NULL}, 0}
+#define schema_Socket_init_zero {false, schema_SocketIp_init_zero, false, schema_SocketIp_init_zero}
+#define schema_Overlay_init_zero {0, 0, {{NULL}, NULL}}
+#define schema_File_init_zero {{{NULL}, NULL}, 0, {schema_Overlay_init_zero}, 0, 0}
+#define schema_ProcessArguments_init_zero {{{NULL}, NULL}, 0, {{NULL}, NULL}, 0}
+#define schema_Descriptor_init_zero {0, false, schema_File_init_zero}
+#define schema_Streams_init_zero {false, schema_Descriptor_init_zero, false, schema_Descriptor_init_zero, false, schema_Descriptor_init_zero}
+#define schema_Process_init_zero {0, {{NULL}, NULL}, 0, false, schema_File_init_zero, 0, {{NULL}, NULL}, 0, 0, 0, false, schema_ProcessArguments_init_zero, false, schema_Streams_init_zero, 0}
+#define schema_Container_init_zero {0, {{NULL}, NULL}, {{NULL}, NULL}, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define schema_ExecuteEvent_init_zero {false, schema_Process_init_zero}
+#define schema_CloneEvent_init_zero {false, schema_Process_init_zero}
+#define schema_EnumerateProcessEvent_init_zero {false, schema_Process_init_zero}
+#define schema_MemoryExecEvent_init_zero {false, schema_Process_init_zero, 0, 0, 0, 0, 0, false, schema_File_init_zero, _schema_MemoryExecEvent_Action_MIN, 0, 0, 0}
+#define schema_ContainerInfoEvent_init_zero {false, schema_Container_init_zero}
+#define schema_ExitEvent_init_zero {{{NULL}, NULL}}
+#define schema_Event_init_zero {0, {schema_ExecuteEvent_init_zero}, 0}
+#define schema_ContainerReport_init_zero {0, false, schema_Container_init_zero}
+
+/* Field tags (for use in manual encoding/decoding) */
+#define schema_ExitEvent_process_uuid_tag 1
+#define schema_Container_creation_timestamp_tag 1
+#define schema_Container_pod_namespace_tag 2
+#define schema_Container_pod_name_tag 3
+#define schema_Container_container_id_tag 4
+#define schema_Container_container_name_tag 5
+#define schema_Container_container_image_uri_tag 6
+#define schema_Container_labels_tag 7
+#define schema_Container_init_uuid_tag 8
+#define schema_Container_container_image_id_tag 9
+#define schema_Overlay_lower_layer_tag 1
+#define schema_Overlay_upper_layer_tag 2
+#define schema_Overlay_modified_uuid_tag 3
+#define schema_ProcessArguments_argv_tag 1
+#define schema_ProcessArguments_argv_truncated_tag 2
+#define schema_ProcessArguments_envp_tag 3
+#define schema_ProcessArguments_envp_truncated_tag 4
+#define schema_SocketIp_family_tag 1
+#define schema_SocketIp_ip_tag 2
+#define schema_SocketIp_port_tag 3
+#define schema_ContainerInfoEvent_container_tag 1
+#define schema_ContainerReport_pid_tag 1
+#define schema_ContainerReport_container_tag 2
+#define schema_Socket_local_tag 1
+#define schema_Socket_remote_tag 2
+#define schema_File_fullpath_tag 1
+#define schema_File_overlayfs_tag 2
+#define schema_File_socket_tag 4
+#define schema_File_ino_tag 3
+#define schema_File_ctime_tag 5
+#define schema_Descriptor_mode_tag 1
+#define schema_Descriptor_file_tag 2
+#define schema_Streams_stdin_tag 1
+#define schema_Streams_stdout_tag 2
+#define schema_Streams_stderr_tag 3
+#define schema_Process_creation_timestamp_tag 1
+#define schema_Process_uuid_tag 2
+#define schema_Process_pid_tag 3
+#define schema_Process_binary_tag 4
+#define schema_Process_parent_pid_tag 5
+#define schema_Process_parent_uuid_tag 6
+#define schema_Process_container_id_tag 7
+#define schema_Process_container_pid_tag 8
+#define schema_Process_container_parent_pid_tag 9
+#define schema_Process_args_tag 10
+#define schema_Process_streams_tag 11
+#define schema_Process_exec_session_id_tag 12
+#define schema_CloneEvent_proc_tag 1
+#define schema_EnumerateProcessEvent_proc_tag 1
+#define schema_ExecuteEvent_proc_tag 1
+#define schema_MemoryExecEvent_proc_tag 1
+#define schema_MemoryExecEvent_prot_exec_timestamp_tag 2
+#define schema_MemoryExecEvent_new_flags_tag 3
+#define schema_MemoryExecEvent_req_flags_tag 4
+#define schema_MemoryExecEvent_old_vm_flags_tag 5
+#define schema_MemoryExecEvent_mmap_flags_tag 6
+#define schema_MemoryExecEvent_mapped_file_tag 7
+#define schema_MemoryExecEvent_action_tag 8
+#define schema_MemoryExecEvent_start_addr_tag 9
+#define schema_MemoryExecEvent_end_addr_tag 10
+#define schema_MemoryExecEvent_is_initial_mmap_tag 11
+#define schema_Event_execute_tag 1
+#define schema_Event_container_tag 2
+#define schema_Event_exit_tag 3
+#define schema_Event_memexec_tag 4
+#define schema_Event_clone_tag 5
+#define schema_Event_enumproc_tag 7
+#define schema_Event_timestamp_tag 6
+
+/* Struct field encoding specification for nanopb */
+#define schema_SocketIp_FIELDLIST(X, a) \
+X(a, STATIC, SINGULAR, UINT32, family, 1) \
+X(a, CALLBACK, SINGULAR, BYTES, ip, 2) \
+X(a, STATIC, SINGULAR, UINT32, port, 3)
+#define schema_SocketIp_CALLBACK pb_default_field_callback
+#define schema_SocketIp_DEFAULT NULL
+
+#define schema_Socket_FIELDLIST(X, a) \
+X(a, STATIC, OPTIONAL, MESSAGE, local, 1) \
+X(a, STATIC, OPTIONAL, MESSAGE, remote, 2)
+#define schema_Socket_CALLBACK NULL
+#define schema_Socket_DEFAULT NULL
+#define schema_Socket_local_MSGTYPE schema_SocketIp
+#define schema_Socket_remote_MSGTYPE schema_SocketIp
+
+#define schema_Overlay_FIELDLIST(X, a) \
+X(a, STATIC, SINGULAR, BOOL, lower_layer, 1) \
+X(a, STATIC, SINGULAR, BOOL, upper_layer, 2) \
+X(a, CALLBACK, SINGULAR, BYTES, modified_uuid, 3)
+#define schema_Overlay_CALLBACK pb_default_field_callback
+#define schema_Overlay_DEFAULT NULL
+
+#define schema_File_FIELDLIST(X, a) \
+X(a, CALLBACK, SINGULAR, BYTES, fullpath, 1) \
+X(a, STATIC, ONEOF, MESSAGE, (filesystem,overlayfs,filesystem.overlayfs), 2) \
+X(a, STATIC, SINGULAR, UINT32, ino, 3) \
+X(a, STATIC, ONEOF, MESSAGE, (filesystem,socket,filesystem.socket), 4) \
+X(a, STATIC, SINGULAR, UINT64, ctime, 5)
+#define schema_File_CALLBACK pb_default_field_callback
+#define schema_File_DEFAULT NULL
+#define schema_File_filesystem_overlayfs_MSGTYPE schema_Overlay
+#define schema_File_filesystem_socket_MSGTYPE schema_Socket
+
+#define schema_ProcessArguments_FIELDLIST(X, a) \
+X(a, CALLBACK, REPEATED, BYTES, argv, 1) \
+X(a, STATIC, SINGULAR, UINT32, argv_truncated, 2) \
+X(a, CALLBACK, REPEATED, BYTES, envp, 3) \
+X(a, STATIC, SINGULAR, UINT32, envp_truncated, 4)
+#define schema_ProcessArguments_CALLBACK pb_default_field_callback
+#define schema_ProcessArguments_DEFAULT NULL
+
+#define schema_Descriptor_FIELDLIST(X, a) \
+X(a, STATIC, SINGULAR, UINT32, mode, 1) \
+X(a, STATIC, OPTIONAL, MESSAGE, file, 2)
+#define schema_Descriptor_CALLBACK NULL
+#define schema_Descriptor_DEFAULT NULL
+#define schema_Descriptor_file_MSGTYPE schema_File
+
+#define schema_Streams_FIELDLIST(X, a) \
+X(a, STATIC, OPTIONAL, MESSAGE, stdin, 1) \
+X(a, STATIC, OPTIONAL, MESSAGE, stdout, 2) \
+X(a, STATIC, OPTIONAL, MESSAGE, stderr, 3)
+#define schema_Streams_CALLBACK NULL
+#define schema_Streams_DEFAULT NULL
+#define schema_Streams_stdin_MSGTYPE schema_Descriptor
+#define schema_Streams_stdout_MSGTYPE schema_Descriptor
+#define schema_Streams_stderr_MSGTYPE schema_Descriptor
+
+#define schema_Process_FIELDLIST(X, a) \
+X(a, STATIC, SINGULAR, UINT64, creation_timestamp, 1) \
+X(a, CALLBACK, SINGULAR, BYTES, uuid, 2) \
+X(a, STATIC, SINGULAR, UINT32, pid, 3) \
+X(a, STATIC, OPTIONAL, MESSAGE, binary, 4) \
+X(a, STATIC, SINGULAR, UINT32, parent_pid, 5) \
+X(a, CALLBACK, SINGULAR, BYTES, parent_uuid, 6) \
+X(a, STATIC, SINGULAR, UINT64, container_id, 7) \
+X(a, STATIC, SINGULAR, UINT32, container_pid, 8) \
+X(a, STATIC, SINGULAR, UINT32, container_parent_pid, 9) \
+X(a, STATIC, OPTIONAL, MESSAGE, args, 10) \
+X(a, STATIC, OPTIONAL, MESSAGE, streams, 11) \
+X(a, STATIC, SINGULAR, UINT64, exec_session_id, 12)
+#define schema_Process_CALLBACK pb_default_field_callback
+#define schema_Process_DEFAULT NULL
+#define schema_Process_binary_MSGTYPE schema_File
+#define schema_Process_args_MSGTYPE schema_ProcessArguments
+#define schema_Process_streams_MSGTYPE schema_Streams
+
+#define schema_Container_FIELDLIST(X, a) \
+X(a, STATIC, SINGULAR, UINT64, creation_timestamp, 1) \
+X(a, CALLBACK, SINGULAR, BYTES, pod_namespace, 2) \
+X(a, CALLBACK, SINGULAR, BYTES, pod_name, 3) \
+X(a, STATIC, SINGULAR, UINT64, container_id, 4) \
+X(a, CALLBACK, SINGULAR, BYTES, container_name, 5) \
+X(a, CALLBACK, SINGULAR, BYTES, container_image_uri, 6) \
+X(a, CALLBACK, REPEATED, BYTES, labels, 7) \
+X(a, CALLBACK, SINGULAR, BYTES, init_uuid, 8) \
+X(a, CALLBACK, SINGULAR, BYTES, container_image_id, 9)
+#define schema_Container_CALLBACK pb_default_field_callback
+#define schema_Container_DEFAULT NULL
+
+#define schema_ExecuteEvent_FIELDLIST(X, a) \
+X(a, STATIC, OPTIONAL, MESSAGE, proc, 1)
+#define schema_ExecuteEvent_CALLBACK NULL
+#define schema_ExecuteEvent_DEFAULT NULL
+#define schema_ExecuteEvent_proc_MSGTYPE schema_Process
+
+#define schema_CloneEvent_FIELDLIST(X, a) \
+X(a, STATIC, OPTIONAL, MESSAGE, proc, 1)
+#define schema_CloneEvent_CALLBACK NULL
+#define schema_CloneEvent_DEFAULT NULL
+#define schema_CloneEvent_proc_MSGTYPE schema_Process
+
+#define schema_EnumerateProcessEvent_FIELDLIST(X, a) \
+X(a, STATIC, OPTIONAL, MESSAGE, proc, 1)
+#define schema_EnumerateProcessEvent_CALLBACK NULL
+#define schema_EnumerateProcessEvent_DEFAULT NULL
+#define schema_EnumerateProcessEvent_proc_MSGTYPE schema_Process
+
+#define schema_MemoryExecEvent_FIELDLIST(X, a) \
+X(a, STATIC, OPTIONAL, MESSAGE, proc, 1) \
+X(a, STATIC, SINGULAR, UINT64, prot_exec_timestamp, 2) \
+X(a, STATIC, SINGULAR, UINT64, new_flags, 3) \
+X(a, STATIC, SINGULAR, UINT64, req_flags, 4) \
+X(a, STATIC, SINGULAR, UINT64, old_vm_flags, 5) \
+X(a, STATIC, SINGULAR, UINT64, mmap_flags, 6) \
+X(a, STATIC, OPTIONAL, MESSAGE, mapped_file, 7) \
+X(a, STATIC, SINGULAR, UENUM, action, 8) \
+X(a, STATIC, SINGULAR, UINT64, start_addr, 9) \
+X(a, STATIC, SINGULAR, UINT64, end_addr, 10) \
+X(a, STATIC, SINGULAR, BOOL, is_initial_mmap, 11)
+#define schema_MemoryExecEvent_CALLBACK NULL
+#define schema_MemoryExecEvent_DEFAULT NULL
+#define schema_MemoryExecEvent_proc_MSGTYPE schema_Process
+#define schema_MemoryExecEvent_mapped_file_MSGTYPE schema_File
+
+#define schema_ContainerInfoEvent_FIELDLIST(X, a) \
+X(a, STATIC, OPTIONAL, MESSAGE, container, 1)
+#define schema_ContainerInfoEvent_CALLBACK NULL
+#define schema_ContainerInfoEvent_DEFAULT NULL
+#define schema_ContainerInfoEvent_container_MSGTYPE schema_Container
+
+#define schema_ExitEvent_FIELDLIST(X, a) \
+X(a, CALLBACK, SINGULAR, BYTES, process_uuid, 1)
+#define schema_ExitEvent_CALLBACK pb_default_field_callback
+#define schema_ExitEvent_DEFAULT NULL
+
+#define schema_Event_FIELDLIST(X, a) \
+X(a, STATIC, ONEOF, MESSAGE, (event,execute,event.execute), 1) \
+X(a, STATIC, ONEOF, MESSAGE, (event,container,event.container), 2) \
+X(a, STATIC, ONEOF, MESSAGE, (event,exit,event.exit), 3) \
+X(a, STATIC, ONEOF, MESSAGE, (event,memexec,event.memexec), 4) \
+X(a, STATIC, ONEOF, MESSAGE, (event,clone,event.clone), 5) \
+X(a, STATIC, SINGULAR, UINT64, timestamp, 6) \
+X(a, STATIC, ONEOF, MESSAGE, (event,enumproc,event.enumproc), 7)
+#define schema_Event_CALLBACK NULL
+#define schema_Event_DEFAULT NULL
+#define schema_Event_event_execute_MSGTYPE schema_ExecuteEvent
+#define schema_Event_event_container_MSGTYPE schema_ContainerInfoEvent
+#define schema_Event_event_exit_MSGTYPE schema_ExitEvent
+#define schema_Event_event_memexec_MSGTYPE schema_MemoryExecEvent
+#define schema_Event_event_clone_MSGTYPE schema_CloneEvent
+#define schema_Event_event_enumproc_MSGTYPE schema_EnumerateProcessEvent
+
+#define schema_ContainerReport_FIELDLIST(X, a) \
+X(a, STATIC, SINGULAR, UINT32, pid, 1) \
+X(a, STATIC, OPTIONAL, MESSAGE, container, 2)
+#define schema_ContainerReport_CALLBACK NULL
+#define schema_ContainerReport_DEFAULT NULL
+#define schema_ContainerReport_container_MSGTYPE schema_Container
+
+extern const pb_msgdesc_t schema_SocketIp_msg;
+extern const pb_msgdesc_t schema_Socket_msg;
+extern const pb_msgdesc_t schema_Overlay_msg;
+extern const pb_msgdesc_t schema_File_msg;
+extern const pb_msgdesc_t schema_ProcessArguments_msg;
+extern const pb_msgdesc_t schema_Descriptor_msg;
+extern const pb_msgdesc_t schema_Streams_msg;
+extern const pb_msgdesc_t schema_Process_msg;
+extern const pb_msgdesc_t schema_Container_msg;
+extern const pb_msgdesc_t schema_ExecuteEvent_msg;
+extern const pb_msgdesc_t schema_CloneEvent_msg;
+extern const pb_msgdesc_t schema_EnumerateProcessEvent_msg;
+extern const pb_msgdesc_t schema_MemoryExecEvent_msg;
+extern const pb_msgdesc_t schema_ContainerInfoEvent_msg;
+extern const pb_msgdesc_t schema_ExitEvent_msg;
+extern const pb_msgdesc_t schema_Event_msg;
+extern const pb_msgdesc_t schema_ContainerReport_msg;
+
+/* Defines for backwards compatibility with code written before nanopb-0.4.0 */
+#define schema_SocketIp_fields &schema_SocketIp_msg
+#define schema_Socket_fields &schema_Socket_msg
+#define schema_Overlay_fields &schema_Overlay_msg
+#define schema_File_fields &schema_File_msg
+#define schema_ProcessArguments_fields &schema_ProcessArguments_msg
+#define schema_Descriptor_fields &schema_Descriptor_msg
+#define schema_Streams_fields &schema_Streams_msg
+#define schema_Process_fields &schema_Process_msg
+#define schema_Container_fields &schema_Container_msg
+#define schema_ExecuteEvent_fields &schema_ExecuteEvent_msg
+#define schema_CloneEvent_fields &schema_CloneEvent_msg
+#define schema_EnumerateProcessEvent_fields &schema_EnumerateProcessEvent_msg
+#define schema_MemoryExecEvent_fields &schema_MemoryExecEvent_msg
+#define schema_ContainerInfoEvent_fields &schema_ContainerInfoEvent_msg
+#define schema_ExitEvent_fields &schema_ExitEvent_msg
+#define schema_Event_fields &schema_Event_msg
+#define schema_ContainerReport_fields &schema_ContainerReport_msg
+
+/* Maximum encoded size of messages (where known) */
+/* schema_SocketIp_size depends on runtime parameters */
+/* schema_Socket_size depends on runtime parameters */
+/* schema_Overlay_size depends on runtime parameters */
+/* schema_File_size depends on runtime parameters */
+/* schema_ProcessArguments_size depends on runtime parameters */
+/* schema_Descriptor_size depends on runtime parameters */
+/* schema_Streams_size depends on runtime parameters */
+/* schema_Process_size depends on runtime parameters */
+/* schema_Container_size depends on runtime parameters */
+/* schema_ExecuteEvent_size depends on runtime parameters */
+/* schema_CloneEvent_size depends on runtime parameters */
+/* schema_EnumerateProcessEvent_size depends on runtime parameters */
+/* schema_MemoryExecEvent_size depends on runtime parameters */
+/* schema_ContainerInfoEvent_size depends on runtime parameters */
+/* schema_ExitEvent_size depends on runtime parameters */
+/* schema_Event_size depends on runtime parameters */
+/* schema_ContainerReport_size depends on runtime parameters */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
diff --git a/security/container/protos/event.proto b/security/container/protos/event.proto
new file mode 100644
index 0000000..dfe483f
--- /dev/null
+++ b/security/container/protos/event.proto
@@ -0,0 +1,151 @@
+syntax = "proto3";
+
+package schema;
+
+message SocketIp {
+ uint32 family = 1; // AF_* for socket type.
+ bytes ip = 2; // ip4 or ip6 address.
+ uint32 port = 3; // port bind or connected.
+}
+
+message Socket {
+ SocketIp local = 1;
+ SocketIp remote = 2; // unset if not connected.
+}
+
+message Overlay {
+ bool lower_layer = 1;
+ bool upper_layer = 2;
+ bytes modified_uuid = 3; // The process who first modified the file.
+}
+
+message File {
+ bytes fullpath = 1;
+ uint32 ino = 3; // inode number.
+ oneof filesystem {
+ Overlay overlayfs = 2;
+ Socket socket = 4;
+ }
+}
+
+message ProcessArguments {
+ repeated bytes argv = 1; // process arguments
+ uint32 argv_truncated = 2; // number of characters truncated from argv
+ repeated bytes envp = 3; // process environment variables
+ uint32 envp_truncated = 4; // number of characters truncated from envp
+}
+
+message Descriptor {
+ uint32 mode = 1; // file mode (stat st_mode)
+ File file = 2;
+}
+
+message Streams {
+ Descriptor stdin = 1;
+ Descriptor stdout = 2;
+ Descriptor stderr = 3;
+}
+
+message Process {
+ uint64 creation_timestamp = 1; // Only populated in ExecuteEvent, in ns.
+ bytes uuid = 2;
+ uint32 pid = 3;
+ File binary = 4; // Only populated in ExecuteEvent.
+ uint32 parent_pid = 5;
+ bytes parent_uuid = 6;
+ uint64 container_id = 7; // unique id of process's container
+ uint32 container_pid = 8; // pid inside the container namespace pid
+ uint32 container_parent_pid = 9; // optional
+ ProcessArguments args = 10; // Only populated in ExecuteEvent.
+ Streams streams = 11; // Only populated in ExecuteEvent.
+ uint64 exec_session_id = 12; // identifier set for kubectl exec sessions.
+}
+
+message Container {
+ uint64 creation_timestamp = 1; // container create time in ns
+ bytes pod_namespace = 2;
+ bytes pod_name = 3;
+ uint64 container_id = 4; // unique across lifetime of Node
+ bytes container_name = 5;
+ bytes container_image_uri = 6;
+ repeated bytes labels = 7;
+ bytes init_uuid = 8;
+ bytes container_image_id = 9;
+}
+
+// A binary being executed.
+// e.g., execve()
+message ExecuteEvent {
+ Process proc = 1;
+}
+
+// A process clone is being created. This message means that a cloning operation
+// is being attempted. It may be sent even if fork fails.
+message CloneEvent {
+ Process proc = 1;
+}
+
+// Processes that are enumerated at startup will be sent with this event. There
+// is no distinction from events we would have seen from fork or exec.
+message EnumerateProcessEvent {
+ Process proc = 1;
+}
+
+// Collect information about mmap/mprotect calls with the PROT_EXEC flag set.
+message MemoryExecEvent {
+ Process proc = 1; // The origin process
+ // The timestamp in ns when the memory was set executable
+ uint64 prot_exec_timestamp = 2;
+ // The prot flags granted by the kernel for the operation
+ uint64 new_flags = 3;
+ // The prot flags requested for the mprotect/mmap operation
+ uint64 req_flags = 4;
+ // The vm_flags prior to the mprotect operation, if relevant
+ uint64 old_vm_flags = 5;
+ // The operational flags for the mmap operation, if relevant
+ uint64 mmap_flags = 6;
+ // Derived from the file struct describing the fd being mapped
+ File mapped_file = 7;
+ enum Action {
+ UNDEFINED = 0;
+ MPROTECT = 1;
+ MMAP_FILE = 2;
+ }
+ Action action = 8;
+
+ uint64 start_addr = 9; // The executable memory region start addr
+ uint64 end_addr = 10; // The executable memory region end addr
+ // True if this event is a mmap of the process' binary
+ bool is_initial_mmap = 11;
+}
+
+// Associate the following container information with all processes
+// that have the indicated container_id.
+message ContainerInfoEvent {
+ Container container = 1;
+}
+
+// The process with the indicated pid has exited.
+message ExitEvent {
+ bytes process_uuid = 1;
+}
+
+// Next ID: 8
+message Event {
+ oneof event {
+ ExecuteEvent execute = 1;
+ ContainerInfoEvent container = 2;
+ ExitEvent exit = 3;
+ MemoryExecEvent memexec = 4;
+ CloneEvent clone = 5;
+ EnumerateProcessEvent enumproc = 7;
+ }
+
+ uint64 timestamp = 6; // In nanoseconds
+}
+
+// Message sent by the daemonset to the LSM for container enlightenment.
+message ContainerReport {
+ uint32 pid = 1; // Top pid of the running container.
+ Container container = 2; // Information collected about the container.
+}
diff --git a/security/container/protos/nanopb/LICENSE b/security/container/protos/nanopb/LICENSE
new file mode 100644
index 0000000..a83630a
--- /dev/null
+++ b/security/container/protos/nanopb/LICENSE
@@ -0,0 +1,20 @@
+Copyright (c) 2011 Petteri Aimonen <jpa at nanopb.mail.kapsi.fi>
+
+This software is provided 'as-is', without any express or
+implied warranty. In no event will the authors be held liable
+for any damages arising from the use of this software.
+
+Permission is granted to anyone to use this software for any
+purpose, including commercial applications, and to alter it and
+redistribute it freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you
+ must not claim that you wrote the original software. If you use
+ this software in a product, an acknowledgment in the product
+ documentation would be appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and
+ must not be misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+ distribution.
diff --git a/security/container/protos/nanopb/METADATA b/security/container/protos/nanopb/METADATA
new file mode 100644
index 0000000..6b85630
--- /dev/null
+++ b/security/container/protos/nanopb/METADATA
@@ -0,0 +1,23 @@
+name: "nanopb"
+description: "Nanopb is a C library for encoding and decoding protocol buffers."
+
+third_party {
+ url {
+ type: GIT
+ value: "https://github.com/nanopb/nanopb/"
+ }
+ version: "0.4.5"
+ last_upgrade_date: {
+ year: 2021
+ month: 8
+ day: 12
+ }
+ license_type: NOTICE
+ security {
+ category: REVIEWED_AND_SECURE
+ note: "https://buganizer.corp.google.com/u/0/issues/19409596, https://buganizer.corp.google.com/u/0/issues/120506242"
+ tag: "NVD-CPE2.3:cpe:/a:nanopb_project:nanopb"
+ tag: "vuln_reporting:buganizer_component:588910"
+ tag: "vuln_reporting:contact_emails:" # Blunderbuss will assign bugs.
+ }
+}
diff --git a/security/container/protos/nanopb/Makefile b/security/container/protos/nanopb/Makefile
new file mode 100644
index 0000000..b7e15f8
--- /dev/null
+++ b/security/container/protos/nanopb/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_SECURITY_CONTAINER_MONITOR) += nanopb.o
+
+nanopb-y := pb_encode.o pb_decode.o pb_common.o
+
+ccflags-y := -I$(srctree)/security/container/protos \
+ -I$(srctree)/security/container/protos/nanopb \
+ $(PB_CCFLAGS)
diff --git a/security/container/protos/nanopb/pb.h b/security/container/protos/nanopb/pb.h
new file mode 100644
index 0000000..be7c067
--- /dev/null
+++ b/security/container/protos/nanopb/pb.h
@@ -0,0 +1,875 @@
+/* Common parts of the nanopb library. Most of these are quite low-level
+ * stuff. For the high-level interface, see pb_encode.h and pb_decode.h.
+ */
+
+#ifndef PB_H_INCLUDED
+#define PB_H_INCLUDED
+
+/*****************************************************************
+ * Nanopb compilation time options. You can change these here by *
+ * uncommenting the lines, or on the compiler command line. *
+ *****************************************************************/
+
+/* Enable support for dynamically allocated fields */
+/* #define PB_ENABLE_MALLOC 1 */
+
+/* Define this if your CPU / compiler combination does not support
+ * unaligned memory access to packed structures. */
+/* #define PB_NO_PACKED_STRUCTS 1 */
+
+/* Increase the number of required fields that are tracked.
+ * A compiler warning will tell if you need this. */
+/* #define PB_MAX_REQUIRED_FIELDS 256 */
+
+/* Add support for tag numbers > 65536 and fields larger than 65536 bytes. */
+/* #define PB_FIELD_32BIT 1 */
+
+/* Disable support for error messages in order to save some code space. */
+/* #define PB_NO_ERRMSG 1 */
+
+/* Disable support for custom streams (support only memory buffers). */
+/* #define PB_BUFFER_ONLY 1 */
+
+/* Disable support for 64-bit datatypes, for compilers without int64_t
+ or to save some code space. */
+/* #define PB_WITHOUT_64BIT 1 */
+
+/* Don't encode scalar arrays as packed. This is only to be used when
+ * the decoder on the receiving side cannot process packed scalar arrays.
+ * Such example is older protobuf.js. */
+/* #define PB_ENCODE_ARRAYS_UNPACKED 1 */
+
+/* Enable conversion of doubles to floats for platforms that do not
+ * support 64-bit doubles. Most commonly AVR. */
+/* #define PB_CONVERT_DOUBLE_FLOAT 1 */
+
+/* Check whether incoming strings are valid UTF-8 sequences. Slows down
+ * the string processing slightly and slightly increases code size. */
+/* #define PB_VALIDATE_UTF8 1 */
+
+/******************************************************************
+ * You usually don't need to change anything below this line. *
+ * Feel free to look around and use the defined macros, though. *
+ ******************************************************************/
+
+
+/* Version of the nanopb library. Just in case you want to check it in
+ * your own program. */
+#define NANOPB_VERSION nanopb-0.4.5
+
+/* Include all the system headers needed by nanopb. You will need the
+ * definitions of the following:
+ * - strlen, memcpy, memset functions
+ * - [u]int_least8_t, uint_fast8_t, [u]int_least16_t, [u]int32_t, [u]int64_t
+ * - size_t
+ * - bool
+ *
+ * If you don't have the standard header files, you can instead provide
+ * a custom header that defines or includes all this. In that case,
+ * define PB_SYSTEM_HEADER to the path of this file.
+ */
+#ifdef PB_SYSTEM_HEADER
+#include PB_SYSTEM_HEADER
+#else
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <limits.h>
+
+#ifdef PB_ENABLE_MALLOC
+#include <stdlib.h>
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Macro for defining packed structures (compiler dependent).
+ * This just reduces memory requirements, but is not required.
+ */
+#if defined(PB_NO_PACKED_STRUCTS)
+ /* Disable struct packing */
+# define PB_PACKED_STRUCT_START
+# define PB_PACKED_STRUCT_END
+# define pb_packed
+#elif defined(__GNUC__) || defined(__clang__)
+ /* For GCC and clang */
+# define PB_PACKED_STRUCT_START
+# define PB_PACKED_STRUCT_END
+# define pb_packed __attribute__((packed))
+#elif defined(__ICCARM__) || defined(__CC_ARM)
+ /* For IAR ARM and Keil MDK-ARM compilers */
+# define PB_PACKED_STRUCT_START _Pragma("pack(push, 1)")
+# define PB_PACKED_STRUCT_END _Pragma("pack(pop)")
+# define pb_packed
+#elif defined(_MSC_VER) && (_MSC_VER >= 1500)
+ /* For Microsoft Visual C++ */
+# define PB_PACKED_STRUCT_START __pragma(pack(push, 1))
+# define PB_PACKED_STRUCT_END __pragma(pack(pop))
+# define pb_packed
+#else
+ /* Unknown compiler */
+# define PB_PACKED_STRUCT_START
+# define PB_PACKED_STRUCT_END
+# define pb_packed
+#endif
+
+/* Handly macro for suppressing unreferenced-parameter compiler warnings. */
+#ifndef PB_UNUSED
+#define PB_UNUSED(x) (void)(x)
+#endif
+
+/* Harvard-architecture processors may need special attributes for storing
+ * field information in program memory. */
+#ifndef PB_PROGMEM
+#ifdef __AVR__
+#include <avr/pgmspace.h>
+#define PB_PROGMEM PROGMEM
+#define PB_PROGMEM_READU32(x) pgm_read_dword(&x)
+#else
+#define PB_PROGMEM
+#define PB_PROGMEM_READU32(x) (x)
+#endif
+#endif
+
+/* Compile-time assertion, used for checking compatible compilation options.
+ * If this does not work properly on your compiler, use
+ * #define PB_NO_STATIC_ASSERT to disable it.
+ *
+ * But before doing that, check carefully the error message / place where it
+ * comes from to see if the error has a real cause. Unfortunately the error
+ * message is not always very clear to read, but you can see the reason better
+ * in the place where the PB_STATIC_ASSERT macro was called.
+ */
+#ifndef PB_NO_STATIC_ASSERT
+# ifndef PB_STATIC_ASSERT
+# if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+ /* C11 standard _Static_assert mechanism */
+# define PB_STATIC_ASSERT(COND,MSG) _Static_assert(COND,#MSG);
+# else
+ /* Classic negative-size-array static assert mechanism */
+# define PB_STATIC_ASSERT(COND,MSG) typedef char PB_STATIC_ASSERT_MSG(MSG, __LINE__, __COUNTER__)[(COND)?1:-1];
+# define PB_STATIC_ASSERT_MSG(MSG, LINE, COUNTER) PB_STATIC_ASSERT_MSG_(MSG, LINE, COUNTER)
+# define PB_STATIC_ASSERT_MSG_(MSG, LINE, COUNTER) pb_static_assertion_##MSG##_##LINE##_##COUNTER
+# endif
+# endif
+#else
+ /* Static asserts disabled by PB_NO_STATIC_ASSERT */
+# define PB_STATIC_ASSERT(COND,MSG)
+#endif
+
+/* Number of required fields to keep track of. */
+#ifndef PB_MAX_REQUIRED_FIELDS
+#define PB_MAX_REQUIRED_FIELDS 64
+#endif
+
+#if PB_MAX_REQUIRED_FIELDS < 64
+#error You should not lower PB_MAX_REQUIRED_FIELDS from the default value (64).
+#endif
+
+#ifdef PB_WITHOUT_64BIT
+#ifdef PB_CONVERT_DOUBLE_FLOAT
+/* Cannot use doubles without 64-bit types */
+#undef PB_CONVERT_DOUBLE_FLOAT
+#endif
+#endif
+
+/* List of possible field types. These are used in the autogenerated code.
+ * Least-significant 4 bits tell the scalar type
+ * Most-significant 4 bits specify repeated/required/packed etc.
+ */
+
+typedef uint_least8_t pb_type_t;
+
+/**** Field data types ****/
+
+/* Numeric types */
+#define PB_LTYPE_BOOL 0x00U /* bool */
+#define PB_LTYPE_VARINT 0x01U /* int32, int64, enum, bool */
+#define PB_LTYPE_UVARINT 0x02U /* uint32, uint64 */
+#define PB_LTYPE_SVARINT 0x03U /* sint32, sint64 */
+#define PB_LTYPE_FIXED32 0x04U /* fixed32, sfixed32, float */
+#define PB_LTYPE_FIXED64 0x05U /* fixed64, sfixed64, double */
+
+/* Marker for last packable field type. */
+#define PB_LTYPE_LAST_PACKABLE 0x05U
+
+/* Byte array with pre-allocated buffer.
+ * data_size is the length of the allocated PB_BYTES_ARRAY structure. */
+#define PB_LTYPE_BYTES 0x06U
+
+/* String with pre-allocated buffer.
+ * data_size is the maximum length. */
+#define PB_LTYPE_STRING 0x07U
+
+/* Submessage
+ * submsg_fields is pointer to field descriptions */
+#define PB_LTYPE_SUBMESSAGE 0x08U
+
+/* Submessage with pre-decoding callback
+ * The pre-decoding callback is stored as pb_callback_t right before pSize.
+ * submsg_fields is pointer to field descriptions */
+#define PB_LTYPE_SUBMSG_W_CB 0x09U
+
+/* Extension pseudo-field
+ * The field contains a pointer to pb_extension_t */
+#define PB_LTYPE_EXTENSION 0x0AU
+
+/* Byte array with inline, pre-allocated byffer.
+ * data_size is the length of the inline, allocated buffer.
+ * This differs from PB_LTYPE_BYTES by defining the element as
+ * pb_byte_t[data_size] rather than pb_bytes_array_t. */
+#define PB_LTYPE_FIXED_LENGTH_BYTES 0x0BU
+
+/* Number of declared LTYPES */
+#define PB_LTYPES_COUNT 0x0CU
+#define PB_LTYPE_MASK 0x0FU
+
+/**** Field repetition rules ****/
+
+#define PB_HTYPE_REQUIRED 0x00U
+#define PB_HTYPE_OPTIONAL 0x10U
+#define PB_HTYPE_SINGULAR 0x10U
+#define PB_HTYPE_REPEATED 0x20U
+#define PB_HTYPE_FIXARRAY 0x20U
+#define PB_HTYPE_ONEOF 0x30U
+#define PB_HTYPE_MASK 0x30U
+
+/**** Field allocation types ****/
+
+#define PB_ATYPE_STATIC 0x00U
+#define PB_ATYPE_POINTER 0x80U
+#define PB_ATYPE_CALLBACK 0x40U
+#define PB_ATYPE_MASK 0xC0U
+
+#define PB_ATYPE(x) ((x) & PB_ATYPE_MASK)
+#define PB_HTYPE(x) ((x) & PB_HTYPE_MASK)
+#define PB_LTYPE(x) ((x) & PB_LTYPE_MASK)
+#define PB_LTYPE_IS_SUBMSG(x) (PB_LTYPE(x) == PB_LTYPE_SUBMESSAGE || \
+ PB_LTYPE(x) == PB_LTYPE_SUBMSG_W_CB)
+
+/* Data type used for storing sizes of struct fields
+ * and array counts.
+ */
+#if defined(PB_FIELD_32BIT)
+ typedef uint32_t pb_size_t;
+ typedef int32_t pb_ssize_t;
+#else
+ typedef uint_least16_t pb_size_t;
+ typedef int_least16_t pb_ssize_t;
+#endif
+#define PB_SIZE_MAX ((pb_size_t)-1)
+
+/* Data type for storing encoded data and other byte streams.
+ * This typedef exists to support platforms where uint8_t does not exist.
+ * You can regard it as equivalent on uint8_t on other platforms.
+ */
+typedef uint_least8_t pb_byte_t;
+
+/* Forward declaration of struct types */
+typedef struct pb_istream_s pb_istream_t;
+typedef struct pb_ostream_s pb_ostream_t;
+typedef struct pb_field_iter_s pb_field_iter_t;
+
+/* This structure is used in auto-generated constants
+ * to specify struct fields.
+ */
+typedef struct pb_msgdesc_s pb_msgdesc_t;
+struct pb_msgdesc_s {
+ const uint32_t *field_info;
+ const pb_msgdesc_t * const * submsg_info;
+ const pb_byte_t *default_value;
+
+ bool (*field_callback)(pb_istream_t *istream, pb_ostream_t *ostream, const pb_field_iter_t *field);
+
+ pb_size_t field_count;
+ pb_size_t required_field_count;
+ pb_size_t largest_tag;
+};
+
+/* Iterator for message descriptor */
+struct pb_field_iter_s {
+ const pb_msgdesc_t *descriptor; /* Pointer to message descriptor constant */
+ void *message; /* Pointer to start of the structure */
+
+ pb_size_t index; /* Index of the field */
+ pb_size_t field_info_index; /* Index to descriptor->field_info array */
+ pb_size_t required_field_index; /* Index that counts only the required fields */
+ pb_size_t submessage_index; /* Index that counts only submessages */
+
+ pb_size_t tag; /* Tag of current field */
+ pb_size_t data_size; /* sizeof() of a single item */
+ pb_size_t array_size; /* Number of array entries */
+ pb_type_t type; /* Type of current field */
+
+ void *pField; /* Pointer to current field in struct */
+ void *pData; /* Pointer to current data contents. Different than pField for arrays and pointers. */
+ void *pSize; /* Pointer to count/has field */
+
+ const pb_msgdesc_t *submsg_desc; /* For submessage fields, pointer to field descriptor for the submessage. */
+};
+
+/* For compatibility with legacy code */
+typedef pb_field_iter_t pb_field_t;
+
+/* Make sure that the standard integer types are of the expected sizes.
+ * Otherwise fixed32/fixed64 fields can break.
+ *
+ * If you get errors here, it probably means that your stdint.h is not
+ * correct for your platform.
+ */
+#ifndef PB_WITHOUT_64BIT
+PB_STATIC_ASSERT(sizeof(int64_t) == 2 * sizeof(int32_t), INT64_T_WRONG_SIZE)
+PB_STATIC_ASSERT(sizeof(uint64_t) == 2 * sizeof(uint32_t), UINT64_T_WRONG_SIZE)
+#endif
+
+/* This structure is used for 'bytes' arrays.
+ * It has the number of bytes in the beginning, and after that an array.
+ * Note that actual structs used will have a different length of bytes array.
+ */
+#define PB_BYTES_ARRAY_T(n) struct { pb_size_t size; pb_byte_t bytes[n]; }
+#define PB_BYTES_ARRAY_T_ALLOCSIZE(n) ((size_t)n + offsetof(pb_bytes_array_t, bytes))
+
+struct pb_bytes_array_s {
+ pb_size_t size;
+ pb_byte_t bytes[1];
+};
+typedef struct pb_bytes_array_s pb_bytes_array_t;
+
+/* This structure is used for giving the callback function.
+ * It is stored in the message structure and filled in by the method that
+ * calls pb_decode.
+ *
+ * The decoding callback will be given a limited-length stream
+ * If the wire type was string, the length is the length of the string.
+ * If the wire type was a varint/fixed32/fixed64, the length is the length
+ * of the actual value.
+ * The function may be called multiple times (especially for repeated types,
+ * but also otherwise if the message happens to contain the field multiple
+ * times.)
+ *
+ * The encoding callback will receive the actual output stream.
+ * It should write all the data in one call, including the field tag and
+ * wire type. It can write multiple fields.
+ *
+ * The callback can be null if you want to skip a field.
+ */
+typedef struct pb_callback_s pb_callback_t;
+struct pb_callback_s {
+ /* Callback functions receive a pointer to the arg field.
+ * You can access the value of the field as *arg, and modify it if needed.
+ */
+ union {
+ bool (*decode)(pb_istream_t *stream, const pb_field_t *field, void **arg);
+ bool (*encode)(pb_ostream_t *stream, const pb_field_t *field, void * const *arg);
+ } funcs;
+
+ /* Free arg for use by callback */
+ void *arg;
+};
+
+extern bool pb_default_field_callback(pb_istream_t *istream, pb_ostream_t *ostream, const pb_field_t *field);
+
+/* Wire types. Library user needs these only in encoder callbacks. */
+typedef enum {
+ PB_WT_VARINT = 0,
+ PB_WT_64BIT = 1,
+ PB_WT_STRING = 2,
+ PB_WT_32BIT = 5
+} pb_wire_type_t;
+
+/* Structure for defining the handling of unknown/extension fields.
+ * Usually the pb_extension_type_t structure is automatically generated,
+ * while the pb_extension_t structure is created by the user. However,
+ * if you want to catch all unknown fields, you can also create a custom
+ * pb_extension_type_t with your own callback.
+ */
+typedef struct pb_extension_type_s pb_extension_type_t;
+typedef struct pb_extension_s pb_extension_t;
+struct pb_extension_type_s {
+ /* Called for each unknown field in the message.
+ * If you handle the field, read off all of its data and return true.
+ * If you do not handle the field, do not read anything and return true.
+ * If you run into an error, return false.
+ * Set to NULL for default handler.
+ */
+ bool (*decode)(pb_istream_t *stream, pb_extension_t *extension,
+ uint32_t tag, pb_wire_type_t wire_type);
+
+ /* Called once after all regular fields have been encoded.
+ * If you have something to write, do so and return true.
+ * If you do not have anything to write, just return true.
+ * If you run into an error, return false.
+ * Set to NULL for default handler.
+ */
+ bool (*encode)(pb_ostream_t *stream, const pb_extension_t *extension);
+
+ /* Free field for use by the callback. */
+ const void *arg;
+};
+
+struct pb_extension_s {
+ /* Type describing the extension field. Usually you'll initialize
+ * this to a pointer to the automatically generated structure. */
+ const pb_extension_type_t *type;
+
+ /* Destination for the decoded data. This must match the datatype
+ * of the extension field. */
+ void *dest;
+
+ /* Pointer to the next extension handler, or NULL.
+ * If this extension does not match a field, the next handler is
+ * automatically called. */
+ pb_extension_t *next;
+
+ /* The decoder sets this to true if the extension was found.
+ * Ignored for encoding. */
+ bool found;
+};
+
+#define pb_extension_init_zero {NULL,NULL,NULL,false}
+
+/* Memory allocation functions to use. You can define pb_realloc and
+ * pb_free to custom functions if you want. */
+#ifdef PB_ENABLE_MALLOC
+# ifndef pb_realloc
+# define pb_realloc(ptr, size) realloc(ptr, size)
+# endif
+# ifndef pb_free
+# define pb_free(ptr) free(ptr)
+# endif
+#endif
+
+/* This is used to inform about need to regenerate .pb.h/.pb.c files. */
+#define PB_PROTO_HEADER_VERSION 40
+
+/* These macros are used to declare pb_field_t's in the constant array. */
+/* Size of a structure member, in bytes. */
+#define pb_membersize(st, m) (sizeof ((st*)0)->m)
+/* Number of entries in an array. */
+#define pb_arraysize(st, m) (pb_membersize(st, m) / pb_membersize(st, m[0]))
+/* Delta from start of one member to the start of another member. */
+#define pb_delta(st, m1, m2) ((int)offsetof(st, m1) - (int)offsetof(st, m2))
+
+/* Force expansion of macro value */
+#define PB_EXPAND(x) x
+
+/* Binding of a message field set into a specific structure */
+#define PB_BIND(msgname, structname, width) \
+ const uint32_t structname ## _field_info[] PB_PROGMEM = \
+ { \
+ msgname ## _FIELDLIST(PB_GEN_FIELD_INFO_ ## width, structname) \
+ 0 \
+ }; \
+ const pb_msgdesc_t* const structname ## _submsg_info[] = \
+ { \
+ msgname ## _FIELDLIST(PB_GEN_SUBMSG_INFO, structname) \
+ NULL \
+ }; \
+ const pb_msgdesc_t structname ## _msg = \
+ { \
+ structname ## _field_info, \
+ structname ## _submsg_info, \
+ msgname ## _DEFAULT, \
+ msgname ## _CALLBACK, \
+ 0 msgname ## _FIELDLIST(PB_GEN_FIELD_COUNT, structname), \
+ 0 msgname ## _FIELDLIST(PB_GEN_REQ_FIELD_COUNT, structname), \
+ 0 msgname ## _FIELDLIST(PB_GEN_LARGEST_TAG, structname), \
+ }; \
+ msgname ## _FIELDLIST(PB_GEN_FIELD_INFO_ASSERT_ ## width, structname)
+
+#define PB_GEN_FIELD_COUNT(structname, atype, htype, ltype, fieldname, tag) +1
+#define PB_GEN_REQ_FIELD_COUNT(structname, atype, htype, ltype, fieldname, tag) \
+ + (PB_HTYPE_ ## htype == PB_HTYPE_REQUIRED)
+#define PB_GEN_LARGEST_TAG(structname, atype, htype, ltype, fieldname, tag) \
+ * 0 + tag
+
+/* X-macro for generating the entries in struct_field_info[] array. */
+#define PB_GEN_FIELD_INFO_1(structname, atype, htype, ltype, fieldname, tag) \
+ PB_FIELDINFO_1(tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \
+ PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname))
+
+#define PB_GEN_FIELD_INFO_2(structname, atype, htype, ltype, fieldname, tag) \
+ PB_FIELDINFO_2(tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \
+ PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname))
+
+#define PB_GEN_FIELD_INFO_4(structname, atype, htype, ltype, fieldname, tag) \
+ PB_FIELDINFO_4(tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \
+ PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname))
+
+#define PB_GEN_FIELD_INFO_8(structname, atype, htype, ltype, fieldname, tag) \
+ PB_FIELDINFO_8(tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \
+ PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname))
+
+#define PB_GEN_FIELD_INFO_AUTO(structname, atype, htype, ltype, fieldname, tag) \
+ PB_FIELDINFO_AUTO2(PB_FIELDINFO_WIDTH_AUTO(_PB_ATYPE_ ## atype, _PB_HTYPE_ ## htype, _PB_LTYPE_ ## ltype), \
+ tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \
+ PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname))
+
+#define PB_FIELDINFO_AUTO2(width, tag, type, data_offset, data_size, size_offset, array_size) \
+ PB_FIELDINFO_AUTO3(width, tag, type, data_offset, data_size, size_offset, array_size)
+
+#define PB_FIELDINFO_AUTO3(width, tag, type, data_offset, data_size, size_offset, array_size) \
+ PB_FIELDINFO_ ## width(tag, type, data_offset, data_size, size_offset, array_size)
+
+/* X-macro for generating asserts that entries fit in struct_field_info[] array.
+ * The structure of macros here must match the structure above in PB_GEN_FIELD_INFO_x(),
+ * but it is not easily reused because of how macro substitutions work. */
+#define PB_GEN_FIELD_INFO_ASSERT_1(structname, atype, htype, ltype, fieldname, tag) \
+ PB_FIELDINFO_ASSERT_1(tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \
+ PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname))
+
+#define PB_GEN_FIELD_INFO_ASSERT_2(structname, atype, htype, ltype, fieldname, tag) \
+ PB_FIELDINFO_ASSERT_2(tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \
+ PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname))
+
+#define PB_GEN_FIELD_INFO_ASSERT_4(structname, atype, htype, ltype, fieldname, tag) \
+ PB_FIELDINFO_ASSERT_4(tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \
+ PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname))
+
+#define PB_GEN_FIELD_INFO_ASSERT_8(structname, atype, htype, ltype, fieldname, tag) \
+ PB_FIELDINFO_ASSERT_8(tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \
+ PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname))
+
+#define PB_GEN_FIELD_INFO_ASSERT_AUTO(structname, atype, htype, ltype, fieldname, tag) \
+ PB_FIELDINFO_ASSERT_AUTO2(PB_FIELDINFO_WIDTH_AUTO(_PB_ATYPE_ ## atype, _PB_HTYPE_ ## htype, _PB_LTYPE_ ## ltype), \
+ tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \
+ PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \
+ PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname))
+
+#define PB_FIELDINFO_ASSERT_AUTO2(width, tag, type, data_offset, data_size, size_offset, array_size) \
+ PB_FIELDINFO_ASSERT_AUTO3(width, tag, type, data_offset, data_size, size_offset, array_size)
+
+#define PB_FIELDINFO_ASSERT_AUTO3(width, tag, type, data_offset, data_size, size_offset, array_size) \
+ PB_FIELDINFO_ASSERT_ ## width(tag, type, data_offset, data_size, size_offset, array_size)
+
+#define PB_DATA_OFFSET_STATIC(htype, structname, fieldname) PB_DO ## htype(structname, fieldname)
+#define PB_DATA_OFFSET_POINTER(htype, structname, fieldname) PB_DO ## htype(structname, fieldname)
+#define PB_DATA_OFFSET_CALLBACK(htype, structname, fieldname) PB_DO ## htype(structname, fieldname)
+#define PB_DO_PB_HTYPE_REQUIRED(structname, fieldname) offsetof(structname, fieldname)
+#define PB_DO_PB_HTYPE_SINGULAR(structname, fieldname) offsetof(structname, fieldname)
+#define PB_DO_PB_HTYPE_ONEOF(structname, fieldname) offsetof(structname, PB_ONEOF_NAME(FULL, fieldname))
+#define PB_DO_PB_HTYPE_OPTIONAL(structname, fieldname) offsetof(structname, fieldname)
+#define PB_DO_PB_HTYPE_REPEATED(structname, fieldname) offsetof(structname, fieldname)
+#define PB_DO_PB_HTYPE_FIXARRAY(structname, fieldname) offsetof(structname, fieldname)
+
+#define PB_SIZE_OFFSET_STATIC(htype, structname, fieldname) PB_SO ## htype(structname, fieldname)
+#define PB_SIZE_OFFSET_POINTER(htype, structname, fieldname) PB_SO_PTR ## htype(structname, fieldname)
+#define PB_SIZE_OFFSET_CALLBACK(htype, structname, fieldname) PB_SO_CB ## htype(structname, fieldname)
+#define PB_SO_PB_HTYPE_REQUIRED(structname, fieldname) 0
+#define PB_SO_PB_HTYPE_SINGULAR(structname, fieldname) 0
+#define PB_SO_PB_HTYPE_ONEOF(structname, fieldname) PB_SO_PB_HTYPE_ONEOF2(structname, PB_ONEOF_NAME(FULL, fieldname), PB_ONEOF_NAME(UNION, fieldname))
+#define PB_SO_PB_HTYPE_ONEOF2(structname, fullname, unionname) PB_SO_PB_HTYPE_ONEOF3(structname, fullname, unionname)
+#define PB_SO_PB_HTYPE_ONEOF3(structname, fullname, unionname) pb_delta(structname, fullname, which_ ## unionname)
+#define PB_SO_PB_HTYPE_OPTIONAL(structname, fieldname) pb_delta(structname, fieldname, has_ ## fieldname)
+#define PB_SO_PB_HTYPE_REPEATED(structname, fieldname) pb_delta(structname, fieldname, fieldname ## _count)
+#define PB_SO_PB_HTYPE_FIXARRAY(structname, fieldname) 0
+#define PB_SO_PTR_PB_HTYPE_REQUIRED(structname, fieldname) 0
+#define PB_SO_PTR_PB_HTYPE_SINGULAR(structname, fieldname) 0
+#define PB_SO_PTR_PB_HTYPE_ONEOF(structname, fieldname) PB_SO_PB_HTYPE_ONEOF(structname, fieldname)
+#define PB_SO_PTR_PB_HTYPE_OPTIONAL(structname, fieldname) 0
+#define PB_SO_PTR_PB_HTYPE_REPEATED(structname, fieldname) PB_SO_PB_HTYPE_REPEATED(structname, fieldname)
+#define PB_SO_PTR_PB_HTYPE_FIXARRAY(structname, fieldname) 0
+#define PB_SO_CB_PB_HTYPE_REQUIRED(structname, fieldname) 0
+#define PB_SO_CB_PB_HTYPE_SINGULAR(structname, fieldname) 0
+#define PB_SO_CB_PB_HTYPE_ONEOF(structname, fieldname) PB_SO_PB_HTYPE_ONEOF(structname, fieldname)
+#define PB_SO_CB_PB_HTYPE_OPTIONAL(structname, fieldname) 0
+#define PB_SO_CB_PB_HTYPE_REPEATED(structname, fieldname) 0
+#define PB_SO_CB_PB_HTYPE_FIXARRAY(structname, fieldname) 0
+
+#define PB_ARRAY_SIZE_STATIC(htype, structname, fieldname) PB_AS ## htype(structname, fieldname)
+#define PB_ARRAY_SIZE_POINTER(htype, structname, fieldname) PB_AS_PTR ## htype(structname, fieldname)
+#define PB_ARRAY_SIZE_CALLBACK(htype, structname, fieldname) 1
+#define PB_AS_PB_HTYPE_REQUIRED(structname, fieldname) 1
+#define PB_AS_PB_HTYPE_SINGULAR(structname, fieldname) 1
+#define PB_AS_PB_HTYPE_OPTIONAL(structname, fieldname) 1
+#define PB_AS_PB_HTYPE_ONEOF(structname, fieldname) 1
+#define PB_AS_PB_HTYPE_REPEATED(structname, fieldname) pb_arraysize(structname, fieldname)
+#define PB_AS_PB_HTYPE_FIXARRAY(structname, fieldname) pb_arraysize(structname, fieldname)
+#define PB_AS_PTR_PB_HTYPE_REQUIRED(structname, fieldname) 1
+#define PB_AS_PTR_PB_HTYPE_SINGULAR(structname, fieldname) 1
+#define PB_AS_PTR_PB_HTYPE_OPTIONAL(structname, fieldname) 1
+#define PB_AS_PTR_PB_HTYPE_ONEOF(structname, fieldname) 1
+#define PB_AS_PTR_PB_HTYPE_REPEATED(structname, fieldname) 1
+#define PB_AS_PTR_PB_HTYPE_FIXARRAY(structname, fieldname) pb_arraysize(structname, fieldname[0])
+
+#define PB_DATA_SIZE_STATIC(htype, structname, fieldname) PB_DS ## htype(structname, fieldname)
+#define PB_DATA_SIZE_POINTER(htype, structname, fieldname) PB_DS_PTR ## htype(structname, fieldname)
+#define PB_DATA_SIZE_CALLBACK(htype, structname, fieldname) PB_DS_CB ## htype(structname, fieldname)
+#define PB_DS_PB_HTYPE_REQUIRED(structname, fieldname) pb_membersize(structname, fieldname)
+#define PB_DS_PB_HTYPE_SINGULAR(structname, fieldname) pb_membersize(structname, fieldname)
+#define PB_DS_PB_HTYPE_OPTIONAL(structname, fieldname) pb_membersize(structname, fieldname)
+#define PB_DS_PB_HTYPE_ONEOF(structname, fieldname) pb_membersize(structname, PB_ONEOF_NAME(FULL, fieldname))
+#define PB_DS_PB_HTYPE_REPEATED(structname, fieldname) pb_membersize(structname, fieldname[0])
+#define PB_DS_PB_HTYPE_FIXARRAY(structname, fieldname) pb_membersize(structname, fieldname[0])
+#define PB_DS_PTR_PB_HTYPE_REQUIRED(structname, fieldname) pb_membersize(structname, fieldname[0])
+#define PB_DS_PTR_PB_HTYPE_SINGULAR(structname, fieldname) pb_membersize(structname, fieldname[0])
+#define PB_DS_PTR_PB_HTYPE_OPTIONAL(structname, fieldname) pb_membersize(structname, fieldname[0])
+#define PB_DS_PTR_PB_HTYPE_ONEOF(structname, fieldname) pb_membersize(structname, PB_ONEOF_NAME(FULL, fieldname)[0])
+#define PB_DS_PTR_PB_HTYPE_REPEATED(structname, fieldname) pb_membersize(structname, fieldname[0])
+#define PB_DS_PTR_PB_HTYPE_FIXARRAY(structname, fieldname) pb_membersize(structname, fieldname[0][0])
+#define PB_DS_CB_PB_HTYPE_REQUIRED(structname, fieldname) pb_membersize(structname, fieldname)
+#define PB_DS_CB_PB_HTYPE_SINGULAR(structname, fieldname) pb_membersize(structname, fieldname)
+#define PB_DS_CB_PB_HTYPE_OPTIONAL(structname, fieldname) pb_membersize(structname, fieldname)
+#define PB_DS_CB_PB_HTYPE_ONEOF(structname, fieldname) pb_membersize(structname, PB_ONEOF_NAME(FULL, fieldname))
+#define PB_DS_CB_PB_HTYPE_REPEATED(structname, fieldname) pb_membersize(structname, fieldname)
+#define PB_DS_CB_PB_HTYPE_FIXARRAY(structname, fieldname) pb_membersize(structname, fieldname)
+
+#define PB_ONEOF_NAME(type, tuple) PB_EXPAND(PB_ONEOF_NAME_ ## type tuple)
+#define PB_ONEOF_NAME_UNION(unionname,membername,fullname) unionname
+#define PB_ONEOF_NAME_MEMBER(unionname,membername,fullname) membername
+#define PB_ONEOF_NAME_FULL(unionname,membername,fullname) fullname
+
+#define PB_GEN_SUBMSG_INFO(structname, atype, htype, ltype, fieldname, tag) \
+ PB_SUBMSG_INFO_ ## htype(_PB_LTYPE_ ## ltype, structname, fieldname)
+
+#define PB_SUBMSG_INFO_REQUIRED(ltype, structname, fieldname) PB_SI ## ltype(structname ## _ ## fieldname ## _MSGTYPE)
+#define PB_SUBMSG_INFO_SINGULAR(ltype, structname, fieldname) PB_SI ## ltype(structname ## _ ## fieldname ## _MSGTYPE)
+#define PB_SUBMSG_INFO_OPTIONAL(ltype, structname, fieldname) PB_SI ## ltype(structname ## _ ## fieldname ## _MSGTYPE)
+#define PB_SUBMSG_INFO_ONEOF(ltype, structname, fieldname) PB_SUBMSG_INFO_ONEOF2(ltype, structname, PB_ONEOF_NAME(UNION, fieldname), PB_ONEOF_NAME(MEMBER, fieldname))
+#define PB_SUBMSG_INFO_ONEOF2(ltype, structname, unionname, membername) PB_SUBMSG_INFO_ONEOF3(ltype, structname, unionname, membername)
+#define PB_SUBMSG_INFO_ONEOF3(ltype, structname, unionname, membername) PB_SI ## ltype(structname ## _ ## unionname ## _ ## membername ## _MSGTYPE)
+#define PB_SUBMSG_INFO_REPEATED(ltype, structname, fieldname) PB_SI ## ltype(structname ## _ ## fieldname ## _MSGTYPE)
+#define PB_SUBMSG_INFO_FIXARRAY(ltype, structname, fieldname) PB_SI ## ltype(structname ## _ ## fieldname ## _MSGTYPE)
+#define PB_SI_PB_LTYPE_BOOL(t)
+#define PB_SI_PB_LTYPE_BYTES(t)
+#define PB_SI_PB_LTYPE_DOUBLE(t)
+#define PB_SI_PB_LTYPE_ENUM(t)
+#define PB_SI_PB_LTYPE_UENUM(t)
+#define PB_SI_PB_LTYPE_FIXED32(t)
+#define PB_SI_PB_LTYPE_FIXED64(t)
+#define PB_SI_PB_LTYPE_FLOAT(t)
+#define PB_SI_PB_LTYPE_INT32(t)
+#define PB_SI_PB_LTYPE_INT64(t)
+#define PB_SI_PB_LTYPE_MESSAGE(t) PB_SUBMSG_DESCRIPTOR(t)
+#define PB_SI_PB_LTYPE_MSG_W_CB(t) PB_SUBMSG_DESCRIPTOR(t)
+#define PB_SI_PB_LTYPE_SFIXED32(t)
+#define PB_SI_PB_LTYPE_SFIXED64(t)
+#define PB_SI_PB_LTYPE_SINT32(t)
+#define PB_SI_PB_LTYPE_SINT64(t)
+#define PB_SI_PB_LTYPE_STRING(t)
+#define PB_SI_PB_LTYPE_UINT32(t)
+#define PB_SI_PB_LTYPE_UINT64(t)
+#define PB_SI_PB_LTYPE_EXTENSION(t)
+#define PB_SI_PB_LTYPE_FIXED_LENGTH_BYTES(t)
+#define PB_SUBMSG_DESCRIPTOR(t) &(t ## _msg),
+
+/* The field descriptors use a variable width format, with width of either
+ * 1, 2, 4 or 8 of 32-bit words. The two lowest bytes of the first byte always
+ * encode the descriptor size, 6 lowest bits of field tag number, and 8 bits
+ * of the field type.
+ *
+ * Descriptor size is encoded as 0 = 1 word, 1 = 2 words, 2 = 4 words, 3 = 8 words.
+ *
+ * Formats, listed starting with the least significant bit of the first word.
+ * 1 word: [2-bit len] [6-bit tag] [8-bit type] [8-bit data_offset] [4-bit size_offset] [4-bit data_size]
+ *
+ * 2 words: [2-bit len] [6-bit tag] [8-bit type] [12-bit array_size] [4-bit size_offset]
+ * [16-bit data_offset] [12-bit data_size] [4-bit tag>>6]
+ *
+ * 4 words: [2-bit len] [6-bit tag] [8-bit type] [16-bit array_size]
+ * [8-bit size_offset] [24-bit tag>>6]
+ * [32-bit data_offset]
+ * [32-bit data_size]
+ *
+ * 8 words: [2-bit len] [6-bit tag] [8-bit type] [16-bit reserved]
+ * [8-bit size_offset] [24-bit tag>>6]
+ * [32-bit data_offset]
+ * [32-bit data_size]
+ * [32-bit array_size]
+ * [32-bit reserved]
+ * [32-bit reserved]
+ * [32-bit reserved]
+ */
+
+#define PB_FIELDINFO_1(tag, type, data_offset, data_size, size_offset, array_size) \
+ (0 | (((tag) << 2) & 0xFF) | ((type) << 8) | (((uint32_t)(data_offset) & 0xFF) << 16) | \
+ (((uint32_t)(size_offset) & 0x0F) << 24) | (((uint32_t)(data_size) & 0x0F) << 28)),
+
+#define PB_FIELDINFO_2(tag, type, data_offset, data_size, size_offset, array_size) \
+ (1 | (((tag) << 2) & 0xFF) | ((type) << 8) | (((uint32_t)(array_size) & 0xFFF) << 16) | (((uint32_t)(size_offset) & 0x0F) << 28)), \
+ (((uint32_t)(data_offset) & 0xFFFF) | (((uint32_t)(data_size) & 0xFFF) << 16) | (((uint32_t)(tag) & 0x3c0) << 22)),
+
+#define PB_FIELDINFO_4(tag, type, data_offset, data_size, size_offset, array_size) \
+ (2 | (((tag) << 2) & 0xFF) | ((type) << 8) | (((uint32_t)(array_size) & 0xFFFF) << 16)), \
+ ((uint32_t)(int_least8_t)(size_offset) | (((uint32_t)(tag) << 2) & 0xFFFFFF00)), \
+ (data_offset), (data_size),
+
+#define PB_FIELDINFO_8(tag, type, data_offset, data_size, size_offset, array_size) \
+ (3 | (((tag) << 2) & 0xFF) | ((type) << 8)), \
+ ((uint32_t)(int_least8_t)(size_offset) | (((uint32_t)(tag) << 2) & 0xFFFFFF00)), \
+ (data_offset), (data_size), (array_size), 0, 0, 0,
+
+/* These assertions verify that the field information fits in the allocated space.
+ * The generator tries to automatically determine the correct width that can fit all
+ * data associated with a message. These asserts will fail only if there has been a
+ * problem in the automatic logic - this may be worth reporting as a bug. As a workaround,
+ * you can increase the descriptor width by defining PB_FIELDINFO_WIDTH or by setting
+ * descriptorsize option in .options file.
+ */
+#define PB_FITS(value,bits) ((uint32_t)(value) < ((uint32_t)1<<bits))
+#define PB_FIELDINFO_ASSERT_1(tag, type, data_offset, data_size, size_offset, array_size) \
+ PB_STATIC_ASSERT(PB_FITS(tag,6) && PB_FITS(data_offset,8) && PB_FITS(size_offset,4) && PB_FITS(data_size,4) && PB_FITS(array_size,1), FIELDINFO_DOES_NOT_FIT_width1_field ## tag)
+
+#define PB_FIELDINFO_ASSERT_2(tag, type, data_offset, data_size, size_offset, array_size) \
+ PB_STATIC_ASSERT(PB_FITS(tag,10) && PB_FITS(data_offset,16) && PB_FITS(size_offset,4) && PB_FITS(data_size,12) && PB_FITS(array_size,12), FIELDINFO_DOES_NOT_FIT_width2_field ## tag)
+
+#ifndef PB_FIELD_32BIT
+/* Maximum field sizes are still 16-bit if pb_size_t is 16-bit */
+#define PB_FIELDINFO_ASSERT_4(tag, type, data_offset, data_size, size_offset, array_size) \
+ PB_STATIC_ASSERT(PB_FITS(tag,16) && PB_FITS(data_offset,16) && PB_FITS((int_least8_t)size_offset,8) && PB_FITS(data_size,16) && PB_FITS(array_size,16), FIELDINFO_DOES_NOT_FIT_width4_field ## tag)
+
+#define PB_FIELDINFO_ASSERT_8(tag, type, data_offset, data_size, size_offset, array_size) \
+ PB_STATIC_ASSERT(PB_FITS(tag,16) && PB_FITS(data_offset,16) && PB_FITS((int_least8_t)size_offset,8) && PB_FITS(data_size,16) && PB_FITS(array_size,16), FIELDINFO_DOES_NOT_FIT_width8_field ## tag)
+#else
+/* Up to 32-bit fields supported.
+ * Note that the checks are against 31 bits to avoid compiler warnings about shift wider than type in the test.
+ * I expect that there is no reasonable use for >2GB messages with nanopb anyway.
+ */
+#define PB_FIELDINFO_ASSERT_4(tag, type, data_offset, data_size, size_offset, array_size) \
+ PB_STATIC_ASSERT(PB_FITS(tag,30) && PB_FITS(data_offset,31) && PB_FITS(size_offset,8) && PB_FITS(data_size,31) && PB_FITS(array_size,16), FIELDINFO_DOES_NOT_FIT_width4_field ## tag)
+
+#define PB_FIELDINFO_ASSERT_8(tag, type, data_offset, data_size, size_offset, array_size) \
+ PB_STATIC_ASSERT(PB_FITS(tag,30) && PB_FITS(data_offset,31) && PB_FITS(size_offset,8) && PB_FITS(data_size,31) && PB_FITS(array_size,31), FIELDINFO_DOES_NOT_FIT_width8_field ## tag)
+#endif
+
+
+/* Automatic picking of FIELDINFO width:
+ * Uses width 1 when possible, otherwise resorts to width 2.
+ * This is used when PB_BIND() is called with "AUTO" as the argument.
+ * The generator will give explicit size argument when it knows that a message
+ * structure grows beyond 1-word format limits.
+ */
+#define PB_FIELDINFO_WIDTH_AUTO(atype, htype, ltype) PB_FI_WIDTH ## atype(htype, ltype)
+#define PB_FI_WIDTH_PB_ATYPE_STATIC(htype, ltype) PB_FI_WIDTH ## htype(ltype)
+#define PB_FI_WIDTH_PB_ATYPE_POINTER(htype, ltype) PB_FI_WIDTH ## htype(ltype)
+#define PB_FI_WIDTH_PB_ATYPE_CALLBACK(htype, ltype) 2
+#define PB_FI_WIDTH_PB_HTYPE_REQUIRED(ltype) PB_FI_WIDTH ## ltype
+#define PB_FI_WIDTH_PB_HTYPE_SINGULAR(ltype) PB_FI_WIDTH ## ltype
+#define PB_FI_WIDTH_PB_HTYPE_OPTIONAL(ltype) PB_FI_WIDTH ## ltype
+#define PB_FI_WIDTH_PB_HTYPE_ONEOF(ltype) PB_FI_WIDTH ## ltype
+#define PB_FI_WIDTH_PB_HTYPE_REPEATED(ltype) 2
+#define PB_FI_WIDTH_PB_HTYPE_FIXARRAY(ltype) 2
+#define PB_FI_WIDTH_PB_LTYPE_BOOL 1
+#define PB_FI_WIDTH_PB_LTYPE_BYTES 2
+#define PB_FI_WIDTH_PB_LTYPE_DOUBLE 1
+#define PB_FI_WIDTH_PB_LTYPE_ENUM 1
+#define PB_FI_WIDTH_PB_LTYPE_UENUM 1
+#define PB_FI_WIDTH_PB_LTYPE_FIXED32 1
+#define PB_FI_WIDTH_PB_LTYPE_FIXED64 1
+#define PB_FI_WIDTH_PB_LTYPE_FLOAT 1
+#define PB_FI_WIDTH_PB_LTYPE_INT32 1
+#define PB_FI_WIDTH_PB_LTYPE_INT64 1
+#define PB_FI_WIDTH_PB_LTYPE_MESSAGE 2
+#define PB_FI_WIDTH_PB_LTYPE_MSG_W_CB 2
+#define PB_FI_WIDTH_PB_LTYPE_SFIXED32 1
+#define PB_FI_WIDTH_PB_LTYPE_SFIXED64 1
+#define PB_FI_WIDTH_PB_LTYPE_SINT32 1
+#define PB_FI_WIDTH_PB_LTYPE_SINT64 1
+#define PB_FI_WIDTH_PB_LTYPE_STRING 2
+#define PB_FI_WIDTH_PB_LTYPE_UINT32 1
+#define PB_FI_WIDTH_PB_LTYPE_UINT64 1
+#define PB_FI_WIDTH_PB_LTYPE_EXTENSION 1
+#define PB_FI_WIDTH_PB_LTYPE_FIXED_LENGTH_BYTES 2
+
+/* The mapping from protobuf types to LTYPEs is done using these macros. */
+#define PB_LTYPE_MAP_BOOL PB_LTYPE_BOOL
+#define PB_LTYPE_MAP_BYTES PB_LTYPE_BYTES
+#define PB_LTYPE_MAP_DOUBLE PB_LTYPE_FIXED64
+#define PB_LTYPE_MAP_ENUM PB_LTYPE_VARINT
+#define PB_LTYPE_MAP_UENUM PB_LTYPE_UVARINT
+#define PB_LTYPE_MAP_FIXED32 PB_LTYPE_FIXED32
+#define PB_LTYPE_MAP_FIXED64 PB_LTYPE_FIXED64
+#define PB_LTYPE_MAP_FLOAT PB_LTYPE_FIXED32
+#define PB_LTYPE_MAP_INT32 PB_LTYPE_VARINT
+#define PB_LTYPE_MAP_INT64 PB_LTYPE_VARINT
+#define PB_LTYPE_MAP_MESSAGE PB_LTYPE_SUBMESSAGE
+#define PB_LTYPE_MAP_MSG_W_CB PB_LTYPE_SUBMSG_W_CB
+#define PB_LTYPE_MAP_SFIXED32 PB_LTYPE_FIXED32
+#define PB_LTYPE_MAP_SFIXED64 PB_LTYPE_FIXED64
+#define PB_LTYPE_MAP_SINT32 PB_LTYPE_SVARINT
+#define PB_LTYPE_MAP_SINT64 PB_LTYPE_SVARINT
+#define PB_LTYPE_MAP_STRING PB_LTYPE_STRING
+#define PB_LTYPE_MAP_UINT32 PB_LTYPE_UVARINT
+#define PB_LTYPE_MAP_UINT64 PB_LTYPE_UVARINT
+#define PB_LTYPE_MAP_EXTENSION PB_LTYPE_EXTENSION
+#define PB_LTYPE_MAP_FIXED_LENGTH_BYTES PB_LTYPE_FIXED_LENGTH_BYTES
+
+/* These macros are used for giving out error messages.
+ * They are mostly a debugging aid; the main error information
+ * is the true/false return value from functions.
+ * Some code space can be saved by disabling the error
+ * messages if not used.
+ *
+ * PB_SET_ERROR() sets the error message if none has been set yet.
+ * msg must be a constant string literal.
+ * PB_GET_ERROR() always returns a pointer to a string.
+ * PB_RETURN_ERROR() sets the error and returns false from current
+ * function.
+ */
+#ifdef PB_NO_ERRMSG
+#define PB_SET_ERROR(stream, msg) PB_UNUSED(stream)
+#define PB_GET_ERROR(stream) "(errmsg disabled)"
+#else
+#define PB_SET_ERROR(stream, msg) (stream->errmsg = (stream)->errmsg ? (stream)->errmsg : (msg))
+#define PB_GET_ERROR(stream) ((stream)->errmsg ? (stream)->errmsg : "(none)")
+#endif
+
+#define PB_RETURN_ERROR(stream, msg) return PB_SET_ERROR(stream, msg), false
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#ifdef __cplusplus
+#if __cplusplus >= 201103L
+#define PB_CONSTEXPR constexpr
+#else // __cplusplus >= 201103L
+#define PB_CONSTEXPR
+#endif // __cplusplus >= 201103L
+
+#if __cplusplus >= 201703L
+#define PB_INLINE_CONSTEXPR inline constexpr
+#else // __cplusplus >= 201703L
+#define PB_INLINE_CONSTEXPR PB_CONSTEXPR
+#endif // __cplusplus >= 201703L
+
+namespace nanopb {
+// Each type will be partially specialized by the generator.
+template <typename GenMessageT> struct MessageDescriptor;
+} // namespace nanopb
+#endif /* __cplusplus */
+
+#endif
+
diff --git a/security/container/protos/nanopb/pb_common.c b/security/container/protos/nanopb/pb_common.c
new file mode 100644
index 0000000..6aee76b
--- /dev/null
+++ b/security/container/protos/nanopb/pb_common.c
@@ -0,0 +1,388 @@
+/* pb_common.c: Common support functions for pb_encode.c and pb_decode.c.
+ *
+ * 2014 Petteri Aimonen <jpa@kapsi.fi>
+ */
+
+#include "pb_common.h"
+
+static bool load_descriptor_values(pb_field_iter_t *iter)
+{
+ uint32_t word0;
+ uint32_t data_offset;
+ int_least8_t size_offset;
+
+ if (iter->index >= iter->descriptor->field_count)
+ return false;
+
+ word0 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index]);
+ iter->type = (pb_type_t)((word0 >> 8) & 0xFF);
+
+ switch(word0 & 3)
+ {
+ case 0: {
+ /* 1-word format */
+ iter->array_size = 1;
+ iter->tag = (pb_size_t)((word0 >> 2) & 0x3F);
+ size_offset = (int_least8_t)((word0 >> 24) & 0x0F);
+ data_offset = (word0 >> 16) & 0xFF;
+ iter->data_size = (pb_size_t)((word0 >> 28) & 0x0F);
+ break;
+ }
+
+ case 1: {
+ /* 2-word format */
+ uint32_t word1 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index + 1]);
+
+ iter->array_size = (pb_size_t)((word0 >> 16) & 0x0FFF);
+ iter->tag = (pb_size_t)(((word0 >> 2) & 0x3F) | ((word1 >> 28) << 6));
+ size_offset = (int_least8_t)((word0 >> 28) & 0x0F);
+ data_offset = word1 & 0xFFFF;
+ iter->data_size = (pb_size_t)((word1 >> 16) & 0x0FFF);
+ break;
+ }
+
+ case 2: {
+ /* 4-word format */
+ uint32_t word1 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index + 1]);
+ uint32_t word2 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index + 2]);
+ uint32_t word3 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index + 3]);
+
+ iter->array_size = (pb_size_t)(word0 >> 16);
+ iter->tag = (pb_size_t)(((word0 >> 2) & 0x3F) | ((word1 >> 8) << 6));
+ size_offset = (int_least8_t)(word1 & 0xFF);
+ data_offset = word2;
+ iter->data_size = (pb_size_t)word3;
+ break;
+ }
+
+ default: {
+ /* 8-word format */
+ uint32_t word1 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index + 1]);
+ uint32_t word2 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index + 2]);
+ uint32_t word3 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index + 3]);
+ uint32_t word4 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index + 4]);
+
+ iter->array_size = (pb_size_t)word4;
+ iter->tag = (pb_size_t)(((word0 >> 2) & 0x3F) | ((word1 >> 8) << 6));
+ size_offset = (int_least8_t)(word1 & 0xFF);
+ data_offset = word2;
+ iter->data_size = (pb_size_t)word3;
+ break;
+ }
+ }
+
+ if (!iter->message)
+ {
+ /* Avoid doing arithmetic on null pointers, it is undefined */
+ iter->pField = NULL;
+ iter->pSize = NULL;
+ }
+ else
+ {
+ iter->pField = (char*)iter->message + data_offset;
+
+ if (size_offset)
+ {
+ iter->pSize = (char*)iter->pField - size_offset;
+ }
+ else if (PB_HTYPE(iter->type) == PB_HTYPE_REPEATED &&
+ (PB_ATYPE(iter->type) == PB_ATYPE_STATIC ||
+ PB_ATYPE(iter->type) == PB_ATYPE_POINTER))
+ {
+ /* Fixed count array */
+ iter->pSize = &iter->array_size;
+ }
+ else
+ {
+ iter->pSize = NULL;
+ }
+
+ if (PB_ATYPE(iter->type) == PB_ATYPE_POINTER && iter->pField != NULL)
+ {
+ iter->pData = *(void**)iter->pField;
+ }
+ else
+ {
+ iter->pData = iter->pField;
+ }
+ }
+
+ if (PB_LTYPE_IS_SUBMSG(iter->type))
+ {
+ iter->submsg_desc = iter->descriptor->submsg_info[iter->submessage_index];
+ }
+ else
+ {
+ iter->submsg_desc = NULL;
+ }
+
+ return true;
+}
+
+static void advance_iterator(pb_field_iter_t *iter)
+{
+ iter->index++;
+
+ if (iter->index >= iter->descriptor->field_count)
+ {
+ /* Restart */
+ iter->index = 0;
+ iter->field_info_index = 0;
+ iter->submessage_index = 0;
+ iter->required_field_index = 0;
+ }
+ else
+ {
+ /* Increment indexes based on previous field type.
+ * All field info formats have the following fields:
+ * - lowest 2 bits tell the amount of words in the descriptor (2^n words)
+ * - bits 2..7 give the lowest bits of tag number.
+ * - bits 8..15 give the field type.
+ */
+ uint32_t prev_descriptor = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index]);
+ pb_type_t prev_type = (prev_descriptor >> 8) & 0xFF;
+ pb_size_t descriptor_len = (pb_size_t)(1 << (prev_descriptor & 3));
+
+ /* Add to fields.
+ * The cast to pb_size_t is needed to avoid -Wconversion warning.
+ * Because the data is is constants from generator, there is no danger of overflow.
+ */
+ iter->field_info_index = (pb_size_t)(iter->field_info_index + descriptor_len);
+ iter->required_field_index = (pb_size_t)(iter->required_field_index + (PB_HTYPE(prev_type) == PB_HTYPE_REQUIRED));
+ iter->submessage_index = (pb_size_t)(iter->submessage_index + PB_LTYPE_IS_SUBMSG(prev_type));
+ }
+}
+
+bool pb_field_iter_begin(pb_field_iter_t *iter, const pb_msgdesc_t *desc, void *message)
+{
+ memset(iter, 0, sizeof(*iter));
+
+ iter->descriptor = desc;
+ iter->message = message;
+
+ return load_descriptor_values(iter);
+}
+
+bool pb_field_iter_begin_extension(pb_field_iter_t *iter, pb_extension_t *extension)
+{
+ const pb_msgdesc_t *msg = (const pb_msgdesc_t*)extension->type->arg;
+ bool status;
+
+ uint32_t word0 = PB_PROGMEM_READU32(msg->field_info[0]);
+ if (PB_ATYPE(word0 >> 8) == PB_ATYPE_POINTER)
+ {
+ /* For pointer extensions, the pointer is stored directly
+ * in the extension structure. This avoids having an extra
+ * indirection. */
+ status = pb_field_iter_begin(iter, msg, &extension->dest);
+ }
+ else
+ {
+ status = pb_field_iter_begin(iter, msg, extension->dest);
+ }
+
+ iter->pSize = &extension->found;
+ return status;
+}
+
+bool pb_field_iter_next(pb_field_iter_t *iter)
+{
+ advance_iterator(iter);
+ (void)load_descriptor_values(iter);
+ return iter->index != 0;
+}
+
+bool pb_field_iter_find(pb_field_iter_t *iter, uint32_t tag)
+{
+ if (iter->tag == tag)
+ {
+ return true; /* Nothing to do, correct field already. */
+ }
+ else if (tag > iter->descriptor->largest_tag)
+ {
+ return false;
+ }
+ else
+ {
+ pb_size_t start = iter->index;
+ uint32_t fieldinfo;
+
+ if (tag < iter->tag)
+ {
+ /* Fields are in tag number order, so we know that tag is between
+ * 0 and our start position. Setting index to end forces
+ * advance_iterator() call below to restart from beginning. */
+ iter->index = iter->descriptor->field_count;
+ }
+
+ do
+ {
+ /* Advance iterator but don't load values yet */
+ advance_iterator(iter);
+
+ /* Do fast check for tag number match */
+ fieldinfo = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index]);
+
+ if (((fieldinfo >> 2) & 0x3F) == (tag & 0x3F))
+ {
+ /* Good candidate, check further */
+ (void)load_descriptor_values(iter);
+
+ if (iter->tag == tag &&
+ PB_LTYPE(iter->type) != PB_LTYPE_EXTENSION)
+ {
+ /* Found it */
+ return true;
+ }
+ }
+ } while (iter->index != start);
+
+ /* Searched all the way back to start, and found nothing. */
+ (void)load_descriptor_values(iter);
+ return false;
+ }
+}
+
+bool pb_field_iter_find_extension(pb_field_iter_t *iter)
+{
+ if (PB_LTYPE(iter->type) == PB_LTYPE_EXTENSION)
+ {
+ return true;
+ }
+ else
+ {
+ pb_size_t start = iter->index;
+ uint32_t fieldinfo;
+
+ do
+ {
+ /* Advance iterator but don't load values yet */
+ advance_iterator(iter);
+
+ /* Do fast check for field type */
+ fieldinfo = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index]);
+
+ if (PB_LTYPE((fieldinfo >> 8) & 0xFF) == PB_LTYPE_EXTENSION)
+ {
+ return load_descriptor_values(iter);
+ }
+ } while (iter->index != start);
+
+ /* Searched all the way back to start, and found nothing. */
+ (void)load_descriptor_values(iter);
+ return false;
+ }
+}
+
+static void *pb_const_cast(const void *p)
+{
+ /* Note: this casts away const, in order to use the common field iterator
+ * logic for both encoding and decoding. The cast is done using union
+ * to avoid spurious compiler warnings. */
+ union {
+ void *p1;
+ const void *p2;
+ } t;
+ t.p2 = p;
+ return t.p1;
+}
+
+bool pb_field_iter_begin_const(pb_field_iter_t *iter, const pb_msgdesc_t *desc, const void *message)
+{
+ return pb_field_iter_begin(iter, desc, pb_const_cast(message));
+}
+
+bool pb_field_iter_begin_extension_const(pb_field_iter_t *iter, const pb_extension_t *extension)
+{
+ return pb_field_iter_begin_extension(iter, (pb_extension_t*)pb_const_cast(extension));
+}
+
+bool pb_default_field_callback(pb_istream_t *istream, pb_ostream_t *ostream, const pb_field_t *field)
+{
+ if (field->data_size == sizeof(pb_callback_t))
+ {
+ pb_callback_t *pCallback = (pb_callback_t*)field->pData;
+
+ if (pCallback != NULL)
+ {
+ if (istream != NULL && pCallback->funcs.decode != NULL)
+ {
+ return pCallback->funcs.decode(istream, field, &pCallback->arg);
+ }
+
+ if (ostream != NULL && pCallback->funcs.encode != NULL)
+ {
+ return pCallback->funcs.encode(ostream, field, &pCallback->arg);
+ }
+ }
+ }
+
+ return true; /* Success, but didn't do anything */
+
+}
+
+#ifdef PB_VALIDATE_UTF8
+
+/* This function checks whether a string is valid UTF-8 text.
+ *
+ * Algorithm is adapted from https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c
+ * Original copyright: Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> 2005-03-30
+ * Licensed under "Short code license", which allows use under MIT license or
+ * any compatible with it.
+ */
+
+bool pb_validate_utf8(const char *str)
+{
+ const pb_byte_t *s = (const pb_byte_t*)str;
+ while (*s)
+ {
+ if (*s < 0x80)
+ {
+ /* 0xxxxxxx */
+ s++;
+ }
+ else if ((s[0] & 0xe0) == 0xc0)
+ {
+ /* 110XXXXx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[0] & 0xfe) == 0xc0) /* overlong? */
+ return false;
+ else
+ s += 2;
+ }
+ else if ((s[0] & 0xf0) == 0xe0)
+ {
+ /* 1110XXXX 10Xxxxxx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */
+ (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */
+ (s[0] == 0xef && s[1] == 0xbf &&
+ (s[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */
+ return false;
+ else
+ s += 3;
+ }
+ else if ((s[0] & 0xf8) == 0xf0)
+ {
+ /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[3] & 0xc0) != 0x80 ||
+ (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */
+ (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */
+ return false;
+ else
+ s += 4;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+#endif
+
diff --git a/security/container/protos/nanopb/pb_common.h b/security/container/protos/nanopb/pb_common.h
new file mode 100644
index 0000000..58aa90f
--- /dev/null
+++ b/security/container/protos/nanopb/pb_common.h
@@ -0,0 +1,49 @@
+/* pb_common.h: Common support functions for pb_encode.c and pb_decode.c.
+ * These functions are rarely needed by applications directly.
+ */
+
+#ifndef PB_COMMON_H_INCLUDED
+#define PB_COMMON_H_INCLUDED
+
+#include "pb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Initialize the field iterator structure to beginning.
+ * Returns false if the message type is empty. */
+bool pb_field_iter_begin(pb_field_iter_t *iter, const pb_msgdesc_t *desc, void *message);
+
+/* Get a field iterator for extension field. */
+bool pb_field_iter_begin_extension(pb_field_iter_t *iter, pb_extension_t *extension);
+
+/* Same as pb_field_iter_begin(), but for const message pointer.
+ * Note that the pointers in pb_field_iter_t will be non-const but shouldn't
+ * be written to when using these functions. */
+bool pb_field_iter_begin_const(pb_field_iter_t *iter, const pb_msgdesc_t *desc, const void *message);
+bool pb_field_iter_begin_extension_const(pb_field_iter_t *iter, const pb_extension_t *extension);
+
+/* Advance the iterator to the next field.
+ * Returns false when the iterator wraps back to the first field. */
+bool pb_field_iter_next(pb_field_iter_t *iter);
+
+/* Advance the iterator until it points at a field with the given tag.
+ * Returns false if no such field exists. */
+bool pb_field_iter_find(pb_field_iter_t *iter, uint32_t tag);
+
+/* Find a field with type PB_LTYPE_EXTENSION, or return false if not found.
+ * There can be only one extension range field per message. */
+bool pb_field_iter_find_extension(pb_field_iter_t *iter);
+
+#ifdef PB_VALIDATE_UTF8
+/* Validate UTF-8 text string */
+bool pb_validate_utf8(const char *s);
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
+
diff --git a/security/container/protos/nanopb/pb_decode.c b/security/container/protos/nanopb/pb_decode.c
new file mode 100644
index 0000000..b194825
--- /dev/null
+++ b/security/container/protos/nanopb/pb_decode.c
@@ -0,0 +1,1709 @@
+/* pb_decode.c -- decode a protobuf using minimal resources
+ *
+ * 2011 Petteri Aimonen <jpa@kapsi.fi>
+ */
+
+/* Use the GCC warn_unused_result attribute to check that all return values
+ * are propagated correctly. On other compilers and gcc before 3.4.0 just
+ * ignore the annotation.
+ */
+#if !defined(__GNUC__) || ( __GNUC__ < 3) || (__GNUC__ == 3 && __GNUC_MINOR__ < 4)
+ #define checkreturn
+#else
+ #define checkreturn __attribute__((warn_unused_result))
+#endif
+
+#include "pb.h"
+#include "pb_decode.h"
+#include "pb_common.h"
+
+/**************************************
+ * Declarations internal to this file *
+ **************************************/
+
+static bool checkreturn buf_read(pb_istream_t *stream, pb_byte_t *buf, size_t count);
+static bool checkreturn pb_decode_varint32_eof(pb_istream_t *stream, uint32_t *dest, bool *eof);
+static bool checkreturn read_raw_value(pb_istream_t *stream, pb_wire_type_t wire_type, pb_byte_t *buf, size_t *size);
+static bool checkreturn decode_basic_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field);
+static bool checkreturn decode_static_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field);
+static bool checkreturn decode_pointer_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field);
+static bool checkreturn decode_callback_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field);
+static bool checkreturn decode_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field);
+static bool checkreturn default_extension_decoder(pb_istream_t *stream, pb_extension_t *extension, uint32_t tag, pb_wire_type_t wire_type);
+static bool checkreturn decode_extension(pb_istream_t *stream, uint32_t tag, pb_wire_type_t wire_type, pb_extension_t *extension);
+static bool pb_field_set_to_default(pb_field_iter_t *field);
+static bool pb_message_set_to_defaults(pb_field_iter_t *iter);
+static bool checkreturn pb_dec_bool(pb_istream_t *stream, const pb_field_iter_t *field);
+static bool checkreturn pb_dec_varint(pb_istream_t *stream, const pb_field_iter_t *field);
+static bool checkreturn pb_dec_bytes(pb_istream_t *stream, const pb_field_iter_t *field);
+static bool checkreturn pb_dec_string(pb_istream_t *stream, const pb_field_iter_t *field);
+static bool checkreturn pb_dec_submessage(pb_istream_t *stream, const pb_field_iter_t *field);
+static bool checkreturn pb_dec_fixed_length_bytes(pb_istream_t *stream, const pb_field_iter_t *field);
+static bool checkreturn pb_skip_varint(pb_istream_t *stream);
+static bool checkreturn pb_skip_string(pb_istream_t *stream);
+
+#ifdef PB_ENABLE_MALLOC
+static bool checkreturn allocate_field(pb_istream_t *stream, void *pData, size_t data_size, size_t array_size);
+static void initialize_pointer_field(void *pItem, pb_field_iter_t *field);
+static bool checkreturn pb_release_union_field(pb_istream_t *stream, pb_field_iter_t *field);
+static void pb_release_single_field(pb_field_iter_t *field);
+#endif
+
+#ifdef PB_WITHOUT_64BIT
+#define pb_int64_t int32_t
+#define pb_uint64_t uint32_t
+#else
+#define pb_int64_t int64_t
+#define pb_uint64_t uint64_t
+#endif
+
+#define PB_WT_PACKED ((pb_wire_type_t)0xFF)
+
+typedef struct {
+ uint32_t bitfield[(PB_MAX_REQUIRED_FIELDS + 31) / 32];
+} pb_fields_seen_t;
+
+/*******************************
+ * pb_istream_t implementation *
+ *******************************/
+
+static bool checkreturn buf_read(pb_istream_t *stream, pb_byte_t *buf, size_t count)
+{
+ size_t i;
+ const pb_byte_t *source = (const pb_byte_t*)stream->state;
+ stream->state = (pb_byte_t*)stream->state + count;
+
+ if (buf != NULL)
+ {
+ for (i = 0; i < count; i++)
+ buf[i] = source[i];
+ }
+
+ return true;
+}
+
+bool checkreturn pb_read(pb_istream_t *stream, pb_byte_t *buf, size_t count)
+{
+ if (count == 0)
+ return true;
+
+#ifndef PB_BUFFER_ONLY
+ if (buf == NULL && stream->callback != buf_read)
+ {
+ /* Skip input bytes */
+ pb_byte_t tmp[16];
+ while (count > 16)
+ {
+ if (!pb_read(stream, tmp, 16))
+ return false;
+
+ count -= 16;
+ }
+
+ return pb_read(stream, tmp, count);
+ }
+#endif
+
+ if (stream->bytes_left < count)
+ PB_RETURN_ERROR(stream, "end-of-stream");
+
+#ifndef PB_BUFFER_ONLY
+ if (!stream->callback(stream, buf, count))
+ PB_RETURN_ERROR(stream, "io error");
+#else
+ if (!buf_read(stream, buf, count))
+ return false;
+#endif
+
+ stream->bytes_left -= count;
+ return true;
+}
+
+/* Read a single byte from input stream. buf may not be NULL.
+ * This is an optimization for the varint decoding. */
+static bool checkreturn pb_readbyte(pb_istream_t *stream, pb_byte_t *buf)
+{
+ if (stream->bytes_left == 0)
+ PB_RETURN_ERROR(stream, "end-of-stream");
+
+#ifndef PB_BUFFER_ONLY
+ if (!stream->callback(stream, buf, 1))
+ PB_RETURN_ERROR(stream, "io error");
+#else
+ *buf = *(const pb_byte_t*)stream->state;
+ stream->state = (pb_byte_t*)stream->state + 1;
+#endif
+
+ stream->bytes_left--;
+
+ return true;
+}
+
+pb_istream_t pb_istream_from_buffer(const pb_byte_t *buf, size_t msglen)
+{
+ pb_istream_t stream;
+ /* Cast away the const from buf without a compiler error. We are
+ * careful to use it only in a const manner in the callbacks.
+ */
+ union {
+ void *state;
+ const void *c_state;
+ } state;
+#ifdef PB_BUFFER_ONLY
+ stream.callback = NULL;
+#else
+ stream.callback = &buf_read;
+#endif
+ state.c_state = buf;
+ stream.state = state.state;
+ stream.bytes_left = msglen;
+#ifndef PB_NO_ERRMSG
+ stream.errmsg = NULL;
+#endif
+ return stream;
+}
+
+/********************
+ * Helper functions *
+ ********************/
+
+static bool checkreturn pb_decode_varint32_eof(pb_istream_t *stream, uint32_t *dest, bool *eof)
+{
+ pb_byte_t byte;
+ uint32_t result;
+
+ if (!pb_readbyte(stream, &byte))
+ {
+ if (stream->bytes_left == 0)
+ {
+ if (eof)
+ {
+ *eof = true;
+ }
+ }
+
+ return false;
+ }
+
+ if ((byte & 0x80) == 0)
+ {
+ /* Quick case, 1 byte value */
+ result = byte;
+ }
+ else
+ {
+ /* Multibyte case */
+ uint_fast8_t bitpos = 7;
+ result = byte & 0x7F;
+
+ do
+ {
+ if (!pb_readbyte(stream, &byte))
+ return false;
+
+ if (bitpos >= 32)
+ {
+ /* Note: The varint could have trailing 0x80 bytes, or 0xFF for negative. */
+ pb_byte_t sign_extension = (bitpos < 63) ? 0xFF : 0x01;
+ bool valid_extension = ((byte & 0x7F) == 0x00 ||
+ ((result >> 31) != 0 && byte == sign_extension));
+
+ if (bitpos >= 64 || !valid_extension)
+ {
+ PB_RETURN_ERROR(stream, "varint overflow");
+ }
+ }
+ else
+ {
+ result |= (uint32_t)(byte & 0x7F) << bitpos;
+ }
+ bitpos = (uint_fast8_t)(bitpos + 7);
+ } while (byte & 0x80);
+
+ if (bitpos == 35 && (byte & 0x70) != 0)
+ {
+ /* The last byte was at bitpos=28, so only bottom 4 bits fit. */
+ PB_RETURN_ERROR(stream, "varint overflow");
+ }
+ }
+
+ *dest = result;
+ return true;
+}
+
+bool checkreturn pb_decode_varint32(pb_istream_t *stream, uint32_t *dest)
+{
+ return pb_decode_varint32_eof(stream, dest, NULL);
+}
+
+#ifndef PB_WITHOUT_64BIT
+bool checkreturn pb_decode_varint(pb_istream_t *stream, uint64_t *dest)
+{
+ pb_byte_t byte;
+ uint_fast8_t bitpos = 0;
+ uint64_t result = 0;
+
+ do
+ {
+ if (bitpos >= 64)
+ PB_RETURN_ERROR(stream, "varint overflow");
+
+ if (!pb_readbyte(stream, &byte))
+ return false;
+
+ result |= (uint64_t)(byte & 0x7F) << bitpos;
+ bitpos = (uint_fast8_t)(bitpos + 7);
+ } while (byte & 0x80);
+
+ *dest = result;
+ return true;
+}
+#endif
+
+bool checkreturn pb_skip_varint(pb_istream_t *stream)
+{
+ pb_byte_t byte;
+ do
+ {
+ if (!pb_read(stream, &byte, 1))
+ return false;
+ } while (byte & 0x80);
+ return true;
+}
+
+bool checkreturn pb_skip_string(pb_istream_t *stream)
+{
+ uint32_t length;
+ if (!pb_decode_varint32(stream, &length))
+ return false;
+
+ if ((size_t)length != length)
+ {
+ PB_RETURN_ERROR(stream, "size too large");
+ }
+
+ return pb_read(stream, NULL, (size_t)length);
+}
+
+bool checkreturn pb_decode_tag(pb_istream_t *stream, pb_wire_type_t *wire_type, uint32_t *tag, bool *eof)
+{
+ uint32_t temp;
+ *eof = false;
+ *wire_type = (pb_wire_type_t) 0;
+ *tag = 0;
+
+ if (!pb_decode_varint32_eof(stream, &temp, eof))
+ {
+ return false;
+ }
+
+ *tag = temp >> 3;
+ *wire_type = (pb_wire_type_t)(temp & 7);
+ return true;
+}
+
+bool checkreturn pb_skip_field(pb_istream_t *stream, pb_wire_type_t wire_type)
+{
+ switch (wire_type)
+ {
+ case PB_WT_VARINT: return pb_skip_varint(stream);
+ case PB_WT_64BIT: return pb_read(stream, NULL, 8);
+ case PB_WT_STRING: return pb_skip_string(stream);
+ case PB_WT_32BIT: return pb_read(stream, NULL, 4);
+ default: PB_RETURN_ERROR(stream, "invalid wire_type");
+ }
+}
+
+/* Read a raw value to buffer, for the purpose of passing it to callback as
+ * a substream. Size is maximum size on call, and actual size on return.
+ */
+static bool checkreturn read_raw_value(pb_istream_t *stream, pb_wire_type_t wire_type, pb_byte_t *buf, size_t *size)
+{
+ size_t max_size = *size;
+ switch (wire_type)
+ {
+ case PB_WT_VARINT:
+ *size = 0;
+ do
+ {
+ (*size)++;
+ if (*size > max_size)
+ PB_RETURN_ERROR(stream, "varint overflow");
+
+ if (!pb_read(stream, buf, 1))
+ return false;
+ } while (*buf++ & 0x80);
+ return true;
+
+ case PB_WT_64BIT:
+ *size = 8;
+ return pb_read(stream, buf, 8);
+
+ case PB_WT_32BIT:
+ *size = 4;
+ return pb_read(stream, buf, 4);
+
+ case PB_WT_STRING:
+ /* Calling read_raw_value with a PB_WT_STRING is an error.
+ * Explicitly handle this case and fallthrough to default to avoid
+ * compiler warnings.
+ */
+
+ default: PB_RETURN_ERROR(stream, "invalid wire_type");
+ }
+}
+
+/* Decode string length from stream and return a substream with limited length.
+ * Remember to close the substream using pb_close_string_substream().
+ */
+bool checkreturn pb_make_string_substream(pb_istream_t *stream, pb_istream_t *substream)
+{
+ uint32_t size;
+ if (!pb_decode_varint32(stream, &size))
+ return false;
+
+ *substream = *stream;
+ if (substream->bytes_left < size)
+ PB_RETURN_ERROR(stream, "parent stream too short");
+
+ substream->bytes_left = (size_t)size;
+ stream->bytes_left -= (size_t)size;
+ return true;
+}
+
+bool checkreturn pb_close_string_substream(pb_istream_t *stream, pb_istream_t *substream)
+{
+ if (substream->bytes_left) {
+ if (!pb_read(substream, NULL, substream->bytes_left))
+ return false;
+ }
+
+ stream->state = substream->state;
+
+#ifndef PB_NO_ERRMSG
+ stream->errmsg = substream->errmsg;
+#endif
+ return true;
+}
+
+/*************************
+ * Decode a single field *
+ *************************/
+
+static bool checkreturn decode_basic_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field)
+{
+ switch (PB_LTYPE(field->type))
+ {
+ case PB_LTYPE_BOOL:
+ if (wire_type != PB_WT_VARINT && wire_type != PB_WT_PACKED)
+ PB_RETURN_ERROR(stream, "wrong wire type");
+
+ return pb_dec_bool(stream, field);
+
+ case PB_LTYPE_VARINT:
+ case PB_LTYPE_UVARINT:
+ case PB_LTYPE_SVARINT:
+ if (wire_type != PB_WT_VARINT && wire_type != PB_WT_PACKED)
+ PB_RETURN_ERROR(stream, "wrong wire type");
+
+ return pb_dec_varint(stream, field);
+
+ case PB_LTYPE_FIXED32:
+ if (wire_type != PB_WT_32BIT && wire_type != PB_WT_PACKED)
+ PB_RETURN_ERROR(stream, "wrong wire type");
+
+ return pb_decode_fixed32(stream, field->pData);
+
+ case PB_LTYPE_FIXED64:
+ if (wire_type != PB_WT_64BIT && wire_type != PB_WT_PACKED)
+ PB_RETURN_ERROR(stream, "wrong wire type");
+
+#ifdef PB_CONVERT_DOUBLE_FLOAT
+ if (field->data_size == sizeof(float))
+ {
+ return pb_decode_double_as_float(stream, (float*)field->pData);
+ }
+#endif
+
+#ifdef PB_WITHOUT_64BIT
+ PB_RETURN_ERROR(stream, "invalid data_size");
+#else
+ return pb_decode_fixed64(stream, field->pData);
+#endif
+
+ case PB_LTYPE_BYTES:
+ if (wire_type != PB_WT_STRING)
+ PB_RETURN_ERROR(stream, "wrong wire type");
+
+ return pb_dec_bytes(stream, field);
+
+ case PB_LTYPE_STRING:
+ if (wire_type != PB_WT_STRING)
+ PB_RETURN_ERROR(stream, "wrong wire type");
+
+ return pb_dec_string(stream, field);
+
+ case PB_LTYPE_SUBMESSAGE:
+ case PB_LTYPE_SUBMSG_W_CB:
+ if (wire_type != PB_WT_STRING)
+ PB_RETURN_ERROR(stream, "wrong wire type");
+
+ return pb_dec_submessage(stream, field);
+
+ case PB_LTYPE_FIXED_LENGTH_BYTES:
+ if (wire_type != PB_WT_STRING)
+ PB_RETURN_ERROR(stream, "wrong wire type");
+
+ return pb_dec_fixed_length_bytes(stream, field);
+
+ default:
+ PB_RETURN_ERROR(stream, "invalid field type");
+ }
+}
+
+static bool checkreturn decode_static_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field)
+{
+ switch (PB_HTYPE(field->type))
+ {
+ case PB_HTYPE_REQUIRED:
+ return decode_basic_field(stream, wire_type, field);
+
+ case PB_HTYPE_OPTIONAL:
+ if (field->pSize != NULL)
+ *(bool*)field->pSize = true;
+ return decode_basic_field(stream, wire_type, field);
+
+ case PB_HTYPE_REPEATED:
+ if (wire_type == PB_WT_STRING
+ && PB_LTYPE(field->type) <= PB_LTYPE_LAST_PACKABLE)
+ {
+ /* Packed array */
+ bool status = true;
+ pb_istream_t substream;
+ pb_size_t *size = (pb_size_t*)field->pSize;
+ field->pData = (char*)field->pField + field->data_size * (*size);
+
+ if (!pb_make_string_substream(stream, &substream))
+ return false;
+
+ while (substream.bytes_left > 0 && *size < field->array_size)
+ {
+ if (!decode_basic_field(&substream, PB_WT_PACKED, field))
+ {
+ status = false;
+ break;
+ }
+ (*size)++;
+ field->pData = (char*)field->pData + field->data_size;
+ }
+
+ if (substream.bytes_left != 0)
+ PB_RETURN_ERROR(stream, "array overflow");
+ if (!pb_close_string_substream(stream, &substream))
+ return false;
+
+ return status;
+ }
+ else
+ {
+ /* Repeated field */
+ pb_size_t *size = (pb_size_t*)field->pSize;
+ field->pData = (char*)field->pField + field->data_size * (*size);
+
+ if ((*size)++ >= field->array_size)
+ PB_RETURN_ERROR(stream, "array overflow");
+
+ return decode_basic_field(stream, wire_type, field);
+ }
+
+ case PB_HTYPE_ONEOF:
+ if (PB_LTYPE_IS_SUBMSG(field->type) &&
+ *(pb_size_t*)field->pSize != field->tag)
+ {
+ /* We memset to zero so that any callbacks are set to NULL.
+ * This is because the callbacks might otherwise have values
+ * from some other union field.
+ * If callbacks are needed inside oneof field, use .proto
+ * option submsg_callback to have a separate callback function
+ * that can set the fields before submessage is decoded.
+ * pb_dec_submessage() will set any default values. */
+ memset(field->pData, 0, (size_t)field->data_size);
+
+ /* Set default values for the submessage fields. */
+ if (field->submsg_desc->default_value != NULL ||
+ field->submsg_desc->field_callback != NULL ||
+ field->submsg_desc->submsg_info[0] != NULL)
+ {
+ pb_field_iter_t submsg_iter;
+ if (pb_field_iter_begin(&submsg_iter, field->submsg_desc, field->pData))
+ {
+ if (!pb_message_set_to_defaults(&submsg_iter))
+ PB_RETURN_ERROR(stream, "failed to set defaults");
+ }
+ }
+ }
+ *(pb_size_t*)field->pSize = field->tag;
+
+ return decode_basic_field(stream, wire_type, field);
+
+ default:
+ PB_RETURN_ERROR(stream, "invalid field type");
+ }
+}
+
+#ifdef PB_ENABLE_MALLOC
+/* Allocate storage for the field and store the pointer at iter->pData.
+ * array_size is the number of entries to reserve in an array.
+ * Zero size is not allowed, use pb_free() for releasing.
+ */
+static bool checkreturn allocate_field(pb_istream_t *stream, void *pData, size_t data_size, size_t array_size)
+{
+ void *ptr = *(void**)pData;
+
+ if (data_size == 0 || array_size == 0)
+ PB_RETURN_ERROR(stream, "invalid size");
+
+#ifdef __AVR__
+ /* Workaround for AVR libc bug 53284: http://savannah.nongnu.org/bugs/?53284
+ * Realloc to size of 1 byte can cause corruption of the malloc structures.
+ */
+ if (data_size == 1 && array_size == 1)
+ {
+ data_size = 2;
+ }
+#endif
+
+ /* Check for multiplication overflows.
+ * This code avoids the costly division if the sizes are small enough.
+ * Multiplication is safe as long as only half of bits are set
+ * in either multiplicand.
+ */
+ {
+ const size_t check_limit = (size_t)1 << (sizeof(size_t) * 4);
+ if (data_size >= check_limit || array_size >= check_limit)
+ {
+ const size_t size_max = (size_t)-1;
+ if (size_max / array_size < data_size)
+ {
+ PB_RETURN_ERROR(stream, "size too large");
+ }
+ }
+ }
+
+ /* Allocate new or expand previous allocation */
+ /* Note: on failure the old pointer will remain in the structure,
+ * the message must be freed by caller also on error return. */
+ ptr = pb_realloc(ptr, array_size * data_size);
+ if (ptr == NULL)
+ PB_RETURN_ERROR(stream, "realloc failed");
+
+ *(void**)pData = ptr;
+ return true;
+}
+
+/* Clear a newly allocated item in case it contains a pointer, or is a submessage. */
+static void initialize_pointer_field(void *pItem, pb_field_iter_t *field)
+{
+ if (PB_LTYPE(field->type) == PB_LTYPE_STRING ||
+ PB_LTYPE(field->type) == PB_LTYPE_BYTES)
+ {
+ *(void**)pItem = NULL;
+ }
+ else if (PB_LTYPE_IS_SUBMSG(field->type))
+ {
+ /* We memset to zero so that any callbacks are set to NULL.
+ * Default values will be set by pb_dec_submessage(). */
+ memset(pItem, 0, field->data_size);
+ }
+}
+#endif
+
+static bool checkreturn decode_pointer_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field)
+{
+#ifndef PB_ENABLE_MALLOC
+ PB_UNUSED(wire_type);
+ PB_UNUSED(field);
+ PB_RETURN_ERROR(stream, "no malloc support");
+#else
+ switch (PB_HTYPE(field->type))
+ {
+ case PB_HTYPE_REQUIRED:
+ case PB_HTYPE_OPTIONAL:
+ case PB_HTYPE_ONEOF:
+ if (PB_LTYPE_IS_SUBMSG(field->type) && *(void**)field->pField != NULL)
+ {
+ /* Duplicate field, have to release the old allocation first. */
+ /* FIXME: Does this work correctly for oneofs? */
+ pb_release_single_field(field);
+ }
+
+ if (PB_HTYPE(field->type) == PB_HTYPE_ONEOF)
+ {
+ *(pb_size_t*)field->pSize = field->tag;
+ }
+
+ if (PB_LTYPE(field->type) == PB_LTYPE_STRING ||
+ PB_LTYPE(field->type) == PB_LTYPE_BYTES)
+ {
+ /* pb_dec_string and pb_dec_bytes handle allocation themselves */
+ field->pData = field->pField;
+ return decode_basic_field(stream, wire_type, field);
+ }
+ else
+ {
+ if (!allocate_field(stream, field->pField, field->data_size, 1))
+ return false;
+
+ field->pData = *(void**)field->pField;
+ initialize_pointer_field(field->pData, field);
+ return decode_basic_field(stream, wire_type, field);
+ }
+
+ case PB_HTYPE_REPEATED:
+ if (wire_type == PB_WT_STRING
+ && PB_LTYPE(field->type) <= PB_LTYPE_LAST_PACKABLE)
+ {
+ /* Packed array, multiple items come in at once. */
+ bool status = true;
+ pb_size_t *size = (pb_size_t*)field->pSize;
+ size_t allocated_size = *size;
+ pb_istream_t substream;
+
+ if (!pb_make_string_substream(stream, &substream))
+ return false;
+
+ while (substream.bytes_left)
+ {
+ if (*size == PB_SIZE_MAX)
+ {
+#ifndef PB_NO_ERRMSG
+ stream->errmsg = "too many array entries";
+#endif
+ status = false;
+ break;
+ }
+
+ if ((size_t)*size + 1 > allocated_size)
+ {
+ /* Allocate more storage. This tries to guess the
+ * number of remaining entries. Round the division
+ * upwards. */
+ size_t remain = (substream.bytes_left - 1) / field->data_size + 1;
+ if (remain < PB_SIZE_MAX - allocated_size)
+ allocated_size += remain;
+ else
+ allocated_size += 1;
+
+ if (!allocate_field(&substream, field->pField, field->data_size, allocated_size))
+ {
+ status = false;
+ break;
+ }
+ }
+
+ /* Decode the array entry */
+ field->pData = *(char**)field->pField + field->data_size * (*size);
+ initialize_pointer_field(field->pData, field);
+ if (!decode_basic_field(&substream, PB_WT_PACKED, field))
+ {
+ status = false;
+ break;
+ }
+
+ (*size)++;
+ }
+ if (!pb_close_string_substream(stream, &substream))
+ return false;
+
+ return status;
+ }
+ else
+ {
+ /* Normal repeated field, i.e. only one item at a time. */
+ pb_size_t *size = (pb_size_t*)field->pSize;
+
+ if (*size == PB_SIZE_MAX)
+ PB_RETURN_ERROR(stream, "too many array entries");
+
+ if (!allocate_field(stream, field->pField, field->data_size, (size_t)(*size + 1)))
+ return false;
+
+ field->pData = *(char**)field->pField + field->data_size * (*size);
+ (*size)++;
+ initialize_pointer_field(field->pData, field);
+ return decode_basic_field(stream, wire_type, field);
+ }
+
+ default:
+ PB_RETURN_ERROR(stream, "invalid field type");
+ }
+#endif
+}
+
+static bool checkreturn decode_callback_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field)
+{
+ if (!field->descriptor->field_callback)
+ return pb_skip_field(stream, wire_type);
+
+ if (wire_type == PB_WT_STRING)
+ {
+ pb_istream_t substream;
+ size_t prev_bytes_left;
+
+ if (!pb_make_string_substream(stream, &substream))
+ return false;
+
+ do
+ {
+ prev_bytes_left = substream.bytes_left;
+ if (!field->descriptor->field_callback(&substream, NULL, field))
+ PB_RETURN_ERROR(stream, "callback failed");
+ } while (substream.bytes_left > 0 && substream.bytes_left < prev_bytes_left);
+
+ if (!pb_close_string_substream(stream, &substream))
+ return false;
+
+ return true;
+ }
+ else
+ {
+ /* Copy the single scalar value to stack.
+ * This is required so that we can limit the stream length,
+ * which in turn allows to use same callback for packed and
+ * not-packed fields. */
+ pb_istream_t substream;
+ pb_byte_t buffer[10];
+ size_t size = sizeof(buffer);
+
+ if (!read_raw_value(stream, wire_type, buffer, &size))
+ return false;
+ substream = pb_istream_from_buffer(buffer, size);
+
+ return field->descriptor->field_callback(&substream, NULL, field);
+ }
+}
+
+static bool checkreturn decode_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field)
+{
+#ifdef PB_ENABLE_MALLOC
+ /* When decoding an oneof field, check if there is old data that must be
+ * released first. */
+ if (PB_HTYPE(field->type) == PB_HTYPE_ONEOF)
+ {
+ if (!pb_release_union_field(stream, field))
+ return false;
+ }
+#endif
+
+ switch (PB_ATYPE(field->type))
+ {
+ case PB_ATYPE_STATIC:
+ return decode_static_field(stream, wire_type, field);
+
+ case PB_ATYPE_POINTER:
+ return decode_pointer_field(stream, wire_type, field);
+
+ case PB_ATYPE_CALLBACK:
+ return decode_callback_field(stream, wire_type, field);
+
+ default:
+ PB_RETURN_ERROR(stream, "invalid field type");
+ }
+}
+
+/* Default handler for extension fields. Expects to have a pb_msgdesc_t
+ * pointer in the extension->type->arg field, pointing to a message with
+ * only one field in it. */
+static bool checkreturn default_extension_decoder(pb_istream_t *stream,
+ pb_extension_t *extension, uint32_t tag, pb_wire_type_t wire_type)
+{
+ pb_field_iter_t iter;
+
+ if (!pb_field_iter_begin_extension(&iter, extension))
+ PB_RETURN_ERROR(stream, "invalid extension");
+
+ if (iter.tag != tag || !iter.message)
+ return true;
+
+ extension->found = true;
+ return decode_field(stream, wire_type, &iter);
+}
+
+/* Try to decode an unknown field as an extension field. Tries each extension
+ * decoder in turn, until one of them handles the field or loop ends. */
+static bool checkreturn decode_extension(pb_istream_t *stream,
+ uint32_t tag, pb_wire_type_t wire_type, pb_extension_t *extension)
+{
+ size_t pos = stream->bytes_left;
+
+ while (extension != NULL && pos == stream->bytes_left)
+ {
+ bool status;
+ if (extension->type->decode)
+ status = extension->type->decode(stream, extension, tag, wire_type);
+ else
+ status = default_extension_decoder(stream, extension, tag, wire_type);
+
+ if (!status)
+ return false;
+
+ extension = extension->next;
+ }
+
+ return true;
+}
+
+/* Initialize message fields to default values, recursively */
+static bool pb_field_set_to_default(pb_field_iter_t *field)
+{
+ pb_type_t type;
+ type = field->type;
+
+ if (PB_LTYPE(type) == PB_LTYPE_EXTENSION)
+ {
+ pb_extension_t *ext = *(pb_extension_t* const *)field->pData;
+ while (ext != NULL)
+ {
+ pb_field_iter_t ext_iter;
+ if (pb_field_iter_begin_extension(&ext_iter, ext))
+ {
+ ext->found = false;
+ if (!pb_message_set_to_defaults(&ext_iter))
+ return false;
+ }
+ ext = ext->next;
+ }
+ }
+ else if (PB_ATYPE(type) == PB_ATYPE_STATIC)
+ {
+ bool init_data = true;
+ if (PB_HTYPE(type) == PB_HTYPE_OPTIONAL && field->pSize != NULL)
+ {
+ /* Set has_field to false. Still initialize the optional field
+ * itself also. */
+ *(bool*)field->pSize = false;
+ }
+ else if (PB_HTYPE(type) == PB_HTYPE_REPEATED ||
+ PB_HTYPE(type) == PB_HTYPE_ONEOF)
+ {
+ /* REPEATED: Set array count to 0, no need to initialize contents.
+ ONEOF: Set which_field to 0. */
+ *(pb_size_t*)field->pSize = 0;
+ init_data = false;
+ }
+
+ if (init_data)
+ {
+ if (PB_LTYPE_IS_SUBMSG(field->type) &&
+ (field->submsg_desc->default_value != NULL ||
+ field->submsg_desc->field_callback != NULL ||
+ field->submsg_desc->submsg_info[0] != NULL))
+ {
+ /* Initialize submessage to defaults.
+ * Only needed if it has default values
+ * or callback/submessage fields. */
+ pb_field_iter_t submsg_iter;
+ if (pb_field_iter_begin(&submsg_iter, field->submsg_desc, field->pData))
+ {
+ if (!pb_message_set_to_defaults(&submsg_iter))
+ return false;
+ }
+ }
+ else
+ {
+ /* Initialize to zeros */
+ memset(field->pData, 0, (size_t)field->data_size);
+ }
+ }
+ }
+ else if (PB_ATYPE(type) == PB_ATYPE_POINTER)
+ {
+ /* Initialize the pointer to NULL. */
+ *(void**)field->pField = NULL;
+
+ /* Initialize array count to 0. */
+ if (PB_HTYPE(type) == PB_HTYPE_REPEATED ||
+ PB_HTYPE(type) == PB_HTYPE_ONEOF)
+ {
+ *(pb_size_t*)field->pSize = 0;
+ }
+ }
+ else if (PB_ATYPE(type) == PB_ATYPE_CALLBACK)
+ {
+ /* Don't overwrite callback */
+ }
+
+ return true;
+}
+
+static bool pb_message_set_to_defaults(pb_field_iter_t *iter)
+{
+ pb_istream_t defstream = PB_ISTREAM_EMPTY;
+ uint32_t tag = 0;
+ pb_wire_type_t wire_type = PB_WT_VARINT;
+ bool eof;
+
+ if (iter->descriptor->default_value)
+ {
+ defstream = pb_istream_from_buffer(iter->descriptor->default_value, (size_t)-1);
+ if (!pb_decode_tag(&defstream, &wire_type, &tag, &eof))
+ return false;
+ }
+
+ do
+ {
+ if (!pb_field_set_to_default(iter))
+ return false;
+
+ if (tag != 0 && iter->tag == tag)
+ {
+ /* We have a default value for this field in the defstream */
+ if (!decode_field(&defstream, wire_type, iter))
+ return false;
+ if (!pb_decode_tag(&defstream, &wire_type, &tag, &eof))
+ return false;
+
+ if (iter->pSize)
+ *(bool*)iter->pSize = false;
+ }
+ } while (pb_field_iter_next(iter));
+
+ return true;
+}
+
+/*********************
+ * Decode all fields *
+ *********************/
+
+static bool checkreturn pb_decode_inner(pb_istream_t *stream, const pb_msgdesc_t *fields, void *dest_struct, unsigned int flags)
+{
+ uint32_t extension_range_start = 0;
+ pb_extension_t *extensions = NULL;
+
+ /* 'fixed_count_field' and 'fixed_count_size' track position of a repeated fixed
+ * count field. This can only handle _one_ repeated fixed count field that
+ * is unpacked and unordered among other (non repeated fixed count) fields.
+ */
+ pb_size_t fixed_count_field = PB_SIZE_MAX;
+ pb_size_t fixed_count_size = 0;
+ pb_size_t fixed_count_total_size = 0;
+
+ pb_fields_seen_t fields_seen = {{0, 0}};
+ const uint32_t allbits = ~(uint32_t)0;
+ pb_field_iter_t iter;
+
+ if (pb_field_iter_begin(&iter, fields, dest_struct))
+ {
+ if ((flags & PB_DECODE_NOINIT) == 0)
+ {
+ if (!pb_message_set_to_defaults(&iter))
+ PB_RETURN_ERROR(stream, "failed to set defaults");
+ }
+ }
+
+ while (stream->bytes_left)
+ {
+ uint32_t tag;
+ pb_wire_type_t wire_type;
+ bool eof;
+
+ if (!pb_decode_tag(stream, &wire_type, &tag, &eof))
+ {
+ if (eof)
+ break;
+ else
+ return false;
+ }
+
+ if (tag == 0)
+ {
+ if (flags & PB_DECODE_NULLTERMINATED)
+ {
+ break;
+ }
+ else
+ {
+ PB_RETURN_ERROR(stream, "zero tag");
+ }
+ }
+
+ if (!pb_field_iter_find(&iter, tag) || PB_LTYPE(iter.type) == PB_LTYPE_EXTENSION)
+ {
+ /* No match found, check if it matches an extension. */
+ if (extension_range_start == 0)
+ {
+ if (pb_field_iter_find_extension(&iter))
+ {
+ extensions = *(pb_extension_t* const *)iter.pData;
+ extension_range_start = iter.tag;
+ }
+
+ if (!extensions)
+ {
+ extension_range_start = (uint32_t)-1;
+ }
+ }
+
+ if (tag >= extension_range_start)
+ {
+ size_t pos = stream->bytes_left;
+
+ if (!decode_extension(stream, tag, wire_type, extensions))
+ return false;
+
+ if (pos != stream->bytes_left)
+ {
+ /* The field was handled */
+ continue;
+ }
+ }
+
+ /* No match found, skip data */
+ if (!pb_skip_field(stream, wire_type))
+ return false;
+ continue;
+ }
+
+ /* If a repeated fixed count field was found, get size from
+ * 'fixed_count_field' as there is no counter contained in the struct.
+ */
+ if (PB_HTYPE(iter.type) == PB_HTYPE_REPEATED && iter.pSize == &iter.array_size)
+ {
+ if (fixed_count_field != iter.index) {
+ /* If the new fixed count field does not match the previous one,
+ * check that the previous one is NULL or that it finished
+ * receiving all the expected data.
+ */
+ if (fixed_count_field != PB_SIZE_MAX &&
+ fixed_count_size != fixed_count_total_size)
+ {
+ PB_RETURN_ERROR(stream, "wrong size for fixed count field");
+ }
+
+ fixed_count_field = iter.index;
+ fixed_count_size = 0;
+ fixed_count_total_size = iter.array_size;
+ }
+
+ iter.pSize = &fixed_count_size;
+ }
+
+ if (PB_HTYPE(iter.type) == PB_HTYPE_REQUIRED
+ && iter.required_field_index < PB_MAX_REQUIRED_FIELDS)
+ {
+ uint32_t tmp = ((uint32_t)1 << (iter.required_field_index & 31));
+ fields_seen.bitfield[iter.required_field_index >> 5] |= tmp;
+ }
+
+ if (!decode_field(stream, wire_type, &iter))
+ return false;
+ }
+
+ /* Check that all elements of the last decoded fixed count field were present. */
+ if (fixed_count_field != PB_SIZE_MAX &&
+ fixed_count_size != fixed_count_total_size)
+ {
+ PB_RETURN_ERROR(stream, "wrong size for fixed count field");
+ }
+
+ /* Check that all required fields were present. */
+ {
+ pb_size_t req_field_count = iter.descriptor->required_field_count;
+
+ if (req_field_count > 0)
+ {
+ pb_size_t i;
+
+ if (req_field_count > PB_MAX_REQUIRED_FIELDS)
+ req_field_count = PB_MAX_REQUIRED_FIELDS;
+
+ /* Check the whole words */
+ for (i = 0; i < (req_field_count >> 5); i++)
+ {
+ if (fields_seen.bitfield[i] != allbits)
+ PB_RETURN_ERROR(stream, "missing required field");
+ }
+
+ /* Check the remaining bits (if any) */
+ if ((req_field_count & 31) != 0)
+ {
+ if (fields_seen.bitfield[req_field_count >> 5] !=
+ (allbits >> (uint_least8_t)(32 - (req_field_count & 31))))
+ {
+ PB_RETURN_ERROR(stream, "missing required field");
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+bool checkreturn pb_decode_ex(pb_istream_t *stream, const pb_msgdesc_t *fields, void *dest_struct, unsigned int flags)
+{
+ bool status;
+
+ if ((flags & PB_DECODE_DELIMITED) == 0)
+ {
+ status = pb_decode_inner(stream, fields, dest_struct, flags);
+ }
+ else
+ {
+ pb_istream_t substream;
+ if (!pb_make_string_substream(stream, &substream))
+ return false;
+
+ status = pb_decode_inner(&substream, fields, dest_struct, flags);
+
+ if (!pb_close_string_substream(stream, &substream))
+ return false;
+ }
+
+#ifdef PB_ENABLE_MALLOC
+ if (!status)
+ pb_release(fields, dest_struct);
+#endif
+
+ return status;
+}
+
+bool checkreturn pb_decode(pb_istream_t *stream, const pb_msgdesc_t *fields, void *dest_struct)
+{
+ bool status;
+
+ status = pb_decode_inner(stream, fields, dest_struct, 0);
+
+#ifdef PB_ENABLE_MALLOC
+ if (!status)
+ pb_release(fields, dest_struct);
+#endif
+
+ return status;
+}
+
+#ifdef PB_ENABLE_MALLOC
+/* Given an oneof field, if there has already been a field inside this oneof,
+ * release it before overwriting with a different one. */
+static bool pb_release_union_field(pb_istream_t *stream, pb_field_iter_t *field)
+{
+ pb_field_iter_t old_field = *field;
+ pb_size_t old_tag = *(pb_size_t*)field->pSize; /* Previous which_ value */
+ pb_size_t new_tag = field->tag; /* New which_ value */
+
+ if (old_tag == 0)
+ return true; /* Ok, no old data in union */
+
+ if (old_tag == new_tag)
+ return true; /* Ok, old data is of same type => merge */
+
+ /* Release old data. The find can fail if the message struct contains
+ * invalid data. */
+ if (!pb_field_iter_find(&old_field, old_tag))
+ PB_RETURN_ERROR(stream, "invalid union tag");
+
+ pb_release_single_field(&old_field);
+
+ if (PB_ATYPE(field->type) == PB_ATYPE_POINTER)
+ {
+ /* Initialize the pointer to NULL to make sure it is valid
+ * even in case of error return. */
+ *(void**)field->pField = NULL;
+ field->pData = NULL;
+ }
+
+ return true;
+}
+
+static void pb_release_single_field(pb_field_iter_t *field)
+{
+ pb_type_t type;
+ type = field->type;
+
+ if (PB_HTYPE(type) == PB_HTYPE_ONEOF)
+ {
+ if (*(pb_size_t*)field->pSize != field->tag)
+ return; /* This is not the current field in the union */
+ }
+
+ /* Release anything contained inside an extension or submsg.
+ * This has to be done even if the submsg itself is statically
+ * allocated. */
+ if (PB_LTYPE(type) == PB_LTYPE_EXTENSION)
+ {
+ /* Release fields from all extensions in the linked list */
+ pb_extension_t *ext = *(pb_extension_t**)field->pData;
+ while (ext != NULL)
+ {
+ pb_field_iter_t ext_iter;
+ if (pb_field_iter_begin_extension(&ext_iter, ext))
+ {
+ pb_release_single_field(&ext_iter);
+ }
+ ext = ext->next;
+ }
+ }
+ else if (PB_LTYPE_IS_SUBMSG(type) && PB_ATYPE(type) != PB_ATYPE_CALLBACK)
+ {
+ /* Release fields in submessage or submsg array */
+ pb_size_t count = 1;
+
+ if (PB_ATYPE(type) == PB_ATYPE_POINTER)
+ {
+ field->pData = *(void**)field->pField;
+ }
+ else
+ {
+ field->pData = field->pField;
+ }
+
+ if (PB_HTYPE(type) == PB_HTYPE_REPEATED)
+ {
+ count = *(pb_size_t*)field->pSize;
+
+ if (PB_ATYPE(type) == PB_ATYPE_STATIC && count > field->array_size)
+ {
+ /* Protect against corrupted _count fields */
+ count = field->array_size;
+ }
+ }
+
+ if (field->pData)
+ {
+ for (; count > 0; count--)
+ {
+ pb_release(field->submsg_desc, field->pData);
+ field->pData = (char*)field->pData + field->data_size;
+ }
+ }
+ }
+
+ if (PB_ATYPE(type) == PB_ATYPE_POINTER)
+ {
+ if (PB_HTYPE(type) == PB_HTYPE_REPEATED &&
+ (PB_LTYPE(type) == PB_LTYPE_STRING ||
+ PB_LTYPE(type) == PB_LTYPE_BYTES))
+ {
+ /* Release entries in repeated string or bytes array */
+ void **pItem = *(void***)field->pField;
+ pb_size_t count = *(pb_size_t*)field->pSize;
+ for (; count > 0; count--)
+ {
+ pb_free(*pItem);
+ *pItem++ = NULL;
+ }
+ }
+
+ if (PB_HTYPE(type) == PB_HTYPE_REPEATED)
+ {
+ /* We are going to release the array, so set the size to 0 */
+ *(pb_size_t*)field->pSize = 0;
+ }
+
+ /* Release main pointer */
+ pb_free(*(void**)field->pField);
+ *(void**)field->pField = NULL;
+ }
+}
+
+void pb_release(const pb_msgdesc_t *fields, void *dest_struct)
+{
+ pb_field_iter_t iter;
+
+ if (!dest_struct)
+ return; /* Ignore NULL pointers, similar to free() */
+
+ if (!pb_field_iter_begin(&iter, fields, dest_struct))
+ return; /* Empty message type */
+
+ do
+ {
+ pb_release_single_field(&iter);
+ } while (pb_field_iter_next(&iter));
+}
+#endif
+
+/* Field decoders */
+
+bool pb_decode_bool(pb_istream_t *stream, bool *dest)
+{
+ uint32_t value;
+ if (!pb_decode_varint32(stream, &value))
+ return false;
+
+ *(bool*)dest = (value != 0);
+ return true;
+}
+
+bool pb_decode_svarint(pb_istream_t *stream, pb_int64_t *dest)
+{
+ pb_uint64_t value;
+ if (!pb_decode_varint(stream, &value))
+ return false;
+
+ if (value & 1)
+ *dest = (pb_int64_t)(~(value >> 1));
+ else
+ *dest = (pb_int64_t)(value >> 1);
+
+ return true;
+}
+
+bool pb_decode_fixed32(pb_istream_t *stream, void *dest)
+{
+ union {
+ uint32_t fixed32;
+ pb_byte_t bytes[4];
+ } u;
+
+ if (!pb_read(stream, u.bytes, 4))
+ return false;
+
+#if defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN && CHAR_BIT == 8
+ /* fast path - if we know that we're on little endian, assign directly */
+ *(uint32_t*)dest = u.fixed32;
+#else
+ *(uint32_t*)dest = ((uint32_t)u.bytes[0] << 0) |
+ ((uint32_t)u.bytes[1] << 8) |
+ ((uint32_t)u.bytes[2] << 16) |
+ ((uint32_t)u.bytes[3] << 24);
+#endif
+ return true;
+}
+
+#ifndef PB_WITHOUT_64BIT
+bool pb_decode_fixed64(pb_istream_t *stream, void *dest)
+{
+ union {
+ uint64_t fixed64;
+ pb_byte_t bytes[8];
+ } u;
+
+ if (!pb_read(stream, u.bytes, 8))
+ return false;
+
+#if defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN && CHAR_BIT == 8
+ /* fast path - if we know that we're on little endian, assign directly */
+ *(uint64_t*)dest = u.fixed64;
+#else
+ *(uint64_t*)dest = ((uint64_t)u.bytes[0] << 0) |
+ ((uint64_t)u.bytes[1] << 8) |
+ ((uint64_t)u.bytes[2] << 16) |
+ ((uint64_t)u.bytes[3] << 24) |
+ ((uint64_t)u.bytes[4] << 32) |
+ ((uint64_t)u.bytes[5] << 40) |
+ ((uint64_t)u.bytes[6] << 48) |
+ ((uint64_t)u.bytes[7] << 56);
+#endif
+ return true;
+}
+#endif
+
+static bool checkreturn pb_dec_bool(pb_istream_t *stream, const pb_field_iter_t *field)
+{
+ return pb_decode_bool(stream, (bool*)field->pData);
+}
+
+static bool checkreturn pb_dec_varint(pb_istream_t *stream, const pb_field_iter_t *field)
+{
+ if (PB_LTYPE(field->type) == PB_LTYPE_UVARINT)
+ {
+ pb_uint64_t value, clamped;
+ if (!pb_decode_varint(stream, &value))
+ return false;
+
+ /* Cast to the proper field size, while checking for overflows */
+ if (field->data_size == sizeof(pb_uint64_t))
+ clamped = *(pb_uint64_t*)field->pData = value;
+ else if (field->data_size == sizeof(uint32_t))
+ clamped = *(uint32_t*)field->pData = (uint32_t)value;
+ else if (field->data_size == sizeof(uint_least16_t))
+ clamped = *(uint_least16_t*)field->pData = (uint_least16_t)value;
+ else if (field->data_size == sizeof(uint_least8_t))
+ clamped = *(uint_least8_t*)field->pData = (uint_least8_t)value;
+ else
+ PB_RETURN_ERROR(stream, "invalid data_size");
+
+ if (clamped != value)
+ PB_RETURN_ERROR(stream, "integer too large");
+
+ return true;
+ }
+ else
+ {
+ pb_uint64_t value;
+ pb_int64_t svalue;
+ pb_int64_t clamped;
+
+ if (PB_LTYPE(field->type) == PB_LTYPE_SVARINT)
+ {
+ if (!pb_decode_svarint(stream, &svalue))
+ return false;
+ }
+ else
+ {
+ if (!pb_decode_varint(stream, &value))
+ return false;
+
+ /* See issue 97: Google's C++ protobuf allows negative varint values to
+ * be cast as int32_t, instead of the int64_t that should be used when
+ * encoding. Nanopb versions before 0.2.5 had a bug in encoding. In order to
+ * not break decoding of such messages, we cast <=32 bit fields to
+ * int32_t first to get the sign correct.
+ */
+ if (field->data_size == sizeof(pb_int64_t))
+ svalue = (pb_int64_t)value;
+ else
+ svalue = (int32_t)value;
+ }
+
+ /* Cast to the proper field size, while checking for overflows */
+ if (field->data_size == sizeof(pb_int64_t))
+ clamped = *(pb_int64_t*)field->pData = svalue;
+ else if (field->data_size == sizeof(int32_t))
+ clamped = *(int32_t*)field->pData = (int32_t)svalue;
+ else if (field->data_size == sizeof(int_least16_t))
+ clamped = *(int_least16_t*)field->pData = (int_least16_t)svalue;
+ else if (field->data_size == sizeof(int_least8_t))
+ clamped = *(int_least8_t*)field->pData = (int_least8_t)svalue;
+ else
+ PB_RETURN_ERROR(stream, "invalid data_size");
+
+ if (clamped != svalue)
+ PB_RETURN_ERROR(stream, "integer too large");
+
+ return true;
+ }
+}
+
+static bool checkreturn pb_dec_bytes(pb_istream_t *stream, const pb_field_iter_t *field)
+{
+ uint32_t size;
+ size_t alloc_size;
+ pb_bytes_array_t *dest;
+
+ if (!pb_decode_varint32(stream, &size))
+ return false;
+
+ if (size > PB_SIZE_MAX)
+ PB_RETURN_ERROR(stream, "bytes overflow");
+
+ alloc_size = PB_BYTES_ARRAY_T_ALLOCSIZE(size);
+ if (size > alloc_size)
+ PB_RETURN_ERROR(stream, "size too large");
+
+ if (PB_ATYPE(field->type) == PB_ATYPE_POINTER)
+ {
+#ifndef PB_ENABLE_MALLOC
+ PB_RETURN_ERROR(stream, "no malloc support");
+#else
+ if (stream->bytes_left < size)
+ PB_RETURN_ERROR(stream, "end-of-stream");
+
+ if (!allocate_field(stream, field->pData, alloc_size, 1))
+ return false;
+ dest = *(pb_bytes_array_t**)field->pData;
+#endif
+ }
+ else
+ {
+ if (alloc_size > field->data_size)
+ PB_RETURN_ERROR(stream, "bytes overflow");
+ dest = (pb_bytes_array_t*)field->pData;
+ }
+
+ dest->size = (pb_size_t)size;
+ return pb_read(stream, dest->bytes, (size_t)size);
+}
+
+static bool checkreturn pb_dec_string(pb_istream_t *stream, const pb_field_iter_t *field)
+{
+ uint32_t size;
+ size_t alloc_size;
+ pb_byte_t *dest = (pb_byte_t*)field->pData;
+
+ if (!pb_decode_varint32(stream, &size))
+ return false;
+
+ if (size == (uint32_t)-1)
+ PB_RETURN_ERROR(stream, "size too large");
+
+ /* Space for null terminator */
+ alloc_size = (size_t)(size + 1);
+
+ if (alloc_size < size)
+ PB_RETURN_ERROR(stream, "size too large");
+
+ if (PB_ATYPE(field->type) == PB_ATYPE_POINTER)
+ {
+#ifndef PB_ENABLE_MALLOC
+ PB_RETURN_ERROR(stream, "no malloc support");
+#else
+ if (stream->bytes_left < size)
+ PB_RETURN_ERROR(stream, "end-of-stream");
+
+ if (!allocate_field(stream, field->pData, alloc_size, 1))
+ return false;
+ dest = *(pb_byte_t**)field->pData;
+#endif
+ }
+ else
+ {
+ if (alloc_size > field->data_size)
+ PB_RETURN_ERROR(stream, "string overflow");
+ }
+
+ dest[size] = 0;
+
+ if (!pb_read(stream, dest, (size_t)size))
+ return false;
+
+#ifdef PB_VALIDATE_UTF8
+ if (!pb_validate_utf8((const char*)dest))
+ PB_RETURN_ERROR(stream, "invalid utf8");
+#endif
+
+ return true;
+}
+
+static bool checkreturn pb_dec_submessage(pb_istream_t *stream, const pb_field_iter_t *field)
+{
+ bool status = true;
+ bool submsg_consumed = false;
+ pb_istream_t substream;
+
+ if (!pb_make_string_substream(stream, &substream))
+ return false;
+
+ if (field->submsg_desc == NULL)
+ PB_RETURN_ERROR(stream, "invalid field descriptor");
+
+ /* Submessages can have a separate message-level callback that is called
+ * before decoding the message. Typically it is used to set callback fields
+ * inside oneofs. */
+ if (PB_LTYPE(field->type) == PB_LTYPE_SUBMSG_W_CB && field->pSize != NULL)
+ {
+ /* Message callback is stored right before pSize. */
+ pb_callback_t *callback = (pb_callback_t*)field->pSize - 1;
+ if (callback->funcs.decode)
+ {
+ status = callback->funcs.decode(&substream, field, &callback->arg);
+
+ if (substream.bytes_left == 0)
+ {
+ submsg_consumed = true;
+ }
+ }
+ }
+
+ /* Now decode the submessage contents */
+ if (status && !submsg_consumed)
+ {
+ unsigned int flags = 0;
+
+ /* Static required/optional fields are already initialized by top-level
+ * pb_decode(), no need to initialize them again. */
+ if (PB_ATYPE(field->type) == PB_ATYPE_STATIC &&
+ PB_HTYPE(field->type) != PB_HTYPE_REPEATED)
+ {
+ flags = PB_DECODE_NOINIT;
+ }
+
+ status = pb_decode_inner(&substream, field->submsg_desc, field->pData, flags);
+ }
+
+ if (!pb_close_string_substream(stream, &substream))
+ return false;
+
+ return status;
+}
+
+static bool checkreturn pb_dec_fixed_length_bytes(pb_istream_t *stream, const pb_field_iter_t *field)
+{
+ uint32_t size;
+
+ if (!pb_decode_varint32(stream, &size))
+ return false;
+
+ if (size > PB_SIZE_MAX)
+ PB_RETURN_ERROR(stream, "bytes overflow");
+
+ if (size == 0)
+ {
+ /* As a special case, treat empty bytes string as all zeros for fixed_length_bytes. */
+ memset(field->pData, 0, (size_t)field->data_size);
+ return true;
+ }
+
+ if (size != field->data_size)
+ PB_RETURN_ERROR(stream, "incorrect fixed length bytes size");
+
+ return pb_read(stream, (pb_byte_t*)field->pData, (size_t)field->data_size);
+}
+
+#ifdef PB_CONVERT_DOUBLE_FLOAT
+bool pb_decode_double_as_float(pb_istream_t *stream, float *dest)
+{
+ uint_least8_t sign;
+ int exponent;
+ uint32_t mantissa;
+ uint64_t value;
+ union { float f; uint32_t i; } out;
+
+ if (!pb_decode_fixed64(stream, &value))
+ return false;
+
+ /* Decompose input value */
+ sign = (uint_least8_t)((value >> 63) & 1);
+ exponent = (int)((value >> 52) & 0x7FF) - 1023;
+ mantissa = (value >> 28) & 0xFFFFFF; /* Highest 24 bits */
+
+ /* Figure if value is in range representable by floats. */
+ if (exponent == 1024)
+ {
+ /* Special value */
+ exponent = 128;
+ mantissa >>= 1;
+ }
+ else
+ {
+ if (exponent > 127)
+ {
+ /* Too large, convert to infinity */
+ exponent = 128;
+ mantissa = 0;
+ }
+ else if (exponent < -150)
+ {
+ /* Too small, convert to zero */
+ exponent = -127;
+ mantissa = 0;
+ }
+ else if (exponent < -126)
+ {
+ /* Denormalized */
+ mantissa |= 0x1000000;
+ mantissa >>= (-126 - exponent);
+ exponent = -127;
+ }
+
+ /* Round off mantissa */
+ mantissa = (mantissa + 1) >> 1;
+
+ /* Check if mantissa went over 2.0 */
+ if (mantissa & 0x800000)
+ {
+ exponent += 1;
+ mantissa &= 0x7FFFFF;
+ mantissa >>= 1;
+ }
+ }
+
+ /* Combine fields */
+ out.i = mantissa;
+ out.i |= (uint32_t)(exponent + 127) << 23;
+ out.i |= (uint32_t)sign << 31;
+
+ *dest = out.f;
+ return true;
+}
+#endif
diff --git a/security/container/protos/nanopb/pb_decode.h b/security/container/protos/nanopb/pb_decode.h
new file mode 100644
index 0000000..824acd4
--- /dev/null
+++ b/security/container/protos/nanopb/pb_decode.h
@@ -0,0 +1,199 @@
+/* pb_decode.h: Functions to decode protocol buffers. Depends on pb_decode.c.
+ * The main function is pb_decode. You also need an input stream, and the
+ * field descriptions created by nanopb_generator.py.
+ */
+
+#ifndef PB_DECODE_H_INCLUDED
+#define PB_DECODE_H_INCLUDED
+
+#include "pb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Structure for defining custom input streams. You will need to provide
+ * a callback function to read the bytes from your storage, which can be
+ * for example a file or a network socket.
+ *
+ * The callback must conform to these rules:
+ *
+ * 1) Return false on IO errors. This will cause decoding to abort.
+ * 2) You can use state to store your own data (e.g. buffer pointer),
+ * and rely on pb_read to verify that no-body reads past bytes_left.
+ * 3) Your callback may be used with substreams, in which case bytes_left
+ * is different than from the main stream. Don't use bytes_left to compute
+ * any pointers.
+ */
+struct pb_istream_s
+{
+#ifdef PB_BUFFER_ONLY
+ /* Callback pointer is not used in buffer-only configuration.
+ * Having an int pointer here allows binary compatibility but
+ * gives an error if someone tries to assign callback function.
+ */
+ int *callback;
+#else
+ bool (*callback)(pb_istream_t *stream, pb_byte_t *buf, size_t count);
+#endif
+
+ void *state; /* Free field for use by callback implementation */
+ size_t bytes_left;
+
+#ifndef PB_NO_ERRMSG
+ const char *errmsg;
+#endif
+};
+
+#ifndef PB_NO_ERRMSG
+#define PB_ISTREAM_EMPTY {0,0,0,0}
+#else
+#define PB_ISTREAM_EMPTY {0,0,0}
+#endif
+
+/***************************
+ * Main decoding functions *
+ ***************************/
+
+/* Decode a single protocol buffers message from input stream into a C structure.
+ * Returns true on success, false on any failure.
+ * The actual struct pointed to by dest must match the description in fields.
+ * Callback fields of the destination structure must be initialized by caller.
+ * All other fields will be initialized by this function.
+ *
+ * Example usage:
+ * MyMessage msg = {};
+ * uint8_t buffer[64];
+ * pb_istream_t stream;
+ *
+ * // ... read some data into buffer ...
+ *
+ * stream = pb_istream_from_buffer(buffer, count);
+ * pb_decode(&stream, MyMessage_fields, &msg);
+ */
+bool pb_decode(pb_istream_t *stream, const pb_msgdesc_t *fields, void *dest_struct);
+
+/* Extended version of pb_decode, with several options to control
+ * the decoding process:
+ *
+ * PB_DECODE_NOINIT: Do not initialize the fields to default values.
+ * This is slightly faster if you do not need the default
+ * values and instead initialize the structure to 0 using
+ * e.g. memset(). This can also be used for merging two
+ * messages, i.e. combine already existing data with new
+ * values.
+ *
+ * PB_DECODE_DELIMITED: Input message starts with the message size as varint.
+ * Corresponds to parseDelimitedFrom() in Google's
+ * protobuf API.
+ *
+ * PB_DECODE_NULLTERMINATED: Stop reading when field tag is read as 0. This allows
+ * reading null terminated messages.
+ * NOTE: Until nanopb-0.4.0, pb_decode() also allows
+ * null-termination. This behaviour is not supported in
+ * most other protobuf implementations, so PB_DECODE_DELIMITED
+ * is a better option for compatibility.
+ *
+ * Multiple flags can be combined with bitwise or (| operator)
+ */
+#define PB_DECODE_NOINIT 0x01U
+#define PB_DECODE_DELIMITED 0x02U
+#define PB_DECODE_NULLTERMINATED 0x04U
+bool pb_decode_ex(pb_istream_t *stream, const pb_msgdesc_t *fields, void *dest_struct, unsigned int flags);
+
+/* Defines for backwards compatibility with code written before nanopb-0.4.0 */
+#define pb_decode_noinit(s,f,d) pb_decode_ex(s,f,d, PB_DECODE_NOINIT)
+#define pb_decode_delimited(s,f,d) pb_decode_ex(s,f,d, PB_DECODE_DELIMITED)
+#define pb_decode_delimited_noinit(s,f,d) pb_decode_ex(s,f,d, PB_DECODE_DELIMITED | PB_DECODE_NOINIT)
+#define pb_decode_nullterminated(s,f,d) pb_decode_ex(s,f,d, PB_DECODE_NULLTERMINATED)
+
+#ifdef PB_ENABLE_MALLOC
+/* Release any allocated pointer fields. If you use dynamic allocation, you should
+ * call this for any successfully decoded message when you are done with it. If
+ * pb_decode() returns with an error, the message is already released.
+ */
+void pb_release(const pb_msgdesc_t *fields, void *dest_struct);
+#else
+/* Allocation is not supported, so release is no-op */
+#define pb_release(fields, dest_struct) PB_UNUSED(fields); PB_UNUSED(dest_struct);
+#endif
+
+
+/**************************************
+ * Functions for manipulating streams *
+ **************************************/
+
+/* Create an input stream for reading from a memory buffer.
+ *
+ * msglen should be the actual length of the message, not the full size of
+ * allocated buffer.
+ *
+ * Alternatively, you can use a custom stream that reads directly from e.g.
+ * a file or a network socket.
+ */
+pb_istream_t pb_istream_from_buffer(const pb_byte_t *buf, size_t msglen);
+
+/* Function to read from a pb_istream_t. You can use this if you need to
+ * read some custom header data, or to read data in field callbacks.
+ */
+bool pb_read(pb_istream_t *stream, pb_byte_t *buf, size_t count);
+
+
+/************************************************
+ * Helper functions for writing field callbacks *
+ ************************************************/
+
+/* Decode the tag for the next field in the stream. Gives the wire type and
+ * field tag. At end of the message, returns false and sets eof to true. */
+bool pb_decode_tag(pb_istream_t *stream, pb_wire_type_t *wire_type, uint32_t *tag, bool *eof);
+
+/* Skip the field payload data, given the wire type. */
+bool pb_skip_field(pb_istream_t *stream, pb_wire_type_t wire_type);
+
+/* Decode an integer in the varint format. This works for enum, int32,
+ * int64, uint32 and uint64 field types. */
+#ifndef PB_WITHOUT_64BIT
+bool pb_decode_varint(pb_istream_t *stream, uint64_t *dest);
+#else
+#define pb_decode_varint pb_decode_varint32
+#endif
+
+/* Decode an integer in the varint format. This works for enum, int32,
+ * and uint32 field types. */
+bool pb_decode_varint32(pb_istream_t *stream, uint32_t *dest);
+
+/* Decode a bool value in varint format. */
+bool pb_decode_bool(pb_istream_t *stream, bool *dest);
+
+/* Decode an integer in the zig-zagged svarint format. This works for sint32
+ * and sint64. */
+#ifndef PB_WITHOUT_64BIT
+bool pb_decode_svarint(pb_istream_t *stream, int64_t *dest);
+#else
+bool pb_decode_svarint(pb_istream_t *stream, int32_t *dest);
+#endif
+
+/* Decode a fixed32, sfixed32 or float value. You need to pass a pointer to
+ * a 4-byte wide C variable. */
+bool pb_decode_fixed32(pb_istream_t *stream, void *dest);
+
+#ifndef PB_WITHOUT_64BIT
+/* Decode a fixed64, sfixed64 or double value. You need to pass a pointer to
+ * a 8-byte wide C variable. */
+bool pb_decode_fixed64(pb_istream_t *stream, void *dest);
+#endif
+
+#ifdef PB_CONVERT_DOUBLE_FLOAT
+/* Decode a double value into float variable. */
+bool pb_decode_double_as_float(pb_istream_t *stream, float *dest);
+#endif
+
+/* Make a limited-length substream for reading a PB_WT_STRING field. */
+bool pb_make_string_substream(pb_istream_t *stream, pb_istream_t *substream);
+bool pb_close_string_substream(pb_istream_t *stream, pb_istream_t *substream);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
diff --git a/security/container/protos/nanopb/pb_encode.c b/security/container/protos/nanopb/pb_encode.c
new file mode 100644
index 0000000..de716f7
--- /dev/null
+++ b/security/container/protos/nanopb/pb_encode.c
@@ -0,0 +1,987 @@
+/* pb_encode.c -- encode a protobuf using minimal resources
+ *
+ * 2011 Petteri Aimonen <jpa@kapsi.fi>
+ */
+
+#include "pb.h"
+#include "pb_encode.h"
+#include "pb_common.h"
+
+/* Use the GCC warn_unused_result attribute to check that all return values
+ * are propagated correctly. On other compilers and gcc before 3.4.0 just
+ * ignore the annotation.
+ */
+#if !defined(__GNUC__) || ( __GNUC__ < 3) || (__GNUC__ == 3 && __GNUC_MINOR__ < 4)
+ #define checkreturn
+#else
+ #define checkreturn __attribute__((warn_unused_result))
+#endif
+
+/**************************************
+ * Declarations internal to this file *
+ **************************************/
+static bool checkreturn buf_write(pb_ostream_t *stream, const pb_byte_t *buf, size_t count);
+static bool checkreturn encode_array(pb_ostream_t *stream, pb_field_iter_t *field);
+static bool checkreturn pb_check_proto3_default_value(const pb_field_iter_t *field);
+static bool checkreturn encode_basic_field(pb_ostream_t *stream, const pb_field_iter_t *field);
+static bool checkreturn encode_callback_field(pb_ostream_t *stream, const pb_field_iter_t *field);
+static bool checkreturn encode_field(pb_ostream_t *stream, pb_field_iter_t *field);
+static bool checkreturn encode_extension_field(pb_ostream_t *stream, const pb_field_iter_t *field);
+static bool checkreturn default_extension_encoder(pb_ostream_t *stream, const pb_extension_t *extension);
+static bool checkreturn pb_encode_varint_32(pb_ostream_t *stream, uint32_t low, uint32_t high);
+static bool checkreturn pb_enc_bool(pb_ostream_t *stream, const pb_field_iter_t *field);
+static bool checkreturn pb_enc_varint(pb_ostream_t *stream, const pb_field_iter_t *field);
+static bool checkreturn pb_enc_fixed(pb_ostream_t *stream, const pb_field_iter_t *field);
+static bool checkreturn pb_enc_bytes(pb_ostream_t *stream, const pb_field_iter_t *field);
+static bool checkreturn pb_enc_string(pb_ostream_t *stream, const pb_field_iter_t *field);
+static bool checkreturn pb_enc_submessage(pb_ostream_t *stream, const pb_field_iter_t *field);
+static bool checkreturn pb_enc_fixed_length_bytes(pb_ostream_t *stream, const pb_field_iter_t *field);
+
+#ifdef PB_WITHOUT_64BIT
+#define pb_int64_t int32_t
+#define pb_uint64_t uint32_t
+#else
+#define pb_int64_t int64_t
+#define pb_uint64_t uint64_t
+#endif
+
+/*******************************
+ * pb_ostream_t implementation *
+ *******************************/
+
+static bool checkreturn buf_write(pb_ostream_t *stream, const pb_byte_t *buf, size_t count)
+{
+ size_t i;
+ pb_byte_t *dest = (pb_byte_t*)stream->state;
+ stream->state = dest + count;
+
+ for (i = 0; i < count; i++)
+ dest[i] = buf[i];
+
+ return true;
+}
+
+pb_ostream_t pb_ostream_from_buffer(pb_byte_t *buf, size_t bufsize)
+{
+ pb_ostream_t stream;
+#ifdef PB_BUFFER_ONLY
+ stream.callback = (void*)1; /* Just a marker value */
+#else
+ stream.callback = &buf_write;
+#endif
+ stream.state = buf;
+ stream.max_size = bufsize;
+ stream.bytes_written = 0;
+#ifndef PB_NO_ERRMSG
+ stream.errmsg = NULL;
+#endif
+ return stream;
+}
+
+bool checkreturn pb_write(pb_ostream_t *stream, const pb_byte_t *buf, size_t count)
+{
+ if (count > 0 && stream->callback != NULL)
+ {
+ if (stream->bytes_written + count < stream->bytes_written ||
+ stream->bytes_written + count > stream->max_size)
+ {
+ PB_RETURN_ERROR(stream, "stream full");
+ }
+
+#ifdef PB_BUFFER_ONLY
+ if (!buf_write(stream, buf, count))
+ PB_RETURN_ERROR(stream, "io error");
+#else
+ if (!stream->callback(stream, buf, count))
+ PB_RETURN_ERROR(stream, "io error");
+#endif
+ }
+
+ stream->bytes_written += count;
+ return true;
+}
+
+/*************************
+ * Encode a single field *
+ *************************/
+
+/* Read a bool value without causing undefined behavior even if the value
+ * is invalid. See issue #434 and
+ * https://stackoverflow.com/questions/27661768/weird-results-for-conditional
+ */
+static bool safe_read_bool(const void *pSize)
+{
+ const char *p = (const char *)pSize;
+ size_t i;
+ for (i = 0; i < sizeof(bool); i++)
+ {
+ if (p[i] != 0)
+ return true;
+ }
+ return false;
+}
+
+/* Encode a static array. Handles the size calculations and possible packing. */
+static bool checkreturn encode_array(pb_ostream_t *stream, pb_field_iter_t *field)
+{
+ pb_size_t i;
+ pb_size_t count;
+#ifndef PB_ENCODE_ARRAYS_UNPACKED
+ size_t size;
+#endif
+
+ count = *(pb_size_t*)field->pSize;
+
+ if (count == 0)
+ return true;
+
+ if (PB_ATYPE(field->type) != PB_ATYPE_POINTER && count > field->array_size)
+ PB_RETURN_ERROR(stream, "array max size exceeded");
+
+#ifndef PB_ENCODE_ARRAYS_UNPACKED
+ /* We always pack arrays if the datatype allows it. */
+ if (PB_LTYPE(field->type) <= PB_LTYPE_LAST_PACKABLE)
+ {
+ if (!pb_encode_tag(stream, PB_WT_STRING, field->tag))
+ return false;
+
+ /* Determine the total size of packed array. */
+ if (PB_LTYPE(field->type) == PB_LTYPE_FIXED32)
+ {
+ size = 4 * (size_t)count;
+ }
+ else if (PB_LTYPE(field->type) == PB_LTYPE_FIXED64)
+ {
+ size = 8 * (size_t)count;
+ }
+ else
+ {
+ pb_ostream_t sizestream = PB_OSTREAM_SIZING;
+ void *pData_orig = field->pData;
+ for (i = 0; i < count; i++)
+ {
+ if (!pb_enc_varint(&sizestream, field))
+ PB_RETURN_ERROR(stream, PB_GET_ERROR(&sizestream));
+ field->pData = (char*)field->pData + field->data_size;
+ }
+ field->pData = pData_orig;
+ size = sizestream.bytes_written;
+ }
+
+ if (!pb_encode_varint(stream, (pb_uint64_t)size))
+ return false;
+
+ if (stream->callback == NULL)
+ return pb_write(stream, NULL, size); /* Just sizing.. */
+
+ /* Write the data */
+ for (i = 0; i < count; i++)
+ {
+ if (PB_LTYPE(field->type) == PB_LTYPE_FIXED32 || PB_LTYPE(field->type) == PB_LTYPE_FIXED64)
+ {
+ if (!pb_enc_fixed(stream, field))
+ return false;
+ }
+ else
+ {
+ if (!pb_enc_varint(stream, field))
+ return false;
+ }
+
+ field->pData = (char*)field->pData + field->data_size;
+ }
+ }
+ else /* Unpacked fields */
+#endif
+ {
+ for (i = 0; i < count; i++)
+ {
+ /* Normally the data is stored directly in the array entries, but
+ * for pointer-type string and bytes fields, the array entries are
+ * actually pointers themselves also. So we have to dereference once
+ * more to get to the actual data. */
+ if (PB_ATYPE(field->type) == PB_ATYPE_POINTER &&
+ (PB_LTYPE(field->type) == PB_LTYPE_STRING ||
+ PB_LTYPE(field->type) == PB_LTYPE_BYTES))
+ {
+ bool status;
+ void *pData_orig = field->pData;
+ field->pData = *(void* const*)field->pData;
+
+ if (!field->pData)
+ {
+ /* Null pointer in array is treated as empty string / bytes */
+ status = pb_encode_tag_for_field(stream, field) &&
+ pb_encode_varint(stream, 0);
+ }
+ else
+ {
+ status = encode_basic_field(stream, field);
+ }
+
+ field->pData = pData_orig;
+
+ if (!status)
+ return false;
+ }
+ else
+ {
+ if (!encode_basic_field(stream, field))
+ return false;
+ }
+ field->pData = (char*)field->pData + field->data_size;
+ }
+ }
+
+ return true;
+}
+
+/* In proto3, all fields are optional and are only encoded if their value is "non-zero".
+ * This function implements the check for the zero value. */
+static bool checkreturn pb_check_proto3_default_value(const pb_field_iter_t *field)
+{
+ pb_type_t type = field->type;
+
+ if (PB_ATYPE(type) == PB_ATYPE_STATIC)
+ {
+ if (PB_HTYPE(type) == PB_HTYPE_REQUIRED)
+ {
+ /* Required proto2 fields inside proto3 submessage, pretty rare case */
+ return false;
+ }
+ else if (PB_HTYPE(type) == PB_HTYPE_REPEATED)
+ {
+ /* Repeated fields inside proto3 submessage: present if count != 0 */
+ return *(const pb_size_t*)field->pSize == 0;
+ }
+ else if (PB_HTYPE(type) == PB_HTYPE_ONEOF)
+ {
+ /* Oneof fields */
+ return *(const pb_size_t*)field->pSize == 0;
+ }
+ else if (PB_HTYPE(type) == PB_HTYPE_OPTIONAL && field->pSize != NULL)
+ {
+ /* Proto2 optional fields inside proto3 message, or proto3
+ * submessage fields. */
+ return safe_read_bool(field->pSize) == false;
+ }
+ else if (field->descriptor->default_value)
+ {
+ /* Proto3 messages do not have default values, but proto2 messages
+ * can contain optional fields without has_fields (generator option 'proto3').
+ * In this case they must always be encoded, to make sure that the
+ * non-zero default value is overwritten.
+ */
+ return false;
+ }
+
+ /* Rest is proto3 singular fields */
+ if (PB_LTYPE(type) <= PB_LTYPE_LAST_PACKABLE)
+ {
+ /* Simple integer / float fields */
+ pb_size_t i;
+ const char *p = (const char*)field->pData;
+ for (i = 0; i < field->data_size; i++)
+ {
+ if (p[i] != 0)
+ {
+ return false;
+ }
+ }
+
+ return true;
+ }
+ else if (PB_LTYPE(type) == PB_LTYPE_BYTES)
+ {
+ const pb_bytes_array_t *bytes = (const pb_bytes_array_t*)field->pData;
+ return bytes->size == 0;
+ }
+ else if (PB_LTYPE(type) == PB_LTYPE_STRING)
+ {
+ return *(const char*)field->pData == '\0';
+ }
+ else if (PB_LTYPE(type) == PB_LTYPE_FIXED_LENGTH_BYTES)
+ {
+ /* Fixed length bytes is only empty if its length is fixed
+ * as 0. Which would be pretty strange, but we can check
+ * it anyway. */
+ return field->data_size == 0;
+ }
+ else if (PB_LTYPE_IS_SUBMSG(type))
+ {
+ /* Check all fields in the submessage to find if any of them
+ * are non-zero. The comparison cannot be done byte-per-byte
+ * because the C struct may contain padding bytes that must
+ * be skipped. Note that usually proto3 submessages have
+ * a separate has_field that is checked earlier in this if.
+ */
+ pb_field_iter_t iter;
+ if (pb_field_iter_begin(&iter, field->submsg_desc, field->pData))
+ {
+ do
+ {
+ if (!pb_check_proto3_default_value(&iter))
+ {
+ return false;
+ }
+ } while (pb_field_iter_next(&iter));
+ }
+ return true;
+ }
+ }
+ else if (PB_ATYPE(type) == PB_ATYPE_POINTER)
+ {
+ return field->pData == NULL;
+ }
+ else if (PB_ATYPE(type) == PB_ATYPE_CALLBACK)
+ {
+ if (PB_LTYPE(type) == PB_LTYPE_EXTENSION)
+ {
+ const pb_extension_t *extension = *(const pb_extension_t* const *)field->pData;
+ return extension == NULL;
+ }
+ else if (field->descriptor->field_callback == pb_default_field_callback)
+ {
+ pb_callback_t *pCallback = (pb_callback_t*)field->pData;
+ return pCallback->funcs.encode == NULL;
+ }
+ else
+ {
+ return field->descriptor->field_callback == NULL;
+ }
+ }
+
+ return false; /* Not typically reached, safe default for weird special cases. */
+}
+
+/* Encode a field with static or pointer allocation, i.e. one whose data
+ * is available to the encoder directly. */
+static bool checkreturn encode_basic_field(pb_ostream_t *stream, const pb_field_iter_t *field)
+{
+ if (!field->pData)
+ {
+ /* Missing pointer field */
+ return true;
+ }
+
+ if (!pb_encode_tag_for_field(stream, field))
+ return false;
+
+ switch (PB_LTYPE(field->type))
+ {
+ case PB_LTYPE_BOOL:
+ return pb_enc_bool(stream, field);
+
+ case PB_LTYPE_VARINT:
+ case PB_LTYPE_UVARINT:
+ case PB_LTYPE_SVARINT:
+ return pb_enc_varint(stream, field);
+
+ case PB_LTYPE_FIXED32:
+ case PB_LTYPE_FIXED64:
+ return pb_enc_fixed(stream, field);
+
+ case PB_LTYPE_BYTES:
+ return pb_enc_bytes(stream, field);
+
+ case PB_LTYPE_STRING:
+ return pb_enc_string(stream, field);
+
+ case PB_LTYPE_SUBMESSAGE:
+ case PB_LTYPE_SUBMSG_W_CB:
+ return pb_enc_submessage(stream, field);
+
+ case PB_LTYPE_FIXED_LENGTH_BYTES:
+ return pb_enc_fixed_length_bytes(stream, field);
+
+ default:
+ PB_RETURN_ERROR(stream, "invalid field type");
+ }
+}
+
+/* Encode a field with callback semantics. This means that a user function is
+ * called to provide and encode the actual data. */
+static bool checkreturn encode_callback_field(pb_ostream_t *stream, const pb_field_iter_t *field)
+{
+ if (field->descriptor->field_callback != NULL)
+ {
+ if (!field->descriptor->field_callback(NULL, stream, field))
+ PB_RETURN_ERROR(stream, "callback error");
+ }
+ return true;
+}
+
+/* Encode a single field of any callback, pointer or static type. */
+static bool checkreturn encode_field(pb_ostream_t *stream, pb_field_iter_t *field)
+{
+ /* Check field presence */
+ if (PB_HTYPE(field->type) == PB_HTYPE_ONEOF)
+ {
+ if (*(const pb_size_t*)field->pSize != field->tag)
+ {
+ /* Different type oneof field */
+ return true;
+ }
+ }
+ else if (PB_HTYPE(field->type) == PB_HTYPE_OPTIONAL)
+ {
+ if (field->pSize)
+ {
+ if (safe_read_bool(field->pSize) == false)
+ {
+ /* Missing optional field */
+ return true;
+ }
+ }
+ else if (PB_ATYPE(field->type) == PB_ATYPE_STATIC)
+ {
+ /* Proto3 singular field */
+ if (pb_check_proto3_default_value(field))
+ return true;
+ }
+ }
+
+ if (!field->pData)
+ {
+ if (PB_HTYPE(field->type) == PB_HTYPE_REQUIRED)
+ PB_RETURN_ERROR(stream, "missing required field");
+
+ /* Pointer field set to NULL */
+ return true;
+ }
+
+ /* Then encode field contents */
+ if (PB_ATYPE(field->type) == PB_ATYPE_CALLBACK)
+ {
+ return encode_callback_field(stream, field);
+ }
+ else if (PB_HTYPE(field->type) == PB_HTYPE_REPEATED)
+ {
+ return encode_array(stream, field);
+ }
+ else
+ {
+ return encode_basic_field(stream, field);
+ }
+}
+
+/* Default handler for extension fields. Expects to have a pb_msgdesc_t
+ * pointer in the extension->type->arg field, pointing to a message with
+ * only one field in it. */
+static bool checkreturn default_extension_encoder(pb_ostream_t *stream, const pb_extension_t *extension)
+{
+ pb_field_iter_t iter;
+
+ if (!pb_field_iter_begin_extension_const(&iter, extension))
+ PB_RETURN_ERROR(stream, "invalid extension");
+
+ return encode_field(stream, &iter);
+}
+
+
+/* Walk through all the registered extensions and give them a chance
+ * to encode themselves. */
+static bool checkreturn encode_extension_field(pb_ostream_t *stream, const pb_field_iter_t *field)
+{
+ const pb_extension_t *extension = *(const pb_extension_t* const *)field->pData;
+
+ while (extension)
+ {
+ bool status;
+ if (extension->type->encode)
+ status = extension->type->encode(stream, extension);
+ else
+ status = default_extension_encoder(stream, extension);
+
+ if (!status)
+ return false;
+
+ extension = extension->next;
+ }
+
+ return true;
+}
+
+/*********************
+ * Encode all fields *
+ *********************/
+
+bool checkreturn pb_encode(pb_ostream_t *stream, const pb_msgdesc_t *fields, const void *src_struct)
+{
+ pb_field_iter_t iter;
+ if (!pb_field_iter_begin_const(&iter, fields, src_struct))
+ return true; /* Empty message type */
+
+ do {
+ if (PB_LTYPE(iter.type) == PB_LTYPE_EXTENSION)
+ {
+ /* Special case for the extension field placeholder */
+ if (!encode_extension_field(stream, &iter))
+ return false;
+ }
+ else
+ {
+ /* Regular field */
+ if (!encode_field(stream, &iter))
+ return false;
+ }
+ } while (pb_field_iter_next(&iter));
+
+ return true;
+}
+
+bool checkreturn pb_encode_ex(pb_ostream_t *stream, const pb_msgdesc_t *fields, const void *src_struct, unsigned int flags)
+{
+ if ((flags & PB_ENCODE_DELIMITED) != 0)
+ {
+ return pb_encode_submessage(stream, fields, src_struct);
+ }
+ else if ((flags & PB_ENCODE_NULLTERMINATED) != 0)
+ {
+ const pb_byte_t zero = 0;
+
+ if (!pb_encode(stream, fields, src_struct))
+ return false;
+
+ return pb_write(stream, &zero, 1);
+ }
+ else
+ {
+ return pb_encode(stream, fields, src_struct);
+ }
+}
+
+bool pb_get_encoded_size(size_t *size, const pb_msgdesc_t *fields, const void *src_struct)
+{
+ pb_ostream_t stream = PB_OSTREAM_SIZING;
+
+ if (!pb_encode(&stream, fields, src_struct))
+ return false;
+
+ *size = stream.bytes_written;
+ return true;
+}
+
+/********************
+ * Helper functions *
+ ********************/
+
+/* This function avoids 64-bit shifts as they are quite slow on many platforms. */
+static bool checkreturn pb_encode_varint_32(pb_ostream_t *stream, uint32_t low, uint32_t high)
+{
+ size_t i = 0;
+ pb_byte_t buffer[10];
+ pb_byte_t byte = (pb_byte_t)(low & 0x7F);
+ low >>= 7;
+
+ while (i < 4 && (low != 0 || high != 0))
+ {
+ byte |= 0x80;
+ buffer[i++] = byte;
+ byte = (pb_byte_t)(low & 0x7F);
+ low >>= 7;
+ }
+
+ if (high)
+ {
+ byte = (pb_byte_t)(byte | ((high & 0x07) << 4));
+ high >>= 3;
+
+ while (high)
+ {
+ byte |= 0x80;
+ buffer[i++] = byte;
+ byte = (pb_byte_t)(high & 0x7F);
+ high >>= 7;
+ }
+ }
+
+ buffer[i++] = byte;
+
+ return pb_write(stream, buffer, i);
+}
+
+bool checkreturn pb_encode_varint(pb_ostream_t *stream, pb_uint64_t value)
+{
+ if (value <= 0x7F)
+ {
+ /* Fast path: single byte */
+ pb_byte_t byte = (pb_byte_t)value;
+ return pb_write(stream, &byte, 1);
+ }
+ else
+ {
+#ifdef PB_WITHOUT_64BIT
+ return pb_encode_varint_32(stream, value, 0);
+#else
+ return pb_encode_varint_32(stream, (uint32_t)value, (uint32_t)(value >> 32));
+#endif
+ }
+}
+
+bool checkreturn pb_encode_svarint(pb_ostream_t *stream, pb_int64_t value)
+{
+ pb_uint64_t zigzagged;
+ if (value < 0)
+ zigzagged = ~((pb_uint64_t)value << 1);
+ else
+ zigzagged = (pb_uint64_t)value << 1;
+
+ return pb_encode_varint(stream, zigzagged);
+}
+
+bool checkreturn pb_encode_fixed32(pb_ostream_t *stream, const void *value)
+{
+ uint32_t val = *(const uint32_t*)value;
+ pb_byte_t bytes[4];
+ bytes[0] = (pb_byte_t)(val & 0xFF);
+ bytes[1] = (pb_byte_t)((val >> 8) & 0xFF);
+ bytes[2] = (pb_byte_t)((val >> 16) & 0xFF);
+ bytes[3] = (pb_byte_t)((val >> 24) & 0xFF);
+ return pb_write(stream, bytes, 4);
+}
+
+#ifndef PB_WITHOUT_64BIT
+bool checkreturn pb_encode_fixed64(pb_ostream_t *stream, const void *value)
+{
+ uint64_t val = *(const uint64_t*)value;
+ pb_byte_t bytes[8];
+ bytes[0] = (pb_byte_t)(val & 0xFF);
+ bytes[1] = (pb_byte_t)((val >> 8) & 0xFF);
+ bytes[2] = (pb_byte_t)((val >> 16) & 0xFF);
+ bytes[3] = (pb_byte_t)((val >> 24) & 0xFF);
+ bytes[4] = (pb_byte_t)((val >> 32) & 0xFF);
+ bytes[5] = (pb_byte_t)((val >> 40) & 0xFF);
+ bytes[6] = (pb_byte_t)((val >> 48) & 0xFF);
+ bytes[7] = (pb_byte_t)((val >> 56) & 0xFF);
+ return pb_write(stream, bytes, 8);
+}
+#endif
+
+bool checkreturn pb_encode_tag(pb_ostream_t *stream, pb_wire_type_t wiretype, uint32_t field_number)
+{
+ pb_uint64_t tag = ((pb_uint64_t)field_number << 3) | wiretype;
+ return pb_encode_varint(stream, tag);
+}
+
+bool pb_encode_tag_for_field ( pb_ostream_t* stream, const pb_field_iter_t* field )
+{
+ pb_wire_type_t wiretype;
+ switch (PB_LTYPE(field->type))
+ {
+ case PB_LTYPE_BOOL:
+ case PB_LTYPE_VARINT:
+ case PB_LTYPE_UVARINT:
+ case PB_LTYPE_SVARINT:
+ wiretype = PB_WT_VARINT;
+ break;
+
+ case PB_LTYPE_FIXED32:
+ wiretype = PB_WT_32BIT;
+ break;
+
+ case PB_LTYPE_FIXED64:
+ wiretype = PB_WT_64BIT;
+ break;
+
+ case PB_LTYPE_BYTES:
+ case PB_LTYPE_STRING:
+ case PB_LTYPE_SUBMESSAGE:
+ case PB_LTYPE_SUBMSG_W_CB:
+ case PB_LTYPE_FIXED_LENGTH_BYTES:
+ wiretype = PB_WT_STRING;
+ break;
+
+ default:
+ PB_RETURN_ERROR(stream, "invalid field type");
+ }
+
+ return pb_encode_tag(stream, wiretype, field->tag);
+}
+
+bool checkreturn pb_encode_string(pb_ostream_t *stream, const pb_byte_t *buffer, size_t size)
+{
+ if (!pb_encode_varint(stream, (pb_uint64_t)size))
+ return false;
+
+ return pb_write(stream, buffer, size);
+}
+
+bool checkreturn pb_encode_submessage(pb_ostream_t *stream, const pb_msgdesc_t *fields, const void *src_struct)
+{
+ /* First calculate the message size using a non-writing substream. */
+ pb_ostream_t substream = PB_OSTREAM_SIZING;
+ size_t size;
+ bool status;
+
+ if (!pb_encode(&substream, fields, src_struct))
+ {
+#ifndef PB_NO_ERRMSG
+ stream->errmsg = substream.errmsg;
+#endif
+ return false;
+ }
+
+ size = substream.bytes_written;
+
+ if (!pb_encode_varint(stream, (pb_uint64_t)size))
+ return false;
+
+ if (stream->callback == NULL)
+ return pb_write(stream, NULL, size); /* Just sizing */
+
+ if (stream->bytes_written + size > stream->max_size)
+ PB_RETURN_ERROR(stream, "stream full");
+
+ /* Use a substream to verify that a callback doesn't write more than
+ * what it did the first time. */
+ substream.callback = stream->callback;
+ substream.state = stream->state;
+ substream.max_size = size;
+ substream.bytes_written = 0;
+#ifndef PB_NO_ERRMSG
+ substream.errmsg = NULL;
+#endif
+
+ status = pb_encode(&substream, fields, src_struct);
+
+ stream->bytes_written += substream.bytes_written;
+ stream->state = substream.state;
+#ifndef PB_NO_ERRMSG
+ stream->errmsg = substream.errmsg;
+#endif
+
+ if (substream.bytes_written != size)
+ PB_RETURN_ERROR(stream, "submsg size changed");
+
+ return status;
+}
+
+/* Field encoders */
+
+static bool checkreturn pb_enc_bool(pb_ostream_t *stream, const pb_field_iter_t *field)
+{
+ uint32_t value = safe_read_bool(field->pData) ? 1 : 0;
+ PB_UNUSED(field);
+ return pb_encode_varint(stream, value);
+}
+
+static bool checkreturn pb_enc_varint(pb_ostream_t *stream, const pb_field_iter_t *field)
+{
+ if (PB_LTYPE(field->type) == PB_LTYPE_UVARINT)
+ {
+ /* Perform unsigned integer extension */
+ pb_uint64_t value = 0;
+
+ if (field->data_size == sizeof(uint_least8_t))
+ value = *(const uint_least8_t*)field->pData;
+ else if (field->data_size == sizeof(uint_least16_t))
+ value = *(const uint_least16_t*)field->pData;
+ else if (field->data_size == sizeof(uint32_t))
+ value = *(const uint32_t*)field->pData;
+ else if (field->data_size == sizeof(pb_uint64_t))
+ value = *(const pb_uint64_t*)field->pData;
+ else
+ PB_RETURN_ERROR(stream, "invalid data_size");
+
+ return pb_encode_varint(stream, value);
+ }
+ else
+ {
+ /* Perform signed integer extension */
+ pb_int64_t value = 0;
+
+ if (field->data_size == sizeof(int_least8_t))
+ value = *(const int_least8_t*)field->pData;
+ else if (field->data_size == sizeof(int_least16_t))
+ value = *(const int_least16_t*)field->pData;
+ else if (field->data_size == sizeof(int32_t))
+ value = *(const int32_t*)field->pData;
+ else if (field->data_size == sizeof(pb_int64_t))
+ value = *(const pb_int64_t*)field->pData;
+ else
+ PB_RETURN_ERROR(stream, "invalid data_size");
+
+ if (PB_LTYPE(field->type) == PB_LTYPE_SVARINT)
+ return pb_encode_svarint(stream, value);
+#ifdef PB_WITHOUT_64BIT
+ else if (value < 0)
+ return pb_encode_varint_32(stream, (uint32_t)value, (uint32_t)-1);
+#endif
+ else
+ return pb_encode_varint(stream, (pb_uint64_t)value);
+
+ }
+}
+
+static bool checkreturn pb_enc_fixed(pb_ostream_t *stream, const pb_field_iter_t *field)
+{
+#ifdef PB_CONVERT_DOUBLE_FLOAT
+ if (field->data_size == sizeof(float) && PB_LTYPE(field->type) == PB_LTYPE_FIXED64)
+ {
+ return pb_encode_float_as_double(stream, *(float*)field->pData);
+ }
+#endif
+
+ if (field->data_size == sizeof(uint32_t))
+ {
+ return pb_encode_fixed32(stream, field->pData);
+ }
+#ifndef PB_WITHOUT_64BIT
+ else if (field->data_size == sizeof(uint64_t))
+ {
+ return pb_encode_fixed64(stream, field->pData);
+ }
+#endif
+ else
+ {
+ PB_RETURN_ERROR(stream, "invalid data_size");
+ }
+}
+
+static bool checkreturn pb_enc_bytes(pb_ostream_t *stream, const pb_field_iter_t *field)
+{
+ const pb_bytes_array_t *bytes = NULL;
+
+ bytes = (const pb_bytes_array_t*)field->pData;
+
+ if (bytes == NULL)
+ {
+ /* Treat null pointer as an empty bytes field */
+ return pb_encode_string(stream, NULL, 0);
+ }
+
+ if (PB_ATYPE(field->type) == PB_ATYPE_STATIC &&
+ bytes->size > field->data_size - offsetof(pb_bytes_array_t, bytes))
+ {
+ PB_RETURN_ERROR(stream, "bytes size exceeded");
+ }
+
+ return pb_encode_string(stream, bytes->bytes, (size_t)bytes->size);
+}
+
+static bool checkreturn pb_enc_string(pb_ostream_t *stream, const pb_field_iter_t *field)
+{
+ size_t size = 0;
+ size_t max_size = (size_t)field->data_size;
+ const char *str = (const char*)field->pData;
+
+ if (PB_ATYPE(field->type) == PB_ATYPE_POINTER)
+ {
+ max_size = (size_t)-1;
+ }
+ else
+ {
+ /* pb_dec_string() assumes string fields end with a null
+ * terminator when the type isn't PB_ATYPE_POINTER, so we
+ * shouldn't allow more than max-1 bytes to be written to
+ * allow space for the null terminator.
+ */
+ if (max_size == 0)
+ PB_RETURN_ERROR(stream, "zero-length string");
+
+ max_size -= 1;
+ }
+
+
+ if (str == NULL)
+ {
+ size = 0; /* Treat null pointer as an empty string */
+ }
+ else
+ {
+ const char *p = str;
+
+ /* strnlen() is not always available, so just use a loop */
+ while (size < max_size && *p != '\0')
+ {
+ size++;
+ p++;
+ }
+
+ if (*p != '\0')
+ {
+ PB_RETURN_ERROR(stream, "unterminated string");
+ }
+ }
+
+#ifdef PB_VALIDATE_UTF8
+ if (!pb_validate_utf8(str))
+ PB_RETURN_ERROR(stream, "invalid utf8");
+#endif
+
+ return pb_encode_string(stream, (const pb_byte_t*)str, size);
+}
+
+static bool checkreturn pb_enc_submessage(pb_ostream_t *stream, const pb_field_iter_t *field)
+{
+ if (field->submsg_desc == NULL)
+ PB_RETURN_ERROR(stream, "invalid field descriptor");
+
+ if (PB_LTYPE(field->type) == PB_LTYPE_SUBMSG_W_CB && field->pSize != NULL)
+ {
+ /* Message callback is stored right before pSize. */
+ pb_callback_t *callback = (pb_callback_t*)field->pSize - 1;
+ if (callback->funcs.encode)
+ {
+ if (!callback->funcs.encode(stream, field, &callback->arg))
+ return false;
+ }
+ }
+
+ return pb_encode_submessage(stream, field->submsg_desc, field->pData);
+}
+
+static bool checkreturn pb_enc_fixed_length_bytes(pb_ostream_t *stream, const pb_field_iter_t *field)
+{
+ return pb_encode_string(stream, (const pb_byte_t*)field->pData, (size_t)field->data_size);
+}
+
+#ifdef PB_CONVERT_DOUBLE_FLOAT
+bool pb_encode_float_as_double(pb_ostream_t *stream, float value)
+{
+ union { float f; uint32_t i; } in;
+ uint_least8_t sign;
+ int exponent;
+ uint64_t mantissa;
+
+ in.f = value;
+
+ /* Decompose input value */
+ sign = (uint_least8_t)((in.i >> 31) & 1);
+ exponent = (int)((in.i >> 23) & 0xFF) - 127;
+ mantissa = in.i & 0x7FFFFF;
+
+ if (exponent == 128)
+ {
+ /* Special value (NaN etc.) */
+ exponent = 1024;
+ }
+ else if (exponent == -127)
+ {
+ if (!mantissa)
+ {
+ /* Zero */
+ exponent = -1023;
+ }
+ else
+ {
+ /* Denormalized */
+ mantissa <<= 1;
+ while (!(mantissa & 0x800000))
+ {
+ mantissa <<= 1;
+ exponent--;
+ }
+ mantissa &= 0x7FFFFF;
+ }
+ }
+
+ /* Combine fields */
+ mantissa <<= 29;
+ mantissa |= (uint64_t)(exponent + 1023) << 52;
+ mantissa |= (uint64_t)sign << 63;
+
+ return pb_encode_fixed64(stream, &mantissa);
+}
+#endif
diff --git a/security/container/protos/nanopb/pb_encode.h b/security/container/protos/nanopb/pb_encode.h
new file mode 100644
index 0000000..9cff22a
--- /dev/null
+++ b/security/container/protos/nanopb/pb_encode.h
@@ -0,0 +1,185 @@
+/* pb_encode.h: Functions to encode protocol buffers. Depends on pb_encode.c.
+ * The main function is pb_encode. You also need an output stream, and the
+ * field descriptions created by nanopb_generator.py.
+ */
+
+#ifndef PB_ENCODE_H_INCLUDED
+#define PB_ENCODE_H_INCLUDED
+
+#include "pb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Structure for defining custom output streams. You will need to provide
+ * a callback function to write the bytes to your storage, which can be
+ * for example a file or a network socket.
+ *
+ * The callback must conform to these rules:
+ *
+ * 1) Return false on IO errors. This will cause encoding to abort.
+ * 2) You can use state to store your own data (e.g. buffer pointer).
+ * 3) pb_write will update bytes_written after your callback runs.
+ * 4) Substreams will modify max_size and bytes_written. Don't use them
+ * to calculate any pointers.
+ */
+struct pb_ostream_s
+{
+#ifdef PB_BUFFER_ONLY
+ /* Callback pointer is not used in buffer-only configuration.
+ * Having an int pointer here allows binary compatibility but
+ * gives an error if someone tries to assign callback function.
+ * Also, NULL pointer marks a 'sizing stream' that does not
+ * write anything.
+ */
+ int *callback;
+#else
+ bool (*callback)(pb_ostream_t *stream, const pb_byte_t *buf, size_t count);
+#endif
+ void *state; /* Free field for use by callback implementation. */
+ size_t max_size; /* Limit number of output bytes written (or use SIZE_MAX). */
+ size_t bytes_written; /* Number of bytes written so far. */
+
+#ifndef PB_NO_ERRMSG
+ const char *errmsg;
+#endif
+};
+
+/***************************
+ * Main encoding functions *
+ ***************************/
+
+/* Encode a single protocol buffers message from C structure into a stream.
+ * Returns true on success, false on any failure.
+ * The actual struct pointed to by src_struct must match the description in fields.
+ * All required fields in the struct are assumed to have been filled in.
+ *
+ * Example usage:
+ * MyMessage msg = {};
+ * uint8_t buffer[64];
+ * pb_ostream_t stream;
+ *
+ * msg.field1 = 42;
+ * stream = pb_ostream_from_buffer(buffer, sizeof(buffer));
+ * pb_encode(&stream, MyMessage_fields, &msg);
+ */
+bool pb_encode(pb_ostream_t *stream, const pb_msgdesc_t *fields, const void *src_struct);
+
+/* Extended version of pb_encode, with several options to control the
+ * encoding process:
+ *
+ * PB_ENCODE_DELIMITED: Prepend the length of message as a varint.
+ * Corresponds to writeDelimitedTo() in Google's
+ * protobuf API.
+ *
+ * PB_ENCODE_NULLTERMINATED: Append a null byte to the message for termination.
+ * NOTE: This behaviour is not supported in most other
+ * protobuf implementations, so PB_ENCODE_DELIMITED
+ * is a better option for compatibility.
+ */
+#define PB_ENCODE_DELIMITED 0x02U
+#define PB_ENCODE_NULLTERMINATED 0x04U
+bool pb_encode_ex(pb_ostream_t *stream, const pb_msgdesc_t *fields, const void *src_struct, unsigned int flags);
+
+/* Defines for backwards compatibility with code written before nanopb-0.4.0 */
+#define pb_encode_delimited(s,f,d) pb_encode_ex(s,f,d, PB_ENCODE_DELIMITED)
+#define pb_encode_nullterminated(s,f,d) pb_encode_ex(s,f,d, PB_ENCODE_NULLTERMINATED)
+
+/* Encode the message to get the size of the encoded data, but do not store
+ * the data. */
+bool pb_get_encoded_size(size_t *size, const pb_msgdesc_t *fields, const void *src_struct);
+
+/**************************************
+ * Functions for manipulating streams *
+ **************************************/
+
+/* Create an output stream for writing into a memory buffer.
+ * The number of bytes written can be found in stream.bytes_written after
+ * encoding the message.
+ *
+ * Alternatively, you can use a custom stream that writes directly to e.g.
+ * a file or a network socket.
+ */
+pb_ostream_t pb_ostream_from_buffer(pb_byte_t *buf, size_t bufsize);
+
+/* Pseudo-stream for measuring the size of a message without actually storing
+ * the encoded data.
+ *
+ * Example usage:
+ * MyMessage msg = {};
+ * pb_ostream_t stream = PB_OSTREAM_SIZING;
+ * pb_encode(&stream, MyMessage_fields, &msg);
+ * printf("Message size is %d\n", stream.bytes_written);
+ */
+#ifndef PB_NO_ERRMSG
+#define PB_OSTREAM_SIZING {0,0,0,0,0}
+#else
+#define PB_OSTREAM_SIZING {0,0,0,0}
+#endif
+
+/* Function to write into a pb_ostream_t stream. You can use this if you need
+ * to append or prepend some custom headers to the message.
+ */
+bool pb_write(pb_ostream_t *stream, const pb_byte_t *buf, size_t count);
+
+
+/************************************************
+ * Helper functions for writing field callbacks *
+ ************************************************/
+
+/* Encode field header based on type and field number defined in the field
+ * structure. Call this from the callback before writing out field contents. */
+bool pb_encode_tag_for_field(pb_ostream_t *stream, const pb_field_iter_t *field);
+
+/* Encode field header by manually specifying wire type. You need to use this
+ * if you want to write out packed arrays from a callback field. */
+bool pb_encode_tag(pb_ostream_t *stream, pb_wire_type_t wiretype, uint32_t field_number);
+
+/* Encode an integer in the varint format.
+ * This works for bool, enum, int32, int64, uint32 and uint64 field types. */
+#ifndef PB_WITHOUT_64BIT
+bool pb_encode_varint(pb_ostream_t *stream, uint64_t value);
+#else
+bool pb_encode_varint(pb_ostream_t *stream, uint32_t value);
+#endif
+
+/* Encode an integer in the zig-zagged svarint format.
+ * This works for sint32 and sint64. */
+#ifndef PB_WITHOUT_64BIT
+bool pb_encode_svarint(pb_ostream_t *stream, int64_t value);
+#else
+bool pb_encode_svarint(pb_ostream_t *stream, int32_t value);
+#endif
+
+/* Encode a string or bytes type field. For strings, pass strlen(s) as size. */
+bool pb_encode_string(pb_ostream_t *stream, const pb_byte_t *buffer, size_t size);
+
+/* Encode a fixed32, sfixed32 or float value.
+ * You need to pass a pointer to a 4-byte wide C variable. */
+bool pb_encode_fixed32(pb_ostream_t *stream, const void *value);
+
+#ifndef PB_WITHOUT_64BIT
+/* Encode a fixed64, sfixed64 or double value.
+ * You need to pass a pointer to a 8-byte wide C variable. */
+bool pb_encode_fixed64(pb_ostream_t *stream, const void *value);
+#endif
+
+#ifdef PB_CONVERT_DOUBLE_FLOAT
+/* Encode a float value so that it appears like a double in the encoded
+ * message. */
+bool pb_encode_float_as_double(pb_ostream_t *stream, float value);
+#endif
+
+/* Encode a submessage field.
+ * You need to pass the pb_field_t array and pointer to struct, just like
+ * with pb_encode(). This internally encodes the submessage twice, first to
+ * calculate message size and then to actually write it out.
+ */
+bool pb_encode_submessage(pb_ostream_t *stream, const pb_msgdesc_t *fields, const void *src_struct);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
diff --git a/security/container/protos/pbsystem.h b/security/container/protos/pbsystem.h
new file mode 100644
index 0000000..f2308f8
--- /dev/null
+++ b/security/container/protos/pbsystem.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Header and types for nanopb to work with the Linux kernel */
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+/* Small types. */
+
+/* Signed. */
+typedef signed char int_least8_t;
+typedef short int int_least16_t;
+typedef int int_least32_t;
+typedef long int int_least64_t;
+
+/* Unsigned. */
+typedef unsigned char uint_least8_t;
+typedef unsigned short int uint_least16_t;
+typedef unsigned int uint_least32_t;
+typedef unsigned long int uint_least64_t;
+
+/* Fast types. */
+
+/* Signed. */
+typedef signed char int_fast8_t;
+typedef long int int_fast16_t;
+typedef long int int_fast32_t;
+typedef long int int_fast64_t;
+
+/* Unsigned. */
+typedef unsigned char uint_fast8_t;
+typedef unsigned long int uint_fast16_t;
+typedef unsigned long int uint_fast32_t;
+typedef unsigned long int uint_fast64_t;
diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
index ee5cb94..df744a9 100644
--- a/security/loadpin/loadpin.c
+++ b/security/loadpin/loadpin.c
@@ -237,7 +237,7 @@
};
/* Should not be mutable after boot, so not listed in sysfs (perm == 0). */
-module_param(enforce, int, 0);
-MODULE_PARM_DESC(enforce, "Enforce module/firmware pinning");
+module_param_named(enabled, enforce, int, 0);
+MODULE_PARM_DESC(enabled, "Enforce module/firmware pinning");
module_param_array_named(exclude, exclude_read_files, charp, NULL, 0);
MODULE_PARM_DESC(exclude, "Exclude pinning specific read file types");
diff --git a/security/security.c b/security/security.c
index f633717..5cc58ad 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1417,6 +1417,11 @@
}
}
+void security_file_pre_free(struct file *file)
+{
+ call_void_hook(file_pre_free_security, file);
+}
+
int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
return call_int_hook(file_ioctl, 0, file, cmd, arg);
@@ -1526,6 +1531,11 @@
return rc;
}
+void security_task_post_alloc(struct task_struct *task)
+{
+ call_void_hook(task_post_alloc, task);
+}
+
void security_task_free(struct task_struct *task)
{
call_void_hook(task_free, task);
@@ -1717,6 +1727,11 @@
return call_int_hook(task_kill, 0, p, info, sig, cred);
}
+void security_task_exit(struct task_struct *p)
+{
+ call_void_hook(task_exit, p);
+}
+
int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
@@ -2417,3 +2432,30 @@
return call_int_hook(locked_down, 0, what);
}
EXPORT_SYMBOL(security_locked_down);
+
+#ifdef CONFIG_PERF_EVENTS
+int security_perf_event_open(struct perf_event_attr *attr, int type)
+{
+ return call_int_hook(perf_event_open, 0, attr, type);
+}
+
+int security_perf_event_alloc(struct perf_event *event)
+{
+ return call_int_hook(perf_event_alloc, 0, event);
+}
+
+void security_perf_event_free(struct perf_event *event)
+{
+ call_void_hook(perf_event_free, event);
+}
+
+int security_perf_event_read(struct perf_event *event)
+{
+ return call_int_hook(perf_event_read, 0, event);
+}
+
+int security_perf_event_write(struct perf_event *event)
+{
+ return call_int_hook(perf_event_write, 0, event);
+}
+#endif /* CONFIG_PERF_EVENTS */
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index d9f15c8..4a5fa39 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6848,6 +6848,67 @@
.lbs_msg_msg = sizeof(struct msg_security_struct),
};
+#ifdef CONFIG_PERF_EVENTS
+static int selinux_perf_event_open(struct perf_event_attr *attr, int type)
+{
+ u32 requested, sid = current_sid();
+
+ if (type == PERF_SECURITY_OPEN)
+ requested = PERF_EVENT__OPEN;
+ else if (type == PERF_SECURITY_CPU)
+ requested = PERF_EVENT__CPU;
+ else if (type == PERF_SECURITY_KERNEL)
+ requested = PERF_EVENT__KERNEL;
+ else if (type == PERF_SECURITY_TRACEPOINT)
+ requested = PERF_EVENT__TRACEPOINT;
+ else
+ return -EINVAL;
+
+ return avc_has_perm(&selinux_state, sid, sid, SECCLASS_PERF_EVENT,
+ requested, NULL);
+}
+
+static int selinux_perf_event_alloc(struct perf_event *event)
+{
+ struct perf_event_security_struct *perfsec;
+
+ perfsec = kzalloc(sizeof(*perfsec), GFP_KERNEL);
+ if (!perfsec)
+ return -ENOMEM;
+
+ perfsec->sid = current_sid();
+ event->security = perfsec;
+
+ return 0;
+}
+
+static void selinux_perf_event_free(struct perf_event *event)
+{
+ struct perf_event_security_struct *perfsec = event->security;
+
+ event->security = NULL;
+ kfree(perfsec);
+}
+
+static int selinux_perf_event_read(struct perf_event *event)
+{
+ struct perf_event_security_struct *perfsec = event->security;
+ u32 sid = current_sid();
+
+ return avc_has_perm(&selinux_state, sid, perfsec->sid,
+ SECCLASS_PERF_EVENT, PERF_EVENT__READ, NULL);
+}
+
+static int selinux_perf_event_write(struct perf_event *event)
+{
+ struct perf_event_security_struct *perfsec = event->security;
+ u32 sid = current_sid();
+
+ return avc_has_perm(&selinux_state, sid, perfsec->sid,
+ SECCLASS_PERF_EVENT, PERF_EVENT__WRITE, NULL);
+}
+#endif
+
static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr),
LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction),
@@ -7085,6 +7146,14 @@
LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free),
#endif
+
+#ifdef CONFIG_PERF_EVENTS
+ LSM_HOOK_INIT(perf_event_open, selinux_perf_event_open),
+ LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc),
+ LSM_HOOK_INIT(perf_event_free, selinux_perf_event_free),
+ LSM_HOOK_INIT(perf_event_read, selinux_perf_event_read),
+ LSM_HOOK_INIT(perf_event_write, selinux_perf_event_write),
+#endif
};
static __init int selinux_init(void)
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 32e9b03..7db2485 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -244,6 +244,8 @@
{"map_create", "map_read", "map_write", "prog_load", "prog_run"} },
{ "xdp_socket",
{ COMMON_SOCK_PERMS, NULL } },
+ { "perf_event",
+ {"open", "cpu", "kernel", "tracepoint", "read", "write"} },
{ NULL }
};
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 586b7ab..a4a86cb 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -141,7 +141,11 @@
};
struct bpf_security_struct {
- u32 sid; /*SID of bpf obj creater*/
+ u32 sid; /* SID of bpf obj creator */
+};
+
+struct perf_event_security_struct {
+ u32 sid; /* SID of perf_event obj creator */
};
extern struct lsm_blob_sizes selinux_blob_sizes;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index cb06310..edaf3fe7 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -199,6 +199,16 @@
BPF_CGROUP_UDP6_RECVMSG,
BPF_CGROUP_GETSOCKOPT,
BPF_CGROUP_SETSOCKOPT,
+ BPF_TRACE_RAW_TP,
+ BPF_TRACE_FENTRY,
+ BPF_TRACE_FEXIT,
+ BPF_MODIFY_RETURN,
+ BPF_LSM_MAC,
+ BPF_TRACE_ITER,
+ BPF_CGROUP_INET4_GETPEERNAME,
+ BPF_CGROUP_INET6_GETPEERNAME,
+ BPF_CGROUP_INET4_GETSOCKNAME,
+ BPF_CGROUP_INET6_GETSOCKNAME,
__MAX_BPF_ATTACH_TYPE
};
@@ -2750,6 +2760,231 @@
* **-EOPNOTSUPP** kernel configuration does not enable SYN cookies
*
* **-EPROTONOSUPPORT** IP packet version is not 4 or 6
+ *
+ * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * Description
+ * Write raw *data* blob into a special BPF perf event held by
+ * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * event must have the following attributes: **PERF_SAMPLE_RAW**
+ * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * **PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * The *flags* are used to indicate the index in *map* for which
+ * the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * to indicate that the index of the current CPU core should be
+ * used.
+ *
+ * The value to write, of *size*, is passed through eBPF stack and
+ * pointed by *data*.
+ *
+ * *ctx* is a pointer to in-kernel struct sk_buff.
+ *
+ * This helper is similar to **bpf_perf_event_output**\ () but
+ * restricted to raw_tracepoint bpf programs.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr)
+ * Description
+ * Safely attempt to read *size* bytes from user space address
+ * *unsafe_ptr* and store the data in *dst*.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
+ * Description
+ * Safely attempt to read *size* bytes from kernel space address
+ * *unsafe_ptr* and store the data in *dst*.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr)
+ * Description
+ * Copy a NUL terminated string from an unsafe user address
+ * *unsafe_ptr* to *dst*. The *size* should include the
+ * terminating NUL byte. In case the string length is smaller than
+ * *size*, the target is not padded with further NUL bytes. If the
+ * string length is larger than *size*, just *size*-1 bytes are
+ * copied and the last byte is set to NUL.
+ *
+ * On success, the length of the copied string is returned. This
+ * makes this helper useful in tracing programs for reading
+ * strings, and more importantly to get its length at runtime. See
+ * the following snippet:
+ *
+ * ::
+ *
+ * SEC("kprobe/sys_open")
+ * void bpf_sys_open(struct pt_regs *ctx)
+ * {
+ * char buf[PATHLEN]; // PATHLEN is defined to 256
+ * int res = bpf_probe_read_user_str(buf, sizeof(buf),
+ * ctx->di);
+ *
+ * // Consume buf, for example push it to
+ * // userspace via bpf_perf_event_output(); we
+ * // can use res (the string length) as event
+ * // size, after checking its boundaries.
+ * }
+ *
+ * In comparison, using **bpf_probe_read_user()** helper here
+ * instead to read the string would require to estimate the length
+ * at compile time, and would often result in copying more memory
+ * than necessary.
+ *
+ * Another useful use case is when parsing individual process
+ * arguments or individual environment variables navigating
+ * *current*\ **->mm->arg_start** and *current*\
+ * **->mm->env_start**: using this helper and the return value,
+ * one can quickly iterate at the right offset of the memory area.
+ * Return
+ * On success, the strictly positive length of the string,
+ * including the trailing NUL character. On error, a negative
+ * value.
+ *
+ * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr)
+ * Description
+ * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr*
+ * to *dst*. Same semantics as with bpf_probe_read_user_str() apply.
+ * Return
+ * On success, the strictly positive length of the string, including
+ * the trailing NUL character. On error, a negative value.
+ *
+ * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt)
+ * Description
+ * Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock.
+ * *rcv_nxt* is the ack_seq to be sent out.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_send_signal_thread(u32 sig)
+ * Description
+ * Send signal *sig* to the thread corresponding to the current task.
+ * Return
+ * 0 on success or successfully queued.
+ *
+ * **-EBUSY** if work queue under nmi is full.
+ *
+ * **-EINVAL** if *sig* is invalid.
+ *
+ * **-EPERM** if no permission to send the *sig*.
+ *
+ * **-EAGAIN** if bpf program can try again.
+ *
+ * u64 bpf_jiffies64(void)
+ * Description
+ * Obtain the 64bit jiffies
+ * Return
+ * The 64 bit jiffies
+ *
+ * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
+ * Description
+ * For an eBPF program attached to a perf event, retrieve the
+ * branch records (struct perf_branch_entry) associated to *ctx*
+ * and store it in the buffer pointed by *buf* up to size
+ * *size* bytes.
+ * Return
+ * On success, number of bytes written to *buf*. On error, a
+ * negative value.
+ *
+ * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to
+ * instead return the number of bytes required to store all the
+ * branch entries. If this flag is set, *buf* may be NULL.
+ *
+ * **-EINVAL** if arguments invalid or **size** not a multiple
+ * of sizeof(struct perf_branch_entry).
+ *
+ * **-ENOENT** if architecture does not support branch records.
+ *
+ * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size)
+ * Description
+ * Returns 0 on success, values for *pid* and *tgid* as seen from the current
+ * *namespace* will be returned in *nsdata*.
+ *
+ * On failure, the returned value is one of the following:
+ *
+ * **-EINVAL** if dev and inum supplied don't match dev_t and inode number
+ * with nsfs of current task, or if dev conversion to dev_t lost high bits.
+ *
+ * **-ENOENT** if pidns does not exists for the current task.
+ *
+ * int bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * Description
+ * Write raw *data* blob into a special BPF perf event held by
+ * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * event must have the following attributes: **PERF_SAMPLE_RAW**
+ * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * **PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * The *flags* are used to indicate the index in *map* for which
+ * the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * to indicate that the index of the current CPU core should be
+ * used.
+ *
+ * The value to write, of *size*, is passed through eBPF stack and
+ * pointed by *data*.
+ *
+ * *ctx* is a pointer to in-kernel struct xdp_buff.
+ *
+ * This helper is similar to **bpf_perf_eventoutput**\ () but
+ * restricted to raw_tracepoint bpf programs.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_get_netns_cookie(void *ctx)
+ * Description
+ * Retrieve the cookie (generated by the kernel) of the network
+ * namespace the input *ctx* is associated with. The network
+ * namespace cookie remains stable for its lifetime and provides
+ * a global identifier that can be assumed unique. If *ctx* is
+ * NULL, then the helper returns the cookie for the initial
+ * network namespace. The cookie itself is very similar to that
+ * of bpf_get_socket_cookie() helper, but for network namespaces
+ * instead of sockets.
+ * Return
+ * A 8-byte long opaque number.
+ *
+ * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level)
+ * Description
+ * Return id of cgroup v2 that is ancestor of the cgroup associated
+ * with the current task at the *ancestor_level*. The root cgroup
+ * is at *ancestor_level* zero and each step down the hierarchy
+ * increments the level. If *ancestor_level* == level of cgroup
+ * associated with the current task, then return value will be the
+ * same as that of **bpf_get_current_cgroup_id**\ ().
+ *
+ * The helper is useful to implement policies based on cgroups
+ * that are upper in hierarchy than immediate cgroup associated
+ * with the current task.
+ *
+ * The format of returned id and helper limitations are same as in
+ * **bpf_get_current_cgroup_id**\ ().
+ * Return
+ * The id is returned or 0 in case the id could not be retrieved.
+ *
+ * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
+ * Description
+ * Assign the *sk* to the *skb*. When combined with appropriate
+ * routing configuration to receive the packet towards the socket,
+ * will cause *skb* to be delivered to the specified socket.
+ * Subsequent redirection of *skb* via **bpf_redirect**\ (),
+ * **bpf_clone_redirect**\ () or other methods outside of BPF may
+ * interfere with successful delivery to the socket.
+ *
+ * This operation is only valid from TC ingress path.
+ *
+ * The *flags* argument must be zero.
+ * Return
+ * 0 on success, or a negative errno in case of failure.
+ *
+ * * **-EINVAL** Unsupported flags specified.
+ * * **-ENOENT** Socket is unavailable for assignment.
+ * * **-ENETUNREACH** Socket is unreachable (wrong netns).
+ * * **-EOPNOTSUPP** Unsupported operation, for example a
+ * call from outside of TC ingress.
+ * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport).
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -2862,7 +3097,21 @@
FN(sk_storage_get), \
FN(sk_storage_delete), \
FN(send_signal), \
- FN(tcp_gen_syncookie),
+ FN(tcp_gen_syncookie), \
+ FN(skb_output), \
+ FN(probe_read_user), \
+ FN(probe_read_kernel), \
+ FN(probe_read_user_str), \
+ FN(probe_read_kernel_str), \
+ FN(tcp_send_ack), \
+ FN(send_signal_thread), \
+ FN(jiffies64), \
+ FN(read_branch_records), \
+ FN(get_ns_current_pid_tgid), \
+ FN(xdp_output), \
+ FN(get_netns_cookie), \
+ FN(get_current_ancestor_cgroup_id), \
+ FN(sk_assign),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call