// SPDX-License-Identifier: GPL-2.0
/*
 * Container Security Monitor module
 *
 * Copyright (c) 2018 Google, Inc
 */

#include "monitor.h"

#include <linux/atomic.h>
#include <linux/audit.h>
#include <linux/file.h>
#include <linux/highmem.h>
#include <linux/mempool.h>
#include <linux/mm.h>
#include <linux/mount.h>
#include <linux/notifier.h>
#include <linux/net.h>
#include <linux/path.h>
#include <linux/pid.h>
#include <linux/pid_namespace.h>
#include <linux/random.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/socket.h>
#include <linux/timekeeping.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/xattr.h>
#include <net/ipv6.h>
#include <net/sock.h>
#include <net/tcp.h>
#include <overlayfs/overlayfs.h>
#include <uapi/linux/magic.h>
#include <uapi/asm/mman.h>

/* Configuration options for execute collector. */
struct execute_config csm_execute_config;

/* unique atomic value for the machine boot instance */
static atomic_t machine_rand = ATOMIC_INIT(0);

/* sequential container identifier */
static atomic_t contid = ATOMIC_INIT(0);

/* Generation id for each enumeration invocation. */
static atomic_t enumeration_count = ATOMIC_INIT(0);

struct file_provenance {
	/* pid of the process doing the first write. */
	pid_t tgid;
	/* start_time of the process to uniquely identify it. */
	u64 start_time;
};

struct csm_enumerate_processes_work_data {
	struct work_struct work;
	int enumeration_count;
};

static void *kmap_argument_stack(struct linux_binprm *bprm, void **ctx)
{
	char *argv;
	int err;
	unsigned long i, pos, count;
	void *map;
	struct page *page;

	/* vma_pages() returns the number of pages reserved for the stack */
	count = vma_pages(bprm->vma);

	if (likely(count == 1)) {
		err = get_user_pages_remote(current, bprm->mm, bprm->p, 1,
					    FOLL_FORCE, &page, NULL, NULL);
		if (err != 1)
			return NULL;

		argv = kmap(page);
		*ctx = page;
	} else {
		/*
		 * If more than one pages is needed, copy all of them to a set
		 * of pages. Parsing the argument across kmap pages in different
		 * addresses would make it impractical.
		 */
		argv = vmalloc(count * PAGE_SIZE);
		if (!argv)
			return NULL;

		for (i = 0; i < count; i++) {
			pos = ALIGN_DOWN(bprm->p, PAGE_SIZE) + i * PAGE_SIZE;
			err = get_user_pages_remote(current, bprm->mm, pos, 1,
						    FOLL_FORCE, &page, NULL,
						    NULL);
			if (err <= 0) {
				vfree(argv);
				return NULL;
			}

			map = kmap(page);
			memcpy(argv + i * PAGE_SIZE, map, PAGE_SIZE);
			kunmap(page);
			put_page(page);
		}
		*ctx = bprm;
	}

	return argv;
}

static void kunmap_argument_stack(struct linux_binprm *bprm, void *addr,
				  void *ctx)
{
	struct page *page;

	if (!addr)
		return;

	if (likely(vma_pages(bprm->vma) == 1)) {
		page = (struct page *)ctx;
		kunmap(page);
		put_page(ctx);
	} else {
		vfree(addr);
	}
}

static char *find_array_next_entry(char *array, unsigned long *offset,
				   unsigned long end)
{
	char *entry;
	unsigned long off = *offset;

	if (off >= end)
		return NULL;

	/* Check the entry is null terminated and in bound */
	entry = array + off;
	while (array[off]) {
		if (++off >= end)
			return NULL;
	}

	/* Pass the null byte for the next iteration */
	*offset = off + 1;

	return entry;
}

struct string_arr_ctx {
	struct linux_binprm *bprm;
	void *stack;
};

static size_t get_config_limit(size_t *config_ptr)
{
	lockdep_assert_held_read(&csm_rwsem_config);

	/*
	 * If execute is not enabled, do not capture arguments.
	 * The vsock packet won't be sent anyway.
	 */
	if (!csm_execute_enabled)
		return 0;

	return *config_ptr;
}

static bool encode_current_argv(pb_ostream_t *stream, const pb_field_t *field,
				void * const *arg)
{
	struct string_arr_ctx *ctx = (struct string_arr_ctx *)*arg;
	int i;
	struct linux_binprm *bprm = ctx->bprm;
	unsigned long offset = bprm->p % PAGE_SIZE;
	unsigned long end = vma_pages(bprm->vma) * PAGE_SIZE;
	char *argv = ctx->stack;
	char *entry;
	size_t limit, used = 0;
	ssize_t ret;

	limit = get_config_limit(&csm_execute_config.argv_limit);
	if (!limit)
		return true;

	for (i = 0; i < bprm->argc; i++) {
		entry = find_array_next_entry(argv, &offset, end);
		if (!entry)
			return false;

		ret = pb_encode_string_field_limit(stream, field,
						   (void * const *)&entry,
						   limit - used);
		if (ret < 0)
			return false;

		used += ret;

		if (used >= limit)
			break;
	}

	return true;
}

static bool check_envp_allowlist(char *envp)
{
	bool ret = false;
	char *strs, *equal;
	size_t str_size, equal_pos;

	/* If execute is not enabled, skip all. */
	if (!csm_execute_enabled)
		goto out;

	/* No filter, allow all. */
	strs = csm_execute_config.envp_allowlist;
	if (!strs) {
		ret = true;
		goto out;
	}

	/*
	 * Identify the key=value separation.
	 * If none exists use the whole string as a key.
	 */
	equal = strchr(envp, '=');
	equal_pos = equal ? (equal - envp) : strlen(envp);

	/* Default to skip if no match found. */
	ret = false;

	do {
		str_size = strlen(strs);

		/*
		 * If the filter length align with the key value equal sign,
		 * it might be a match, check the key value.
		 */
		if (str_size == equal_pos &&
		    !strncmp(strs, envp, str_size)) {
			ret = true;
			goto out;
		}

		strs += str_size + 1;
	} while (*strs != 0);

out:
	return ret;
}

static bool encode_current_envp(pb_ostream_t *stream, const pb_field_t *field,
				void * const *arg)
{
	struct string_arr_ctx *ctx = (struct string_arr_ctx *)*arg;
	int i;
	struct linux_binprm *bprm = ctx->bprm;
	unsigned long offset = bprm->p % PAGE_SIZE;
	unsigned long end = vma_pages(bprm->vma) * PAGE_SIZE;
	char *argv = ctx->stack;
	char *entry;
	size_t limit, used = 0;
	ssize_t ret;

	limit = get_config_limit(&csm_execute_config.envp_limit);
	if (!limit)
		return true;

	/* Skip arguments */
	for (i = 0; i < bprm->argc; i++) {
		if (!find_array_next_entry(argv, &offset, end))
			return false;
	}

	for (i = 0; i < bprm->envc; i++) {
		entry = find_array_next_entry(argv, &offset, end);
		if (!entry)
			return false;

		if (!check_envp_allowlist(entry))
			continue;

		ret = pb_encode_string_field_limit(stream, field,
						   (void * const *)&entry,
						   limit - used);
		if (ret < 0)
			return false;

		used += ret;

		if (used >= limit)
			break;
	}

	return true;
}

static bool is_overlayfs_mounted(struct file *file)
{
	struct vfsmount *mnt;
	struct super_block *mnt_sb;

	mnt = file->f_path.mnt;
	if (mnt == NULL)
		return false;

	mnt_sb = mnt->mnt_sb;
	if (mnt_sb == NULL || mnt_sb->s_magic != OVERLAYFS_SUPER_MAGIC)
		return false;

	return true;
}

/*
 * Before the process starts, identify a possible container by checking if the
 * task is on a pid namespace and the target file is using an overlayfs mounting
 * point. This check is valid for COS and GKE but not all existing containers.
 */
static bool is_possible_container(struct task_struct *task,
				  struct file *file)
{
	if (task_active_pid_ns(task) == &init_pid_ns)
		return false;

	return is_overlayfs_mounted(file);
}

/*
 * Generates a random identifier for this boot instance.
 * This identifier is generated only when needed to increase the entropy
 * available compared to doing it at early boot.
 */
static u32 get_machine_id(void)
{
	int machineid, old;

	machineid = atomic_read(&machine_rand);

	if (unlikely(machineid == 0)) {
		machineid = (int)get_random_int();
		if (machineid == 0)
			machineid = 1;
		old = atomic_cmpxchg(&machine_rand, 0, machineid);

		/* If someone beat us, use their value. */
		if (old != 0)
			machineid = old;
	}

	return (u32)machineid;
}

/*
 * Generate a 128-bit unique identifier for the process by appending:
 *  - A machine identifier unique per boot.
 *  - The start time of the process in nanoseconds.
 *  - The tgid for the set of threads in a process.
 */
static int get_process_uuid(struct task_struct *task, char *buffer, size_t size)
{
	union process_uuid *id = (union process_uuid *)buffer;

	memset(buffer, 0, size);

	if (WARN_ON(size < PROCESS_UUID_SIZE))
		return -EINVAL;

	id->machineid = get_machine_id();
	id->start_time = ktime_mono_to_real(task->group_leader->start_time);
	id->tgid = task_tgid_nr(task);

	return 0;
}

int get_process_uuid_by_pid(pid_t pid_nr, char *buffer, size_t size)
{
	int err;
	struct task_struct *task = NULL;

	rcu_read_lock();
	task = find_task_by_pid_ns(pid_nr, &init_pid_ns);
	if (!task) {
		err = -ENOENT;
		goto out;
	}
	err = get_process_uuid(task, buffer, size);
out:
	rcu_read_unlock();
	return err;
}

static int get_process_uuid_from_xattr(struct file *file, char *buffer,
				       size_t size)
{
	struct dentry *dentry;
	int err;
	struct file_provenance prov;
	union process_uuid *id = (union process_uuid *)buffer;

	memset(buffer, 0, size);

	if (WARN_ON(size < PROCESS_UUID_SIZE))
		return -EINVAL;

	/* The file is part of overlayfs on the upper layer. */
	if (!is_overlayfs_mounted(file))
		return -ENODATA;

	dentry = ovl_dentry_upper(file->f_path.dentry);
	if (!dentry)
		return -ENODATA;

	err = __vfs_getxattr(dentry, dentry->d_inode,
			     XATTR_SECURITY_CSM, &prov, sizeof(prov));
	/* returns -ENODATA if the xattr does not exist. */
	if (err < 0)
		return err;
	if (err != sizeof(prov)) {
		pr_err("unexpected size for xattr: %zu -> %d\n",
		       size, err);
		return -ENODATA;
	}

	id->machineid = get_machine_id();
	id->start_time = prov.start_time;
	id->tgid = prov.tgid;
	return 0;
}

u64 csm_set_contid(struct task_struct *task)
{
	u64 cid;
	struct pid_namespace *ns;

	ns = task_active_pid_ns(task);
	if (WARN_ON(!task->audit) || WARN_ON(!ns))
		return AUDIT_CID_UNSET;

	cid = atomic_inc_return(&contid);
	task->audit->contid = cid;

	/*
	 * If the namespace container-id is not set, use the one assigned
	 * to the first process created.
	 */
	cmpxchg(&ns->cid, 0, cid);
	return cid;
}

u64 csm_get_ns_contid(struct pid_namespace *ns)
{
	if (!ns || !ns->cid)
		return AUDIT_CID_UNSET;

	return ns->cid;
}

union ip_data {
	struct in_addr ip4;
	struct in6_addr ip6;
};

struct file_data {
	void *allocated;
	union ip_data local;
	union ip_data remote;
	char modified_uuid[PROCESS_UUID_SIZE];
};

static void free_file_data(struct file_data *fdata)
{
	free_page((unsigned long)fdata->allocated);
	fdata->allocated = NULL;
}

static void fill_socket_description(struct sockaddr_storage *saddr,
				   union ip_data *idata,
				   schema_SocketIp *schema_socketip)
{
	struct sockaddr_in *sin4 = (struct sockaddr_in *)saddr;
	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)saddr;

	schema_socketip->family = saddr->ss_family;

	switch (saddr->ss_family) {
	case AF_INET:
		schema_socketip->port = ntohs(sin4->sin_port);
		idata->ip4 = sin4->sin_addr;
		schema_socketip->ip.funcs.encode = pb_encode_ip4;
		schema_socketip->ip.arg = &idata->ip4;
		break;
	case AF_INET6:
		schema_socketip->port = ntohs(sin6->sin6_port);
		idata->ip6 = sin6->sin6_addr;
		schema_socketip->ip.funcs.encode = pb_encode_ip6;
		schema_socketip->ip.arg = &idata->ip6;
		break;
	}
}

static int fill_file_overlayfs(struct file *file, schema_File *schema_file,
			       struct file_data *fdata)
{
	struct dentry *dentry;
	int err;
	schema_Overlay *overlayfs;

	/* If not an overlayfs superblock, done. */
	if (!is_overlayfs_mounted(file))
		return 0;

	dentry = file->f_path.dentry;
	schema_file->which_filesystem = schema_File_overlayfs_tag;
	overlayfs = &schema_file->filesystem.overlayfs;
	overlayfs->lower_layer = ovl_dentry_lower(dentry);
	overlayfs->upper_layer = ovl_dentry_upper(dentry);

	err = get_process_uuid_from_xattr(file, fdata->modified_uuid,
					  sizeof(fdata->modified_uuid));
	/* If there is no xattr, just skip the modified_uuid field. */
	if (err == -ENODATA)
		return 0;
	if (err < 0)
		return err;

	overlayfs->modified_uuid.funcs.encode = pb_encode_uuid_field;
	overlayfs->modified_uuid.arg = fdata->modified_uuid;
	return 0;
}

static int fill_file_description(struct file *file, schema_File *schema_file,
				 struct file_data *fdata)
{
	char *buf;
	int err;
	u32 mode;
	char *path;
	struct socket *socket;
	schema_Socket *socketfs;
	struct sockaddr_storage saddr;

	memset(fdata, 0, sizeof(*fdata));

	if (file == NULL)
		return 0;

	schema_file->ino = file_inode(file)->i_ino;
	mode = file_inode(file)->i_mode;

	/* For pipes, no need to resolve the path. */
	if (S_ISFIFO(mode))
		return 0;

	if (S_ISSOCK(mode)) {
		socket = (struct socket *)file->private_data;
		socketfs = &schema_file->filesystem.socket;

		/* Local socket */
		err = kernel_getsockname(socket, (struct sockaddr *)&saddr);
		if (err >= 0) {
			fill_socket_description(&saddr, &fdata->local,
					       &socketfs->local);
		}

		/* Remote socket, might not be connected. */
		err = kernel_getpeername(socket, (struct sockaddr *)&saddr);
		if (err >= 0) {
			fill_socket_description(&saddr, &fdata->remote,
					       &socketfs->remote);
		}

		schema_file->which_filesystem = schema_File_socket_tag;
		return 0;
	}

	/*
	 * From this point, we care about all the other types of files as their
	 * path provides interesting insight.
	 */
	buf = (char *)__get_free_page(GFP_KERNEL);
	if (buf == NULL)
		return -ENOMEM;

	fdata->allocated = buf;

	path = d_path(&file->f_path, buf, PAGE_SIZE);
	if (IS_ERR(path)) {
		free_file_data(fdata);
		return PTR_ERR(path);
	}

	schema_file->fullpath.funcs.encode = pb_encode_string_field;
	schema_file->fullpath.arg = path; /* buf is freed in free_file_data. */

	err = fill_file_overlayfs(file, schema_file, fdata);
	if (err) {
		free_file_data(fdata);
		return err;
	}

	return 0;
}

static int fill_stream_description(schema_Descriptor *desc, int fd,
				   struct file_data *fdata)
{
	struct fd sfd;
	struct file *file;
	int err = 0;

	sfd = fdget(fd);
	file = sfd.file;

	if (file == NULL) {
		memset(fdata, 0, sizeof(*fdata));
		goto end;
	}

	desc->mode = file_inode(file)->i_mode;
	err = fill_file_description(file, &desc->file, fdata);

end:
	fdput(sfd);
	return err;
}

static int populate_proc_uuid_common(schema_Process *proc, char *uuid,
				     size_t uuid_size, char *parent_uuid,
				     size_t parent_uuid_size,
				     struct task_struct *task)
{
	int err;
	struct task_struct *parent;
	/* Generate unique identifier for the process and its parent */
	err = get_process_uuid(task, uuid, uuid_size);
	if (err)
		return err;

	proc->uuid.funcs.encode = pb_encode_uuid_field;
	proc->uuid.arg = uuid;

	rcu_read_lock();

	if (!pid_alive(task))
		goto out;
	/*
	 * I don't think this needs to be task_rcu_dereference because
	 * real_parent is only supposed to be accessed using RCU.
	 */
	parent = rcu_dereference(task->real_parent);

	if (parent) {
		err = get_process_uuid(parent, parent_uuid, parent_uuid_size);
		if (!err) {
			proc->parent_uuid.funcs.encode = pb_encode_uuid_field;
			proc->parent_uuid.arg = parent_uuid;
		}
	}

out:
	rcu_read_unlock();

	return err;
}

/* Populate the fields that we always want to set in Process messages. */
static int populate_proc_common(schema_Process *proc, char *uuid,
				size_t uuid_size, char *parent_uuid,
				size_t parent_uuid_size,
				struct task_struct *task)
{
	u64 cid;
	struct pid_namespace *ns = task_active_pid_ns(task);

	/* Container identifier for the current namespace. */
	proc->container_id = csm_get_ns_contid(ns);

	/*
	 * If the process container-id is different, the process tree is part of
	 * a different session within the namespace (kubectl/docker exec,
	 * liveness probe or others).
	 */
	cid = audit_get_contid(task);
	if (proc->container_id != cid)
		proc->exec_session_id = cid;

	/* Add information about pid in different namespaces */
	proc->pid = task_pid_nr(task);
	proc->parent_pid = task_ppid_nr(task);
	proc->container_pid = task_pid_nr_ns(task, ns);
	proc->container_parent_pid = task_ppid_nr_ns(task, ns);

	return populate_proc_uuid_common(proc, uuid, uuid_size, parent_uuid,
					 parent_uuid_size, task);
}

int csm_bprm_check_security(struct linux_binprm *bprm)
{
	char uuid[PROCESS_UUID_SIZE];
	char parent_uuid[PROCESS_UUID_SIZE];
	int err;
	schema_Event event = schema_Event_init_zero;
	schema_Process *proc;
	struct string_arr_ctx argv_ctx;
	void *stack = NULL, *ctx = NULL;
	u64 cid;
	struct file_data path_data = {};
	struct file_data stdin_data = {};
	struct file_data stdout_data = {};
	struct file_data stderr_data = {};

	/*
	 * Always create a container-id for containerized processes.
	 * If the LSM is enabled later, we can track existing containers.
	 */
	cid = audit_get_contid(current);

	if (cid == AUDIT_CID_UNSET) {
		if (!is_possible_container(current, bprm->file))
			return 0;

		cid = csm_set_contid(current);

		if (cid == AUDIT_CID_UNSET)
			return 0;
	}

	if (!csm_execute_enabled)
		return 0;

	/* The interpreter will call us again with more context. */
	if (bprm->buf[0] == '#' && bprm->buf[1] == '!')
		return 0;

	proc = &event.event.execute.proc;
	err = populate_proc_common(proc, uuid, sizeof(uuid), parent_uuid,
				   sizeof(parent_uuid), current);
	if (err)
		goto out_free_buf;

	proc->creation_timestamp = ktime_get_real_ns();

	/* Provide information about the launched binary. */
	err = fill_file_description(bprm->file, &proc->binary, &path_data);
	if (err)
		goto out_free_buf;

	/* Information about streams */
	err = fill_stream_description(&proc->streams.stdin, STDIN_FILENO,
				      &stdin_data);
	if (err)
		goto out_free_buf;

	err = fill_stream_description(&proc->streams.stdout, STDOUT_FILENO,
				      &stdout_data);
	if (err)
		goto out_free_buf;

	err = fill_stream_description(&proc->streams.stderr, STDERR_FILENO,
				      &stderr_data);
	if (err)
		goto out_free_buf;

	stack = kmap_argument_stack(bprm, &ctx);
	if (!stack) {
		err = -EFAULT;
		goto out_free_buf;
	}

	/* Capture process argument */
	argv_ctx.bprm = bprm;
	argv_ctx.stack = stack;
	proc->args.argv.funcs.encode = encode_current_argv;
	proc->args.argv.arg = &argv_ctx;

	/* Capture process environment variables */
	proc->args.envp.funcs.encode = encode_current_envp;
	proc->args.envp.arg = &argv_ctx;

	event.which_event = schema_Event_execute_tag;

	/*
	 * Configurations options are checked when computing the serialized
	 * protobufs.
	 */
	down_read(&csm_rwsem_config);
	err = csm_sendeventproto(schema_Event_fields, &event);
	up_read(&csm_rwsem_config);

	if (err)
		pr_err("csm_sendeventproto returned %d on execve\n", err);
	err = 0;

out_free_buf:
	kunmap_argument_stack(bprm, stack, ctx);
	free_file_data(&path_data);
	free_file_data(&stdin_data);
	free_file_data(&stdout_data);
	free_file_data(&stderr_data);

	/*
	 * On failure, enforce it only if the execute config is enabled.
	 * If the collector was disabled, prefer to succeed to not impact the
	 * system.
	 */
	if (unlikely(err < 0 && !csm_execute_enabled))
		err = 0;

	return err;
}

/* Create a clone event when a new task leader is created. */
void csm_task_post_alloc(struct task_struct *task)
{
	int err;
	char uuid[PROCESS_UUID_SIZE];
	char parent_uuid[PROCESS_UUID_SIZE];
	schema_Event event = schema_Event_init_zero;
	schema_Process *proc;

	if (!csm_execute_enabled ||
	    audit_get_contid(task) == AUDIT_CID_UNSET ||
	    !thread_group_leader(task))
		return;

	proc = &event.event.clone.proc;

	err = populate_proc_uuid_common(proc, uuid, sizeof(uuid), parent_uuid,
					sizeof(parent_uuid), task);

	event.which_event = schema_Event_clone_tag;
	err = csm_sendeventproto(schema_Event_fields, &event);
	if (err)
		pr_err("csm_sendeventproto returned %d on exit\n", err);
}

/*
 * This LSM hook callback doesn't exist upstream and is called only when the
 * last thread of a thread group exit.
 */
void csm_task_exit(struct task_struct *task)
{
	int err;
	schema_Event event = schema_Event_init_zero;
	schema_ExitEvent *exit;
	char uuid[PROCESS_UUID_SIZE];

	if (!csm_execute_enabled ||
	    audit_get_contid(task) == AUDIT_CID_UNSET)
		return;

	exit = &event.event.exit;

	/* Fetch the unique identifier for this process */
	err = get_process_uuid(task, uuid, sizeof(uuid));
	if (err) {
		pr_err("failed to get process uuid on exit\n");
		return;
	}

	exit->process_uuid.funcs.encode = pb_encode_uuid_field;
	exit->process_uuid.arg = uuid;

	event.which_event = schema_Event_exit_tag;

	err = csm_sendeventproto(schema_Event_fields, &event);
	if (err)
		pr_err("csm_sendeventproto returned %d on exit\n", err);
}

int csm_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
		unsigned long prot)
{
	char uuid[PROCESS_UUID_SIZE];
	char parent_uuid[PROCESS_UUID_SIZE];
	int err;
	schema_Event event = schema_Event_init_zero;
	schema_MemoryExecEvent *memexec;
	u64 cid;
	struct file_data path_data = {};

	cid = audit_get_contid(current);

	if (!csm_memexec_enabled ||
	    !(prot & PROT_EXEC) ||
	    vma->vm_file == NULL ||
	    cid == AUDIT_CID_UNSET)
		return 0;

	memexec = &event.event.memexec;

	err = fill_file_description(vma->vm_file, &memexec->mapped_file,
				    &path_data);
	if (err)
		return err;

	err = populate_proc_common(&memexec->proc, uuid, sizeof(uuid),
				   parent_uuid, sizeof(parent_uuid), current);
	if (err)
		goto out;

	memexec->prot_exec_timestamp = ktime_get_real_ns();
	memexec->new_flags = prot;
	memexec->req_flags = reqprot;
	memexec->old_vm_flags = vma->vm_flags;

	memexec->action = schema_MemoryExecEvent_Action_MPROTECT;
	memexec->start_addr = vma->vm_start;
	memexec->end_addr = vma->vm_end;

	event.which_event = schema_Event_memexec_tag;

	err = csm_sendeventproto(schema_Event_fields, &event);
	if (err)
		pr_err("csm_sendeventproto returned %d on mprotect\n", err);
	err = 0;

	if (unlikely(err < 0 && !csm_memexec_enabled))
		err = 0;

out:
	free_file_data(&path_data);
	return err;
}

int csm_mmap_file(struct file *file, unsigned long reqprot,
		unsigned long prot, unsigned long flags)
{
	char uuid[PROCESS_UUID_SIZE];
	char parent_uuid[PROCESS_UUID_SIZE];
	int err;
	schema_Event event = schema_Event_init_zero;
	schema_MemoryExecEvent *memexec;
	struct file *exe_file;
	u64 cid;
	struct file_data path_data = {};

	cid = audit_get_contid(current);

	if (!csm_memexec_enabled ||
	    !(prot & PROT_EXEC) ||
	    file == NULL ||
	    cid == AUDIT_CID_UNSET)
		return 0;

	memexec = &event.event.memexec;
	err = fill_file_description(file, &memexec->mapped_file,
				    &path_data);
	if (err)
		return err;

	err = populate_proc_common(&memexec->proc, uuid, sizeof(uuid),
				   parent_uuid, sizeof(parent_uuid), current);
	if (err)
		goto out;

	/* get_mm_exe_file does its own locking on mm_sem. */
	exe_file = get_mm_exe_file(current->mm);
	if (exe_file) {
		if (path_equal(&file->f_path, &exe_file->f_path))
			memexec->is_initial_mmap = 1;
		fput(exe_file);
	}

	memexec->prot_exec_timestamp = ktime_get_real_ns();
	memexec->new_flags = prot;
	memexec->req_flags = reqprot;
	memexec->mmap_flags = flags;
	memexec->action = schema_MemoryExecEvent_Action_MMAP_FILE;
	event.which_event = schema_Event_memexec_tag;

	err = csm_sendeventproto(schema_Event_fields, &event);
	if (err)
		pr_err("csm_sendeventproto returned %d on mmap_file\n", err);
	err = 0;

	if (unlikely(err < 0 && !csm_memexec_enabled))
		err = 0;

out:
	free_file_data(&path_data);
	return err;
}

void csm_file_pre_free(struct file *file)
{
	struct dentry *dentry;
	int err;
	struct file_provenance prov;

	/* The file was opened to be modified and the LSM is enabled */
	if (!(file->f_mode & FMODE_WRITE) ||
	    !csm_enabled)
		return;

	/* The current process is containerized. */
	if (audit_get_contid(current) == AUDIT_CID_UNSET)
		return;

	/* The file is part of overlayfs on the upper layer. */
	if (!is_overlayfs_mounted(file))
		return;

	dentry = ovl_dentry_upper(file->f_path.dentry);
	if (!dentry)
		return;

	err = __vfs_getxattr(dentry, dentry->d_inode, XATTR_SECURITY_CSM,
			     NULL, 0);
	if (err != -ENODATA) {
		if (err < 0)
			pr_err("failed to get security attribute: %d\n", err);
		return;
	}

	prov.tgid = task_tgid_nr(current);
	prov.start_time = ktime_mono_to_real(current->group_leader->start_time);

	err = __vfs_setxattr(dentry, dentry->d_inode, XATTR_SECURITY_CSM, &prov,
			     sizeof(prov), 0);
	if (err < 0)
		pr_err("failed to set security attribute: %d\n", err);
}

/*
 * Based off of fs/proc/base.c:next_tgid
 *
 * next_thread_group_leader returns the task_struct of the next task with a pid
 * greater than or equal to tgid. The reference count is increased so that
 * rcu_read_unlock may be called, and preemption reenabled.
 */
static struct task_struct *next_thread_group_leader(pid_t *tgid)
{
	struct pid *pid;
	struct task_struct *task;

	cond_resched();
	rcu_read_lock();
retry:
	task = NULL;
	pid = find_ge_pid(*tgid, &init_pid_ns);
	if (pid) {
		*tgid = pid_nr_ns(pid, &init_pid_ns);
		task = pid_task(pid, PIDTYPE_PID);
		if (!task || !has_group_leader_pid(task) ||
		    audit_get_contid(task) == AUDIT_CID_UNSET) {
			(*tgid) += 1;
			goto retry;
		}

		/*
		 * Increment the reference count on the task before leaving
		 * the RCU grace period.
		 */
		get_task_struct(task);
		(*tgid) += 1;
	}

	rcu_read_unlock();
	return task;
}

void delayed_enumerate_processes(struct work_struct *work)
{
	pid_t tgid = 0;
	struct task_struct *task;
	struct csm_enumerate_processes_work_data *wd = container_of(
		work, struct csm_enumerate_processes_work_data, work);
	int wd_enumeration_count = wd->enumeration_count;

	kfree(wd);
	wd = NULL;
	work = NULL;

	/*
	 * Try for only a single enumeration routine at a time, as long as the
	 * execute collector is enabled.
	 */
	while ((wd_enumeration_count == atomic_read(&enumeration_count)) &&
	       READ_ONCE(csm_execute_enabled) &&
	       (task = next_thread_group_leader(&tgid))) {
		int err;
		char uuid[PROCESS_UUID_SIZE];
		char parent_uuid[PROCESS_UUID_SIZE];
		struct file *exe_file = NULL;
		struct file_data path_data = {};
		schema_Event event = schema_Event_init_zero;
		schema_Process *proc = &event.event.enumproc.proc;

		exe_file = get_task_exe_file(task);
		if (!exe_file) {
			pr_err("failed to get enumerated process executable, pid: %u\n",
			       task_pid_nr(task));
			goto next;
		}

		err = fill_file_description(exe_file, &proc->binary,
					    &path_data);
		if (err) {
			pr_err("failed to fill enumerated process %u executable description: %d\n",
			       task_pid_nr(task), err);
			goto next;
		}

		err = populate_proc_common(proc, uuid, sizeof(uuid),
					   parent_uuid, sizeof(parent_uuid),
					   task);
		if (err) {
			pr_err("failed to set pid %u common fields: %d\n",
			       task_pid_nr(task), err);
			goto next;
		}

		if (task->flags & PF_EXITING)
			goto next;

		event.which_event = schema_Event_enumproc_tag;
		err = csm_sendeventproto(schema_Event_fields,
					 &event);
		if (err) {
			pr_err("failed to send pid %u enumerated process: %d\n",
			       task_pid_nr(task), err);
			goto next;
		}
next:
		free_file_data(&path_data);
		if (exe_file)
			fput(exe_file);

		put_task_struct(task);
	}
}

void csm_enumerate_processes(unsigned long const config_version)
{
	struct csm_enumerate_processes_work_data *wd;

	wd = kmalloc(sizeof(*wd), GFP_KERNEL);
	if (!wd)
		return;

	INIT_WORK(&wd->work, delayed_enumerate_processes);
	wd->enumeration_count = atomic_add_return(1, &enumeration_count);
	schedule_work(&wd->work);
}
