blob: 8d1710239bb009679df879cdbdfee43f0ad81a6b [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0
/*
* Container Security Monitor module
*
* Copyright (c) 2018 Google, Inc
*/
#include "monitor.h"
#include <net/net_namespace.h>
#include <net/vsock_addr.h>
#include <net/sock.h>
#include <linux/socket.h>
#include <linux/workqueue.h>
#include <linux/jiffies.h>
#include <linux/mutex.h>
#include <linux/version.h>
#include <linux/kthread.h>
#include <linux/printk.h>
#include <linux/delay.h>
#include <linux/timekeeping.h>
/*
* virtio vsocket over which to send events to the host.
* NULL if monitoring is disabled, or if the socket was disconnected and we're
* trying to reconnect to the host.
*/
static struct socket *csm_vsocket;
/* reconnect delay */
#define CSM_RECONNECT_FREQ_MSEC 5000
/* config pull delay */
#define CSM_CONFIG_FREQ_MSEC 1000
/* vsock receive attempts and delay until giving up */
#define CSM_RECV_ATTEMPTS 2
#define CSM_RECV_DELAY_MSEC 100
/* heartbeat work */
#define CSM_HEARTBEAT_FREQ msecs_to_jiffies(5000)
static void csm_heartbeat(struct work_struct *work);
static DECLARE_DELAYED_WORK(csm_heartbeat_work, csm_heartbeat);
/* csm protobuf work */
static void csm_sendmsg_pipe_handler(struct work_struct *work);
/* csm message work container*/
struct msg_work_data {
struct work_struct msg_work;
size_t pos_bytes_written;
char msg[];
};
/* size used for the config error message. */
#define CSM_ERROR_BUF_SIZE 40
/* Running thread to manage vsock connections. */
static struct task_struct *socket_thread;
/* Mutex to ensure sequential dumping of protos */
static DEFINE_MUTEX(protodump);
static struct socket *csm_create_socket(void)
{
int err;
struct sockaddr_vm host_addr;
struct socket *sock;
err = sock_create_kern(&init_net, AF_VSOCK, SOCK_STREAM, 0,
&sock);
if (err) {
pr_debug("error creating AF_VSOCK socket: %d\n", err);
return ERR_PTR(err);
}
vsock_addr_init(&host_addr, VMADDR_CID_HYPERVISOR, CSM_HOST_PORT);
err = kernel_connect(sock, (struct sockaddr *)&host_addr,
sizeof(host_addr), 0);
if (err) {
if (err != -ECONNRESET) {
pr_debug("error connecting AF_VSOCK socket to host port %u: %d\n",
CSM_HOST_PORT, err);
}
goto error_release;
}
return sock;
error_release:
sock_release(sock);
return ERR_PTR(err);
}
static void csm_destroy_socket(void)
{
down_write(&csm_rwsem_vsocket);
if (csm_vsocket) {
sock_release(csm_vsocket);
csm_vsocket = NULL;
}
up_write(&csm_rwsem_vsocket);
}
static int csm_vsock_sendmsg(struct kvec *vecs, size_t vecs_size,
size_t total_length)
{
struct msghdr msg = { };
int res = -EPIPE;
if (!cmdline_boot_vsock_enabled)
return 0;
down_read(&csm_rwsem_vsocket);
if (csm_vsocket) {
res = kernel_sendmsg(csm_vsocket, &msg, vecs, vecs_size,
total_length);
if (res > 0)
res = 0;
}
up_read(&csm_rwsem_vsocket);
return res;
}
static ssize_t csm_user_pipe_write(struct kvec *vecs, size_t vecs_size,
size_t total_length)
{
ssize_t perr = 0;
struct iov_iter io = { };
loff_t pos = 0;
struct pipe_inode_info *pipe;
unsigned int readers;
if (!csm_user_write_pipe)
return 0;
down_read(&csm_rwsem_pipe);
if (csm_user_write_pipe == NULL)
goto end;
/* The pipe info is the same for reader and write files. */
pipe = get_pipe_info(csm_user_write_pipe);
/* If nobody is listening, don't write events. */
readers = READ_ONCE(pipe->readers);
if (readers <= 1) {
WARN_ON(readers == 0);
goto end;
}
iov_iter_kvec(&io, ITER_KVEC|WRITE, vecs, vecs_size,
total_length);
file_start_write(csm_user_write_pipe);
perr = vfs_iter_write(csm_user_write_pipe, &io, &pos, 0);
file_end_write(csm_user_write_pipe);
end:
up_read(&csm_rwsem_pipe);
return perr;
}
static int csm_sendmsg(int type, const void *buf, size_t len)
{
struct csm_msg_hdr hdr = {
.msg_type = cpu_to_le32(type),
.msg_length = cpu_to_le32(sizeof(hdr) + len),
};
struct kvec vecs[] = {
{
.iov_base = &hdr,
.iov_len = sizeof(hdr),
}, {
.iov_base = (void *)buf,
.iov_len = len,
}
};
int res;
ssize_t perr;
res = csm_vsock_sendmsg(vecs, ARRAY_SIZE(vecs),
le32_to_cpu(hdr.msg_length));
if (res < 0) {
pr_warn_ratelimited("sendmsg error (msg_type=%d, msg_length=%u): %d\n",
type, le32_to_cpu(hdr.msg_length), res);
}
perr = csm_user_pipe_write(vecs, ARRAY_SIZE(vecs),
le32_to_cpu(hdr.msg_length));
if (perr < 0) {
pr_warn_ratelimited("vfs_iter_write error (msg_type=%d, msg_length=%u): %zd\n",
type, le32_to_cpu(hdr.msg_length), perr);
}
/* If one of them failed, increase the stats once. */
if (res < 0 || perr < 0)
csm_stats.event_writing_failed++;
return res;
}
static bool csm_get_expected_size(size_t *size, const pb_field_t fields[],
const void *src_struct)
{
schema_Event *event;
if (fields != schema_Event_fields)
goto other;
/* Size above 99% of the 100 containers tested running k8s. */
event = (schema_Event *)src_struct;
switch (event->which_event) {
case schema_Event_execute_tag:
*size = 3344;
return true;
case schema_Event_memexec_tag:
*size = 176;
return true;
case schema_Event_clone_tag:
*size = 50;
return true;
case schema_Event_exit_tag:
*size = 30;
return true;
}
other:
/* If unknown, do the pre-computation. */
return pb_get_encoded_size(size, fields, src_struct);
}
static struct msg_work_data *csm_encodeproto(size_t size,
const pb_field_t fields[],
const void *src_struct)
{
pb_ostream_t pos;
struct msg_work_data *wd;
size_t total;
total = size + sizeof(*wd);
if (total < size)
return ERR_PTR(-EINVAL);
wd = kmalloc(total, GFP_KERNEL);
if (!wd)
return ERR_PTR(-ENOMEM);
pos = pb_ostream_from_buffer(wd->msg, size);
if (!pb_encode(&pos, fields, src_struct)) {
kfree(wd);
return ERR_PTR(-EINVAL);
}
INIT_WORK(&wd->msg_work, csm_sendmsg_pipe_handler);
wd->pos_bytes_written = pos.bytes_written;
return wd;
}
static int csm_sendproto(int type, const pb_field_t fields[],
const void *src_struct)
{
int err = 0;
size_t size, previous_size;
struct msg_work_data *wd;
/* Use the expected size first. */
if (!csm_get_expected_size(&size, fields, src_struct))
return -EINVAL;
wd = csm_encodeproto(size, fields, src_struct);
if (unlikely(IS_ERR(wd))) {
/* If it failed, retry with the exact size. */
csm_stats.size_picking_failed++;
previous_size = size;
if (!pb_get_encoded_size(&size, fields, src_struct))
return -EINVAL;
wd = csm_encodeproto(size, fields, src_struct);
if (IS_ERR(wd)) {
csm_stats.proto_encoding_failed++;
return PTR_ERR(wd);
}
pr_debug("size picking failed %lu vs %lu\n", previous_size,
size);
}
/* The work handler takes care of cleanup, if successfully scheduled. */
if (likely(schedule_work(&wd->msg_work)))
return 0;
csm_stats.workqueue_failed++;
pr_err_ratelimited("Sent msg to workqueue unsuccessfully (assume dropped).\n");
kfree(wd);
return err;
}
static void csm_sendmsg_pipe_handler(struct work_struct *work)
{
int err;
int type = CSM_MSG_EVENT_PROTO;
struct msg_work_data *wd = container_of(work, struct msg_work_data,
msg_work);
err = csm_sendmsg(type, wd->msg, wd->pos_bytes_written);
if (err)
pr_err_ratelimited("csm_sendmsg failed in work handler %s\n",
__func__);
kfree(wd);
}
int csm_sendeventproto(const pb_field_t fields[], schema_Event *event)
{
/* Last check before generating and sending an event. */
if (!csm_enabled)
return -ENOTSUPP;
event->timestamp = ktime_get_real_ns();
return csm_sendproto(CSM_MSG_EVENT_PROTO, fields, event);
}
int csm_sendconfigrespproto(const pb_field_t fields[],
schema_ConfigurationResponse *resp)
{
return csm_sendproto(CSM_MSG_CONFIG_RESPONSE_PROTO, fields, resp);
}
static void csm_heartbeat(struct work_struct *work)
{
csm_sendmsg(CSM_MSG_TYPE_HEARTBEAT, NULL, 0);
schedule_delayed_work(&csm_heartbeat_work, CSM_HEARTBEAT_FREQ);
}
static int config_send_response(int err)
{
char buf[CSM_ERROR_BUF_SIZE] = {};
schema_ConfigurationResponse resp =
schema_ConfigurationResponse_init_zero;
resp.error = schema_ConfigurationResponse_ErrorCode_NO_ERROR;
resp.version = CSM_VERSION;
resp.kernel_version = LINUX_VERSION_CODE;
if (err) {
resp.error = schema_ConfigurationResponse_ErrorCode_UNKNOWN;
snprintf(buf, sizeof(buf) - 1, "error code: %d", err);
resp.msg.funcs.encode = pb_encode_string_field;
resp.msg.arg = buf;
}
return csm_sendconfigrespproto(schema_ConfigurationResponse_fields,
&resp);
}
static int csm_recvmsg(void *buf, size_t len, bool expected)
{
int err = 0;
struct msghdr msg = {};
struct kvec vecs;
size_t pos = 0;
size_t attempts = 0;
while (pos < len) {
vecs.iov_base = (char *)buf + pos;
vecs.iov_len = len - pos;
down_read(&csm_rwsem_vsocket);
if (csm_vsocket) {
err = kernel_recvmsg(csm_vsocket, &msg, &vecs, 1, len,
MSG_DONTWAIT);
} else {
pr_err("csm_vsocket was unset while the config thread was running\n");
err = -ENOENT;
}
up_read(&csm_rwsem_vsocket);
if (err == 0) {
err = -ENOTCONN;
pr_warn_ratelimited("vsock connection was reset\n");
break;
}
if (err == -EAGAIN) {
/*
* If nothing is received and nothing was expected
* just bail.
*/
if (!expected && pos == 0) {
err = -EAGAIN;
break;
}
/*
* If we missing data after multiple attempts
* reset the connection.
*/
if (++attempts >= CSM_RECV_ATTEMPTS) {
err = -EPIPE;
break;
}
msleep(CSM_RECV_DELAY_MSEC);
continue;
}
if (err < 0) {
pr_err_ratelimited("kernel_recvmsg failed with %d\n",
err);
break;
}
pos += err;
}
return err;
}
/*
* Listen for configuration until connection is closed or desynchronize.
* If something wrong happens while parsing the packet buffer that may
* desynchronize the thread with the backend, the connection is reset.
*/
static void listen_configuration(void *buf)
{
int err;
struct csm_msg_hdr hdr = {};
uint32_t msg_type, msg_length;
pr_debug("listening for configuration messages\n");
while (true) {
err = csm_recvmsg(&hdr, sizeof(hdr), false);
/* Nothing available, wait and try again. */
if (err == -EAGAIN) {
msleep(CSM_CONFIG_FREQ_MSEC);
continue;
}
if (err < 0)
break;
msg_type = le32_to_cpu(hdr.msg_type);
if (msg_type != CSM_MSG_CONFIG_REQUEST_PROTO) {
pr_warn_ratelimited("unexpected message type: %d\n",
msg_type);
break;
}
msg_length = le32_to_cpu(hdr.msg_length);
if (msg_length <= sizeof(hdr) || msg_length > PAGE_SIZE) {
pr_warn_ratelimited("unexpected message length: %d\n",
msg_length);
break;
}
/* The message length include the size of the header. */
msg_length -= sizeof(hdr);
err = csm_recvmsg(buf, msg_length, true);
if (err < 0) {
pr_warn_ratelimited("failed to gather configuration: %d\n",
err);
break;
}
err = csm_update_config_from_buffer(buf, msg_length);
if (err < 0) {
/*
* Warn of the error but continue listening for
* configuration changes.
*/
pr_warn_ratelimited("config update failed: %d\n", err);
} else {
pr_debug("config received and applied\n");
}
err = config_send_response(err);
if (err < 0) {
pr_err_ratelimited("config response failed: %d\n", err);
break;
}
pr_debug("config response sent\n");
}
}
/* Thread managing connection and listening for new configurations. */
static int socket_thread_fn(void *unsued)
{
void *buf;
struct socket *sock;
/* One page should be enough for current configurations. */
buf = (void *)__get_free_page(GFP_KERNEL);
if (!buf)
return -ENOMEM;
while (true) {
sock = csm_create_socket();
if (IS_ERR(sock)) {
pr_debug("unable to connect to host (port %u), will retry in %u ms\n",
CSM_HOST_PORT, CSM_RECONNECT_FREQ_MSEC);
msleep(CSM_RECONNECT_FREQ_MSEC);
continue;
}
down_write(&csm_rwsem_vsocket);
csm_vsocket = sock;
up_write(&csm_rwsem_vsocket);
schedule_delayed_work(&csm_heartbeat_work, 0);
listen_configuration(buf);
pr_warn("vsock state incorrect, disconnecting. Messages will be lost.\n");
cancel_delayed_work_sync(&csm_heartbeat_work);
csm_destroy_socket();
}
return 0;
}
void __init vsock_destroy(void)
{
if (socket_thread) {
kthread_stop(socket_thread);
socket_thread = NULL;
}
}
int __init vsock_initialize(void)
{
struct task_struct *task;
if (cmdline_boot_vsock_enabled) {
task = kthread_run(socket_thread_fn, NULL, "csm-vsock-thread");
if (IS_ERR(task)) {
pr_err("failed to create socket thread: %ld\n", PTR_ERR(task));
vsock_destroy();
return PTR_ERR(task);
}
socket_thread = task;
}
return 0;
}