blob: 5c5bcf1a4ed377adcfdf2dc6a0997ad0abdc11d4 [file] [log] [blame] [edit]
# Copyright 2022 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Parse auditd logs.
Parse the connect() and send/sendto/sendmsg/sendmmsg() syscall usages per
process basis. Print the frequency historgrams in stdout.
Example:
python3 parser.py data/sample.txt
"""
import sys
DELIM = '----'
TYPE_SYSCALL = 'SYSCALL'
TYPE_SOCKADDR = 'SOCKADDR'
TYPE = 'type'
EVENTID = 'event_id'
SYSCALL = 'syscall'
SYSCALL_CONN = 'connect'
SYSCALL_SEND = 'sendto'
SYSCALL_SENDMSG = 'sendmsg'
SYSCALL_SENDMMSG = 'sendmmsg'
FAM = 'fam'
LADDR = 'laddr'
LPORT = 'lport'
PID = 'pid'
PPID = 'ppid'
UID = 'uid'
COMM = 'comm'
EXE = 'exe'
IGNORE_ADDR_LIST = ['127.0.0.1', '0.0.0.0', '::', '::1']
def main(argv):
"""Parse the log file and show frequency histogram.
Format of the output is: '<number> <process_name>', meaning that the
<proces_name> invoked the syscall <number> of times. Showing 2 histograms
for connect() and for send/sendto/sendmsg/sendmmsg() respectively.
Args:
argv: A single element list, specifying the log file path
e.g. ["data/sample.txt"].
"""
data = parse_file(argv[0])
# Show histogram for connect syscall
visualize_syscalls([SYSCALL_CONN], data)
# Show joint histogram for send,sento,sendmsg syscalls
visualize_syscalls([SYSCALL_SEND, SYSCALL_SENDMSG, SYSCALL_SENDMMSG], data)
def visualize_syscalls(syscalls, data):
"""Print frequency histogram of per-process usage of sys_connect().
To focus on the information sent off the device, ignores:
* sockets that DO NOT use the internet protocol
* loopback interfaces
Args:
syscalls: A list of syscalls we are interested in. Log entries are
accounted if matched to any of the syscalls in this list.
data: The list of log entries. TODO(zauri): add example
"""
stats = {}
for data_point in data:
# Skip the logs which don't contain the syscall information.
if COMM in data_point and FAM in data_point:
sysc, comm, fam, addr = [
data_point[SYSCALL],
data_point[COMM],
data_point[FAM],
data_point.get(LADDR, None),
]
# Skip irrelevant log entries
if (sysc not in syscalls
or fam not in ['inet', 'inet6']
or addr in IGNORE_ADDR_LIST):
continue
stats[comm] = stats.get(comm, 0) + 1
else:
continue
print(f'\nShowing stats for syscall={syscalls}:')
for item in sorted(stats, key=stats.get, reverse=True):
print(stats[item], '\t', item)
def parse_file(file_name):
"""Parse log file and return as a list of dictionary items.
Args:
file_name: path the log file.
Returns:
A list containing per-event information. The list elements are
dictionaries, containing even-related data, like eventid, syscall, etc.
"""
events = None
with open(file_name, 'r') as file:
events = file.read().split(DELIM)
events = list(filter(len, events))
data = []
for event in events:
data.append(parse_event(event))
return data
def parse_event(event):
"""Parse single event information into a data point.
Multiple log entries are logged for each event. Logs of the other events
are separated with '----' delimiter in a log file.
This function takes all the log entries of a single event, parses out the
required fields into a dictionary and returns it.
Args:
event: String containing all the logs of a single event.
Returns:
A dictionary with the relevant event information.
"""
data_point = {}
# Parse relevant fields.
parse_eventid(event, data_point)
parse_syscall_bits(event, data_point)
parse_sockaddr_bits(event, data_point)
return data_point
def parse_type(event, log_type):
"""Return the log entry with the desired log type.
Args:
event: String of all the log entries associated with a single event.
log_type: Type of the log entry we're searcing for e.g. 'SYSCALL'.
Returns:
The first log entry of the given type (should be EXACTLY one).
Returns empty string if the type is not found.
"""
log_entries = event.split('\n')
log_entries = list(filter(len, log_entries))
log_entries = [e.rstrip() for e in log_entries]
for entry in log_entries:
cur_type = entry.split()[0].replace('type=', '')
if cur_type == log_type:
return entry
return ''
def parse_eventid(event, data_point):
"""Parse EVENTID field into the data_point."""
log_entry = event.split()[2].rstrip()
eventid = log_entry.split(')')[0]
eventid = eventid[eventid.rfind(':')+1:]
data_point[EVENTID] = int(eventid)
def parse_syscall_bits(event, data_point):
"""Parse the type=SYSCALL-related fields into the data point."""
sys_entry = parse_type(event, TYPE_SYSCALL)
if sys_entry == '':
return
syscall, success, ppid, pid, auid, uid, gid, comm, exe, subj = [
' syscall=',
' success=',
' ppid=',
' pid=',
' auid=',
' uid=',
' gid=',
' comm=',
' exe=',
' subj=',
]
data_point[SYSCALL] = sys_entry[sys_entry.find(syscall)+len(syscall)
: sys_entry.find(success)]
data_point[PPID] = int(sys_entry[sys_entry.find(ppid)+len(ppid)
: sys_entry.find(pid)])
data_point[PID] = int(sys_entry[sys_entry.find(pid)+len(pid)
: sys_entry.find(auid)])
data_point[UID] = sys_entry[sys_entry.find(uid)+len(uid)
: sys_entry.find(gid)]
data_point[COMM] = sys_entry[sys_entry.find(comm)+len(comm)
: sys_entry.find(exe)]
data_point[EXE] = sys_entry[sys_entry.find(exe)+len(exe)
: sys_entry.find(subj)]
def parse_sockaddr_bits(event, data_point):
"""Parse the type=SOCKADDR-related bits into the data point."""
sockaddr_entry = parse_type(event, TYPE_SOCKADDR)
if sockaddr_entry == '':
return
fam, laddr, lport = [
' fam=',
' laddr=',
' lport=',
]
data_point[FAM] = sockaddr_entry[sockaddr_entry.find(fam)+len(fam)
: sockaddr_entry.find(laddr)]
if data_point[FAM] == 'inet' or data_point[FAM] == 'inet6':
data_point[LADDR] = sockaddr_entry[sockaddr_entry.find(laddr)
+ len(laddr)
: sockaddr_entry.find(lport)]
data_point[LPORT] = int(sockaddr_entry[
sockaddr_entry.find(lport) + len(lport)
: sockaddr_entry.find(' ', sockaddr_entry.find(lport)+1)])
else:
# TODO(zauri): do we need non-inet[6] packets?
pass
if __name__ == '__main__':
main(sys.argv[1:])