blob: 723081ea1547d39708d358eb429d90cd5a38cd02 [file] [log] [blame] [edit]
#!/usr/bin/env python3
# Copyright 2022 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""A tool to generate simulated fingerprint study results."""
from __future__ import annotations
import argparse
import pathlib
import sys
import time
from typing import Optional
from experiment import Experiment
import numpy as np
import pandas as pd
def GenerateFARResults(
num_users: int = 72,
num_fingers: int = 6,
num_verify_samples: int = 80,
user_groups: Optional[list[str]] = ["A", "B", "C", "D", "E", "F"],
prob: float = 1 / 100000.0,
verbose: bool = False,
) -> pd.DataFrame:
time_start = time.perf_counter()
users = np.arange(10001, 10001 + num_users)
fingers = np.arange(num_fingers)
samples = np.arange(num_verify_samples)
# This includes match attempts between the same finger for enrollment and
# verification. We need this for initial dataframe sizing, before we prune
# the invalid matches.
num_rows_naive = (num_users * num_fingers * num_verify_samples) * (
num_users * num_fingers
)
# This is the correct number of total FAR matches.
num_rows = (num_users * num_fingers * num_verify_samples) * (
(num_users * num_fingers) - 1
)
# Generate all cross combinations.
if verbose:
print("Generating User x Finger x Samples Matrix")
cross_mat = np.array(
np.meshgrid(
users,
fingers,
users,
fingers,
samples, # This will not generate the final user assignments.
)
).T.reshape(-1, 5)
# The resultant cross_mat contains a row entry for all combinations of
# user x finger x user x finger x sample (5 columns).
assert cross_mat.shape == (num_rows_naive, 5)
df = pd.DataFrame(
columns=[
Experiment.TableCol.Enroll_User.value,
Experiment.TableCol.Enroll_Finger.value,
Experiment.TableCol.Verify_User.value,
Experiment.TableCol.Verify_Finger.value,
Experiment.TableCol.Verify_Sample.value,
Experiment.TableCol.Decision.value,
]
)
if verbose:
print("Initializing Data Frame")
df[Experiment.TableCol.Verify_User.value] = np.zeros(
num_rows_naive, dtype=int
)
if verbose:
print("Loading Matrix Into Data Frame")
df[
[
Experiment.TableCol.Enroll_User.value,
Experiment.TableCol.Enroll_Finger.value,
Experiment.TableCol.Verify_User.value,
Experiment.TableCol.Verify_Finger.value,
Experiment.TableCol.Verify_Sample.value,
]
] = cross_mat
# Since we don't try to match a verification sample with its own template,
# drop these attempts.
if verbose:
print("Dropping Invalid Matching Combinations")
df.drop(
df.loc[
(df.VerifyUser == df.EnrollUser)
& (df.VerifyFinger == df.EnrollFinger)
].index,
inplace=True,
)
# Check that we now have the correct number of FAR entries.
assert df.shape[0] == num_rows
if verbose:
print("Generate Decisions")
rng = np.random.default_rng()
df[Experiment.TableCol.Decision.value] = rng.choice(
[Experiment.Decision.Reject.value, Experiment.Decision.Accept.value],
size=num_rows,
p=[1 - prob, prob],
)
if not user_groups is None:
if verbose:
print("Adding Group Associations")
if num_users % len(user_groups) != 0:
print(
"Warning - The number users cannot be evenly split into"
f" {len(user_groups)} groups."
)
user_groups_tbl = pd.DataFrame(
{
Experiment.TableCol.User.value: users,
Experiment.TableCol.Group.value: np.repeat(
user_groups, repeats=num_users / len(user_groups)
),
}
)
e = Experiment(
num_verification=num_verify_samples,
num_fingers=num_fingers,
num_users=num_users,
far_decisions=df,
fa_list=None,
)
e.add_groups(user_groups_tbl)
df = e.far_decisions()
time_end = time.perf_counter()
if verbose:
print(f"Generation took {time_end-time_start:.5}s.")
# TODO: Add options to allow forcing specific anomalous users.
# One way to do this might be to choose a few user and redraw the
# decisions from a new more frequent distribution (like 1/20k).
return df
# TODO: Add a function for generating fake collection directories to test
# collection directory group discovery.
def main(argv: Optional[list[str]] = None) -> Optional[int]:
parser = argparse.ArgumentParser(
description="Generate simulated fpstudy data."
)
parser.add_argument(
"output_csv_file",
type=str,
help="file path to write the simulated csv data to",
)
parser.add_argument(
"--users",
default=72,
type=int,
help="number of participants (default: 72)",
)
parser.add_argument(
"--fingers",
default=6,
type=int,
help="number of fingers per participants (default: 6)",
)
parser.add_argument(
"--samples",
default=80,
type=int,
help="number of verification samples per participants" " (default: 80)",
)
parser.add_argument(
"--groups",
default=6,
type=int,
help="number of groups to split participants into"
" -- "
"set to 0 to disable groups columns (default: 6)",
)
parser.add_argument(
"--prob_div",
default=100000,
type=int,
help="divisor of the Probability of False Positive"
" (p = 1/prob_div)"
" (default: 100000)",
)
args = parser.parse_args(argv)
csv_path = pathlib.Path(args.output_csv_file)
if not csv_path.parents[0].is_dir():
parser.error(f'The directory "{csv_path.parents[0]}" doesn\'t exist.')
if csv_path.is_dir():
parser.error(f'The output file "{csv_path}" is a directory.')
user_groups = None
if args.groups > 0:
user_groups = [chr(a) for a in range(ord("A"), ord("A") + args.groups)]
print(f"Using generated groups {user_groups}.")
if args.users % len(user_groups) != 0:
parser.error(
f"User ({args.users}) is not divisible by number "
f"of groups ({len(user_groups)})."
)
else:
print("Disabling groups.")
prob = 1.0 / args.prob_div
print(f"Using probability of false positive {prob}.")
df = GenerateFARResults(
num_users=args.users,
num_fingers=args.fingers,
num_verify_samples=args.samples,
user_groups=user_groups,
prob=prob,
verbose=True,
)
print(f"Generated {df.shape[0]} cross matches.")
print(df)
print(f"Writing output csv to {csv_path}")
df.to_csv(csv_path, index=False)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))