blob: 4a28c90b4cdf9c44238f2f91baa55e6fb75238dc [file] [log] [blame]
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Datatypes and interfaces of speech recognition API.
// NOTE: This mojom exists in two places and must be kept in sync:
// Chromium: //chromeos/services/machine_learning/public/mojom/
// Chrome OS: src/platform2/ml/mojom/
// Note: Other repos downstream of Chromium might also use this mojom.
// Example: A backwards-compatible mojom change (and corresponding
// implementation change) can be made in Chrome OS first, then replicated to the
// clients (Chromium, other downstream repos) later.
// Use //chromeos/services/machine_learning/public/mojom/ to help
// replicate Chrome OS-side changes over to Chromium.
module chromeos.machine_learning.mojom;
import "ml/mojom/time.mojom";
// The configuration used to load Soda recognizer.
struct SodaConfig {
// Number of channels of the audio that will be sent to Soda recognizer.
uint32 channel_count;
// Sample rate of the audio that will be sent to Soda recognizer.
uint32 sample_rate;
// The api key for Soda library.
string api_key;
// Path to already-installed SODA library.
string library_dlc_path;
// Path to already-installed SODA language pack to use.
string language_dlc_path;
// From the endpointer, What kind of endpointer event to record.
[Stable, Extensible]
enum EndpointerType {
// Speech detected.
// End of speech detected, but audio continues.
// Audio is terminated.
// Query is terminated.
// Common information about the timing of reported SODA events.
struct TimingInfo {
// Epoch time of the first audio buffer of the main query that is fed into
// ASR. This is the wall time read from the system clock when the first audio
// buffer is received by the terse processor.
mojo_base.mojom.Time audio_start_epoch;
// Start time in audio time from the start of the SODA session.
// This time measures the amount of audio input into SODA.
mojo_base.mojom.TimeDelta audio_start_time;
// Elapsed wall time usec since the first frame.
mojo_base.mojom.TimeDelta elapsed_wall_time;
// Elapsed processed audio usec from first frame after preamble.
mojo_base.mojom.TimeDelta event_end_time;
// On device benchmark latency as defined in go/asr-latency-metrics.
mojo_base.mojom.TimeDelta latency;
// On device counter part of E2E normalized latency as defined in
// go/asr-latency-metrics. This metric is mainly for non-continuous
// conversation.
float normalized_latency;
// Timing for each word as an offset from audio_start_time_usec.
array<mojo_base.mojom.TimeDelta> word_alignments;
// Start/end events.
struct EndpointerEvent {
EndpointerType endpointer_type;
TimingInfo? timing_event;
// A result _during_ a recognition. Could change at any time with the
// next partial or the final recognition for this chunk.
struct PartialResult {
// Most likely hypothesis so far. First is the most likely, followed by others.
// Note: the relationship from first to other hypothess is not guaranteed in
// any way.
array<string> partial_text;
TimingInfo? timing_event;
[Stable, Extensible]
enum EndpointReason {
// Default value, unknown reason.
// Due to end_of_speech detection by endpointer.
// Due to end_of_utterance detection by endpointer.
// Due to the end of mics audio. This could be due to a mic event or SODA
// being stopped.
struct FinalResult {
// Sorted in decreasing order of probability.
array<string> final_hypotheses;
EndpointReason endpoint_reason;
TimingInfo? timing_event;
// Frequent event from recognizer, almost from every frame. Gives an indication
// of speechiness and audio level.
struct AudioLevelEvent {
// RMS audio level, from PowerEvaluator . Score is [0, 1)
float rms;
// Speech likelihood score, from TerseProcessor. Score is [0, 1)
float audio_level;
// This essentially mirrors the subset of SODA's SodaEvent proto we will
// support.
union SpeechRecognizerEvent {
AudioLevelEvent audio_event;
PartialResult partial_result;
EndpointerEvent endpointer_event;
FinalResult final_result;
// This interface is called upon by the SodaRecognizer. Implemented by
// the client, SODA then calls these as 'events' with appropriate details
// when recognition occurs.
// Next ordinal: 3
interface SodaClient {
// After SODA successfully starts / warms up / stops, in case the client
// cares:
// This is how the client receives actual recognized text as well as other
// conclusions from the SODA model like "speech ended".
OnSpeechRecognizerEvent@2(SpeechRecognizerEvent event);
// The mojom interface for performing the recognition of handwritten text.
// Next ordinal: 4
interface SodaRecognizer {
// Add Audio for speech recognition.
AddAudio@0(array<uint8> audio);
// Instruct SODA to stop processing immediately. Stopping is
// confirmed when SodaClient::OnStop is called back.
// Instruct SODA to start processing. Noop if already
// processing. When Stopped, causes a SodAclient::OnStart callback.
// Instruct SODA to stop processing after all queued audio is
// processed. Will eventually result in a SodaClient::OnStop, but only
// after all audio currently in queue is decoded.