blob: 8d8d3ba458d9fd8b91f3f705fe7ead4735a4b84c [file] [log] [blame]
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Datatypes and interfaces of speech recognition API.
// NOTE: This mojom exists in two places and must be kept in sync:
// Chromium: //chromeos/services/machine_learning/public/mojom/
// Chrome OS: src/platform2/ml/mojom/
// Note: Other repos downstream of Chromium might also use this mojom.
// Example: A backwards-compatible mojom change (and corresponding
// implementation change) can be made in Chrome OS first, then replicated to the
// clients (Chromium, other downstream repos) later.
// Use //chromeos/services/machine_learning/public/mojom/roll_mojom.sh to help
// replicate Chrome OS-side changes over to Chromium.
module chromeos.machine_learning.mojom;
import "ml/mojom/time.mojom";
// The configuration used to load Soda recognizer.
struct SodaConfig {
// Number of channels of the audio that will be sent to Soda recognizer.
uint32 channel_count;
// Sample rate of the audio that will be sent to Soda recognizer.
uint32 sample_rate;
// The api key for Soda library.
string api_key;
// Load path to find the SODA content.
string load_path;
};
// From the endpointer, What kind of endpointer event to record.
enum EndpointerType {
// Speech detected.
START_OF_SPEECH,
// End of speech detected, but audio continues.
END_OF_SPEECH,
// Audio is terminated.
END_OF_AUDIO,
// Query is terminated.
END_OF_UTTERANCE
};
// Common information about the timing of reported SODA events.
struct TimingInfo {
// Epoch time of the first audio buffer of the main query that is fed into
// ASR. This is the wall time read from the system clock when the first audio
// buffer is received by the terse processor.
mojo_base.mojom.Time audio_start_epoch;
// Start time in audio time from the start of the SODA session.
// This time measures the amount of audio input into SODA.
mojo_base.mojom.TimeDelta audio_start_time;
// Elapsed wall time usec since the first frame.
mojo_base.mojom.TimeDelta elapsed_wall_time;
// Elapsed processed audio usec from first frame after preamble.
mojo_base.mojom.TimeDelta event_end_time;
// On device benchmark latency as defined in go/asr-latency-metrics.
mojo_base.mojom.TimeDelta latency;
// On device counter part of E2E normalized latency as defined in
// go/asr-latency-metrics. This metric is mainly for non-continuous
// conversation.
float normalized_latency;
// Timing for each word as an offset from audio_start_time_usec.
array<mojo_base.mojom.TimeDelta> word_alignments;
};
// Start/end events.
struct EndpointerEvent {
EndpointerType endpointer_type;
TimingInfo? timing_event;
};
// A result _during_ a recognition. Could change at any time with the
// next partial or the final recognition for this chunk.
struct PartialResult {
// Most likely hypothesis so far. First is the most likely, followed by others.
// Note: the relationship from first to other hypothess is not guaranteed in
// any way.
array<string> partial_text;
TimingInfo? timing_event;
};
enum EndpointReason {
// Default value, unknown reason.
ENDPOINT_UNKNOWN,
// Due to end_of_speech detection by endpointer.
ENDPOINT_END_OF_SPEECH,
// Due to end_of_utterance detection by endpointer.
ENDPOINT_END_OF_UTTERANCE,
// Due to the end of mics audio. This could be due to a mic event or SODA
// being stopped.
ENDPOINT_END_OF_AUDIO,
};
struct FinalResult {
// Sorted in decreasing order of probability.
array<string> final_hypotheses;
EndpointReason endpoint_reason;
TimingInfo? timing_event;
};
// Frequent event from recognizer, almost from every frame. Gives an indication of speechiness and audio level.
struct AudioLevelEvent {
// RMS audio level, from PowerEvaluator . Score is [0, 1)
float rms;
// Speech likelihood score, from TerseProcessor. Score is [0, 1)
float audio_level;
};
// This essentially mirrors the subset of SODA's SodaEvent proto we will
// support.
union SpeechRecognizerEvent {
AudioLevelEvent audio_event;
PartialResult partial_result;
EndpointerEvent endpointer_event;
FinalResult final_result;
};
interface SodaClient {
// After SODA successfully starts / warms up / stops, in case the client
// cares:
OnStart();
OnStop();
// This is how the client receives actual recognized text as well as other
// conclusions from the SODA model like "speech ended".
OnSpeechRecognizerEvent(SpeechRecognizerEvent event);
};
// The mojom interface for performing the recognition of handwritten text.
interface SodaRecognizer {
// Add Audio for speech recognition.
AddAudio@0(array<uint8> audio);
//
Stop@1();
//
Start@2();
//
MarkDone@3();
};