| // Copyright 2020 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // Datatypes and interfaces of speech recognition API. |
| |
| // NOTE: This mojom exists in two places and must be kept in sync: |
| // Chromium: //chromeos/services/machine_learning/public/mojom/ |
| // Chrome OS: src/platform2/ml/mojom/ |
| // Note: Other repos downstream of Chromium might also use this mojom. |
| // Example: A backwards-compatible mojom change (and corresponding |
| // implementation change) can be made in Chrome OS first, then replicated to the |
| // clients (Chromium, other downstream repos) later. |
| // Use //chromeos/services/machine_learning/public/mojom/roll_mojom.sh to help |
| // replicate Chrome OS-side changes over to Chromium. |
| |
| module chromeos.machine_learning.mojom; |
| |
| import "ml/mojom/time.mojom"; |
| |
| // The configuration used to load Soda recognizer. |
| struct SodaConfig { |
| // Number of channels of the audio that will be sent to Soda recognizer. |
| uint32 channel_count; |
| // Sample rate of the audio that will be sent to Soda recognizer. |
| uint32 sample_rate; |
| // The api key for Soda library. |
| string api_key; |
| // Load path to find the SODA content. |
| string load_path; |
| }; |
| |
| // From the endpointer, What kind of endpointer event to record. |
| enum EndpointerType { |
| // Speech detected. |
| START_OF_SPEECH, |
| // End of speech detected, but audio continues. |
| END_OF_SPEECH, |
| // Audio is terminated. |
| END_OF_AUDIO, |
| // Query is terminated. |
| END_OF_UTTERANCE |
| }; |
| |
| // Common information about the timing of reported SODA events. |
| struct TimingInfo { |
| // Epoch time of the first audio buffer of the main query that is fed into |
| // ASR. This is the wall time read from the system clock when the first audio |
| // buffer is received by the terse processor. |
| mojo_base.mojom.Time audio_start_epoch; |
| |
| // Start time in audio time from the start of the SODA session. |
| // This time measures the amount of audio input into SODA. |
| mojo_base.mojom.TimeDelta audio_start_time; |
| |
| // Elapsed wall time usec since the first frame. |
| mojo_base.mojom.TimeDelta elapsed_wall_time; |
| |
| // Elapsed processed audio usec from first frame after preamble. |
| mojo_base.mojom.TimeDelta event_end_time; |
| |
| // On device benchmark latency as defined in go/asr-latency-metrics. |
| mojo_base.mojom.TimeDelta latency; |
| |
| // On device counter part of E2E normalized latency as defined in |
| // go/asr-latency-metrics. This metric is mainly for non-continuous |
| // conversation. |
| float normalized_latency; |
| |
| // Timing for each word as an offset from audio_start_time_usec. |
| array<mojo_base.mojom.TimeDelta> word_alignments; |
| }; |
| |
| // Start/end events. |
| struct EndpointerEvent { |
| EndpointerType endpointer_type; |
| TimingInfo? timing_event; |
| }; |
| |
| // A result _during_ a recognition. Could change at any time with the |
| // next partial or the final recognition for this chunk. |
| struct PartialResult { |
| // Most likely hypothesis so far. First is the most likely, followed by others. |
| // Note: the relationship from first to other hypothess is not guaranteed in |
| // any way. |
| array<string> partial_text; |
| TimingInfo? timing_event; |
| }; |
| |
| enum EndpointReason { |
| // Default value, unknown reason. |
| ENDPOINT_UNKNOWN, |
| // Due to end_of_speech detection by endpointer. |
| ENDPOINT_END_OF_SPEECH, |
| // Due to end_of_utterance detection by endpointer. |
| ENDPOINT_END_OF_UTTERANCE, |
| // Due to the end of mics audio. This could be due to a mic event or SODA |
| // being stopped. |
| ENDPOINT_END_OF_AUDIO, |
| }; |
| |
| struct FinalResult { |
| // Sorted in decreasing order of probability. |
| array<string> final_hypotheses; |
| EndpointReason endpoint_reason; |
| TimingInfo? timing_event; |
| }; |
| |
| // Frequent event from recognizer, almost from every frame. Gives an indication of speechiness and audio level. |
| struct AudioLevelEvent { |
| // RMS audio level, from PowerEvaluator . Score is [0, 1) |
| float rms; |
| // Speech likelihood score, from TerseProcessor. Score is [0, 1) |
| float audio_level; |
| }; |
| |
| // This essentially mirrors the subset of SODA's SodaEvent proto we will |
| // support. |
| union SpeechRecognizerEvent { |
| AudioLevelEvent audio_event; |
| PartialResult partial_result; |
| EndpointerEvent endpointer_event; |
| FinalResult final_result; |
| }; |
| |
| interface SodaClient { |
| // After SODA successfully starts / warms up / stops, in case the client |
| // cares: |
| OnStart(); |
| OnStop(); |
| |
| // This is how the client receives actual recognized text as well as other |
| // conclusions from the SODA model like "speech ended". |
| OnSpeechRecognizerEvent(SpeechRecognizerEvent event); |
| }; |
| |
| // The mojom interface for performing the recognition of handwritten text. |
| interface SodaRecognizer { |
| // Add Audio for speech recognition. |
| AddAudio@0(array<uint8> audio); |
| // |
| Stop@1(); |
| // |
| Start@2(); |
| // |
| MarkDone@3(); |
| }; |