ml/mojom/soda.mojom - mirrors/cros/chromiumos/platform2 - Git at Google

 // Copyright 2020 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 // Datatypes and interfaces of speech recognition API.

 // NOTE: This mojom exists in two places and must be kept in sync:
 //       Chromium:  //chromeos/services/machine_learning/public/mojom/
 //       Chrome OS: src/platform2/ml/mojom/
 //       Note: Other repos downstream of Chromium might also use this mojom.
 // Example: A backwards-compatible mojom change (and corresponding
 // implementation change) can be made in Chrome OS first, then replicated to the
 // clients (Chromium, other downstream repos) later.
 // Use //chromeos/services/machine_learning/public/mojom/roll_mojom.sh to help
 // replicate Chrome OS-side changes over to Chromium.

 module chromeos.machine_learning.mojom;

 import "ml/mojom/time.mojom";

 // The configuration used to load Soda recognizer.
 struct SodaConfig {
   // Number of channels of the audio that will be sent to Soda recognizer.
   uint32 channel_count;
   // Sample rate of the audio that will be sent to Soda recognizer.
   uint32 sample_rate;
   // The api key for Soda library.
   string api_key;
   // Load path to find the SODA content.
   string load_path;
 };

 // From the endpointer, What kind of endpointer event to record.
 enum EndpointerType {
   // Speech detected.
   START_OF_SPEECH,
   // End of speech detected, but audio continues.
   END_OF_SPEECH,
   // Audio is terminated.
   END_OF_AUDIO,
   // Query is terminated.
   END_OF_UTTERANCE
 };

 // Common information about the timing of reported SODA events.
 struct TimingInfo {
   // Epoch time of the first audio buffer of the main query that is fed into
   // ASR. This is the wall time read from the system clock when the first audio
   // buffer is received by the terse processor.
   mojo_base.mojom.Time audio_start_epoch;

   // Start time in audio time from the start of the SODA session.
   // This time measures the amount of audio input into SODA.
   mojo_base.mojom.TimeDelta audio_start_time;

   // Elapsed wall time usec since the first frame.
   mojo_base.mojom.TimeDelta elapsed_wall_time;

   // Elapsed processed audio usec from first frame after preamble.
   mojo_base.mojom.TimeDelta event_end_time;

   // On device benchmark latency as defined in go/asr-latency-metrics.
   mojo_base.mojom.TimeDelta latency;

   // On device counter part of E2E normalized latency as defined in
   // go/asr-latency-metrics. This metric is mainly for non-continuous
   // conversation.
   float normalized_latency;

   // Timing for each word as an offset from audio_start_time_usec.
   array<mojo_base.mojom.TimeDelta> word_alignments;
 };

 // Start/end events.
 struct EndpointerEvent {
   EndpointerType endpointer_type;
   TimingInfo? timing_event;
 };

 // A result _during_ a recognition. Could change at any time with the
 // next partial or the final recognition for this chunk.
 struct PartialResult {
   // Most likely hypothesis so far. First is the most likely, followed by others.
   // Note: the relationship from first to other hypothess is not guaranteed in
   // any way.
   array<string> partial_text;
   TimingInfo? timing_event;
 };

 enum EndpointReason {
   // Default value, unknown reason.
   ENDPOINT_UNKNOWN,
   // Due to end_of_speech detection by endpointer.
   ENDPOINT_END_OF_SPEECH,
   // Due to end_of_utterance detection by endpointer.
   ENDPOINT_END_OF_UTTERANCE,
   // Due to the end of mics audio. This could be due to a mic event or SODA
   // being stopped.
   ENDPOINT_END_OF_AUDIO,
 };

 struct FinalResult {
   // Sorted in decreasing order of probability.
   array<string> final_hypotheses;
   EndpointReason endpoint_reason;
   TimingInfo? timing_event;
 };

 // Frequent event from recognizer, almost from every frame. Gives an indication of speechiness and audio level.
 struct AudioLevelEvent {
   // RMS audio level, from PowerEvaluator . Score is [0, 1)
   float rms;
   // Speech likelihood score, from TerseProcessor. Score is [0, 1)
   float audio_level;
 };

 // This essentially mirrors the subset of SODA's SodaEvent proto we will
 // support.
 union SpeechRecognizerEvent {
   AudioLevelEvent audio_event;
   PartialResult partial_result;
   EndpointerEvent endpointer_event;
   FinalResult final_result;
 };

 interface SodaClient {
   // After SODA successfully starts / warms up / stops, in case the client
   // cares:
   OnStart();
   OnStop();

   // This is how the client receives actual recognized text as well as other
   // conclusions from the SODA model like "speech ended".
   OnSpeechRecognizerEvent(SpeechRecognizerEvent event);
 };

 // The mojom interface for performing the recognition of handwritten text.
 interface SodaRecognizer {
   // Add Audio for speech recognition.
   AddAudio@0(array<uint8> audio);
   //
   Stop@1();
   //
   Start@2();
   //
   MarkDone@3();
 };
	// Copyright 2020 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	// Datatypes and interfaces of speech recognition API.

	// NOTE: This mojom exists in two places and must be kept in sync:
	// Chromium: //chromeos/services/machine_learning/public/mojom/
	// Chrome OS: src/platform2/ml/mojom/
	// Note: Other repos downstream of Chromium might also use this mojom.
	// Example: A backwards-compatible mojom change (and corresponding
	// implementation change) can be made in Chrome OS first, then replicated to the
	// clients (Chromium, other downstream repos) later.
	// Use //chromeos/services/machine_learning/public/mojom/roll_mojom.sh to help
	// replicate Chrome OS-side changes over to Chromium.

	module chromeos.machine_learning.mojom;

	import "ml/mojom/time.mojom";

	// The configuration used to load Soda recognizer.
	struct SodaConfig {
	// Number of channels of the audio that will be sent to Soda recognizer.
	uint32 channel_count;
	// Sample rate of the audio that will be sent to Soda recognizer.
	uint32 sample_rate;
	// The api key for Soda library.
	string api_key;
	// Load path to find the SODA content.
	string load_path;
	};

	// From the endpointer, What kind of endpointer event to record.
	enum EndpointerType {
	// Speech detected.
	START_OF_SPEECH,
	// End of speech detected, but audio continues.
	END_OF_SPEECH,
	// Audio is terminated.
	END_OF_AUDIO,
	// Query is terminated.
	END_OF_UTTERANCE
	};

	// Common information about the timing of reported SODA events.
	struct TimingInfo {
	// Epoch time of the first audio buffer of the main query that is fed into
	// ASR. This is the wall time read from the system clock when the first audio
	// buffer is received by the terse processor.
	mojo_base.mojom.Time audio_start_epoch;

	// Start time in audio time from the start of the SODA session.
	// This time measures the amount of audio input into SODA.
	mojo_base.mojom.TimeDelta audio_start_time;

	// Elapsed wall time usec since the first frame.
	mojo_base.mojom.TimeDelta elapsed_wall_time;

	// Elapsed processed audio usec from first frame after preamble.
	mojo_base.mojom.TimeDelta event_end_time;

	// On device benchmark latency as defined in go/asr-latency-metrics.
	mojo_base.mojom.TimeDelta latency;

	// On device counter part of E2E normalized latency as defined in
	// go/asr-latency-metrics. This metric is mainly for non-continuous
	// conversation.
	float normalized_latency;

	// Timing for each word as an offset from audio_start_time_usec.
	array<mojo_base.mojom.TimeDelta> word_alignments;
	};

	// Start/end events.
	struct EndpointerEvent {
	EndpointerType endpointer_type;
	TimingInfo? timing_event;
	};

	// A result _during_ a recognition. Could change at any time with the
	// next partial or the final recognition for this chunk.
	struct PartialResult {
	// Most likely hypothesis so far. First is the most likely, followed by others.
	// Note: the relationship from first to other hypothess is not guaranteed in
	// any way.
	array<string> partial_text;
	TimingInfo? timing_event;
	};

	enum EndpointReason {
	// Default value, unknown reason.
	ENDPOINT_UNKNOWN,
	// Due to end_of_speech detection by endpointer.
	ENDPOINT_END_OF_SPEECH,
	// Due to end_of_utterance detection by endpointer.
	ENDPOINT_END_OF_UTTERANCE,
	// Due to the end of mics audio. This could be due to a mic event or SODA
	// being stopped.
	ENDPOINT_END_OF_AUDIO,
	};

	struct FinalResult {
	// Sorted in decreasing order of probability.
	array<string> final_hypotheses;
	EndpointReason endpoint_reason;
	TimingInfo? timing_event;
	};

	// Frequent event from recognizer, almost from every frame. Gives an indication of speechiness and audio level.
	struct AudioLevelEvent {
	// RMS audio level, from PowerEvaluator . Score is [0, 1)
	float rms;
	// Speech likelihood score, from TerseProcessor. Score is [0, 1)
	float audio_level;
	};

	// This essentially mirrors the subset of SODA's SodaEvent proto we will
	// support.
	union SpeechRecognizerEvent {
	AudioLevelEvent audio_event;
	PartialResult partial_result;
	EndpointerEvent endpointer_event;
	FinalResult final_result;
	};

	interface SodaClient {
	// After SODA successfully starts / warms up / stops, in case the client
	// cares:
	OnStart();
	OnStop();

	// This is how the client receives actual recognized text as well as other
	// conclusions from the SODA model like "speech ended".
	OnSpeechRecognizerEvent(SpeechRecognizerEvent event);
	};

	// The mojom interface for performing the recognition of handwritten text.
	interface SodaRecognizer {
	// Add Audio for speech recognition.
	AddAudio@0(array<uint8> audio);
	//
	Stop@1();
	//
	Start@2();
	//
	MarkDone@3();
	};