ml/mojom/text_classifier.mojom - mirrors/cros/chromiumos/platform2 - Git at Google

 // Copyright 2020 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 // API of the text classifier (libtextclassifier).

 // NOTE: This mojom exists in two places and must be kept in sync:
 //       Chromium:  //chromeos/services/machine_learning/public/mojom/
 //       Chrome OS: src/platform2/ml/mojom/
 //       Note: Other repos downstream of Chromium might also use this mojom.
 // Example: A backwards-compatible mojom change (and corresponding
 // implementation change) can be made in Chrome OS first, then replicated to the
 // clients (Chromium, other downstream repos) later.
 // Use //chromeos/services/machine_learning/public/mojom/roll_mojom.sh to help
 // replicate Chrome OS-side changes over to Chromium.

 module chromeos.machine_learning.mojom;

 // NOTE: The base directory for 'import' statements is expected to differ
 //       between Chromium and Chrome OS versions of this file.
 //       And for "time.mojom", on the chromium side, we have to use the version
 //       under mojo folder, that is, "mojo/public/mojom/base/time.mojom".
 import "ml/mojom/time.mojom";

 // Enum for specifying the annotation usecase.
 // Must be consistent with `AnnotationUsecase` in model.fb in libtextclassifier.
 [Stable, Extensible]
 enum AnnotationUsecase {
   // Results are optimized for Smart{Select,Share,Linkify}
   ANNOTATION_USECASE_SMART = 0,
   // Results are optimized for using TextClassifier as an infrastructure that
   // annotates as much as possible.
   ANNOTATION_USECASE_RAW = 1,
 };

 // Stores data extracted from each text entity.
 // Currently, only for "number" type, the real number value is stored in
 // `numeric_value`. For the other types, the substring annotated is stored
 // in `string_value`.
 // The values come from the class `ClassificationResult` of tclib. (See
 // "tclib/annotator/types.h").
 // TODO(honglinyu): add data extraction for more types when needed and
 // available. For example, when "date" data is needed, probably we should add
 // a new struct "Date" and add a new member "Data date_value" to the following
 // union.
 [Stable]
 union TextEntityData {
   // A numeric value.
   //  - For "number", it is the value.
   //    e.g. it is "34.3" for input string "34.3".
   double numeric_value@0;
   // Could be "url", "address" etc.
   string string_value@1;
 };

 // Types of text (can be phone numbers, addresses, emails and urls etc.).
 // This struct is a distillation of the `ClassificationResult` of tclib. (See
 // "tclib/annotator/types.h").
 [Stable]
 struct TextEntity {
   // The name of the type (e.g. "phone", "address", "email" and "url" etc.).
   string name@0;
   // The confidence score of the entity annotation, and the range is 0-1.
   float confidence_score@1;
   // Additional data extracted from the text.
   TextEntityData data@2;
 };

 // A substring of the annotated text and possible associated entities.
 // This struct is a simplification of the `AnnotatedSpan` class of tclib. (See
 // "tclib/annotator/types.h").
 [Stable]
 struct TextAnnotation {
   // The offset of the first character of the annotation.
   uint32 start_offset@0;
   // The offset of the last character of the annotation.
   uint32 end_offset@1;
   // The set of entity types associated with the substring.
   array<TextEntity> entities@2;
 };

 // Contains the input and parameters used to annotate the text.
 // This is a combination of string and `AnnotationOptions` in tclib (see
 // "tclib/annotator/types.h").
 [Stable]
 struct TextAnnotationRequest {
   // The text to be annotated.
   string text@0;
   // Comma-delimited locales (e.g., "en", "en,es").
   string? default_locales@1;
   // Comma-separated list of language tags.
   string? detected_text_language_tags@2;
   // Tailors the output annotations according to the specified use-case.
   AnnotationUsecase annotation_usecase@3 = ANNOTATION_USECASE_SMART;
   // For parsing relative datetimes, the reference now time against which the
   // relative datetimes get resolved.
   mojo_base.mojom.Time? reference_time@4;
   // Timezone in which the input text was written (format as accepted by ICU).
   // If empty (default), will use the system's timezone.
   string? reference_timezone@5;
   // Enabled entities. If empty (default), all types of entities will be
   // enabled.
   array<string>? enabled_entities@6;
 };

 // Marks a span in a sequence of codepoints.
 // This struct is consistent with the type `CodepointSpan` of tclib. (See
 // "tclib/annotator/types.h").
 [Stable]
 struct CodepointSpan {
   // The offset of the first character of the span.
   uint32 start_offset@0;
   // The offset of the last character of the span.
   uint32 end_offset@1;
 };

 // Contains the input and parameters used to suggest selection.
 // This is a combination of the inputs of the `SuggestSelection` function
 // of tclib. (See "tclib/annotator/annotate.h").
 [Stable]
 struct TextSuggestSelectionRequest {
   // The candidate text.
   string text@0;
   // Where the user selects.
   CodepointSpan user_selection@1;
   // Comma-delimited locales (e.g., "en", "en,es").
   string? default_locales@2;
   // Comma-separated list of BCP 47 language tags.
   string? detected_text_language_tags@3;
   // Tailors the output annotations according to the specified use-case.
   AnnotationUsecase annotation_usecase@4 = ANNOTATION_USECASE_SMART;
 };

 // Represent a language detection result.
 [Stable]
 struct TextLanguage {
   // The BCP-47 language code like "en", "fr", "zh" etc.
   string locale;
   // The confidence score of the language detected (range: 0~1).
   float confidence;
 };

 // Used to annotate entities within text strings.
 // Next ordinal: 3
 [Stable]
 interface TextClassifier {
   // Annotate a text string and returns the detected substrings and possible
   // entities.
   Annotate@0(TextAnnotationRequest request) =>
       (array<TextAnnotation> outputs);
   // Suggest a selection based on user's selection.
   // If the inputs are invalid (e.g., user selection's start point is behind
   // the end point), the input user selection will be returned.
   // NOTE: The selection indices are passed in and returned in terms of
   // UTF8 codepoints (not bytes).
   SuggestSelection@1(TextSuggestSelectionRequest request) =>
       (CodepointSpan outputs);
   // Identify the languages the text is possibly written in.
   // The returned results are sorted according to the confidence score, from the
   // highest to the lowest.
   // The maximum number of results returned is determined internally.
   // Will return an empty array if the language can not be determined.
   FindLanguages@2(string text) => (array<TextLanguage> outputs);
 };
	// Copyright 2020 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	// API of the text classifier (libtextclassifier).

	// NOTE: This mojom exists in two places and must be kept in sync:
	// Chromium: //chromeos/services/machine_learning/public/mojom/
	// Chrome OS: src/platform2/ml/mojom/
	// Note: Other repos downstream of Chromium might also use this mojom.
	// Example: A backwards-compatible mojom change (and corresponding
	// implementation change) can be made in Chrome OS first, then replicated to the
	// clients (Chromium, other downstream repos) later.
	// Use //chromeos/services/machine_learning/public/mojom/roll_mojom.sh to help
	// replicate Chrome OS-side changes over to Chromium.

	module chromeos.machine_learning.mojom;

	// NOTE: The base directory for 'import' statements is expected to differ
	// between Chromium and Chrome OS versions of this file.
	// And for "time.mojom", on the chromium side, we have to use the version
	// under mojo folder, that is, "mojo/public/mojom/base/time.mojom".
	import "ml/mojom/time.mojom";

	// Enum for specifying the annotation usecase.
	// Must be consistent with `AnnotationUsecase` in model.fb in libtextclassifier.
	[Stable, Extensible]
	enum AnnotationUsecase {
	// Results are optimized for Smart{Select,Share,Linkify}
	ANNOTATION_USECASE_SMART = 0,
	// Results are optimized for using TextClassifier as an infrastructure that
	// annotates as much as possible.
	ANNOTATION_USECASE_RAW = 1,
	};

	// Stores data extracted from each text entity.
	// Currently, only for "number" type, the real number value is stored in
	// `numeric_value`. For the other types, the substring annotated is stored
	// in `string_value`.
	// The values come from the class `ClassificationResult` of tclib. (See
	// "tclib/annotator/types.h").
	// TODO(honglinyu): add data extraction for more types when needed and
	// available. For example, when "date" data is needed, probably we should add
	// a new struct "Date" and add a new member "Data date_value" to the following
	// union.
	[Stable]
	union TextEntityData {
	// A numeric value.
	// - For "number", it is the value.
	// e.g. it is "34.3" for input string "34.3".
	double numeric_value@0;
	// Could be "url", "address" etc.
	string string_value@1;
	};

	// Types of text (can be phone numbers, addresses, emails and urls etc.).
	// This struct is a distillation of the `ClassificationResult` of tclib. (See
	// "tclib/annotator/types.h").
	[Stable]
	struct TextEntity {
	// The name of the type (e.g. "phone", "address", "email" and "url" etc.).
	string name@0;
	// The confidence score of the entity annotation, and the range is 0-1.
	float confidence_score@1;
	// Additional data extracted from the text.
	TextEntityData data@2;
	};

	// A substring of the annotated text and possible associated entities.
	// This struct is a simplification of the `AnnotatedSpan` class of tclib. (See
	// "tclib/annotator/types.h").
	[Stable]
	struct TextAnnotation {
	// The offset of the first character of the annotation.
	uint32 start_offset@0;
	// The offset of the last character of the annotation.
	uint32 end_offset@1;
	// The set of entity types associated with the substring.
	array<TextEntity> entities@2;
	};

	// Contains the input and parameters used to annotate the text.
	// This is a combination of string and `AnnotationOptions` in tclib (see
	// "tclib/annotator/types.h").
	[Stable]
	struct TextAnnotationRequest {
	// The text to be annotated.
	string text@0;
	// Comma-delimited locales (e.g., "en", "en,es").
	string? default_locales@1;
	// Comma-separated list of language tags.
	string? detected_text_language_tags@2;
	// Tailors the output annotations according to the specified use-case.
	AnnotationUsecase annotation_usecase@3 = ANNOTATION_USECASE_SMART;
	// For parsing relative datetimes, the reference now time against which the
	// relative datetimes get resolved.
	mojo_base.mojom.Time? reference_time@4;
	// Timezone in which the input text was written (format as accepted by ICU).
	// If empty (default), will use the system's timezone.
	string? reference_timezone@5;
	// Enabled entities. If empty (default), all types of entities will be
	// enabled.
	array<string>? enabled_entities@6;
	};

	// Marks a span in a sequence of codepoints.
	// This struct is consistent with the type `CodepointSpan` of tclib. (See
	// "tclib/annotator/types.h").
	[Stable]
	struct CodepointSpan {
	// The offset of the first character of the span.
	uint32 start_offset@0;
	// The offset of the last character of the span.
	uint32 end_offset@1;
	};

	// Contains the input and parameters used to suggest selection.
	// This is a combination of the inputs of the `SuggestSelection` function
	// of tclib. (See "tclib/annotator/annotate.h").
	[Stable]
	struct TextSuggestSelectionRequest {
	// The candidate text.
	string text@0;
	// Where the user selects.
	CodepointSpan user_selection@1;
	// Comma-delimited locales (e.g., "en", "en,es").
	string? default_locales@2;
	// Comma-separated list of BCP 47 language tags.
	string? detected_text_language_tags@3;
	// Tailors the output annotations according to the specified use-case.
	AnnotationUsecase annotation_usecase@4 = ANNOTATION_USECASE_SMART;
	};

	// Represent a language detection result.
	[Stable]
	struct TextLanguage {
	// The BCP-47 language code like "en", "fr", "zh" etc.
	string locale;
	// The confidence score of the language detected (range: 0~1).
	float confidence;
	};

	// Used to annotate entities within text strings.
	// Next ordinal: 3
	[Stable]
	interface TextClassifier {
	// Annotate a text string and returns the detected substrings and possible
	// entities.
	Annotate@0(TextAnnotationRequest request) =>
	(array<TextAnnotation> outputs);
	// Suggest a selection based on user's selection.
	// If the inputs are invalid (e.g., user selection's start point is behind
	// the end point), the input user selection will be returned.
	// NOTE: The selection indices are passed in and returned in terms of
	// UTF8 codepoints (not bytes).
	SuggestSelection@1(TextSuggestSelectionRequest request) =>
	(CodepointSpan outputs);
	// Identify the languages the text is possibly written in.
	// The returned results are sorted according to the confidence score, from the
	// highest to the lowest.
	// The maximum number of results returned is determined internally.
	// Will return an empty array if the language can not be determined.
	FindLanguages@2(string text) => (array<TextLanguage> outputs);
	};