| // Copyright 2020 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // API of the text classifier (libtextclassifier). |
| |
| // NOTE: This mojom exists in two places and must be kept in sync: |
| // Chromium: //chromeos/services/machine_learning/public/mojom/ |
| // Chrome OS: src/platform2/ml/mojom/ |
| // Note: Other repos downstream of Chromium might also use this mojom. |
| // Example: A backwards-compatible mojom change (and corresponding |
| // implementation change) can be made in Chrome OS first, then replicated to the |
| // clients (Chromium, other downstream repos) later. |
| // Use //chromeos/services/machine_learning/public/mojom/roll_mojom.sh to help |
| // replicate Chrome OS-side changes over to Chromium. |
| |
| module chromeos.machine_learning.mojom; |
| |
| // NOTE: The base directory for 'import' statements is expected to differ |
| // between Chromium and Chrome OS versions of this file. |
| // And for "time.mojom", on the chromium side, we have to use the version |
| // under mojo folder, that is, "mojo/public/mojom/base/time.mojom". |
| import "ml/mojom/time.mojom"; |
| |
| // Enum for specifying the annotation usecase. |
| // Must be consistent with `AnnotationUsecase` in model.fb in libtextclassifier. |
| [Stable, Extensible] |
| enum AnnotationUsecase { |
| // Results are optimized for Smart{Select,Share,Linkify} |
| ANNOTATION_USECASE_SMART = 0, |
| // Results are optimized for using TextClassifier as an infrastructure that |
| // annotates as much as possible. |
| ANNOTATION_USECASE_RAW = 1, |
| }; |
| |
| // Stores data extracted from each text entity. |
| // Currently, only for "number" type, the real number value is stored in |
| // `numeric_value`. For the other types, the substring annotated is stored |
| // in `string_value`. |
| // The values come from the class `ClassificationResult` of tclib. (See |
| // "tclib/annotator/types.h"). |
| // TODO(honglinyu): add data extraction for more types when needed and |
| // available. For example, when "date" data is needed, probably we should add |
| // a new struct "Date" and add a new member "Data date_value" to the following |
| // union. |
| [Stable] |
| union TextEntityData { |
| // A numeric value. |
| // - For "number", it is the value. |
| // e.g. it is "34.3" for input string "34.3". |
| double numeric_value@0; |
| // Could be "url", "address" etc. |
| string string_value@1; |
| }; |
| |
| // Types of text (can be phone numbers, addresses, emails and urls etc.). |
| // This struct is a distillation of the `ClassificationResult` of tclib. (See |
| // "tclib/annotator/types.h"). |
| [Stable] |
| struct TextEntity { |
| // The name of the type (e.g. "phone", "address", "email" and "url" etc.). |
| string name@0; |
| // The confidence score of the entity annotation, and the range is 0-1. |
| float confidence_score@1; |
| // Additional data extracted from the text. |
| TextEntityData data@2; |
| }; |
| |
| // A substring of the annotated text and possible associated entities. |
| // This struct is a simplification of the `AnnotatedSpan` class of tclib. (See |
| // "tclib/annotator/types.h"). |
| [Stable] |
| struct TextAnnotation { |
| // The offset of the first character of the annotation. |
| uint32 start_offset@0; |
| // The offset of the last character of the annotation. |
| uint32 end_offset@1; |
| // The set of entity types associated with the substring. |
| array<TextEntity> entities@2; |
| }; |
| |
| // Contains the input and parameters used to annotate the text. |
| // This is a combination of string and `AnnotationOptions` in tclib (see |
| // "tclib/annotator/types.h"). |
| [Stable] |
| struct TextAnnotationRequest { |
| // The text to be annotated. |
| string text@0; |
| // Comma-delimited locales (e.g., "en", "en,es"). |
| string? default_locales@1; |
| // Comma-separated list of language tags. |
| string? detected_text_language_tags@2; |
| // Tailors the output annotations according to the specified use-case. |
| AnnotationUsecase annotation_usecase@3 = ANNOTATION_USECASE_SMART; |
| // For parsing relative datetimes, the reference now time against which the |
| // relative datetimes get resolved. |
| mojo_base.mojom.Time? reference_time@4; |
| // Timezone in which the input text was written (format as accepted by ICU). |
| // If empty (default), will use the system's timezone. |
| string? reference_timezone@5; |
| // Enabled entities. If empty (default), all types of entities will be |
| // enabled. |
| array<string>? enabled_entities@6; |
| }; |
| |
| // Marks a span in a sequence of codepoints. |
| // This struct is consistent with the type `CodepointSpan` of tclib. (See |
| // "tclib/annotator/types.h"). |
| [Stable] |
| struct CodepointSpan { |
| // The offset of the first character of the span. |
| uint32 start_offset@0; |
| // The offset of the last character of the span. |
| uint32 end_offset@1; |
| }; |
| |
| // Contains the input and parameters used to suggest selection. |
| // This is a combination of the inputs of the `SuggestSelection` function |
| // of tclib. (See "tclib/annotator/annotate.h"). |
| [Stable] |
| struct TextSuggestSelectionRequest { |
| // The candidate text. |
| string text@0; |
| // Where the user selects. |
| CodepointSpan user_selection@1; |
| // Comma-delimited locales (e.g., "en", "en,es"). |
| string? default_locales@2; |
| // Comma-separated list of BCP 47 language tags. |
| string? detected_text_language_tags@3; |
| // Tailors the output annotations according to the specified use-case. |
| AnnotationUsecase annotation_usecase@4 = ANNOTATION_USECASE_SMART; |
| }; |
| |
| // Represent a language detection result. |
| [Stable] |
| struct TextLanguage { |
| // The BCP-47 language code like "en", "fr", "zh" etc. |
| string locale; |
| // The confidence score of the language detected (range: 0~1). |
| float confidence; |
| }; |
| |
| // Used to annotate entities within text strings. |
| // Next ordinal: 3 |
| [Stable] |
| interface TextClassifier { |
| // Annotate a text string and returns the detected substrings and possible |
| // entities. |
| Annotate@0(TextAnnotationRequest request) => |
| (array<TextAnnotation> outputs); |
| // Suggest a selection based on user's selection. |
| // If the inputs are invalid (e.g., user selection's start point is behind |
| // the end point), the input user selection will be returned. |
| // NOTE: The selection indices are passed in and returned in terms of |
| // UTF8 codepoints (not bytes). |
| SuggestSelection@1(TextSuggestSelectionRequest request) => |
| (CodepointSpan outputs); |
| // Identify the languages the text is possibly written in. |
| // The returned results are sorted according to the confidence score, from the |
| // highest to the lowest. |
| // The maximum number of results returned is determined internally. |
| // Will return an empty array if the language can not be determined. |
| FindLanguages@2(string text) => (array<TextLanguage> outputs); |
| }; |