blob: 4fe8113844d0cd2f53ffc4f486117885aa833221 [file] [log] [blame]
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// API of the text classifier (libtextclassifier).
// NOTE: This mojom exists in two places and must be kept in sync:
// Chromium: //chromeos/services/machine_learning/public/mojom/
// Chrome OS: src/platform2/ml/mojom/
// Note: Other repos downstream of Chromium might also use this mojom.
// Example: A backwards-compatible mojom change (and corresponding
// implementation change) can be made in Chrome OS first, then replicated to the
// clients (Chromium, other downstream repos) later.
// Use //chromeos/services/machine_learning/public/mojom/roll_mojom.sh to help
// replicate Chrome OS-side changes over to Chromium.
module chromeos.machine_learning.mojom;
// NOTE: The base directory for 'import' statements is expected to differ
// between Chromium and Chrome OS versions of this file.
// And for "time.mojom", on the chromium side, we have to use the version
// under mojo folder, that is, "mojo/public/mojom/base/time.mojom".
import "ml/mojom/time.mojom";
// Enum for specifying the annotation usecase.
// Must be consistent with `AnnotationUsecase` in model.fb in libtextclassifier.
enum AnnotationUsecase {
// Results are optimized for Smart{Select,Share,Linkify}
ANNOTATION_USECASE_SMART = 0,
// Results are optimized for using TextClassifier as an infrastructure that
// annotates as much as possible.
ANNOTATION_USECASE_RAW = 1,
};
// Stores data extracted from each text entity.
// Currently, only for "number" type, the real number value is stored in
// `numeric_value`. For the other types, the substring annotated is stored
// in `string_value`.
// The values come from the class `ClassificationResult` of tclib. (See
// "tclib/annotator/types.h").
// TODO(honglinyu): add data extraction for more types when needed and
// available. For example, when "date" data is needed, probably we should add
// a new struct "Date" and add a new member "Data date_value" to the following
// union.
union TextEntityData {
// A numeric value.
// - For "number", it is the value.
// e.g. it is "34.3" for input string "34.3".
double numeric_value@0;
// Could be "url", "address" etc.
string string_value@1;
};
// Types of text (can be phone numbers, addresses, emails and urls etc.).
// This struct is a distillation of the `ClassificationResult` of tclib. (See
// "tclib/annotator/types.h").
struct TextEntity {
// The name of the type (e.g. "phone", "address", "email" and "url" etc.).
string name@0;
// The confidence score of the entity annotation, and the range is 0-1.
float confidence_score@1;
// Additional data extracted from the text.
TextEntityData data@2;
};
// A substring of the annotated text and possible associated entities.
// This struct is a simplification of the `AnnotatedSpan` class of tclib. (See
// "tclib/annotator/types.h").
struct TextAnnotation {
// The offset of the first character of the annotation.
uint32 start_offset@0;
// The offset of the last character of the annotation.
uint32 end_offset@1;
// The set of entity types associated with the substring.
array<TextEntity> entities@2;
};
// Contains the input and parameters used to annotate the text.
// This is a combination of string and `AnnotationOptions` in tclib (see
// "tclib/annotator/types.h").
struct TextAnnotationRequest {
// The text to be annotated.
string text@0;
// Comma-delimited locales (e.g., "en", "en,es").
string? default_locales@1;
// Comma-separated list of language tags.
string? detected_text_language_tags@2;
// Tailors the output annotations according to the specified use-case.
AnnotationUsecase annotation_usecase@3 = ANNOTATION_USECASE_SMART;
// For parsing relative datetimes, the reference now time against which the
// relative datetimes get resolved.
mojo_base.mojom.Time? reference_time@4;
// Timezone in which the input text was written (format as accepted by ICU).
// If empty (default), will use the system's timezone.
string? reference_timezone@5;
// Enabled entities. If empty (default), all types of entities will be
// enabled.
array<string>? enabled_entities@6;
};
// Marks a span in a sequence of codepoints.
// This struct is consistent with the type `CodepointSpan` of tclib. (See
// "tclib/annotator/types.h").
struct CodepointSpan {
// The offset of the first character of the span.
uint32 start_offset@0;
// The offset of the last character of the span.
uint32 end_offset@1;
};
// Contains the input and parameters used to suggest selection.
// This is a combination of the inputs of the `SuggestSelection` function
// of tclib. (See "tclib/annotator/annotate.h").
struct TextSuggestSelectionRequest {
// The candidate text.
string text@0;
// Where the user selects.
CodepointSpan user_selection@1;
// Comma-delimited locales (e.g., "en", "en,es").
string? default_locales@2;
// Comma-separated list of BCP 47 language tags.
string? detected_text_language_tags@3;
// Tailors the output annotations according to the specified use-case.
AnnotationUsecase annotation_usecase@4 = ANNOTATION_USECASE_SMART;
};
// Represent a language detection result.
struct TextLanguage {
// The BCP-47 language code like "en", "fr", "zh" etc.
string locale;
// The confidence score of the language detected (range: 0~1).
float confidence;
};
// Used to annotate entities within text strings.
interface TextClassifier {
// Annotate a text string and returns the detected substrings and possible
// entities.
Annotate@0(TextAnnotationRequest request) =>
(array<TextAnnotation> outputs);
// Suggest a selection based on user's selection.
// If the inputs are invalid (e.g., user selection's start point is behind
// the end point), the input user selection will be returned.
// NOTE: The selection indices are passed in and returned in terms of
// UTF8 codepoints (not bytes).
SuggestSelection@1(TextSuggestSelectionRequest request) =>
(CodepointSpan outputs);
// Identify the languages the text is possibly written in.
// The returned results are sorted according to the confidence score, from the
// highest to the lowest.
// The maximum number of results returned is determined internally.
// Will return an empty array if the language can not be determined.
FindLanguages@2(string text) => (array<TextLanguage> outputs);
};