blob: 72d948d0a0015ef58cdeebebfdcc39d7c7346745 [file] [log] [blame]
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// API of the text classifier (libtextclassifier).
// NOTE: This mojom exists in two places and must be kept in sync:
// Chromium: //chromeos/services/machine_learning/public/mojom/
// Chrome OS: src/platform2/ml/mojom/
// Example: A backwards-compatible mojom change (and corresponding
// implementation change) can be made in Chrome OS first, then replicated to the
// client (Chromium) later.
module chromeos.machine_learning.mojom;
// NOTE: The base directory for 'import' statements is expected to differ
// between Chromium and Chrome OS versions of this file.
// And for "time.mojom", on the chromium side, we have to use the version
// under mojo folder, that is, "mojo/public/mojom/base/time.mojom".
import "ml/mojom/time.mojom";
// These values are persisted to logs. Entries should not be renumbered and
// numeric values should never be reused.
enum TextAnnotationResult {
OK = 0,
ERROR = 1,
};
// These values are persisted to logs. Entries should not be renumbered and
// numeric values should never be reused.
enum SuggestSelectionResult {
OK = 0,
ERROR = 1,
};
// Enum for specifying the annotation usecase.
// Must be consistent with |AnnotationUsecase| in model.fb in libtextclassifier.
enum AnnotationUsecase {
// Results are optimized for Smart{Select,Share,Linkify}
ANNOTATION_USECASE_SMART = 0,
// Results are optimized for using TextClassifier as an infrastructure that
// annotates as much as possible.
ANNOTATION_USECASE_RAW = 1,
};
// Stores data extracted from each text entity.
// Currently, only for "number" type, the real number value is stored in
// |numeric_value|. For the other types, the substring annotated is stored
// in |string_value|.
// The values come from the class |ClassificationResult| of tclib. (See
// "tclib/annotator/types.h").
// TODO(honglinyu): add data extraction for more types when needed and
// available. For example, when "date" data is needed, probably we should add
// a new struct "Date" and add a new member "Data date_value" to the following
// union.
union TextEntityData {
// A numeric value.
// - For "number", it is the value.
// e.g. it is "34.3" for input string "34.3".
double numeric_value@0;
// Could be "url", "address" etc.
string string_value@1;
};
// Types of text (can be phone numbers, addresses, emails and urls etc.).
// This struct is a distillation of the |ClassificationResult| of tclib. (See
// "tclib/annotator/types.h").
struct TextEntity {
// The name of the type (e.g. "phone", "address", "email" and "url" etc.).
string name@0;
// The confidence score of the entity annotation, and the range is 0-1.
float confidence_score@1;
// Additional data extracted from the text.
TextEntityData data@2;
};
// A substring of the annotated text and possible associated entities.
// This struct is a simplification of the |AnnotatedSpan| class of tclib. (See
// "tclib/annotator/types.h").
struct TextAnnotation {
// The offset of the first character of the annotation.
uint32 start_offset@0;
// The offset of the last character of the annotation.
uint32 end_offset@1;
// The set of entity types associated with the substring.
array<TextEntity> entities@2;
};
// Contains the input and parameters used to annotate the text.
// This is a combination of string and |AnnotationOptions| in tclib (see
// "tclib/annotator/types.h").
struct TextAnnotationRequest {
// The text to be annotated.
string text@0;
// Comma-delimited locales (e.g., "en", "en,es").
string? default_locales@1;
// Comma-separated list of language tags.
string? detected_text_language_tags@2;
// Tailors the output annotations according to the specified use-case.
AnnotationUsecase annotation_usecase@3 = ANNOTATION_USECASE_SMART;
// For parsing relative datetimes, the reference now time against which the
// relative datetimes get resolved.
mojo_base.mojom.Time? reference_time@4;
// Timezone in which the input text was written (format as accepted by ICU).
// If empty (default), will use the system's timezone.
string? reference_timezone@5;
// Enabled entities. If empty (default), all types of entities will be
// enabled.
array<string>? enabled_entities@6;
};
// Marks a span in a sequence of codepoints.
// This struct is consistent with the type |CodepointSpan| of tclib. (See
// "tclib/annotator/types.h").
struct CodepointSpan {
// The offset of the first character of the span.
uint32 start_offset@0;
// The offset of the last character of the span.
uint32 end_offset@1;
};
// Contains the input and parameters used to suggest selection.
// This is a combination of the inputs of the |SuggestSelection| function
// of tclib. (See "tclib/annotator/annotate.h").
struct TextSuggestSelectionRequest {
// The candidate text.
string text@0;
// Where the user selects.
CodepointSpan user_selection@1;
// Comma-delimited locales (e.g., "en", "en,es").
string? default_locales@2;
// Comma-separated list of BCP 47 language tags.
string? detected_text_language_tags@3;
// Tailors the output annotations according to the specified use-case.
AnnotationUsecase annotation_usecase@4 = ANNOTATION_USECASE_SMART;
};
// Used to annotate entities within text strings.
interface TextClassifier {
// Annotate a text string and returns the detected substrings and possible
// entities.
Annotate@0(TextAnnotationRequest request) =>
(array<TextAnnotation> outputs);
// Suggest a selection based on user's selection.
// If the inputs are invalid (e.g., user selection's start point is behind
// the end point), the input user selection will be returned.
// NOTE: The selection indices are passed in and returned in terms of
// UTF8 codepoints (not bytes).
SuggestSelection@1(TextSuggestSelectionRequest request) =>
(CodepointSpan outputs);
};