207 lines
7.9 KiB
Protocol Buffer
207 lines
7.9 KiB
Protocol Buffer
// Copyright (C) 2017 The Android Open Source Project
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Text classification model configuration.
|
|
|
|
syntax = "proto2";
|
|
option optimize_for = LITE_RUNTIME;
|
|
|
|
import "external/libtextclassifier/common/embedding-network.proto";
|
|
import "external/libtextclassifier/smartselect/tokenizer.proto";
|
|
|
|
package libtextclassifier;
|
|
|
|
// Generic options of a model, non-specific to selection or sharing.
|
|
message ModelOptions {
|
|
// If true, will use embeddings from a different model. This is mainly useful
|
|
// for the Sharing model using the embeddings from the Selection model.
|
|
optional bool use_shared_embeddings = 1;
|
|
|
|
// Language of the model.
|
|
optional string language = 2;
|
|
|
|
// Version of the model.
|
|
optional int32 version = 3;
|
|
}
|
|
|
|
message SelectionModelOptions {
|
|
// A list of Unicode codepoints to strip from predicted selections.
|
|
repeated int32 punctuation_to_strip = 1;
|
|
|
|
// Whether to strip punctuation after the selection is made.
|
|
optional bool strip_punctuation = 2;
|
|
|
|
// Enforce symmetrical selections.
|
|
optional bool enforce_symmetry = 3;
|
|
|
|
// Number of inferences made around the click position (to one side), for
|
|
// enforcing symmetry.
|
|
optional int32 symmetry_context_size = 4;
|
|
}
|
|
|
|
message SharingModelOptions {
|
|
// If true, will always return "url" when the url hint is passed in.
|
|
optional bool always_accept_url_hint = 1;
|
|
|
|
// If true, will always return "email" when the e-mail hint is passed in.
|
|
optional bool always_accept_email_hint = 2;
|
|
|
|
// Limits for phone numbers.
|
|
optional int32 phone_min_num_digits = 3 [default = 7];
|
|
optional int32 phone_max_num_digits = 4 [default = 15];
|
|
}
|
|
|
|
message FeatureProcessorOptions {
|
|
// Number of buckets used for hashing charactergrams.
|
|
optional int32 num_buckets = 1 [default = -1];
|
|
|
|
// Context size defines the number of words to the left and to the right of
|
|
// the selected word to be used as context. For example, if context size is
|
|
// N, then we take N words to the left and N words to the right of the
|
|
// selected word as its context.
|
|
optional int32 context_size = 2 [default = -1];
|
|
|
|
// Maximum number of words of the context to select in total.
|
|
optional int32 max_selection_span = 3 [default = -1];
|
|
|
|
// Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
|
|
// character trigrams etc.
|
|
repeated int32 chargram_orders = 4;
|
|
|
|
// Maximum length of a word, in codepoints.
|
|
optional int32 max_word_length = 21 [default = 20];
|
|
|
|
// If true, will use the unicode-aware functionality for extracting features.
|
|
optional bool unicode_aware_features = 19 [default = false];
|
|
|
|
// Whether to extract the token case feature.
|
|
optional bool extract_case_feature = 5 [default = false];
|
|
|
|
// Whether to extract the selection mask feature.
|
|
optional bool extract_selection_mask_feature = 6 [default = false];
|
|
|
|
// List of regexps to run over each token. For each regexp, if there is a
|
|
// match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
|
|
repeated string regexp_feature = 22;
|
|
|
|
// Whether to remap all digits to a single number.
|
|
optional bool remap_digits = 20 [default = false];
|
|
|
|
// Whether to lower-case each token before generating hashgrams.
|
|
optional bool lowercase_tokens = 33;
|
|
|
|
// If true, the selection classifier output will contain only the selections
|
|
// that are feasible (e.g., those that are shorter than max_selection_span),
|
|
// if false, the output will be a complete cross-product of possible
|
|
// selections to the left and posible selections to the right, including the
|
|
// infeasible ones.
|
|
// NOTE: Exists mainly for compatibility with older models that were trained
|
|
// with the non-reduced output space.
|
|
optional bool selection_reduced_output_space = 8 [default = true];
|
|
|
|
// Collection names.
|
|
repeated string collections = 9;
|
|
|
|
// An index of collection in collections to be used if a collection name can't
|
|
// be mapped to an id.
|
|
optional int32 default_collection = 10 [default = -1];
|
|
|
|
// If true, will split the input by lines, and only use the line that contains
|
|
// the clicked token.
|
|
optional bool only_use_line_with_click = 13 [default = false];
|
|
|
|
// If true, will split tokens that contain the selection boundary, at the
|
|
// position of the boundary.
|
|
// E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
|
|
optional bool split_tokens_on_selection_boundaries = 14 [default = false];
|
|
|
|
// Codepoint ranges that determine how different codepoints are tokenized.
|
|
// The ranges must not overlap.
|
|
repeated TokenizationCodepointRange tokenization_codepoint_config = 15;
|
|
|
|
// Method for selecting the center token.
|
|
enum CenterTokenSelectionMethod {
|
|
DEFAULT_CENTER_TOKEN_METHOD = 0; // Invalid option.
|
|
|
|
// Use click indices to determine the center token.
|
|
CENTER_TOKEN_FROM_CLICK = 1;
|
|
|
|
// Use selection indices to get a token range, and select the middle of it
|
|
// as the center token.
|
|
CENTER_TOKEN_MIDDLE_OF_SELECTION = 2;
|
|
}
|
|
optional CenterTokenSelectionMethod center_token_selection_method = 16;
|
|
|
|
// If true, span boundaries will be snapped to containing tokens and not
|
|
// required to exactly match token boundaries.
|
|
optional bool snap_label_span_boundaries_to_containing_tokens = 18;
|
|
|
|
// Range of codepoints start - end, where end is exclusive.
|
|
message CodepointRange {
|
|
optional int32 start = 1;
|
|
optional int32 end = 2;
|
|
}
|
|
|
|
// A set of codepoint ranges supported by the model.
|
|
repeated CodepointRange supported_codepoint_ranges = 23;
|
|
|
|
// A set of codepoint ranges to use in the mixed tokenization mode to identify
|
|
// stretches of tokens to re-tokenize using the internal tokenizer.
|
|
repeated CodepointRange internal_tokenizer_codepoint_ranges = 34;
|
|
|
|
// Minimum ratio of supported codepoints in the input context. If the ratio
|
|
// is lower than this, the feature computation will fail.
|
|
optional float min_supported_codepoint_ratio = 24 [default = 0.0];
|
|
|
|
// Used for versioning the format of features the model expects.
|
|
// - feature_version == 0:
|
|
// For each token the features consist of:
|
|
// - chargram embeddings
|
|
// - dense features
|
|
// Chargram embeddings for tokens are concatenated first together,
|
|
// and at the end, the dense features for the tokens are concatenated
|
|
// to it. So the resulting feature vector has two regions.
|
|
optional int32 feature_version = 25 [default = 0];
|
|
|
|
// Controls the type of tokenization the model will use for the input text.
|
|
enum TokenizationType {
|
|
INVALID_TOKENIZATION_TYPE = 0;
|
|
|
|
// Use the internal tokenizer for tokenization.
|
|
INTERNAL_TOKENIZER = 1;
|
|
|
|
// Use ICU for tokenization.
|
|
ICU = 2;
|
|
|
|
// First apply ICU tokenization. Then identify stretches of tokens
|
|
// consisting only of codepoints in internal_tokenizer_codepoint_ranges
|
|
// and re-tokenize them using the internal tokenizer.
|
|
MIXED = 3;
|
|
}
|
|
optional TokenizationType tokenization_type = 30
|
|
[default = INTERNAL_TOKENIZER];
|
|
optional bool icu_preserve_whitespace_tokens = 31 [default = false];
|
|
|
|
reserved 7, 11, 12, 17, 26, 27, 28, 29, 32;
|
|
};
|
|
|
|
extend nlp_core.EmbeddingNetworkProto {
|
|
optional ModelOptions model_options_in_embedding_network_proto = 150063045;
|
|
optional FeatureProcessorOptions
|
|
feature_processor_options_in_embedding_network_proto = 146230910;
|
|
optional SelectionModelOptions
|
|
selection_model_options_in_embedding_network_proto = 148190899;
|
|
optional SharingModelOptions
|
|
sharing_model_options_in_embedding_network_proto = 151445439;
|
|
}
|