614 lines
24 KiB
C++
614 lines
24 KiB
C++
/*
|
|
* Copyright (C) 2017 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "smartselect/feature-processor.h"
|
|
|
|
#include "gmock/gmock.h"
|
|
#include "gtest/gtest.h"
|
|
|
|
namespace libtextclassifier {
|
|
namespace {
|
|
|
|
using testing::ElementsAreArray;
|
|
using testing::FloatEq;
|
|
|
|
TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
|
|
std::vector<Token> tokens{Token("Hělló", 0, 5),
|
|
Token("fěěbař@google.com", 6, 23),
|
|
Token("heře!", 24, 29)};
|
|
|
|
internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
|
|
|
|
// clang-format off
|
|
EXPECT_THAT(tokens, ElementsAreArray(
|
|
{Token("Hělló", 0, 5),
|
|
Token("fěě", 6, 9),
|
|
Token("bař", 9, 12),
|
|
Token("@google.com", 12, 23),
|
|
Token("heře!", 24, 29)}));
|
|
// clang-format on
|
|
}
|
|
|
|
TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
|
|
std::vector<Token> tokens{Token("Hělló", 0, 5),
|
|
Token("fěěbař@google.com", 6, 23),
|
|
Token("heře!", 24, 29)};
|
|
|
|
internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
|
|
|
|
// clang-format off
|
|
EXPECT_THAT(tokens, ElementsAreArray(
|
|
{Token("Hělló", 0, 5),
|
|
Token("fěěbař", 6, 12),
|
|
Token("@google.com", 12, 23),
|
|
Token("heře!", 24, 29)}));
|
|
// clang-format on
|
|
}
|
|
|
|
TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
|
|
std::vector<Token> tokens{Token("Hělló", 0, 5),
|
|
Token("fěěbař@google.com", 6, 23),
|
|
Token("heře!", 24, 29)};
|
|
|
|
internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
|
|
|
|
// clang-format off
|
|
EXPECT_THAT(tokens, ElementsAreArray(
|
|
{Token("Hělló", 0, 5),
|
|
Token("fěě", 6, 9),
|
|
Token("bař@google.com", 9, 23),
|
|
Token("heře!", 24, 29)}));
|
|
// clang-format on
|
|
}
|
|
|
|
TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
|
|
std::vector<Token> tokens{Token("Hělló", 0, 5),
|
|
Token("fěěbař@google.com", 6, 23),
|
|
Token("heře!", 24, 29)};
|
|
|
|
internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
|
|
|
|
// clang-format off
|
|
EXPECT_THAT(tokens, ElementsAreArray(
|
|
{Token("Hělló", 0, 5),
|
|
Token("fěěbař@google.com", 6, 23),
|
|
Token("heře!", 24, 29)}));
|
|
// clang-format on
|
|
}
|
|
|
|
TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
|
|
std::vector<Token> tokens{Token("Hělló", 0, 5),
|
|
Token("fěěbař@google.com", 6, 23),
|
|
Token("heře!", 24, 29)};
|
|
|
|
internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
|
|
|
|
// clang-format off
|
|
EXPECT_THAT(tokens, ElementsAreArray(
|
|
{Token("Hě", 0, 2),
|
|
Token("lló", 2, 5),
|
|
Token("fěě", 6, 9),
|
|
Token("bař@google.com", 9, 23),
|
|
Token("heře!", 24, 29)}));
|
|
// clang-format on
|
|
}
|
|
|
|
TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
|
|
const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
|
|
const CodepointSpan span = {0, 5};
|
|
// clang-format off
|
|
std::vector<Token> tokens = {Token("Fiřst", 0, 5),
|
|
Token("Lině", 6, 10),
|
|
Token("Sěcond", 11, 17),
|
|
Token("Lině", 18, 22),
|
|
Token("Thiřd", 23, 28),
|
|
Token("Lině", 29, 33)};
|
|
// clang-format on
|
|
|
|
// Keeps the first line.
|
|
internal::StripTokensFromOtherLines(context, span, &tokens);
|
|
EXPECT_THAT(tokens,
|
|
ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
|
|
}
|
|
|
|
TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
|
|
const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
|
|
const CodepointSpan span = {18, 22};
|
|
// clang-format off
|
|
std::vector<Token> tokens = {Token("Fiřst", 0, 5),
|
|
Token("Lině", 6, 10),
|
|
Token("Sěcond", 11, 17),
|
|
Token("Lině", 18, 22),
|
|
Token("Thiřd", 23, 28),
|
|
Token("Lině", 29, 33)};
|
|
// clang-format on
|
|
|
|
// Keeps the first line.
|
|
internal::StripTokensFromOtherLines(context, span, &tokens);
|
|
EXPECT_THAT(tokens, ElementsAreArray(
|
|
{Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
|
|
}
|
|
|
|
TEST(FeatureProcessorTest, KeepLineWithClickThird) {
|
|
const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
|
|
const CodepointSpan span = {24, 33};
|
|
// clang-format off
|
|
std::vector<Token> tokens = {Token("Fiřst", 0, 5),
|
|
Token("Lině", 6, 10),
|
|
Token("Sěcond", 11, 17),
|
|
Token("Lině", 18, 22),
|
|
Token("Thiřd", 23, 28),
|
|
Token("Lině", 29, 33)};
|
|
// clang-format on
|
|
|
|
// Keeps the first line.
|
|
internal::StripTokensFromOtherLines(context, span, &tokens);
|
|
EXPECT_THAT(tokens, ElementsAreArray(
|
|
{Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
|
|
}
|
|
|
|
TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
|
|
const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
|
|
const CodepointSpan span = {18, 22};
|
|
// clang-format off
|
|
std::vector<Token> tokens = {Token("Fiřst", 0, 5),
|
|
Token("Lině", 6, 10),
|
|
Token("Sěcond", 11, 17),
|
|
Token("Lině", 18, 22),
|
|
Token("Thiřd", 23, 28),
|
|
Token("Lině", 29, 33)};
|
|
// clang-format on
|
|
|
|
// Keeps the first line.
|
|
internal::StripTokensFromOtherLines(context, span, &tokens);
|
|
EXPECT_THAT(tokens, ElementsAreArray(
|
|
{Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
|
|
}
|
|
|
|
TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
|
|
const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
|
|
const CodepointSpan span = {5, 23};
|
|
// clang-format off
|
|
std::vector<Token> tokens = {Token("Fiřst", 0, 5),
|
|
Token("Lině", 6, 10),
|
|
Token("Sěcond", 18, 23),
|
|
Token("Lině", 19, 23),
|
|
Token("Thiřd", 23, 28),
|
|
Token("Lině", 29, 33)};
|
|
// clang-format on
|
|
|
|
// Keeps the first line.
|
|
internal::StripTokensFromOtherLines(context, span, &tokens);
|
|
EXPECT_THAT(tokens, ElementsAreArray(
|
|
{Token("Fiřst", 0, 5), Token("Lině", 6, 10),
|
|
Token("Sěcond", 18, 23), Token("Lině", 19, 23),
|
|
Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
|
|
}
|
|
|
|
class TestingFeatureProcessor : public FeatureProcessor {
|
|
public:
|
|
using FeatureProcessor::FeatureProcessor;
|
|
using FeatureProcessor::SpanToLabel;
|
|
using FeatureProcessor::SupportedCodepointsRatio;
|
|
using FeatureProcessor::IsCodepointInRanges;
|
|
using FeatureProcessor::ICUTokenize;
|
|
using FeatureProcessor::supported_codepoint_ranges_;
|
|
};
|
|
|
|
TEST(FeatureProcessorTest, SpanToLabel) {
|
|
FeatureProcessorOptions options;
|
|
options.set_context_size(1);
|
|
options.set_max_selection_span(1);
|
|
options.set_snap_label_span_boundaries_to_containing_tokens(false);
|
|
|
|
TokenizationCodepointRange* config =
|
|
options.add_tokenization_codepoint_config();
|
|
config->set_start(32);
|
|
config->set_end(33);
|
|
config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
|
|
|
|
TestingFeatureProcessor feature_processor(options);
|
|
std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
|
|
ASSERT_EQ(3, tokens.size());
|
|
int label;
|
|
ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
|
|
EXPECT_EQ(kInvalidLabel, label);
|
|
ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
|
|
EXPECT_NE(kInvalidLabel, label);
|
|
TokenSpan token_span;
|
|
feature_processor.LabelToTokenSpan(label, &token_span);
|
|
EXPECT_EQ(0, token_span.first);
|
|
EXPECT_EQ(0, token_span.second);
|
|
|
|
// Reconfigure with snapping enabled.
|
|
options.set_snap_label_span_boundaries_to_containing_tokens(true);
|
|
TestingFeatureProcessor feature_processor2(options);
|
|
int label2;
|
|
ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
|
|
EXPECT_EQ(label, label2);
|
|
ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
|
|
EXPECT_EQ(label, label2);
|
|
ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
|
|
EXPECT_EQ(label, label2);
|
|
|
|
// Cross a token boundary.
|
|
ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
|
|
EXPECT_EQ(kInvalidLabel, label2);
|
|
ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
|
|
EXPECT_EQ(kInvalidLabel, label2);
|
|
|
|
// Multiple tokens.
|
|
options.set_context_size(2);
|
|
options.set_max_selection_span(2);
|
|
TestingFeatureProcessor feature_processor3(options);
|
|
tokens = feature_processor3.Tokenize("zero, one, two, three, four");
|
|
ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
|
|
EXPECT_NE(kInvalidLabel, label2);
|
|
feature_processor3.LabelToTokenSpan(label2, &token_span);
|
|
EXPECT_EQ(1, token_span.first);
|
|
EXPECT_EQ(0, token_span.second);
|
|
|
|
int label3;
|
|
ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
|
|
EXPECT_EQ(label2, label3);
|
|
ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
|
|
EXPECT_EQ(label2, label3);
|
|
ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
|
|
EXPECT_EQ(label2, label3);
|
|
}
|
|
|
|
TEST(FeatureProcessorTest, CenterTokenFromClick) {
|
|
int token_index;
|
|
|
|
// Exactly aligned indices.
|
|
token_index = internal::CenterTokenFromClick(
|
|
{6, 11},
|
|
{Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
|
|
EXPECT_EQ(token_index, 1);
|
|
|
|
// Click is contained in a token.
|
|
token_index = internal::CenterTokenFromClick(
|
|
{13, 17},
|
|
{Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
|
|
EXPECT_EQ(token_index, 2);
|
|
|
|
// Click spans two tokens.
|
|
token_index = internal::CenterTokenFromClick(
|
|
{6, 17},
|
|
{Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
|
|
EXPECT_EQ(token_index, kInvalidIndex);
|
|
}
|
|
|
|
TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
|
|
int token_index;
|
|
|
|
// Selection of length 3. Exactly aligned indices.
|
|
token_index = internal::CenterTokenFromMiddleOfSelection(
|
|
{7, 27},
|
|
{Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
|
|
Token("Token4", 21, 27), Token("Token5", 28, 34)});
|
|
EXPECT_EQ(token_index, 2);
|
|
|
|
// Selection of length 1 token. Exactly aligned indices.
|
|
token_index = internal::CenterTokenFromMiddleOfSelection(
|
|
{21, 27},
|
|
{Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
|
|
Token("Token4", 21, 27), Token("Token5", 28, 34)});
|
|
EXPECT_EQ(token_index, 3);
|
|
|
|
// Selection marks sub-token range, with no tokens in it.
|
|
token_index = internal::CenterTokenFromMiddleOfSelection(
|
|
{29, 33},
|
|
{Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
|
|
Token("Token4", 21, 27), Token("Token5", 28, 34)});
|
|
EXPECT_EQ(token_index, kInvalidIndex);
|
|
|
|
// Selection of length 2. Sub-token indices.
|
|
token_index = internal::CenterTokenFromMiddleOfSelection(
|
|
{3, 25},
|
|
{Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
|
|
Token("Token4", 21, 27), Token("Token5", 28, 34)});
|
|
EXPECT_EQ(token_index, 1);
|
|
|
|
// Selection of length 1. Sub-token indices.
|
|
token_index = internal::CenterTokenFromMiddleOfSelection(
|
|
{22, 34},
|
|
{Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
|
|
Token("Token4", 21, 27), Token("Token5", 28, 34)});
|
|
EXPECT_EQ(token_index, 4);
|
|
|
|
// Some invalid ones.
|
|
token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
|
|
EXPECT_EQ(token_index, -1);
|
|
}
|
|
|
|
TEST(FeatureProcessorTest, SupportedCodepointsRatio) {
|
|
FeatureProcessorOptions options;
|
|
options.set_context_size(2);
|
|
options.set_max_selection_span(2);
|
|
options.set_snap_label_span_boundaries_to_containing_tokens(false);
|
|
|
|
TokenizationCodepointRange* config =
|
|
options.add_tokenization_codepoint_config();
|
|
config->set_start(32);
|
|
config->set_end(33);
|
|
config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
|
|
|
|
FeatureProcessorOptions::CodepointRange* range;
|
|
range = options.add_supported_codepoint_ranges();
|
|
range->set_start(0);
|
|
range->set_end(128);
|
|
|
|
range = options.add_supported_codepoint_ranges();
|
|
range->set_start(10000);
|
|
range->set_end(10001);
|
|
|
|
range = options.add_supported_codepoint_ranges();
|
|
range->set_start(20000);
|
|
range->set_end(30000);
|
|
|
|
TestingFeatureProcessor feature_processor(options);
|
|
EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
|
|
1, feature_processor.Tokenize("aaa bbb ccc")),
|
|
FloatEq(1.0));
|
|
EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
|
|
1, feature_processor.Tokenize("aaa bbb ěěě")),
|
|
FloatEq(2.0 / 3));
|
|
EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
|
|
1, feature_processor.Tokenize("ěěě řřř ěěě")),
|
|
FloatEq(0.0));
|
|
EXPECT_FALSE(feature_processor.IsCodepointInRanges(
|
|
-1, feature_processor.supported_codepoint_ranges_));
|
|
EXPECT_TRUE(feature_processor.IsCodepointInRanges(
|
|
0, feature_processor.supported_codepoint_ranges_));
|
|
EXPECT_TRUE(feature_processor.IsCodepointInRanges(
|
|
10, feature_processor.supported_codepoint_ranges_));
|
|
EXPECT_TRUE(feature_processor.IsCodepointInRanges(
|
|
127, feature_processor.supported_codepoint_ranges_));
|
|
EXPECT_FALSE(feature_processor.IsCodepointInRanges(
|
|
128, feature_processor.supported_codepoint_ranges_));
|
|
EXPECT_FALSE(feature_processor.IsCodepointInRanges(
|
|
9999, feature_processor.supported_codepoint_ranges_));
|
|
EXPECT_TRUE(feature_processor.IsCodepointInRanges(
|
|
10000, feature_processor.supported_codepoint_ranges_));
|
|
EXPECT_FALSE(feature_processor.IsCodepointInRanges(
|
|
10001, feature_processor.supported_codepoint_ranges_));
|
|
EXPECT_TRUE(feature_processor.IsCodepointInRanges(
|
|
25000, feature_processor.supported_codepoint_ranges_));
|
|
|
|
std::vector<Token> tokens;
|
|
int click_pos;
|
|
std::vector<float> extra_features;
|
|
std::unique_ptr<CachedFeatures> cached_features;
|
|
|
|
auto feature_fn = [](const std::vector<int>& sparse_features,
|
|
const std::vector<float>& dense_features,
|
|
float* embedding) { return true; };
|
|
|
|
options.set_min_supported_codepoint_ratio(0.0);
|
|
TestingFeatureProcessor feature_processor2(options);
|
|
EXPECT_TRUE(feature_processor2.ExtractFeatures("ěěě řřř eee", {4, 7}, {0, 0},
|
|
feature_fn, 2, &tokens,
|
|
&click_pos, &cached_features));
|
|
|
|
options.set_min_supported_codepoint_ratio(0.2);
|
|
TestingFeatureProcessor feature_processor3(options);
|
|
EXPECT_TRUE(feature_processor3.ExtractFeatures("ěěě řřř eee", {4, 7}, {0, 0},
|
|
feature_fn, 2, &tokens,
|
|
&click_pos, &cached_features));
|
|
|
|
options.set_min_supported_codepoint_ratio(0.5);
|
|
TestingFeatureProcessor feature_processor4(options);
|
|
EXPECT_FALSE(feature_processor4.ExtractFeatures(
|
|
"ěěě řřř eee", {4, 7}, {0, 0}, feature_fn, 2, &tokens, &click_pos,
|
|
&cached_features));
|
|
}
|
|
|
|
TEST(FeatureProcessorTest, StripUnusedTokensWithNoRelativeClick) {
|
|
std::vector<Token> tokens_orig{
|
|
Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
|
|
Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
|
|
Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
|
|
Token("12", 0, 0)};
|
|
|
|
std::vector<Token> tokens;
|
|
int click_index;
|
|
|
|
// Try to click first token and see if it gets padded from left.
|
|
tokens = tokens_orig;
|
|
click_index = 0;
|
|
internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
|
|
// clang-format off
|
|
EXPECT_EQ(tokens, std::vector<Token>({Token(),
|
|
Token(),
|
|
Token("0", 0, 0),
|
|
Token("1", 0, 0),
|
|
Token("2", 0, 0)}));
|
|
// clang-format on
|
|
EXPECT_EQ(click_index, 2);
|
|
|
|
// When we click the second token nothing should get padded.
|
|
tokens = tokens_orig;
|
|
click_index = 2;
|
|
internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
|
|
// clang-format off
|
|
EXPECT_EQ(tokens, std::vector<Token>({Token("0", 0, 0),
|
|
Token("1", 0, 0),
|
|
Token("2", 0, 0),
|
|
Token("3", 0, 0),
|
|
Token("4", 0, 0)}));
|
|
// clang-format on
|
|
EXPECT_EQ(click_index, 2);
|
|
|
|
// When we click the last token tokens should get padded from the right.
|
|
tokens = tokens_orig;
|
|
click_index = 12;
|
|
internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
|
|
// clang-format off
|
|
EXPECT_EQ(tokens, std::vector<Token>({Token("10", 0, 0),
|
|
Token("11", 0, 0),
|
|
Token("12", 0, 0),
|
|
Token(),
|
|
Token()}));
|
|
// clang-format on
|
|
EXPECT_EQ(click_index, 2);
|
|
}
|
|
|
|
TEST(FeatureProcessorTest, StripUnusedTokensWithRelativeClick) {
|
|
std::vector<Token> tokens_orig{
|
|
Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
|
|
Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
|
|
Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
|
|
Token("12", 0, 0)};
|
|
|
|
std::vector<Token> tokens;
|
|
int click_index;
|
|
|
|
// Try to click first token and see if it gets padded from left to maximum
|
|
// context_size.
|
|
tokens = tokens_orig;
|
|
click_index = 0;
|
|
internal::StripOrPadTokens({2, 3}, 2, &tokens, &click_index);
|
|
// clang-format off
|
|
EXPECT_EQ(tokens, std::vector<Token>({Token(),
|
|
Token(),
|
|
Token("0", 0, 0),
|
|
Token("1", 0, 0),
|
|
Token("2", 0, 0),
|
|
Token("3", 0, 0),
|
|
Token("4", 0, 0),
|
|
Token("5", 0, 0)}));
|
|
// clang-format on
|
|
EXPECT_EQ(click_index, 2);
|
|
|
|
// Clicking to the middle with enough context should not produce any padding.
|
|
tokens = tokens_orig;
|
|
click_index = 6;
|
|
internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
|
|
// clang-format off
|
|
EXPECT_EQ(tokens, std::vector<Token>({Token("1", 0, 0),
|
|
Token("2", 0, 0),
|
|
Token("3", 0, 0),
|
|
Token("4", 0, 0),
|
|
Token("5", 0, 0),
|
|
Token("6", 0, 0),
|
|
Token("7", 0, 0),
|
|
Token("8", 0, 0),
|
|
Token("9", 0, 0)}));
|
|
// clang-format on
|
|
EXPECT_EQ(click_index, 5);
|
|
|
|
// Clicking at the end should pad right to maximum context_size.
|
|
tokens = tokens_orig;
|
|
click_index = 11;
|
|
internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
|
|
// clang-format off
|
|
EXPECT_EQ(tokens, std::vector<Token>({Token("6", 0, 0),
|
|
Token("7", 0, 0),
|
|
Token("8", 0, 0),
|
|
Token("9", 0, 0),
|
|
Token("10", 0, 0),
|
|
Token("11", 0, 0),
|
|
Token("12", 0, 0),
|
|
Token(),
|
|
Token()}));
|
|
// clang-format on
|
|
EXPECT_EQ(click_index, 5);
|
|
}
|
|
|
|
TEST(FeatureProcessorTest, ICUTokenize) {
|
|
FeatureProcessorOptions options;
|
|
options.set_tokenization_type(
|
|
libtextclassifier::FeatureProcessorOptions::ICU);
|
|
|
|
TestingFeatureProcessor feature_processor(options);
|
|
std::vector<Token> tokens = feature_processor.Tokenize("พระบาทสมเด็จพระปรมิ");
|
|
ASSERT_EQ(tokens,
|
|
// clang-format off
|
|
std::vector<Token>({Token("พระบาท", 0, 6),
|
|
Token("สมเด็จ", 6, 12),
|
|
Token("พระ", 12, 15),
|
|
Token("ปร", 15, 17),
|
|
Token("มิ", 17, 19)}));
|
|
// clang-format on
|
|
}
|
|
|
|
TEST(FeatureProcessorTest, ICUTokenizeWithWhitespaces) {
|
|
FeatureProcessorOptions options;
|
|
options.set_tokenization_type(
|
|
libtextclassifier::FeatureProcessorOptions::ICU);
|
|
options.set_icu_preserve_whitespace_tokens(true);
|
|
|
|
TestingFeatureProcessor feature_processor(options);
|
|
std::vector<Token> tokens =
|
|
feature_processor.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
|
|
ASSERT_EQ(tokens,
|
|
// clang-format off
|
|
std::vector<Token>({Token("พระบาท", 0, 6),
|
|
Token(" ", 6, 7),
|
|
Token("สมเด็จ", 7, 13),
|
|
Token(" ", 13, 14),
|
|
Token("พระ", 14, 17),
|
|
Token(" ", 17, 18),
|
|
Token("ปร", 18, 20),
|
|
Token(" ", 20, 21),
|
|
Token("มิ", 21, 23)}));
|
|
// clang-format on
|
|
}
|
|
|
|
TEST(FeatureProcessorTest, MixedTokenize) {
|
|
FeatureProcessorOptions options;
|
|
options.set_tokenization_type(
|
|
libtextclassifier::FeatureProcessorOptions::MIXED);
|
|
|
|
TokenizationCodepointRange* config =
|
|
options.add_tokenization_codepoint_config();
|
|
config->set_start(32);
|
|
config->set_end(33);
|
|
config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
|
|
|
|
FeatureProcessorOptions::CodepointRange* range;
|
|
range = options.add_internal_tokenizer_codepoint_ranges();
|
|
range->set_start(0);
|
|
range->set_end(128);
|
|
|
|
range = options.add_internal_tokenizer_codepoint_ranges();
|
|
range->set_start(128);
|
|
range->set_end(256);
|
|
|
|
range = options.add_internal_tokenizer_codepoint_ranges();
|
|
range->set_start(256);
|
|
range->set_end(384);
|
|
|
|
range = options.add_internal_tokenizer_codepoint_ranges();
|
|
range->set_start(384);
|
|
range->set_end(592);
|
|
|
|
TestingFeatureProcessor feature_processor(options);
|
|
std::vector<Token> tokens = feature_processor.Tokenize(
|
|
"こんにちはJapanese-ląnguagę text 世界 http://www.google.com/");
|
|
ASSERT_EQ(tokens,
|
|
// clang-format off
|
|
std::vector<Token>({Token("こんにちは", 0, 5),
|
|
Token("Japanese-ląnguagę", 5, 22),
|
|
Token("text", 23, 27),
|
|
Token("世界", 28, 30),
|
|
Token("http://www.google.com/", 31, 53)}));
|
|
// clang-format on
|
|
}
|
|
|
|
} // namespace
|
|
} // namespace libtextclassifier
|