allwinner_a64/android/external/libtextclassifier/lang_id/custom-tokenizer.cc

/*
 * Copyright (C) 2017 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "lang_id/custom-tokenizer.h"

#include <ctype.h>

#include <string>

#include "util/strings/utf8.h"

namespace libtextclassifier {
namespace nlp_core {
namespace lang_id {

namespace {
inline bool IsTokenSeparator(int num_bytes, const char *curr) {
  if (num_bytes != 1) {
    return false;
  }
  return !isalpha(*curr);
}
}  // namespace

const char *GetSafeEndOfString(const char *data, size_t size) {
  const char *const hard_end = data + size;
  const char *curr = data;
  while (curr < hard_end) {
    int num_bytes = GetNumBytesForUTF8Char(curr);
    if (num_bytes == 0) {
      break;
    }
    const char *new_curr = curr + num_bytes;
    if (new_curr > hard_end) {
      return curr;
    }
    curr = new_curr;
  }
  return curr;
}

void TokenizeTextForLangId(const std::string &text, LightSentence *sentence) {
  const char *const start = text.data();
  const char *curr = start;
  const char *end = GetSafeEndOfString(start, text.size());

  // Corner case: empty safe part of the text.
  if (curr >= end) {
    return;
  }

  // Number of bytes for UTF8 character starting at *curr.  Note: the loop below
  // is guaranteed to terminate because in each iteration, we move curr by at
  // least num_bytes, and num_bytes is guaranteed to be > 0.
  int num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
  while (curr < end) {
    // Jump over consecutive token separators.
    while (IsTokenSeparator(num_bytes, curr)) {
      curr += num_bytes;
      if (curr >= end) {
        return;
      }
      num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
    }

    // If control reaches this point, we are at beginning of a non-empty token.
    std::string *word = sentence->add_word();

    // Add special token-start character.
    word->push_back('^');

    // Add UTF8 characters to word, until we hit the end of the safe text or a
    // token separator.
    while (true) {
      word->append(curr, num_bytes);
      curr += num_bytes;
      if (curr >= end) {
        break;
      }
      num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
      if (IsTokenSeparator(num_bytes, curr)) {
        curr += num_bytes;
        num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
        break;
      }
    }
    word->push_back('$');

    // Note: we intentionally do not token.set_start()/end(), as those fields
    // are not used by the langid model.
  }
}

}  // namespace lang_id
}  // namespace nlp_core
}  // namespace libtextclassifier