96 lines
3.3 KiB
C++
96 lines
3.3 KiB
C++
/*
|
|
* Copyright (C) 2015 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include <gtest/gtest.h>
|
|
#include <unicode/utf.h>
|
|
#include <cstdlib>
|
|
|
|
// src is of the form "U+1F431 | 'h' 'i'". Position of "|" gets saved to offset if non-null.
|
|
// Size is returned in an out parameter because gtest needs a void return for ASSERT to work.
|
|
void ParseUnicode(uint16_t* buf, size_t buf_size, const char* src, size_t* result_size,
|
|
size_t* offset) {
|
|
size_t input_ix = 0;
|
|
size_t output_ix = 0;
|
|
bool seen_offset = false;
|
|
|
|
while (src[input_ix] != 0) {
|
|
switch (src[input_ix]) {
|
|
case '\'':
|
|
// single ASCII char
|
|
ASSERT_LT(src[input_ix], 0x80);
|
|
input_ix++;
|
|
ASSERT_NE(src[input_ix], 0);
|
|
ASSERT_LT(output_ix, buf_size);
|
|
buf[output_ix++] = (uint16_t)src[input_ix++];
|
|
ASSERT_EQ(src[input_ix], '\'');
|
|
input_ix++;
|
|
break;
|
|
case 'u':
|
|
case 'U': {
|
|
// Unicode codepoint in hex syntax
|
|
input_ix++;
|
|
ASSERT_EQ(src[input_ix], '+');
|
|
input_ix++;
|
|
char* endptr = (char*)src + input_ix;
|
|
unsigned long int codepoint = strtoul(src + input_ix, &endptr, 16);
|
|
size_t num_hex_digits = endptr - (src + input_ix);
|
|
ASSERT_GE(num_hex_digits, 4u); // also triggers on invalid number syntax, digits = 0
|
|
ASSERT_LE(num_hex_digits, 6u);
|
|
ASSERT_LE(codepoint, 0x10FFFFu);
|
|
input_ix += num_hex_digits;
|
|
if (U16_LENGTH(codepoint) == 1) {
|
|
ASSERT_LE(output_ix + 1, buf_size);
|
|
buf[output_ix++] = codepoint;
|
|
} else {
|
|
// UTF-16 encoding
|
|
ASSERT_LE(output_ix + 2, buf_size);
|
|
buf[output_ix++] = U16_LEAD(codepoint);
|
|
buf[output_ix++] = U16_TRAIL(codepoint);
|
|
}
|
|
break;
|
|
}
|
|
case ' ':
|
|
input_ix++;
|
|
break;
|
|
case '|':
|
|
ASSERT_FALSE(seen_offset);
|
|
ASSERT_NE(offset, nullptr);
|
|
*offset = output_ix;
|
|
seen_offset = true;
|
|
input_ix++;
|
|
break;
|
|
default:
|
|
FAIL(); // unexpected character
|
|
}
|
|
}
|
|
ASSERT_NE(result_size, nullptr);
|
|
*result_size = output_ix;
|
|
ASSERT_TRUE(seen_offset || offset == nullptr);
|
|
}
|
|
|
|
TEST(UnicodeUtils, parse) {
|
|
const size_t BUF_SIZE = 256;
|
|
uint16_t buf[BUF_SIZE];
|
|
size_t offset;
|
|
size_t size;
|
|
ParseUnicode(buf, BUF_SIZE, "U+000D U+1F431 | 'a'", &size, &offset);
|
|
EXPECT_EQ(size, 4u);
|
|
EXPECT_EQ(offset, 3u);
|
|
EXPECT_EQ(buf[0], 0x000D);
|
|
EXPECT_EQ(buf[1], 0xD83D);
|
|
EXPECT_EQ(buf[2], 0xDC31);
|
|
EXPECT_EQ(buf[3], 'a');
|
|
}
|