304 lines
9.2 KiB
C++
304 lines
9.2 KiB
C++
/*
|
|
* Copyright (C) 2011 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
/*
|
|
* Validate and manipulate MUTF-8 encoded string data.
|
|
*/
|
|
|
|
#include "DexUtf.h"
|
|
|
|
/* Compare two '\0'-terminated modified UTF-8 strings, using Unicode
|
|
* code point values for comparison. This treats different encodings
|
|
* for the same code point as equivalent, except that only a real '\0'
|
|
* byte is considered the string terminator. The return value is as
|
|
* for strcmp(). */
|
|
int dexUtf8Cmp(const char* s1, const char* s2) {
|
|
for (;;) {
|
|
if (*s1 == '\0') {
|
|
if (*s2 == '\0') {
|
|
return 0;
|
|
}
|
|
return -1;
|
|
} else if (*s2 == '\0') {
|
|
return 1;
|
|
}
|
|
|
|
int utf1 = dexGetUtf16FromUtf8(&s1);
|
|
int utf2 = dexGetUtf16FromUtf8(&s2);
|
|
int diff = utf1 - utf2;
|
|
|
|
if (diff != 0) {
|
|
return diff;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */
|
|
u4 DEX_MEMBER_VALID_LOW_ASCII[4] = {
|
|
0x00000000, // 00..1f low control characters; nothing valid
|
|
0x03ff2010, // 20..3f digits and symbols; valid: '0'..'9', '$', '-'
|
|
0x87fffffe, // 40..5f uppercase etc.; valid: 'A'..'Z', '_'
|
|
0x07fffffe // 60..7f lowercase etc.; valid: 'a'..'z'
|
|
};
|
|
|
|
/* Helper for dexIsValidMemberNameUtf8(); do not call directly. */
|
|
bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr) {
|
|
/*
|
|
* It's a multibyte encoded character. Decode it and analyze. We
|
|
* accept anything that isn't (a) an improperly encoded low value,
|
|
* (b) an improper surrogate pair, (c) an encoded '\0', (d) a high
|
|
* control character, or (e) a high space, layout, or special
|
|
* character (U+00a0, U+2000..U+200f, U+2028..U+202f,
|
|
* U+fff0..U+ffff). This is all specified in the dex format
|
|
* document.
|
|
*/
|
|
|
|
u2 utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);
|
|
|
|
// Perform follow-up tests based on the high 8 bits.
|
|
switch (utf16 >> 8) {
|
|
case 0x00: {
|
|
// It's only valid if it's above the ISO-8859-1 high space (0xa0).
|
|
return (utf16 > 0x00a0);
|
|
}
|
|
case 0xd8:
|
|
case 0xd9:
|
|
case 0xda:
|
|
case 0xdb: {
|
|
/*
|
|
* It's a leading surrogate. Check to see that a trailing
|
|
* surrogate follows.
|
|
*/
|
|
utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);
|
|
return (utf16 >= 0xdc00) && (utf16 <= 0xdfff);
|
|
}
|
|
case 0xdc:
|
|
case 0xdd:
|
|
case 0xde:
|
|
case 0xdf: {
|
|
// It's a trailing surrogate, which is not valid at this point.
|
|
return false;
|
|
}
|
|
case 0x20:
|
|
case 0xff: {
|
|
// It's in the range that has spaces, controls, and specials.
|
|
switch (utf16 & 0xfff8) {
|
|
case 0x2000:
|
|
case 0x2008:
|
|
case 0x2028:
|
|
case 0xfff0:
|
|
case 0xfff8: {
|
|
return false;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/* Return whether the given string is a valid field or method name. */
|
|
bool dexIsValidMemberName(const char* s) {
|
|
bool angleName = false;
|
|
|
|
switch (*s) {
|
|
case '\0': {
|
|
// The empty string is not a valid name.
|
|
return false;
|
|
}
|
|
case '<': {
|
|
/*
|
|
* '<' is allowed only at the start of a name, and if present,
|
|
* means that the name must end with '>'.
|
|
*/
|
|
angleName = true;
|
|
s++;
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (;;) {
|
|
switch (*s) {
|
|
case '\0': {
|
|
return !angleName;
|
|
}
|
|
case '>': {
|
|
return angleName && s[1] == '\0';
|
|
}
|
|
}
|
|
if (!dexIsValidMemberNameUtf8(&s)) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Helper for validating type descriptors and class names, which is parametric
|
|
* with respect to type vs. class and dot vs. slash. */
|
|
static bool isValidTypeDescriptorOrClassName(const char* s, bool isClassName,
|
|
bool dotSeparator) {
|
|
int arrayCount = 0;
|
|
|
|
while (*s == '[') {
|
|
arrayCount++;
|
|
s++;
|
|
}
|
|
|
|
if (arrayCount > 255) {
|
|
// Arrays may have no more than 255 dimensions.
|
|
return false;
|
|
}
|
|
|
|
if (arrayCount != 0) {
|
|
/*
|
|
* If we're looking at an array of some sort, then it doesn't
|
|
* matter if what is being asked for is a class name; the
|
|
* format looks the same as a type descriptor in that case, so
|
|
* treat it as such.
|
|
*/
|
|
isClassName = false;
|
|
}
|
|
|
|
if (!isClassName) {
|
|
/*
|
|
* We are looking for a descriptor. Either validate it as a
|
|
* single-character primitive type, or continue on to check the
|
|
* embedded class name (bracketed by "L" and ";").
|
|
*/
|
|
switch (*(s++)) {
|
|
case 'B':
|
|
case 'C':
|
|
case 'D':
|
|
case 'F':
|
|
case 'I':
|
|
case 'J':
|
|
case 'S':
|
|
case 'Z': {
|
|
// These are all single-character descriptors for primitive types.
|
|
return (*s == '\0');
|
|
}
|
|
case 'V': {
|
|
// Non-array void is valid, but you can't have an array of void.
|
|
return (arrayCount == 0) && (*s == '\0');
|
|
}
|
|
case 'L': {
|
|
// Class name: Break out and continue below.
|
|
break;
|
|
}
|
|
default: {
|
|
// Oddball descriptor character.
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* We just consumed the 'L' that introduces a class name as part
|
|
* of a type descriptor, or we are looking for an unadorned class
|
|
* name.
|
|
*/
|
|
|
|
bool sepOrFirst = true; // first character or just encountered a separator.
|
|
for (;;) {
|
|
u1 c = (u1) *s;
|
|
switch (c) {
|
|
case '\0': {
|
|
/*
|
|
* Premature end for a type descriptor, but valid for
|
|
* a class name as long as we haven't encountered an
|
|
* empty component (including the degenerate case of
|
|
* the empty string "").
|
|
*/
|
|
return isClassName && !sepOrFirst;
|
|
}
|
|
case ';': {
|
|
/*
|
|
* Invalid character for a class name, but the
|
|
* legitimate end of a type descriptor. In the latter
|
|
* case, make sure that this is the end of the string
|
|
* and that it doesn't end with an empty component
|
|
* (including the degenerate case of "L;").
|
|
*/
|
|
return !isClassName && !sepOrFirst && (s[1] == '\0');
|
|
}
|
|
case '/':
|
|
case '.': {
|
|
if (dotSeparator != (c == '.')) {
|
|
// The wrong separator character.
|
|
return false;
|
|
}
|
|
if (sepOrFirst) {
|
|
// Separator at start or two separators in a row.
|
|
return false;
|
|
}
|
|
sepOrFirst = true;
|
|
s++;
|
|
break;
|
|
}
|
|
default: {
|
|
if (!dexIsValidMemberNameUtf8(&s)) {
|
|
return false;
|
|
}
|
|
sepOrFirst = false;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Return whether the given string is a valid type descriptor. */
|
|
bool dexIsValidTypeDescriptor(const char* s) {
|
|
return isValidTypeDescriptorOrClassName(s, false, false);
|
|
}
|
|
|
|
/* (documented in header) */
|
|
bool dexIsValidClassName(const char* s, bool dotSeparator) {
|
|
return isValidTypeDescriptorOrClassName(s, true, dotSeparator);
|
|
}
|
|
|
|
/* Return whether the given string is a valid reference descriptor. This
|
|
* is true if dexIsValidTypeDescriptor() returns true and the descriptor
|
|
* is for a class or array and not a primitive type. */
|
|
bool dexIsReferenceDescriptor(const char* s) {
|
|
if (!dexIsValidTypeDescriptor(s)) {
|
|
return false;
|
|
}
|
|
|
|
return (s[0] == 'L') || (s[0] == '[');
|
|
}
|
|
|
|
/* Return whether the given string is a valid class descriptor. This
|
|
* is true if dexIsValidTypeDescriptor() returns true and the descriptor
|
|
* is for a class and not an array or primitive type. */
|
|
bool dexIsClassDescriptor(const char* s) {
|
|
if (!dexIsValidTypeDescriptor(s)) {
|
|
return false;
|
|
}
|
|
|
|
return s[0] == 'L';
|
|
}
|
|
|
|
/* Return whether the given string is a valid field type descriptor. This
|
|
* is true if dexIsValidTypeDescriptor() returns true and the descriptor
|
|
* is for anything but "void". */
|
|
bool dexIsFieldDescriptor(const char* s) {
|
|
if (!dexIsValidTypeDescriptor(s)) {
|
|
return false;
|
|
}
|
|
|
|
return s[0] != 'V';
|
|
}
|
|
|