327 lines
12 KiB
Awk
327 lines
12 KiB
Awk
# Copyright (C) 2010 The Android Open Source Project
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
# Tiny XML parser implementation in awk.
|
|
#
|
|
# This file is not meant to be used directly, instead copy the
|
|
# functions it defines here into your own script then specialize
|
|
# it appropriately.
|
|
#
|
|
|
|
# See further below for usage instructions and implementation details.
|
|
#
|
|
|
|
# ---------------------------- cut here ---------------------------
|
|
|
|
function xml_event () {
|
|
RS=">";
|
|
XML_TAG=XML_TYPE="";
|
|
split("", XML_ATTR);
|
|
while ( 1 ) {
|
|
if (_xml_closing) { # delayed direct tag closure
|
|
XML_TAG = _xml_closing;
|
|
XML_TYPE = "END";
|
|
_xml_closing = "";
|
|
_xml_exit(XML_TAG);
|
|
return 1;
|
|
}
|
|
if (getline <= 0) return 0; # read new input line
|
|
_xml_p = index($0, "<"); # get start marker
|
|
if (_xml_p == 0) return 0; # end of file (or malformed input)
|
|
$0 = substr($0, _xml_p) # remove anything before '<'
|
|
# ignore CData / Comments / Processing instructions / Declarations
|
|
if (_xml_in_section("<!\\[[Cc][Dd][Aa][Tt][Aa]\\[", "]]") ||
|
|
_xml_in_section("<!--", "--") ||
|
|
_xml_in_section("<\\?", "\\?") ||
|
|
_xml_in_section("<!", "")) {
|
|
continue;
|
|
}
|
|
if (substr($0, 1, 2) == "</") { # is it a closing tag ?
|
|
XML_TYPE = "END";
|
|
$0 = substr($0, 3);
|
|
} else { # nope, it's an opening one
|
|
XML_TYPE = "BEGIN";
|
|
$0 = substr($0, 2);
|
|
}
|
|
XML_TAG = $0
|
|
sub("[ \r\n\t/].*$", "", XML_TAG); # extract tag name
|
|
XML_TAG = toupper(XML_TAG); # uppercase it
|
|
if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ ) # validate it
|
|
_xml_panic("Invalid tag name: " XML_TAG);
|
|
if (XML_TYPE == "BEGIN") { # update reverse path
|
|
_xml_enter(XML_TAG);
|
|
} else {
|
|
_xml_exit(XML_TAG);
|
|
}
|
|
sub("[^ \r\n\t]*[ \r\n\t]*", "", $0); # get rid of tag and spaces
|
|
while ($0) { # process attributes
|
|
if ($0 == "/") { # deal with direct closing tag, e.g. </foo>
|
|
_xml_closing = XML_TAG; # record delayed tag closure.
|
|
break
|
|
}
|
|
_xml_attrib = $0;
|
|
sub(/=.*$/,"",_xml_attrib); # extract attribute name
|
|
sub(/^[^=]*/,"",$0); # remove it from record
|
|
_xml_attrib = tolower(_xml_attrib);
|
|
if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ ) # validate it
|
|
_xml_panic("Invalid attribute name: " _xml_attrib);
|
|
if (substr($0,1,2) == "=\"") { # value is ="something"
|
|
_xml_value = substr($0,3);
|
|
sub(/".*$/,"",_xml_value);
|
|
sub(/^="[^"]*"/,"",$0);
|
|
} else if (substr($0,1,2) == "='") { # value is ='something'
|
|
_xml_value = substr($0,3);
|
|
sub(/'.*$/,"",_xml_value);
|
|
sub(/^='[^']*'/,"",$0);
|
|
} else {
|
|
_xml_panic("Invalid attribute value syntax for " _xml_attrib ": " $0);
|
|
}
|
|
XML_ATTR[_xml_attrib] = _xml_value; # store attribute name/value
|
|
sub(/^[ \t\r\n]*/,"",$0); # get rid of remaining leading spaces
|
|
}
|
|
return 1; # now return, XML_TYPE/TAG/ATTR/RPATH are set
|
|
}
|
|
}
|
|
|
|
function _xml_panic (msg) {
|
|
print msg > "/dev/stderr"
|
|
exit(1)
|
|
}
|
|
|
|
function _xml_in_section (sec_begin, sec_end) {
|
|
if (!match( $0, "^" sec_begin )) return 0;
|
|
while (!match($0, sec_end "$")) {
|
|
if (getline <= 0) _xml_panic("Unexpected EOF: " ERRNO);
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
function _xml_enter (tag) {
|
|
XML_RPATH = tag "/" XML_RPATH;
|
|
}
|
|
|
|
function _xml_exit (tag) {
|
|
_xml_p = index(XML_RPATH, "/");
|
|
_xml_expected = substr(XML_RPATH, 1, _xml_p-1);
|
|
if (_xml_expected != XML_TAG)
|
|
_xml_panic("Unexpected close tag: " XML_TAG ", expecting " _xml_expected);
|
|
XML_RPATH = substr(XML_RPATH, _xml_p+1);
|
|
}
|
|
|
|
# ---------------------------- cut here ---------------------------
|
|
|
|
# USAGE:
|
|
#
|
|
# The functions provided here are used to extract the tags and attributes of a
|
|
# given XML file. They do not support extraction of data, CDATA, comments,
|
|
# processing instructions and declarations at all.
|
|
#
|
|
# You should use this from the BEGIN {} action of your awk script (it will
|
|
# not work from an END {} action).
|
|
#
|
|
# Call xml_event() in a while loop. This functions returns 1 for each XML
|
|
# 'event' encountered, or 0 when the end of input is reached. Note that in
|
|
# case of malformed output, an error will be printed and the script will
|
|
# force an exit(1)
|
|
#
|
|
# After each succesful xml_event() call, the following variables will be set:
|
|
#
|
|
# XML_TYPE: type of event: "BEGIN" -> mean an opening tag, "END" a
|
|
# closing one.
|
|
#
|
|
# XML_TAG: name of the tag, always in UPPERCASE!
|
|
#
|
|
# XML_ATTR: a map of attributes for the type. Only set for "BEGIN" types.
|
|
# all attribute names are in lowercase.
|
|
#
|
|
# beware: values are *not* unescaped !
|
|
#
|
|
# XML_RPATH: the _reversed_ element path, using "/" as a separator.
|
|
# if you are within the <manifest><application> tag, then
|
|
# it will be set to "APPLICATION/MANIFEST/"
|
|
# (note the trailing slash).
|
|
#
|
|
|
|
# This is a simple example that dumps the output of the parsing.
|
|
#
|
|
BEGIN {
|
|
while ( xml_event() ) {
|
|
printf "XML_TYPE=%s XML_TAG=%s XML_RPATH=%s", XML_TYPE, XML_TAG, XML_RPATH;
|
|
if (XML_TYPE == "BEGIN") {
|
|
for (attr in XML_ATTR) {
|
|
printf " %s='%s'", attr, XML_ATTR[attr];
|
|
}
|
|
}
|
|
printf "\n";
|
|
}
|
|
}
|
|
|
|
# IMPLEMENTATION DETAILS:
|
|
#
|
|
# 1. '>' as the record separator:
|
|
#
|
|
# RS is set to '>' to use this character as the record separator, instead of
|
|
# the default '\n'. This means that something like the following:
|
|
#
|
|
# <foo><bar attrib="value">stuff</bar></foo>
|
|
#
|
|
# will be translated into the following successive 'records':
|
|
#
|
|
# <foo
|
|
# <bar attrib="value"
|
|
# stuff</bar
|
|
# </foo
|
|
#
|
|
# Note that the '>' is never part of the records and thus will not be matched.
|
|
# If the record does not contain a single '<', the input is either
|
|
# malformed XML, or we reached the end of file with data after the last
|
|
# '>'.
|
|
#
|
|
# Newlines in the original input are kept in the records as-is.
|
|
#
|
|
# 2. Getting rid of unwanted stuff:
|
|
#
|
|
# We don't need any of the data within elements, so we get rid of them by
|
|
# simply ignoring anything before the '<' in the current record. This is
|
|
# done with code like this:
|
|
#
|
|
# p = index($0, "<"); # get index of '<'
|
|
# if (p == 0) -> return 0; # malformed input or end of file
|
|
# $0 = substr($0, p+1); # remove anything before the '<' in record
|
|
#
|
|
# We also want to ignore certain sections like CDATA, comments, declarations,
|
|
# etc.. These begin with a certain pattern and end with another one, e.g.
|
|
# "<!--" and "-->" for comments. This is handled by the _xml_in_section()
|
|
# function that accepts two patterns as input:
|
|
#
|
|
# sec_begin: is the pattern for the start of the record.
|
|
# sec_end: is the pattern for the end of the record (minus trailing '>').
|
|
#
|
|
# The function deals with the fact that these section can embed a valid '>'
|
|
# and will then span multiple records, i.e. something like:
|
|
#
|
|
# <!-- A comment with an embedded > right here ! -->
|
|
#
|
|
# will be decomposed into two records:
|
|
#
|
|
# "<!-- A comment with an embedded "
|
|
# " right here ! --"
|
|
#
|
|
# The function deals with this case, and exits when such a section is not
|
|
# properly terminated in the input.
|
|
#
|
|
# _xml_in_section() returns 1 if an ignorable section was found, or 0 otherwise.
|
|
#
|
|
# 3. Extracting the tag name:
|
|
#
|
|
# </foo> is a closing tag, and <foo> an opening tag, this is handled
|
|
# by the following code:
|
|
#
|
|
# if (substr($0, 1, 2) == "</") {
|
|
# XML_TYPE = "END";
|
|
# $0 = substr($0, 3);
|
|
# } else {
|
|
# XML_TYPE = "BEGIN";
|
|
# $0 = substr($0, 2);
|
|
# }
|
|
#
|
|
# which defines XML_TYPE, and removes the leading "</" or "<" from the record.
|
|
# The tag is later extracted and converted to uppercase with:
|
|
#
|
|
# XML_TAG = $0 # copy record
|
|
# sub("[ \r\n\t/].*$", "", XML_TAG); # remove anything after tag name
|
|
# XML_TAG = toupper(XML_TAG); # conver to uppercase
|
|
# # validate tag
|
|
# if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ ) -> panic
|
|
#
|
|
# Then the record is purged from the tag name and the spaces after it:
|
|
#
|
|
# # get rid of tag and spaces after it in $0
|
|
# sub("[^ \r\n\t]*[ \r\n\t]*", "", $0);
|
|
#
|
|
# 4. Maintaining XML_RPATH:
|
|
#
|
|
# The _xml_enter() and _xml_exit() functions are called to maintain the
|
|
# XML_RPATH variable when entering and exiting specific tags. _xml_exit()
|
|
# will also validate the input, checking proper tag enclosure (or exit(1)
|
|
# in case of error).
|
|
#
|
|
# if (XML_TYPE == "BEGIN") {
|
|
# _xml_enter(XML_TAG);
|
|
# } else {
|
|
# _xml_exit(XML_TAG);
|
|
# }
|
|
#
|
|
# 5. Extracting attributes:
|
|
#
|
|
# A loop is implemented to parse attributes, the idea is to get the attribute
|
|
# name, which is always followed by a '=' character:
|
|
#
|
|
# _xml_attrib = $0; # copy record.
|
|
# sub(/=.*$/,"",_xml_attrib); # get rid of '=' and anything after.
|
|
# sub(/^[^=]*/,"",$0); # remove attribute name from $0
|
|
# _xml_attrib = tolower(_xml_attrib);
|
|
# if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ )
|
|
# _xml_panic("Invalid attribute name: " _xml_attrib);
|
|
#
|
|
# Now get the value, which is enclosed by either (") or (')
|
|
#
|
|
# if (substr($0,1,2) == "=\"") { # if $0 begins with ="
|
|
# _xml_value = substr($0,3); # extract value
|
|
# sub(/".*$/,"",_xml_value);
|
|
# sub(/^="[^"]*"/,"",$0); # remove it from $0
|
|
# } else if (substr($0,1,2) == "='") { # if $0 begins with ='
|
|
# _xml_value = substr($0,3); # extract value
|
|
# sub(/'.*$/,"",_xml_value);
|
|
# sub(/^='[^']*'/,"",$0); # remove it from $0
|
|
# } else {
|
|
# -> panic (malformed input)
|
|
# }
|
|
#
|
|
# After that, we simply store the value into the XML_ATTR associative
|
|
# array, and cleanup $0 from leading spaces:
|
|
#
|
|
# XML_ATTR[_xml_attrib] = _xml_value;
|
|
# sub(/^[ \t\r\n]*/,"",$0);
|
|
#
|
|
#
|
|
# 6. Handling direct tag closure:
|
|
#
|
|
# When a tag is closed directly (as in <foo/>), A single '/' will be
|
|
# parsed in the attribute parsing loop. We need to record this for the
|
|
# next call to xml_event(), since the current one should return a"BEGIN"
|
|
# for the "FOO" tag instead.
|
|
#
|
|
# We do this by setting the special _xml_closing variable, as in:
|
|
#
|
|
# if ($0 == "/") {
|
|
# # record a delayed tag closure for the next call
|
|
# _xml_closing = XML_TAG;
|
|
# break
|
|
# }
|
|
#
|
|
# This variable is checked at the start of xml_event() like this:
|
|
#
|
|
# # delayed tag closure - see below
|
|
# if (_xml_closing) {
|
|
# XML_TAG = _xml_closing;
|
|
# XML_TYPE = "END";
|
|
# _xml_closing = "";
|
|
# _xml_exit(XML_TAG);
|
|
# return 1;
|
|
# }
|
|
#
|
|
# Note the call to _xml_exit() to update XML_RPATH here.
|
|
#
|