189 lines
6.8 KiB
Python
Executable file
189 lines
6.8 KiB
Python
Executable file
#!/usr/bin/python
|
|
# Copyright 2015 The Chromium OS Authors. All rights reserved.
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
# found in the LICENSE file.
|
|
|
|
"""Module for parsing TCG TPM2 library specification in HTML format.
|
|
|
|
This module processes parts 2 and 3 of the specification, extracting
|
|
information related to tables defined in the documents, feeding the
|
|
information into the Table object for further processing and creating the
|
|
appropriate TPM2 objects.
|
|
"""
|
|
|
|
from __future__ import print_function
|
|
|
|
import HTMLParser
|
|
import os
|
|
import re
|
|
import sys
|
|
|
|
import tpm_table
|
|
|
|
table_name = re.compile(r'^\s*Table\s+[0-9]+')
|
|
|
|
|
|
class SpecParser(HTMLParser.HTMLParser):
|
|
"""A class for parsing TCG specifications in html format."""
|
|
|
|
# The state machine of the parser could be in one of the following states.
|
|
ANCHOR = 0 # Look for table title anchor
|
|
TABLE_NAME = 1 # Look for table title in the data stream
|
|
TABLE_BODY = 2 # Scraping the actual table body
|
|
MAYBE_DONE = 3 # Could be over, unless a single spec table is split in
|
|
# multiple HTML tables (to continue on the next page)
|
|
SKIP_HEADER = 4 # Ignore the header of the split tables
|
|
|
|
def __init__(self):
|
|
"""Initialize a parser object to default state."""
|
|
HTMLParser.HTMLParser.__init__(self)
|
|
self._state = self.ANCHOR
|
|
self._title = ''
|
|
self._table = tpm_table.Table()
|
|
self._previous_table_number = 0 # Used to check if there are skipped tables
|
|
|
|
def _Normalize(self, data):
|
|
"""Normalize HTML data.
|
|
|
|
HTML files generated from TCG specifications sometimes include utf8
|
|
characters (like long dashes), which appear only in comments/table titles
|
|
and can be safely ignored.
|
|
|
|
Args:
|
|
data: a string representing portion of data from the HTML being parsed.
|
|
|
|
Returns:
|
|
a string, the input data with characters above ASCII printable range
|
|
excluded.
|
|
"""
|
|
return ' ' + ''.join(x for x in self.unescape(data) if ord(x) < 128)
|
|
|
|
def GetTable(self):
|
|
"""Return the Table object containing all information parsed so far."""
|
|
return self._table
|
|
|
|
def _SetState(self, new_state):
|
|
if self._state != new_state:
|
|
self._state = new_state
|
|
if new_state == self.TABLE_NAME:
|
|
self._title = ''
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
"""Invoked each time a new HTML tag is opened.
|
|
|
|
This method drives changes in the parser FSM states, its heuristics are
|
|
derived from the format of the HTML files the TCG specs get converted to.
|
|
|
|
Each specification table is preceded with a tittle. The title is wrapped
|
|
in an anchor tag with a property 'name' set to 'bookmark#xxx. The title
|
|
text starts with ' Table [0-9]+ '. Once the table title is detected,
|
|
the state machine switches to looking for the actual HTML table, i.e. tags
|
|
'table', 'tr' and 'td' (the generated specs do not use the 'th' tags).
|
|
|
|
Large specification tables can be split into multiple HTML tables (so that
|
|
they fit in a page). This is why the presence of the closing 'table' tag
|
|
is not enough to close the parsing of the current specification table.
|
|
|
|
In some cases the next table is defined in the spec immediately after the
|
|
current one - this is when the new anchor tag is used as a signal that the
|
|
previous table has been completely consumed.
|
|
|
|
Args:
|
|
tag: a string, the HTML tag
|
|
attrs: a tuple of zero or more two-string tuples, the first element -
|
|
the HTML tag's attribute, the second element - the attribute
|
|
value.
|
|
"""
|
|
if tag == 'a':
|
|
if [x for x in attrs if x[0] == 'name' and x[1].startswith('bookmark')]:
|
|
if self._state == self.ANCHOR:
|
|
self._SetState(self.TABLE_NAME)
|
|
elif self._state == self.MAYBE_DONE:
|
|
# Done indeed
|
|
self._table.ProcessTable()
|
|
self._table.Init()
|
|
self._SetState(self.TABLE_NAME)
|
|
elif self._state == self.TABLE_NAME:
|
|
self._title = ''
|
|
elif tag == 'p' and self._state == self.TABLE_NAME and not self._title:
|
|
# This was not a valid table start, back to looking for the right anchor.
|
|
self._SetState(self.ANCHOR)
|
|
elif self._state == self.TABLE_NAME and tag == 'table':
|
|
if not table_name.search(self._title):
|
|
# Table title does not match the expected format - back to square one.
|
|
self._SetState(self.ANCHOR)
|
|
return # will have to start over
|
|
table_number = int(self._title.split()[1])
|
|
self._previous_table_number += 1
|
|
if table_number > self._previous_table_number:
|
|
print('Table(s) %s missing' % ' '.join(
|
|
'%d' % x for x in
|
|
range(self._previous_table_number, table_number)), file=sys.stderr)
|
|
self._previous_table_number = table_number
|
|
self._table.Init(self._title)
|
|
self._SetState(self.TABLE_BODY)
|
|
elif self._state == self.MAYBE_DONE and tag == 'tr':
|
|
self._SetState(self.SKIP_HEADER)
|
|
elif self._state == self.SKIP_HEADER and tag == 'tr':
|
|
self._SetState(self.TABLE_BODY)
|
|
self._table.NewRow()
|
|
elif self._state == self.TABLE_BODY:
|
|
if tag == 'tr':
|
|
self._table.NewRow()
|
|
elif tag == 'td':
|
|
self._table.NewCell()
|
|
|
|
def handle_endtag(self, tag):
|
|
"""Invoked each time an HTML tag is closed."""
|
|
if tag == 'table' and self._table.InProgress():
|
|
self._SetState(self.MAYBE_DONE)
|
|
|
|
def handle_data(self, data):
|
|
"""Process data outside HTML tags."""
|
|
if self._state == self.TABLE_NAME:
|
|
self._title += ' %s' % self._Normalize(data)
|
|
elif self._state == self.TABLE_BODY:
|
|
self._table.AddData(self._Normalize(data))
|
|
elif self._state == self.MAYBE_DONE:
|
|
# Done indeed
|
|
self._table.ProcessTable()
|
|
self._table.Init()
|
|
self._SetState(self.ANCHOR)
|
|
|
|
def close(self):
|
|
"""Finish processing of the HTML buffer."""
|
|
if self._state in (self.TABLE_BODY, self.MAYBE_DONE):
|
|
self._table.ProcessTable()
|
|
self._state = self.ANCHOR
|
|
|
|
def handle_entityref(self, name):
|
|
"""Process HTML escape sequence."""
|
|
entmap = {
|
|
'amp': '&',
|
|
'gt': '>',
|
|
'lt': '<',
|
|
'quot': '"',
|
|
}
|
|
if name in entmap:
|
|
if self._state == self.TABLE_BODY:
|
|
self._table.AddData(entmap[name])
|
|
elif self._state == self.TABLE_NAME:
|
|
self._title += entmap[name]
|
|
|
|
|
|
def main(structs_html_file_name):
|
|
"""When invoked standalone - dump .h file on the console."""
|
|
parser = SpecParser()
|
|
with open(structs_html_file_name) as input_file:
|
|
html_content = input_file.read()
|
|
parser.feed(html_content)
|
|
parser.close()
|
|
print(parser.GetTable().GetHFile())
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) != 2:
|
|
print('%s: One parameter is required, the name of the html file '
|
|
'which is the TPM2 library Part 2 specification' %
|
|
os.path.basename(sys.argv[0]), file=sys.stderr)
|
|
sys.exit(1)
|
|
main(sys.argv[1])
|