572 lines
18 KiB
C
572 lines
18 KiB
C
/*
|
|
* Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
/**
|
|
* @file picoklex.c
|
|
*
|
|
* Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
|
|
* All rights reserved.
|
|
*
|
|
* History:
|
|
* - 2009-04-20 -- initial version
|
|
*
|
|
*/
|
|
#include "picoos.h"
|
|
#include "picodbg.h"
|
|
#include "picodata.h"
|
|
#include "picoknow.h"
|
|
#include "picoklex.h"
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
#if 0
|
|
}
|
|
#endif
|
|
|
|
/* ************************************************************/
|
|
/* lexicon */
|
|
/* ************************************************************/
|
|
|
|
/**
|
|
* @addtogroup picolex
|
|
*
|
|
overview:
|
|
- lex consists of optional searchindex and a non-empty list of lexblocks
|
|
- lexblocks are fixed size, at the start of a block there is also the
|
|
start of an entry
|
|
- using the searchindex a unambiguous lexblock can be determined which
|
|
contains the entry (or there is no entry)
|
|
- one lex entry has POS GRAPH PHON, all mandatory, but
|
|
- PHON can be empty string -> no pronunciation in the resulting TTS output
|
|
- PHON can be :G2P -> use G2P later to add pronunciation
|
|
- (POS,GRAPH) is a uniq key (only one entry allowed)
|
|
- (GRAPH) is almost a uniq key (2-4 entries with the same GRAPH, and
|
|
differing POS and differing PHON possible)
|
|
- for one graph we can have two or three solutions from the lex
|
|
which all need to be passed on the the next PU
|
|
- in this case GRAPH, POS, and PHON all must be available in lex
|
|
|
|
sizing:
|
|
- 3 bytes entry index -> 16MB addressable
|
|
- 2 bytes searchindex nr -> 64K blocks possible
|
|
- 5 bytes per searchindex entry
|
|
- 3 bytes for graph-prefix
|
|
- 2 bytes blockadr in searchindex -> 64K blocks possible
|
|
- lexblock size 512B:
|
|
- 32M possible
|
|
- with ~20 bytes per entry
|
|
-> max. average of ~26 entries to be searched per lookup
|
|
- overhead of ~10 bytes per block to sync with
|
|
block boundaries
|
|
- examples:
|
|
- 500KB lex -> 1000 blocks,
|
|
1000 entries in searchindex, ~25.6K lex-entries,
|
|
- ~5KB searchindex
|
|
~10KB overhead for block sync
|
|
- 100KB lex -> 200 blocks,
|
|
200 entries in searchindex, ~5.1K lex-entries,
|
|
- ~1KB searchindex
|
|
~2KB overhead for block sync
|
|
|
|
pil-file: lexicon knowledge base in binary form
|
|
|
|
lex-kb = content
|
|
|
|
content = searchindex {lexblock}1:NRBLOCKS2
|
|
|
|
lexblock = {lexentry}1: (lexblock size is fixed 512Bytes)
|
|
|
|
searchindex = NRBLOCKS2 {GRAPH1 GRAPH1 GRAPH1 LEXBLOCKIND2}=NRBLOCKS2
|
|
|
|
lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1
|
|
LENPOSPHON1 POS1 {PHON1}=LENPOSPHON1-2
|
|
|
|
- special cases:
|
|
- PHON is empty string (no pronunciation in the resulting TTS output):
|
|
lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1 2 POS1
|
|
- PHON can be :G2P -> use G2P later to add pronunciation:
|
|
lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1 3 POS1 <reserved-phon-val=5>
|
|
- multi-byte values always little endian
|
|
*/
|
|
|
|
|
|
/* ************************************************************/
|
|
/* lexicon data defines */
|
|
/* may not be changed with current implementation */
|
|
/* ************************************************************/
|
|
|
|
/* nr bytes of nrblocks info */
|
|
#define PICOKLEX_LEX_NRBLOCKS_SIZE 2
|
|
|
|
/* search index entry: - nr graphs
|
|
- nr bytes of block index
|
|
- nr bytes per entry, NRGRAPHS*INDSIZE */
|
|
#define PICOKLEX_LEX_SIE_NRGRAPHS 3
|
|
#define PICOKLEX_LEX_SIE_INDSIZE 2
|
|
#define PICOKLEX_LEX_SIE_SIZE 5
|
|
|
|
/* nr of bytes per lexblock */
|
|
#define PICOKLEX_LEXBLOCK_SIZE 512
|
|
|
|
|
|
/* reserved values in klex to indicate :G2P needed for a lexentry */
|
|
#define PICOKLEX_NEEDS_G2P 5
|
|
|
|
|
|
/* ************************************************************/
|
|
/* lexicon type and loading */
|
|
/* ************************************************************/
|
|
|
|
/** object : LexKnowledgeBase
|
|
* shortcut : klex
|
|
* derived from : picoknow_KnowledgeBase
|
|
*/
|
|
|
|
typedef struct klex_subobj *klex_SubObj;
|
|
|
|
typedef struct klex_subobj
|
|
{
|
|
picoos_uint16 nrblocks; /* nr lexblocks = nr eles in searchind */
|
|
picoos_uint8 *searchind;
|
|
picoos_uint8 *lexblocks;
|
|
} klex_subobj_t;
|
|
|
|
|
|
static pico_status_t klexInitialize(register picoknow_KnowledgeBase this,
|
|
picoos_Common common)
|
|
{
|
|
picoos_uint32 curpos = 0;
|
|
klex_subobj_t *klex;
|
|
|
|
PICODBG_DEBUG(("start"));
|
|
|
|
/* check whether (this->size != 0) done before calling this function */
|
|
|
|
if (NULL == this || NULL == this->subObj) {
|
|
return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
|
|
NULL, NULL);
|
|
}
|
|
klex = (klex_subobj_t *) this->subObj;
|
|
|
|
if (PICO_OK == picoos_read_mem_pi_uint16(this->base, &curpos,
|
|
&(klex->nrblocks))) {
|
|
if (klex->nrblocks > 0) {
|
|
PICODBG_DEBUG(("nr blocks: %i, curpos: %i", klex->nrblocks,curpos));
|
|
klex->searchind = this->base + curpos;
|
|
} else {
|
|
klex->searchind = NULL;
|
|
}
|
|
klex->lexblocks = this->base + PICOKLEX_LEX_NRBLOCKS_SIZE +
|
|
(klex->nrblocks * (PICOKLEX_LEX_SIE_SIZE));
|
|
return PICO_OK;
|
|
} else {
|
|
return picoos_emRaiseException(common->em, PICO_EXC_FILE_CORRUPT,
|
|
NULL, NULL);
|
|
}
|
|
}
|
|
|
|
|
|
static pico_status_t klexSubObjDeallocate(register picoknow_KnowledgeBase this,
|
|
picoos_MemoryManager mm)
|
|
{
|
|
if (NULL != this) {
|
|
picoos_deallocate(mm, (void *) &this->subObj);
|
|
}
|
|
return PICO_OK;
|
|
}
|
|
|
|
|
|
/* we don't offer a specialized constructor for a LexKnowledgeBase but
|
|
* instead a "specializer" of an allready existing generic
|
|
* picoknow_KnowledgeBase */
|
|
|
|
pico_status_t picoklex_specializeLexKnowledgeBase(picoknow_KnowledgeBase this,
|
|
picoos_Common common)
|
|
{
|
|
if (NULL == this) {
|
|
return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
|
|
NULL, NULL);
|
|
}
|
|
if (this->size > 0) {
|
|
this->subDeallocate = klexSubObjDeallocate;
|
|
this->subObj = picoos_allocate(common->mm, sizeof(klex_subobj_t));
|
|
if (NULL == this->subObj) {
|
|
return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
|
|
NULL, NULL);
|
|
}
|
|
return klexInitialize(this, common);
|
|
} else {
|
|
/* some dummy klex */
|
|
return PICO_OK;
|
|
}
|
|
}
|
|
|
|
/* for now we don't need to do anything special for the main lex */
|
|
/*
|
|
pico_status_t picoklex_specializeMainLexKnowledgeBase(
|
|
picoknow_KnowledgeBase this,
|
|
picoos_Common common)
|
|
{
|
|
return picoklex_specializeLexKnowledgeBase(this,common);
|
|
}
|
|
*/
|
|
|
|
|
|
/* ************************************************************/
|
|
/* lexicon getLex */
|
|
/* ************************************************************/
|
|
|
|
picoklex_Lex picoklex_getLex(picoknow_KnowledgeBase this)
|
|
{
|
|
if (NULL == this) {
|
|
return NULL;
|
|
} else {
|
|
return (picoklex_Lex) this->subObj;
|
|
}
|
|
}
|
|
|
|
|
|
/* ************************************************************/
|
|
/* functions on searchindex */
|
|
/* ************************************************************/
|
|
|
|
|
|
static picoos_uint32 klex_getSearchIndexVal(const klex_SubObj this,
|
|
picoos_uint16 index)
|
|
{
|
|
picoos_uint32 pos, val;
|
|
pos = index * PICOKLEX_LEX_SIE_SIZE;
|
|
val = this->searchind[pos];
|
|
val = (val << 8) + this->searchind[pos + 1];
|
|
val = (val << 8) + this->searchind[pos + 2];
|
|
return val;
|
|
}
|
|
|
|
|
|
/* Determine first lexblock containing entries for specified
|
|
grapheme. */
|
|
|
|
static picoos_uint16 klex_getLexblockNr(const klex_SubObj this,
|
|
const picoos_uint8 *graphsi) {
|
|
/* graphsi is of len PICOKLEX_LEX_SI_NGRAPHS */
|
|
picoos_int32 low, mid, high;
|
|
picoos_uint32 searchval, indval;
|
|
|
|
/* PICOKLEX_LEX_SIE_NRGRAPHS */
|
|
|
|
/* convert graph-prefix to number with 'lexicographic' ordering */
|
|
searchval = graphsi[0];
|
|
searchval = (searchval << 8) + graphsi[1];
|
|
searchval = (searchval << 8) + graphsi[2];
|
|
|
|
low = 0;
|
|
high = this->nrblocks;
|
|
|
|
/* do binary search */
|
|
while (low < high) {
|
|
mid = (low + high) / 2;
|
|
indval = klex_getSearchIndexVal(this, mid);
|
|
if (indval < searchval) {
|
|
low = mid + 1;
|
|
} else {
|
|
high = mid;
|
|
}
|
|
}
|
|
PICODBG_ASSERT(high == low);
|
|
/* low points to the first entry greater than or equal to searchval */
|
|
|
|
if (low < this->nrblocks) {
|
|
indval = klex_getSearchIndexVal(this, low);
|
|
if (indval > searchval) {
|
|
low--;
|
|
/* if there are identical elements in the search index we have
|
|
to move to the first one */
|
|
if (low > 0) {
|
|
indval = klex_getSearchIndexVal(this, low);
|
|
while (indval == klex_getSearchIndexVal(this, low-1)) {
|
|
low--;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
low = this->nrblocks - 1;
|
|
}
|
|
|
|
#if defined(PICO_DEBUG)
|
|
{
|
|
picoos_uint32 pos = low * PICOKLEX_LEX_SIE_SIZE;
|
|
PICODBG_DEBUG(("binary search result is %c%c%c (%d)",
|
|
this->searchind[pos], this->searchind[pos + 1],
|
|
this->searchind[pos + 2], low));
|
|
}
|
|
#endif
|
|
|
|
return (picoos_uint16) low;
|
|
}
|
|
|
|
|
|
/* Determine number of adjacent lexblocks containing entries for
|
|
the same grapheme search prefix (identified by search index). */
|
|
|
|
static picoos_uint16 klex_getLexblockRange(const klex_SubObj this,
|
|
picoos_uint16 index)
|
|
{
|
|
picoos_uint16 count;
|
|
picoos_uint32 sval1, sval2;
|
|
|
|
sval1 = klex_getSearchIndexVal(this, index);
|
|
|
|
#if defined(PICO_DEBUG)
|
|
/* 'index' must point to first lexblock of its kind */
|
|
if (index > 0) {
|
|
sval2 = klex_getSearchIndexVal(this, index - 1);
|
|
PICODBG_ASSERT(sval1 != sval2);
|
|
}
|
|
#endif
|
|
|
|
index++;
|
|
sval2 = klex_getSearchIndexVal(this, index);
|
|
|
|
count = 1;
|
|
while (sval1 == sval2) {
|
|
count++;
|
|
index++;
|
|
sval2 = klex_getSearchIndexVal(this, index);
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
|
|
/* ************************************************************/
|
|
/* functions on single lexblock */
|
|
/* ************************************************************/
|
|
|
|
static picoos_int8 klex_lexMatch(picoos_uint8 *lexentry,
|
|
const picoos_uint8 *graph,
|
|
const picoos_uint16 graphlen) {
|
|
picoos_uint8 i;
|
|
picoos_uint8 lexlen;
|
|
picoos_uint8 *lexgraph;
|
|
|
|
lexlen = lexentry[0] - 1;
|
|
lexgraph = &(lexentry[1]);
|
|
for (i=0; (i<graphlen) && (i<lexlen); i++) {
|
|
PICODBG_TRACE(("%d|%d graph|lex: %c|%c", graphlen, lexlen,
|
|
graph[i], lexgraph[i]));
|
|
if (lexgraph[i] < graph[i]) {
|
|
return -1;
|
|
} else if (lexgraph[i] > graph[i]) {
|
|
return 1;
|
|
}
|
|
}
|
|
if (graphlen == lexlen) {
|
|
return 0;
|
|
} else if (lexlen < graphlen) {
|
|
return -1;
|
|
} else {
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
|
|
static void klex_setLexResult(const picoos_uint8 *lexentry,
|
|
const picoos_uint32 lexpos,
|
|
picoklex_lexl_result_t *lexres) {
|
|
picoos_uint8 i;
|
|
|
|
/* check if :G2P */
|
|
if ((2 < (lexentry[lexentry[0]])) && ((lexentry[lexentry[0] + 2]) == PICOKLEX_NEEDS_G2P)) {
|
|
/* set pos */
|
|
lexres->posind[0] = lexentry[lexentry[0] + 1];
|
|
/* set rest */
|
|
lexres->phonfound = FALSE;
|
|
lexres->posindlen = 1;
|
|
lexres->nrres = 1;
|
|
PICODBG_DEBUG(("result %d :G2P", lexres->nrres));
|
|
} else {
|
|
i = lexres->nrres * (PICOKLEX_POSIND_SIZE);
|
|
lexres->posindlen += PICOKLEX_POSIND_SIZE;
|
|
lexres->phonfound = TRUE;
|
|
/* set pos */
|
|
lexres->posind[i++] = lexentry[lexentry[0] + 1];
|
|
/* set ind, PICOKLEX_IND_SIZE */
|
|
lexres->posind[i++] = 0x000000ff & (lexpos);
|
|
lexres->posind[i++] = 0x000000ff & (lexpos >> 8);
|
|
lexres->posind[i] = 0x000000ff & (lexpos >> 16);
|
|
lexres->nrres++;
|
|
PICODBG_DEBUG(("result %d", lexres->nrres));
|
|
}
|
|
}
|
|
|
|
|
|
static void klex_lexblockLookup(klex_SubObj this,
|
|
const picoos_uint32 lexposStart,
|
|
const picoos_uint32 lexposEnd,
|
|
const picoos_uint8 *graph,
|
|
const picoos_uint16 graphlen,
|
|
picoklex_lexl_result_t *lexres) {
|
|
picoos_uint32 lexpos;
|
|
picoos_int8 rv;
|
|
|
|
lexres->nrres = 0;
|
|
|
|
lexpos = lexposStart;
|
|
rv = -1;
|
|
while ((rv < 0) && (lexpos < lexposEnd)) {
|
|
|
|
rv = klex_lexMatch(&(this->lexblocks[lexpos]), graph, graphlen);
|
|
|
|
if (rv == 0) { /* found */
|
|
klex_setLexResult(&(this->lexblocks[lexpos]), lexpos, lexres);
|
|
if (lexres->phonfound) {
|
|
/* look for more results, up to MAX_NRRES, don't even
|
|
check if more results would be available */
|
|
while ((lexres->nrres < PICOKLEX_MAX_NRRES) &&
|
|
(lexpos < lexposEnd)) {
|
|
lexpos += this->lexblocks[lexpos];
|
|
lexpos += this->lexblocks[lexpos];
|
|
/* if there are no more entries in this block, advance
|
|
to next block by skipping all zeros */
|
|
while ((this->lexblocks[lexpos] == 0) &&
|
|
(lexpos < lexposEnd)) {
|
|
lexpos++;
|
|
}
|
|
if (lexpos < lexposEnd) {
|
|
if (klex_lexMatch(&(this->lexblocks[lexpos]), graph,
|
|
graphlen) == 0) {
|
|
klex_setLexResult(&(this->lexblocks[lexpos]),
|
|
lexpos, lexres);
|
|
} else {
|
|
/* no more results, quit loop */
|
|
lexpos = lexposEnd;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
/* :G2P mark */
|
|
}
|
|
} else if (rv < 0) {
|
|
/* not found, goto next entry */
|
|
lexpos += this->lexblocks[lexpos];
|
|
lexpos += this->lexblocks[lexpos];
|
|
/* if there are no more entries in this block, advance
|
|
to next block by skipping all zeros */
|
|
while ((this->lexblocks[lexpos] == 0) && (lexpos < lexposEnd)) {
|
|
lexpos++;
|
|
}
|
|
} else {
|
|
/* rv > 0, not found, won't show up later in block */
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/* ************************************************************/
|
|
/* lexicon lookup functions */
|
|
/* ************************************************************/
|
|
|
|
picoos_uint8 picoklex_lexLookup(const picoklex_Lex this,
|
|
const picoos_uint8 *graph,
|
|
const picoos_uint16 graphlen,
|
|
picoklex_lexl_result_t *lexres) {
|
|
picoos_uint16 lbnr, lbc;
|
|
picoos_uint32 lexposStart, lexposEnd;
|
|
picoos_uint8 i;
|
|
picoos_uint8 tgraph[PICOKLEX_LEX_SIE_NRGRAPHS];
|
|
klex_SubObj klex = (klex_SubObj) this;
|
|
|
|
if (NULL == klex) {
|
|
PICODBG_ERROR(("no lexicon loaded"));
|
|
/* no exception here needed, already checked at initialization */
|
|
return FALSE;
|
|
}
|
|
|
|
lexres->nrres = 0;
|
|
lexres->posindlen = 0;
|
|
lexres->phonfound = FALSE;
|
|
|
|
for (i = 0; i<PICOKLEX_LEX_SIE_NRGRAPHS; i++) {
|
|
if (i < graphlen) {
|
|
tgraph[i] = graph[i];
|
|
} else {
|
|
tgraph[i] = '\0';
|
|
}
|
|
}
|
|
PICODBG_DEBUG(("tgraph: %c%c%c", tgraph[0],tgraph[1],tgraph[2]));
|
|
|
|
if ((klex->nrblocks) == 0) {
|
|
/* no searchindex, no lexblock */
|
|
PICODBG_WARN(("no searchindex, no lexblock"));
|
|
return FALSE;
|
|
} else {
|
|
lbnr = klex_getLexblockNr(klex, tgraph);
|
|
PICODBG_ASSERT(lbnr < klex->nrblocks);
|
|
lbc = klex_getLexblockRange(klex, lbnr);
|
|
PICODBG_ASSERT((lbc >= 1) && (lbc <= klex->nrblocks));
|
|
}
|
|
PICODBG_DEBUG(("lexblock nr: %d (#%d)", lbnr, lbc));
|
|
|
|
lexposStart = lbnr * PICOKLEX_LEXBLOCK_SIZE;
|
|
lexposEnd = lexposStart + lbc * PICOKLEX_LEXBLOCK_SIZE;
|
|
|
|
PICODBG_DEBUG(("lookup start, lexpos range %d..%d", lexposStart,lexposEnd));
|
|
klex_lexblockLookup(klex, lexposStart, lexposEnd, graph, graphlen, lexres);
|
|
PICODBG_DEBUG(("lookup done, %d found", lexres->nrres));
|
|
|
|
return (lexres->nrres > 0);
|
|
}
|
|
|
|
|
|
picoos_uint8 picoklex_lexIndLookup(const picoklex_Lex this,
|
|
const picoos_uint8 *ind,
|
|
const picoos_uint8 indlen,
|
|
picoos_uint8 *pos,
|
|
picoos_uint8 **phon,
|
|
picoos_uint8 *phonlen) {
|
|
picoos_uint32 pentry;
|
|
klex_SubObj klex = (klex_SubObj) this;
|
|
|
|
/* check indlen */
|
|
if (indlen != PICOKLEX_IND_SIZE) {
|
|
return FALSE;
|
|
}
|
|
|
|
/* PICOKLEX_IND_SIZE */
|
|
pentry = 0x000000ff & (ind[0]);
|
|
pentry |= ((picoos_uint32)(ind[1]) << 8);
|
|
pentry |= ((picoos_uint32)(ind[2]) << 16);
|
|
|
|
/* check ind if it is within lexblocks byte stream, if not, return FALSE */
|
|
if (pentry >= ((picoos_uint32)klex->nrblocks * PICOKLEX_LEXBLOCK_SIZE)) {
|
|
return FALSE;
|
|
}
|
|
|
|
pentry += (klex->lexblocks[pentry]);
|
|
*phonlen = (klex->lexblocks[pentry++]) - 2;
|
|
*pos = klex->lexblocks[pentry++];
|
|
*phon = &(klex->lexblocks[pentry]);
|
|
|
|
PICODBG_DEBUG(("pentry: %d, phonlen: %d", pentry, *phonlen));
|
|
return TRUE;
|
|
}
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
|
|
/* end */
|