438 lines
14 KiB
C
438 lines
14 KiB
C
/*
|
|
* Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
/**
|
|
* @file picokfst.c
|
|
*
|
|
* FST knowledge loading and access
|
|
*
|
|
* Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
|
|
* All rights reserved.
|
|
*
|
|
* History:
|
|
* - 2009-04-20 -- initial version
|
|
*
|
|
*/
|
|
#include "picoos.h"
|
|
#include "picodbg.h"
|
|
#include "picoknow.h"
|
|
#include "picokfst.h"
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
#if 0
|
|
}
|
|
#endif
|
|
|
|
|
|
#define FileHdrSize 4 /* size of FST file header */
|
|
|
|
|
|
|
|
/* ************************************************************/
|
|
/* function to create specialized kb, */
|
|
/* to be used by picorsrc only */
|
|
/* ************************************************************/
|
|
|
|
/** object : FSTKnowledgeBase
|
|
* shortcut : kfst
|
|
* derived from : picoknow_KnowledgeBase
|
|
*/
|
|
|
|
typedef struct kfst_subobj * kfst_SubObj;
|
|
|
|
typedef struct kfst_subobj{
|
|
picoos_uint8 * fstStream; /* the byte stream base address */
|
|
picoos_int32 hdrLen; /* length of file header */
|
|
picoos_int32 transductionMode; /* transduction mode to be used for FST */
|
|
picoos_int32 nrClasses; /* nr of pair/transition classes in FST; class is in [1..nrClasses] */
|
|
picoos_int32 nrStates; /* nr of states in FST; state is in [1..nrState] */
|
|
picoos_int32 termClass; /* pair class of terminator symbol pair; probably obsolete */
|
|
picoos_int32 alphaHashTabSize; /* size of pair alphabet hash table */
|
|
picoos_int32 alphaHashTabPos; /* absolute address of the start of the pair alphabet */
|
|
picoos_int32 transTabEntrySize; /* size in bytes of each transition table entry */
|
|
picoos_int32 transTabPos; /* absolute address of the start of the transition table */
|
|
picoos_int32 inEpsStateTabPos; /* absolute address of the start of the input epsilon transition table */
|
|
picoos_int32 accStateTabPos; /* absolute address of the table of accepting states */
|
|
} kfst_subobj_t;
|
|
|
|
|
|
|
|
/* ************************************************************/
|
|
/* primitives for reading from byte stream */
|
|
/* ************************************************************/
|
|
|
|
/* Converts 'nrBytes' bytes starting at position '*pos' in byte stream 'stream' into unsigned number 'num'.
|
|
'*pos' is modified to the position right after the number */
|
|
static void FixedBytesToUnsignedNum (picoos_uint8 * stream, picoos_uint8 nrBytes, picoos_uint32 * pos, picoos_uint32 * num)
|
|
{
|
|
picoos_int32 i;
|
|
|
|
(*num) = 0;
|
|
for (i = 0; i < nrBytes; i++) {
|
|
(*num) = ((*num) << 8) + (picoos_uint32)stream[*pos];
|
|
(*pos)++;
|
|
}
|
|
}
|
|
|
|
|
|
/* Converts 'nrBytes' bytes starting at position '*pos' in byte stream 'stream' into signed number 'num'.
|
|
'*pos' is modified to the position right after the number */
|
|
static void FixedBytesToSignedNum (picoos_uint8 * stream, picoos_uint8 nrBytes, picoos_uint32 * pos, picoos_int32 * num)
|
|
{
|
|
picoos_int32 i;
|
|
picoos_uint32 val;
|
|
|
|
val = 0;
|
|
for (i = 0; i < nrBytes; i++) {
|
|
val = (val << 8) + (picoos_uint32)stream[*pos];
|
|
(*pos)++;
|
|
}
|
|
if (val % 2 == 1) {
|
|
/* negative number */
|
|
(*num) = -((picoos_int32)((val - 1) / 2)) - 1;
|
|
} else {
|
|
/* positive number */
|
|
(*num) = val / 2;
|
|
}
|
|
}
|
|
|
|
|
|
/* Converts varying-sized sequence of bytes starting at position '*pos' in byte stream 'stream'
|
|
into (signed) number 'num'. '*pos' is modified to the position right after the number. */
|
|
static void BytesToNum (picoos_uint8 * stream, picoos_uint32 * pos, picoos_int32 * num)
|
|
{
|
|
picoos_uint32 val;
|
|
picoos_uint32 b;
|
|
|
|
val = 0;
|
|
b = (picoos_uint32)stream[*pos];
|
|
(*pos)++;
|
|
while (b < 128) {
|
|
val = (val << 7) + b;
|
|
b = (picoos_uint32)stream[*pos];
|
|
(*pos)++;
|
|
}
|
|
val = (val << 7) + (b - 128);
|
|
if (val % 2 == 1) {
|
|
/* negative number */
|
|
(*num) = -((picoos_int32)((val - 1) / 2)) - 1;
|
|
} else {
|
|
/* positive number */
|
|
(*num) = val / 2;
|
|
}
|
|
}
|
|
|
|
|
|
/* ************************************************************/
|
|
/* setting up FST from byte stream */
|
|
/* ************************************************************/
|
|
|
|
static pico_status_t kfstInitialize(register picoknow_KnowledgeBase this,
|
|
picoos_Common common)
|
|
{
|
|
picoos_uint32 curpos;
|
|
picoos_int32 offs;
|
|
kfst_subobj_t * kfst;
|
|
|
|
PICODBG_DEBUG(("kfstInitialize -- start\n"));
|
|
|
|
if (NULL == this || NULL == this->subObj) {
|
|
return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL,
|
|
NULL);
|
|
}
|
|
kfst = (kfst_subobj_t *) this->subObj;
|
|
|
|
/* +CT+ */
|
|
kfst->fstStream = this->base;
|
|
PICODBG_TRACE(("base: %d\n",this->base));
|
|
kfst->hdrLen = FileHdrSize;
|
|
curpos = kfst->hdrLen;
|
|
BytesToNum(kfst->fstStream,& curpos,& kfst->transductionMode);
|
|
BytesToNum(kfst->fstStream,& curpos,& kfst->nrClasses);
|
|
BytesToNum(kfst->fstStream,& curpos,& kfst->nrStates);
|
|
BytesToNum(kfst->fstStream,& curpos,& kfst->termClass);
|
|
BytesToNum(kfst->fstStream,& curpos,& kfst->alphaHashTabSize);
|
|
BytesToNum(kfst->fstStream,& curpos,& offs);
|
|
kfst->alphaHashTabPos = kfst->hdrLen + offs;
|
|
BytesToNum(kfst->fstStream,& curpos,& kfst->transTabEntrySize);
|
|
BytesToNum(kfst->fstStream,& curpos,& offs);
|
|
kfst->transTabPos = kfst->hdrLen + offs;
|
|
BytesToNum(kfst->fstStream,& curpos,& offs);
|
|
kfst->inEpsStateTabPos = kfst->hdrLen + offs;
|
|
BytesToNum(kfst->fstStream,& curpos,& offs);
|
|
kfst->accStateTabPos = kfst->hdrLen + offs;
|
|
/* -CT- */
|
|
|
|
return PICO_OK;
|
|
}
|
|
|
|
|
|
static pico_status_t kfstSubObjDeallocate(register picoknow_KnowledgeBase this,
|
|
picoos_MemoryManager mm)
|
|
{
|
|
if (NULL != this) {
|
|
picoos_deallocate(mm, (void *) &this->subObj);
|
|
}
|
|
return PICO_OK;
|
|
}
|
|
|
|
|
|
/* calculates a small number of data (e.g. addresses) from kb for fast access.
|
|
* This data is encapsulated in a picokfst_FST that can later be retrieved
|
|
* with picokfst_getFST. */
|
|
pico_status_t picokfst_specializeFSTKnowledgeBase(picoknow_KnowledgeBase this,
|
|
picoos_Common common)
|
|
{
|
|
pico_status_t status;
|
|
|
|
if (NULL == this) {
|
|
return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, NULL);
|
|
}
|
|
if (0 < this->size) {
|
|
/* not a dummy kb */
|
|
this->subDeallocate = kfstSubObjDeallocate;
|
|
|
|
this->subObj = picoos_allocate(common->mm, sizeof(kfst_subobj_t));
|
|
|
|
if (NULL == this->subObj) {
|
|
return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, NULL, NULL);
|
|
}
|
|
status = kfstInitialize(this, common);
|
|
if (PICO_OK != status) {
|
|
picoos_deallocate(common->mm,(void **)&this->subObj);
|
|
}
|
|
}
|
|
return PICO_OK;
|
|
}
|
|
|
|
|
|
/* ************************************************************/
|
|
/* FST type and getFST function */
|
|
/* ************************************************************/
|
|
|
|
|
|
|
|
/* return kb FST for usage in PU */
|
|
picokfst_FST picokfst_getFST(picoknow_KnowledgeBase this)
|
|
{
|
|
if (NULL == this) {
|
|
return NULL;
|
|
} else {
|
|
return (picokfst_FST) this->subObj;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/* ************************************************************/
|
|
/* FST access methods */
|
|
/* ************************************************************/
|
|
|
|
|
|
/* see description in header file */
|
|
extern picoos_uint8 picokfst_kfstGetTransductionMode(picokfst_FST this)
|
|
{
|
|
kfst_SubObj fst = (kfst_SubObj) this;
|
|
if (fst != NULL) {
|
|
return fst->transductionMode;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
|
|
/* see description in header file */
|
|
extern void picokfst_kfstGetFSTSizes (picokfst_FST this, picoos_int32 *nrStates, picoos_int32 *nrClasses)
|
|
{
|
|
kfst_SubObj fst = (kfst_SubObj) this;
|
|
if (fst != NULL) {
|
|
*nrStates = fst->nrStates;
|
|
*nrClasses = fst->nrClasses;
|
|
} else {
|
|
*nrStates = 0;
|
|
*nrClasses = 0;
|
|
}
|
|
}
|
|
|
|
/* see description in header file */
|
|
extern void picokfst_kfstStartPairSearch (picokfst_FST this, picokfst_symid_t inSym,
|
|
picoos_bool * inSymFound, picoos_int32 * searchState)
|
|
{
|
|
picoos_uint32 pos;
|
|
picoos_int32 offs;
|
|
picoos_int32 h;
|
|
picoos_int32 inSymCellPos;
|
|
picoos_int32 inSymX;
|
|
picoos_int32 nextSameHashInSymOffs;
|
|
|
|
kfst_SubObj fst = (kfst_SubObj) this;
|
|
(*searchState) = -1;
|
|
(*inSymFound) = 0;
|
|
h = inSym % fst->alphaHashTabSize;
|
|
pos = fst->alphaHashTabPos + (h * 4);
|
|
FixedBytesToSignedNum(fst->fstStream,4,& pos,& offs);
|
|
if (offs > 0) {
|
|
inSymCellPos = fst->alphaHashTabPos + offs;
|
|
pos = inSymCellPos;
|
|
BytesToNum(fst->fstStream,& pos,& inSymX);
|
|
BytesToNum(fst->fstStream,& pos,& nextSameHashInSymOffs);
|
|
while ((inSymX != inSym) && (nextSameHashInSymOffs > 0)) {
|
|
inSymCellPos = inSymCellPos + nextSameHashInSymOffs;
|
|
pos = inSymCellPos;
|
|
BytesToNum(fst->fstStream,& pos,& inSymX);
|
|
BytesToNum(fst->fstStream,& pos,& nextSameHashInSymOffs);
|
|
}
|
|
if (inSymX == inSym) {
|
|
/* input symbol found; state is set to position after symbol cell */
|
|
(*searchState) = pos;
|
|
(*inSymFound) = 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/* see description in header file */
|
|
extern void picokfst_kfstGetNextPair (picokfst_FST this, picoos_int32 * searchState,
|
|
picoos_bool * pairFound,
|
|
picokfst_symid_t * outSym, picokfst_class_t * pairClass)
|
|
{
|
|
picoos_uint32 pos;
|
|
picoos_int32 val;
|
|
|
|
kfst_SubObj fst = (kfst_SubObj) this;
|
|
if ((*searchState) < 0) {
|
|
(*pairFound) = 0;
|
|
(*outSym) = PICOKFST_SYMID_ILLEG;
|
|
(*pairClass) = -1;
|
|
} else {
|
|
pos = (*searchState);
|
|
BytesToNum(fst->fstStream,& pos,& val);
|
|
*outSym = (picokfst_symid_t)val;
|
|
if ((*outSym) != PICOKFST_SYMID_ILLEG) {
|
|
BytesToNum(fst->fstStream,& pos,& val);
|
|
*pairClass = (picokfst_class_t)val;
|
|
(*pairFound) = 1;
|
|
(*searchState) = pos;
|
|
} else {
|
|
(*pairFound) = 0;
|
|
(*outSym) = PICOKFST_SYMID_ILLEG;
|
|
(*pairClass) = -1;
|
|
(*searchState) = -1;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/* see description in header file */
|
|
extern void picokfst_kfstGetTrans (picokfst_FST this, picokfst_state_t startState, picokfst_class_t transClass,
|
|
picokfst_state_t * endState)
|
|
{
|
|
|
|
picoos_uint32 pos;
|
|
picoos_int32 index;
|
|
picoos_uint32 endStateX;
|
|
|
|
kfst_SubObj fst = (kfst_SubObj) this;
|
|
if ((startState < 1) || (startState > fst->nrStates) || (transClass < 1) || (transClass > fst->nrClasses)) {
|
|
(*endState) = 0;
|
|
} else {
|
|
index = (startState - 1) * fst->nrClasses + transClass - 1;
|
|
pos = fst->transTabPos + (index * fst->transTabEntrySize);
|
|
FixedBytesToUnsignedNum(fst->fstStream,fst->transTabEntrySize,& pos,& endStateX);
|
|
(*endState) = endStateX;
|
|
}
|
|
}
|
|
|
|
|
|
/* see description in header file */
|
|
extern void picokfst_kfstStartInEpsTransSearch (picokfst_FST this, picokfst_state_t startState,
|
|
picoos_bool * inEpsTransFound, picoos_int32 * searchState)
|
|
{
|
|
|
|
picoos_int32 offs;
|
|
picoos_uint32 pos;
|
|
|
|
kfst_SubObj fst = (kfst_SubObj) this;
|
|
(*searchState) = -1;
|
|
(*inEpsTransFound) = 0;
|
|
if ((startState > 0) && (startState <= fst->nrStates)) {
|
|
pos = fst->inEpsStateTabPos + (startState - 1) * 4;
|
|
FixedBytesToSignedNum(fst->fstStream,4,& pos,& offs);
|
|
if (offs > 0) {
|
|
(*searchState) = fst->inEpsStateTabPos + offs;
|
|
(*inEpsTransFound) = 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/* see description in header file */
|
|
extern void picokfst_kfstGetNextInEpsTrans (picokfst_FST this, picoos_int32 * searchState,
|
|
picoos_bool * inEpsTransFound,
|
|
picokfst_symid_t * outSym, picokfst_state_t * endState)
|
|
{
|
|
picoos_uint32 pos;
|
|
picoos_int32 val;
|
|
|
|
kfst_SubObj fst = (kfst_SubObj) this;
|
|
if ((*searchState) < 0) {
|
|
(*inEpsTransFound) = 0;
|
|
(*outSym) = PICOKFST_SYMID_ILLEG;
|
|
(*endState) = 0;
|
|
} else {
|
|
pos = (*searchState);
|
|
BytesToNum(fst->fstStream,& pos,& val);
|
|
*outSym = (picokfst_symid_t)val;
|
|
if ((*outSym) != PICOKFST_SYMID_ILLEG) {
|
|
BytesToNum(fst->fstStream,& pos,& val);
|
|
*endState = (picokfst_state_t)val;
|
|
(*inEpsTransFound) = 1;
|
|
(*searchState) = pos;
|
|
} else {
|
|
(*inEpsTransFound) = 0;
|
|
(*outSym) = PICOKFST_SYMID_ILLEG;
|
|
(*endState) = 0;
|
|
(*searchState) = -1;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/* see description in header file */
|
|
extern picoos_bool picokfst_kfstIsAcceptingState (picokfst_FST this, picokfst_state_t state)
|
|
{
|
|
|
|
picoos_uint32 pos;
|
|
picoos_uint32 val;
|
|
|
|
kfst_SubObj fst = (kfst_SubObj) this;
|
|
if ((state > 0) && (state <= fst->nrStates)) {
|
|
pos = fst->accStateTabPos + (state - 1);
|
|
FixedBytesToUnsignedNum(fst->fstStream,1,& pos,& val);
|
|
return (val == 1);
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
/* End picofst.c */
|