1236 lines
31 KiB
C++
1236 lines
31 KiB
C++
/*******************************************************************************
|
|
* LtsCart.cpp *
|
|
*----------*
|
|
*
|
|
* ** WARNING **
|
|
* CART code for LTS. This code was created in MS Research and LiJ owns
|
|
* the algorithm. YunusM eliminated the private heap used by this code
|
|
* and used the new and delete operators instead.
|
|
*
|
|
* Created By: LIJ (MS Research) Date: 06/18/99
|
|
* Current Owner: Fil
|
|
*
|
|
* Copyright (C) 1999 Microsoft Corporation. All Rights Reserved
|
|
*******************************************************************************/
|
|
|
|
//--- Includes --------------------------------------------------------------
|
|
|
|
#include "StdAfx.h"
|
|
#include "LtsCart.h"
|
|
|
|
#pragma warning(disable : 4100)
|
|
|
|
/* the following are for exceptions: single letter and NULL output */
|
|
static const char *bogus_pron_1033 = "B OW G AH S P R AH N AH N S IY EY SH AH N";
|
|
static const char *bogus_pron_1041 = "N A N I"; // what?
|
|
|
|
static const char *single_letter_pron_1033[52] =
|
|
{
|
|
"EY",
|
|
"B IY",
|
|
"S IY",
|
|
"D IY",
|
|
"IY",
|
|
"EH F",
|
|
"JH IY",
|
|
"EY CH",
|
|
"AY",
|
|
"JH EY",
|
|
"K EY",
|
|
"EH L",
|
|
"EH M",
|
|
"EH N",
|
|
"OW",
|
|
"P IY",
|
|
"K Y UW",
|
|
"AA R",
|
|
"EH S",
|
|
"T IY",
|
|
"Y UW",
|
|
"V IY",
|
|
"D AH B AX L Y UW",
|
|
"EH K S",
|
|
"W AY",
|
|
"Z IY",
|
|
//
|
|
// PLURAL SPELLINGS
|
|
//
|
|
"EY Z",
|
|
"B IY Z",
|
|
"S IY Z",
|
|
"D IY Z",
|
|
"IY Z",
|
|
"EH F S",
|
|
"JH IY Z",
|
|
"EY CH AX Z",
|
|
"AY Z",
|
|
"JH EY Z",
|
|
"K EY Z",
|
|
"EH L Z",
|
|
"EH M Z",
|
|
"EH N Z",
|
|
"OW Z",
|
|
"P IY Z",
|
|
"K Y UW Z",
|
|
"AA R Z",
|
|
"EH S AX Z",
|
|
"T IY Z",
|
|
"Y UW Z",
|
|
"V IY Z",
|
|
"D AH B AX L Y UW Z",
|
|
"EH K S AX Z",
|
|
"W AY Z",
|
|
"Z IY Z"
|
|
};
|
|
static const char *single_letter_pron_1041[52] =
|
|
{
|
|
"EE",
|
|
"B II",
|
|
"SH II",
|
|
"D II",
|
|
"II",
|
|
"E H U",
|
|
"J II",
|
|
"EE CH I",
|
|
"A I",
|
|
"J EE",
|
|
"K EE",
|
|
"E R U",
|
|
"E M U",
|
|
"E N U",
|
|
"OO",
|
|
"P II",
|
|
"K Y UU",
|
|
"AA R U",
|
|
"E S U",
|
|
"T II",
|
|
"Y UU",
|
|
"B U I",
|
|
"D A B U R Y UU",
|
|
"E STOP K U S U",
|
|
"W A I",
|
|
"Z E STOP T O",
|
|
//
|
|
// PLURAL SPELLINGS
|
|
//
|
|
"EE Z U",
|
|
"B II Z U",
|
|
"SH II Z U",
|
|
"D II Z U",
|
|
"II Z U",
|
|
"E H U Z U",
|
|
"J II Z U",
|
|
"EE CH I Z U",
|
|
"A I Z U",
|
|
"J EE Z U",
|
|
"K EE Z U",
|
|
"E R U Z U",
|
|
"E M U Z U",
|
|
"E N U Z U",
|
|
"OO Z U",
|
|
"P II Z U",
|
|
"K Y UU Z U",
|
|
"AA R U Z U",
|
|
"E S U Z U",
|
|
"T II Z U",
|
|
"Y UU Z U",
|
|
"B U I Z U",
|
|
"D A B U R Y UU Z U",
|
|
"E STOP K U S U Z U",
|
|
"W A I Z U",
|
|
"Z E STOP T O Z U"
|
|
};
|
|
|
|
|
|
/*
|
|
* not worthwhile to use binary search with only about 30 entries
|
|
*/
|
|
static int symbol_to_id(LTS_SYMTAB *tab, char *sym)
|
|
{
|
|
USES_CONVERSION;
|
|
SPDBG_FUNC("symbol_to_id");
|
|
|
|
int i;
|
|
for (i = 0; i < tab->n_symbols; i++)
|
|
{
|
|
if (CSTR_EQUAL == CompareString(MAKELCID(MAKELANGID(LANG_ENGLISH, SUBLANG_ENGLISH_US), SORT_DEFAULT), NORM_IGNORECASE,
|
|
A2T(tab->storage + tab->sym_idx[i]), -1, A2T(sym), -1))
|
|
{
|
|
return i;
|
|
}
|
|
}
|
|
return NO_SYMBOL;
|
|
} // static int symbol_to_id(LTS_SYMTAB *tab, char *sym)
|
|
|
|
|
|
static char *id_to_symbol(LTS_SYMTAB *tab, int id)
|
|
{
|
|
SPDBG_FUNC("id_to_symbol");
|
|
|
|
if (id < 0 || id > tab->n_symbols)
|
|
{
|
|
return NULL;
|
|
}
|
|
else
|
|
{
|
|
return tab->storage + tab->sym_idx[id];
|
|
}
|
|
} // static char *id_to_symbol(LTS_SYMTAB *tab, int id)
|
|
|
|
|
|
__inline void ODS (const char *format, ...)
|
|
{
|
|
#ifdef _DEBUG
|
|
SPDBG_FUNC("ODS");
|
|
|
|
va_list arglist;
|
|
va_start (arglist, format);
|
|
|
|
char buf[2048];
|
|
_vsnprintf(buf, 2048, format, arglist);
|
|
OutputDebugStringA(buf);
|
|
|
|
va_end (arglist);
|
|
#endif
|
|
}
|
|
|
|
__inline int ans_simp_question (LTS_FEATURE *feat, SIMPLE_QUESTION question,
|
|
LTS_SAMPLE *sample)
|
|
{
|
|
SPDBG_FUNC("ans_simp_question");
|
|
|
|
SYMBOL id;
|
|
int *phones = feat[question.questype].feature[question.feature];
|
|
|
|
SAMPLE_GET_CONTEXT(sample, question.questype, question.context,
|
|
question.offset, id);
|
|
|
|
return (TST_BIT(phones, id) ? TRUE : FALSE);
|
|
} // __inline int ans_simp_question (LTS_FEATURE *feat, SIMPLE_QUESTION question,
|
|
|
|
|
|
static int product_eval (LTS_FEATURE *feat, char *term, LTS_SAMPLE *sample)
|
|
{
|
|
SPDBG_FUNC("product_eval");
|
|
|
|
int negate, result;
|
|
SIMPLE_QUESTION ques;
|
|
char *cptr;
|
|
|
|
cptr = term;
|
|
while (TRUE)
|
|
{
|
|
/* negation sign */
|
|
if (*cptr == '~')
|
|
{
|
|
negate = TRUE;
|
|
cptr++;
|
|
}
|
|
else
|
|
{
|
|
negate = FALSE;
|
|
}
|
|
|
|
if (!isdigit(*cptr))
|
|
{
|
|
//quit (-1, "Invalid product in product_eval\n");
|
|
// OutputDebugString("Invalid product in product_eval\n");
|
|
return FALSE;
|
|
}
|
|
|
|
for (result = *cptr++ - '0'; isdigit (*cptr); cptr++)
|
|
{
|
|
result = result * 10 + (*cptr - '0');
|
|
}
|
|
|
|
QUES_DECODE(result, ques.questype, ques.context, ques.offset,
|
|
ques.feature);
|
|
if ((negate ^ ans_simp_question (feat, ques, sample)) == FALSE)
|
|
{
|
|
return FALSE;
|
|
}
|
|
|
|
if (*cptr == '\0')
|
|
{
|
|
break;
|
|
}
|
|
if (*cptr++ != '&')
|
|
{
|
|
//quit (-1, "product_eval: syntax error in product term %s\n", term);
|
|
/*
|
|
char szTemp[512];
|
|
|
|
sprintf(szTemp, "product_eval: syntax error in product term %s\n", term);
|
|
OutputDebugString(szTemp);
|
|
*/
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
return TRUE;
|
|
} // static int product_eval (LTS_FEATURE *feat, char *term, LTS_SAMPLE *sample)
|
|
|
|
|
|
static int ans_comp_question(LTS_FEATURE *feat, char *prod,
|
|
LTS_SAMPLE *sample)
|
|
{
|
|
SPDBG_FUNC("ans_comp_question");
|
|
|
|
int i, num_products, limit;
|
|
char *cptr, string[LONGEST_STR], *products[MAX_PRODUCTS];
|
|
|
|
strcpy(string, prod);
|
|
for (cptr = string, num_products = 1; *cptr != '\0'; cptr++)
|
|
{
|
|
if (*cptr == '|') num_products++;
|
|
}
|
|
|
|
if (num_products > MAX_PRODUCTS)
|
|
{
|
|
//quit(1, "please increase MAX_PRODUCTS up to %d at least\n", num_products);
|
|
|
|
/*
|
|
char szTemp[256];
|
|
sprintf(szTemp, "please increase MAX_PRODUCTS up to %d at least\n", num_products);
|
|
OutputDebugString(szTemp);
|
|
*/
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
for (i = 0, limit = num_products -1, cptr = string; ; i++)
|
|
{
|
|
products[i] = cptr++;
|
|
if (i == limit)
|
|
{
|
|
break;
|
|
}
|
|
|
|
for (; *cptr != '|'; cptr++) {};
|
|
*cptr++ = '\0';
|
|
}
|
|
|
|
for (i = 0; i < num_products; i++)
|
|
{
|
|
if (product_eval (feat, products[i], sample) == TRUE)
|
|
{
|
|
return TRUE;
|
|
}
|
|
}
|
|
|
|
return FALSE;
|
|
} // static int ans_comp_question(LTS_FEATURE *feat, char *prod,
|
|
|
|
|
|
static T_NODE *find_leaf(LTS_FEATURE *feat, T_NODE *root, LTS_SAMPLE *sample)
|
|
{
|
|
SPDBG_FUNC("find_leaf");
|
|
|
|
if (!root->yes_child)
|
|
{
|
|
return root;
|
|
}
|
|
else if (ans_comp_question(feat, root->prod, sample))
|
|
{
|
|
return find_leaf(feat, root->yes_child, sample);
|
|
}
|
|
else
|
|
{
|
|
return find_leaf(feat, root->no_child, sample);
|
|
}
|
|
} // static T_NODE *find_leaf(LTS_FEATURE *feat, T_NODE *root, LTS_SAMPLE *sample)
|
|
|
|
|
|
static int lts_product_eval (LTS_FEATURE *feat, LTS_PROD *term,
|
|
LTS_SAMPLE *sample, LTS_PROD **next)
|
|
{
|
|
SPDBG_FUNC("lts_product_eval");
|
|
|
|
int negate, result;
|
|
SIMPLE_QUESTION ques;
|
|
LTS_PROD *cptr = term;
|
|
|
|
while (TRUE)
|
|
{
|
|
if ((*cptr) & PROD_NEG)
|
|
{
|
|
negate = TRUE;
|
|
result = (*cptr) ^ PROD_NEG;
|
|
}
|
|
else
|
|
{
|
|
negate = FALSE;
|
|
result = (*cptr);
|
|
}
|
|
|
|
QUES_DECODE(result, ques.questype, ques.context, ques.offset,
|
|
ques.feature);
|
|
if ((negate ^ ans_simp_question (feat, ques, sample)) == FALSE)
|
|
{
|
|
while (*cptr != PROD_TERM && *cptr != QUES_TERM)
|
|
{
|
|
cptr++;
|
|
}
|
|
if (*cptr == QUES_TERM)
|
|
{
|
|
*next = NULL;
|
|
}
|
|
else
|
|
{
|
|
*next = cptr + 1;
|
|
}
|
|
return FALSE;
|
|
}
|
|
|
|
cptr++;
|
|
if (*cptr == QUES_TERM)
|
|
{
|
|
*next = NULL;
|
|
break;
|
|
}
|
|
else if (*cptr == PROD_TERM)
|
|
{
|
|
*next = cptr + 1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return TRUE;
|
|
} // static int lts_product_eval (LTS_FEATURE *feat, LTS_PROD *term,
|
|
|
|
|
|
static int lts_ans_comp_question(LTS_TREE UNALIGNED *tree, LTS_FEATURE *feat,
|
|
int idx, LTS_SAMPLE *sample)
|
|
{
|
|
SPDBG_FUNC("lts_ans_comp_question");
|
|
|
|
LTS_PROD *next, *term = (LTS_PROD *) ((char *) tree->p_prod + idx);
|
|
|
|
while (TRUE)
|
|
{
|
|
if (lts_product_eval (feat, term, sample, &next) == TRUE)
|
|
{
|
|
return TRUE;
|
|
}
|
|
if (next == NULL)
|
|
{
|
|
break;
|
|
}
|
|
term = next;
|
|
}
|
|
|
|
return FALSE;
|
|
} // static int lts_ans_comp_question(LTS_TREE *tree, LTS_FEATURE *feat,
|
|
|
|
|
|
static LTS_NODE *lts_find_leaf(LTS_TREE UNALIGNED *tree, LTS_FEATURE *feat,
|
|
LTS_NODE *root, LTS_SAMPLE *sample)
|
|
{
|
|
SPDBG_FUNC("lts_find_leaf");
|
|
|
|
if (IS_LEAF_NODE(root))
|
|
{
|
|
return root;
|
|
}
|
|
else if (lts_ans_comp_question(tree, feat, ((LTS_NODE UNALIGNED *)root)->idx, sample))
|
|
{
|
|
return lts_find_leaf(tree, feat, root + ((LTS_NODE UNALIGNED *)root)->yes, sample);
|
|
}
|
|
else
|
|
{
|
|
return lts_find_leaf(tree, feat, root + ((LTS_NODE UNALIGNED *)root)->yes + 1, sample);
|
|
}
|
|
} // static LTS_NODE *lts_find_leaf(LTS_TREE *tree, LTS_FEATURE *feat,
|
|
|
|
|
|
static LTS_DIST *lts_find_leaf_count(LTS_FOREST *l_forest, SYMBOL *pIn,
|
|
SYMBOL *pOut)
|
|
{
|
|
SPDBG_FUNC("lts_find_leaf_count");
|
|
|
|
LTS_TREE UNALIGNED *tree = l_forest->tree[*pIn];
|
|
LTS_NODE UNALIGNED *leaf;
|
|
LTS_SAMPLE sample;
|
|
|
|
/*
|
|
* construct a sample in order to share all the code with training
|
|
*/
|
|
sample.pIn = pIn;
|
|
sample.pOut = pOut;
|
|
|
|
/* *pOut cannot be NULL_SYMBOL_ID */
|
|
*pOut = NULL_SYMBOL_ID + 1;
|
|
|
|
leaf = lts_find_leaf(tree, l_forest->features, &(tree->nodes[0]), &sample);
|
|
return (LTS_DIST *) ((char *)tree->p_dist + leaf->idx);
|
|
} // static LTS_DIST *lts_find_leaf_count(LTS_FOREST *l_forest, SYMBOL *pIn,
|
|
|
|
static LTS_OUT_RESULT *allocate_out_result(LTS_FOREST *l_forest)
|
|
{
|
|
SPDBG_FUNC("allocate_out_result");
|
|
|
|
LTS_OUT_RESULT *res = new LTS_OUT_RESULT;
|
|
if (res)
|
|
{
|
|
res->out_strings = new LTS_OUT_STRING *[MAX_ALT_STRINGS];
|
|
if (res->out_strings)
|
|
{
|
|
res->num_allocated_strings = MAX_ALT_STRINGS;
|
|
res->num_strings = 0;
|
|
}
|
|
else
|
|
{
|
|
delete res;
|
|
res = NULL;
|
|
}
|
|
}
|
|
|
|
return res;
|
|
} // static LTS_OUT_RESULT *allocate_out_result(LTS_FOREST *l_forest)
|
|
|
|
static void free_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res)
|
|
{
|
|
SPDBG_FUNC("free_out_result");
|
|
|
|
int i;
|
|
|
|
for (i = 0; i < res->num_strings; i++)
|
|
{
|
|
delete res->out_strings[i];
|
|
}
|
|
if (res->num_allocated_strings == MAX_ALT_STRINGS)
|
|
{
|
|
delete res->out_strings;
|
|
}
|
|
else
|
|
{
|
|
free(res->out_strings); /* dirty */
|
|
}
|
|
|
|
delete res;
|
|
} // static void free_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res)
|
|
|
|
|
|
static bool reallocate_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res,
|
|
int min)
|
|
{
|
|
SPDBG_FUNC("reallocate_out_result");
|
|
|
|
int s = res->num_allocated_strings, old_size = s;
|
|
LTS_OUT_STRING **p;
|
|
|
|
while (s < min)
|
|
s += INC_ALT_STRINGS;
|
|
p = res->out_strings;
|
|
|
|
res->out_strings = (LTS_OUT_STRING **)
|
|
calloc(s, sizeof(LTS_OUT_STRING *));
|
|
if (!res->out_strings)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
memcpy(res->out_strings, p, old_size * sizeof(LTS_OUT_STRING *));
|
|
|
|
if (old_size == MAX_ALT_STRINGS)
|
|
{
|
|
delete p;
|
|
}
|
|
else
|
|
{
|
|
free(p);
|
|
}
|
|
|
|
res->num_allocated_strings = s;
|
|
ODS("increased out_strings to %d in order to meet %d\n", s, min);
|
|
|
|
return true;
|
|
} // static void reallocate_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res,
|
|
|
|
|
|
static bool grow_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res,
|
|
SYMBOL i, int count, float inv_sum,
|
|
LTS_OUT_RESULT *tmpRes)
|
|
{
|
|
SPDBG_FUNC("grow_out_result");
|
|
|
|
int j;
|
|
|
|
if (res->num_strings + tmpRes->num_strings >= res->num_allocated_strings)
|
|
{
|
|
if (!reallocate_out_result(l_forest, res,
|
|
res->num_strings + tmpRes->num_strings))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
for (j = 0; j < tmpRes->num_strings; j++)
|
|
{
|
|
SYMBOL *psrc = tmpRes->out_strings[j]->psym;
|
|
SYMBOL *ptgt;
|
|
res->out_strings[res->num_strings + j] = new LTS_OUT_STRING;
|
|
if (!res->out_strings)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
ptgt = res->out_strings[res->num_strings + j]->psym;
|
|
*ptgt++ = i;
|
|
while (*psrc != NULL_SYMBOL_ID)
|
|
{
|
|
*ptgt++ = *psrc++;
|
|
}
|
|
*ptgt++ = NULL_SYMBOL_ID;
|
|
res->out_strings[res->num_strings + j]->prob = count * inv_sum *
|
|
tmpRes->out_strings[j]->prob;
|
|
}
|
|
res->num_strings += tmpRes->num_strings;
|
|
free_out_result(l_forest, tmpRes);
|
|
|
|
return true;
|
|
} // static void grow_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res,
|
|
|
|
|
|
static LTS_OUT_RESULT *gen_one_output(LTS_FOREST *l_forest, int len,
|
|
SYMBOL *input_id, int in_index,
|
|
SYMBOL *output_id, float cutoff)
|
|
{
|
|
SPDBG_FUNC("gen_one_output");
|
|
|
|
SYMBOL out[SP_MAX_WORD_LENGTH], *pOut;
|
|
LTS_OUT_RESULT *res = allocate_out_result(l_forest);
|
|
if (!res)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
int sum, i, dim;
|
|
LTS_DIST UNALIGNED *pdf;
|
|
LTS_PAIR UNALIGNED *l_pair, *lp;
|
|
float cut, inv_sum;
|
|
|
|
/*
|
|
* copy output_id to local
|
|
*/
|
|
SYMBOL *psrc = output_id - 1, *ptgt = out;
|
|
while (*psrc != NULL_SYMBOL_ID) psrc--;
|
|
while (psrc != output_id)
|
|
*ptgt++ = *psrc++;
|
|
pOut = ptgt;
|
|
/* sanity check */
|
|
if (pOut - out != in_index + 1)
|
|
{
|
|
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
int *z=0;
|
|
z[0]=z[1];
|
|
}
|
|
|
|
if (in_index == len - 1)
|
|
{
|
|
pdf = lts_find_leaf_count(l_forest, input_id + in_index, pOut);
|
|
l_pair = &(pdf->p_pair);
|
|
dim = pdf->c_dists;
|
|
for (lp = l_pair, sum = 0, i = 0; i < dim; i++, lp++)
|
|
{
|
|
sum += lp->cnt;
|
|
}
|
|
SPDBG_ASSERT(sum > 0);
|
|
inv_sum = 1.0f / sum;
|
|
cut = cutoff * sum;
|
|
for (lp = l_pair, i = 0; i < dim; i++, lp++)
|
|
{
|
|
if ((float)(lp->cnt) > cut)
|
|
{
|
|
res->out_strings[res->num_strings] = new LTS_OUT_STRING;
|
|
if (NULL == res->out_strings[res->num_strings])
|
|
{
|
|
return NULL;
|
|
}
|
|
res->out_strings[res->num_strings]->psym[0] = (SYMBOL) lp->id;
|
|
res->out_strings[res->num_strings]->psym[1] = NULL_SYMBOL_ID;
|
|
res->out_strings[res->num_strings]->prob = lp->cnt * inv_sum;
|
|
res->num_strings++;
|
|
} /* cut */
|
|
}
|
|
}
|
|
else
|
|
{
|
|
LTS_OUT_RESULT *tmpRes;
|
|
|
|
pdf = lts_find_leaf_count(l_forest, input_id + in_index, pOut);
|
|
dim = pdf->c_dists;
|
|
l_pair = &(pdf->p_pair);
|
|
for (lp = l_pair, sum = 0, i = 0; i < dim; i++, lp++)
|
|
{
|
|
sum += lp->cnt;
|
|
}
|
|
SPDBG_ASSERT(sum > 0);
|
|
|
|
inv_sum = 1.0f / sum;
|
|
cut = cutoff * sum;
|
|
for (lp = l_pair, i = 0; i < dim; i++, lp++)
|
|
{
|
|
if ((float)(lp->cnt) > cut)
|
|
{
|
|
SYMBOL *pTmpOut = pOut + 1;
|
|
*pOut = (SYMBOL) lp->id;
|
|
tmpRes = gen_one_output(l_forest, len, input_id, in_index + 1, pTmpOut, cutoff);
|
|
if (!tmpRes)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
if (!grow_out_result(l_forest, res, (SYMBOL)(lp->id), lp->cnt,
|
|
inv_sum, tmpRes))
|
|
{
|
|
return NULL;
|
|
}
|
|
}
|
|
} /* i */
|
|
} /* else */
|
|
|
|
return res;
|
|
} // static LTS_OUT_RESULT *gen_one_output(LTS_FOREST *l_forest, int len,
|
|
|
|
|
|
static int comp_out_result_prob(const void *vp1, const void *vp2)
|
|
{
|
|
SPDBG_FUNC("comp_out_result_prob");
|
|
|
|
LTS_OUT_STRING **p1 = (LTS_OUT_STRING **) vp1,
|
|
**p2 = (LTS_OUT_STRING **) vp2;
|
|
|
|
if ((*p1)->prob > (*p2)->prob)
|
|
{
|
|
return -1;
|
|
}
|
|
else if ((*p1)->prob < (*p2)->prob)
|
|
{
|
|
return 1;
|
|
}
|
|
else
|
|
{
|
|
return 0;
|
|
}
|
|
} // static int comp_out_result_prob(const void *vp1, const void *vp2)
|
|
|
|
|
|
static void lts_fill_out_buffer(LTS_FOREST *l_forest, LTS_OUT_RESULT *out,
|
|
char *word)
|
|
{
|
|
SPDBG_FUNC("lts_fill_out_buffer");
|
|
|
|
int i, j, n;
|
|
float inv_sum, sum = 0.0f;
|
|
char phnstr[LONGEST_STR];
|
|
char *tmp;
|
|
LTS_SYMTAB *tab = l_forest->symbols;
|
|
|
|
if (out == NULL)
|
|
{
|
|
return;
|
|
}
|
|
|
|
if (word)
|
|
{
|
|
strcpy(l_forest->out.word, word);
|
|
}
|
|
else
|
|
{
|
|
l_forest->out.word[0] = 0;
|
|
}
|
|
|
|
/* normalize probabilities */
|
|
for (i = 0; i < out->num_strings; i++)
|
|
{
|
|
sum += out->out_strings[i]->prob;
|
|
}
|
|
inv_sum = 1.0f / sum;
|
|
for (i = 0; i < out->num_strings; i++)
|
|
{
|
|
out->out_strings[i]->prob *= inv_sum;
|
|
}
|
|
|
|
/*
|
|
* sort them according to the prob field
|
|
*/
|
|
qsort(out->out_strings, out->num_strings, sizeof(LTS_OUT_STRING *),
|
|
&comp_out_result_prob);
|
|
|
|
if (out->num_strings > MAX_OUTPUT_STRINGS - l_forest->out.num_prons)
|
|
{
|
|
n = MAX_OUTPUT_STRINGS - l_forest->out.num_prons;
|
|
for (sum = 0.0f, i = 0; i < n; i++)
|
|
{
|
|
sum += out->out_strings[i]->prob;
|
|
}
|
|
inv_sum = 1.0f / sum;
|
|
for (i = 0; i < n; i++)
|
|
{
|
|
out->out_strings[i]->prob *= inv_sum;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
n = out->num_strings;
|
|
}
|
|
|
|
for (j = l_forest->out.num_prons, i = 0; i < n; i++)
|
|
{
|
|
SYMBOL *p = out->out_strings[i]->psym;
|
|
char *psrc, *ptgt;
|
|
|
|
if (out->out_strings[i]->prob < MIN_OUT_PROB)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
phnstr[0] = 0;
|
|
l_forest->out.pron[j].prob = out->out_strings[i]->prob;
|
|
|
|
while (*p != NULL_SYMBOL_ID)
|
|
{
|
|
tmp = id_to_symbol(&(tab[OUTPUT]), *p++);
|
|
SPDBG_ASSERT(tmp);
|
|
if (tmp)
|
|
{
|
|
strcat(phnstr, tmp);
|
|
strcat(phnstr, " ");
|
|
}
|
|
}
|
|
|
|
psrc = phnstr;
|
|
ptgt = l_forest->out.pron[j].pstr;
|
|
while (*psrc)
|
|
{
|
|
if (*psrc != '#' && *psrc != '_')
|
|
{
|
|
*ptgt++ = *psrc++;
|
|
}
|
|
else if (*psrc == '_')
|
|
{
|
|
*ptgt++ = ' ';
|
|
psrc++;
|
|
}
|
|
else
|
|
{
|
|
psrc += 2; /* skip an extra space */
|
|
}
|
|
/* extreme case, truncate it */
|
|
if (ptgt - l_forest->out.pron[j].pstr >= SP_MAX_PRON_LENGTH)
|
|
{
|
|
for (ptgt--; !isspace(*ptgt); ptgt--) {}; /* never output partial phone */
|
|
ptgt++;
|
|
break;
|
|
}
|
|
}
|
|
// output could contain only '# '
|
|
if (ptgt > l_forest->out.pron[j].pstr && *(ptgt - 1) == ' ')
|
|
{
|
|
*(ptgt - 1) = 0; /* remove the last space */
|
|
}
|
|
else
|
|
{
|
|
*ptgt = 0; /* shouldn't happen unless ptgt didn't move */
|
|
}
|
|
if (ptgt > l_forest->out.pron[j].pstr)
|
|
{
|
|
j++;
|
|
}
|
|
} /* i */
|
|
|
|
if (j <= MAX_OUTPUT_STRINGS)
|
|
{
|
|
l_forest->out.num_prons = j;
|
|
}
|
|
else
|
|
{
|
|
l_forest->out.num_prons = MAX_OUTPUT_STRINGS; // should never happen
|
|
}
|
|
|
|
free_out_result(l_forest, out);
|
|
} // static void lts_fill_out_buffer(LTS_FOREST *l_forest, LTS_OUT_RESULT *out,
|
|
|
|
|
|
void assign_a_fixed_pron(LTS_OUTPUT *out, const char *pron, char *word)
|
|
{
|
|
SPDBG_FUNC("assign_a_fixed_pron");
|
|
|
|
out->num_prons = 1;
|
|
strcpy(out->word, word);
|
|
out->pron[0].prob = 1.0f;
|
|
if (strlen(pron) < SP_MAX_PRON_LENGTH)
|
|
{
|
|
strcpy(out->pron[0].pstr, pron);
|
|
}
|
|
else
|
|
{
|
|
char *p;
|
|
strncpy(out->pron[0].pstr, pron, SP_MAX_PRON_LENGTH);
|
|
p = &(out->pron[0].pstr[SP_MAX_PRON_LENGTH - 1]);
|
|
while (!isspace(*p))
|
|
{
|
|
p--; /* truncate the last partial phoneme */
|
|
}
|
|
*p = 0;
|
|
}
|
|
} // void assign_a_fixed_pron(LTS_OUTPUT *out, char *pron, char *word)
|
|
|
|
inline BOOL IsCharInRangeA(int ch, int chMin, int chMax)
|
|
{
|
|
return (unsigned)(ch - chMin) <= (unsigned)(chMax - chMin);
|
|
}
|
|
|
|
void assign_a_spelling_pron(LTS_OUTPUT *out, const char * single_letter_pron[52], char *word)
|
|
{
|
|
SPDBG_FUNC("assign_a_spelling_pron");
|
|
|
|
char *p;
|
|
int cchPron = 0;
|
|
|
|
strcpy(out->word, word);
|
|
if (ispunct(*word))
|
|
{
|
|
p = word + 1;
|
|
}
|
|
else
|
|
{
|
|
p = word;
|
|
}
|
|
|
|
out->num_prons = 1;
|
|
out->pron[0].prob = 1.0f;
|
|
out->pron[0].pstr[0] = 0;
|
|
|
|
char * pchPron = out->pron[0].pstr;
|
|
|
|
while (*p)
|
|
{
|
|
int cPOffset = 0; // 0 for single letter, 26 for plurals
|
|
int c = *p++;
|
|
|
|
// Lowercaseify, and skip over non-letters
|
|
if (IsCharInRangeA(c, 'A', 'Z'))
|
|
{
|
|
c += 'a' - 'A';
|
|
}
|
|
else if (!IsCharInRangeA(c, 'a', 'z'))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Check if the next two characters are 'S (apostrophe S). Include the following cases: words ending in 's 'S s' S'
|
|
// If they are we use a the plural pronunciation for the letter and skip over the letter and 'S
|
|
if ((p[0] == '\'') && ((0 == p[1] && 's' == c) || 's' == p[1] || 'S' == p[1]))
|
|
{
|
|
cPOffset = 26;
|
|
p += p[1] ? 1 : 0; // skip 'S
|
|
}
|
|
|
|
// Make sure the string isn't too long accounting for the new phone and seperator
|
|
const char * const pchPronT = single_letter_pron[cPOffset + c - 'a'];
|
|
const int cchPronT = strlen(pchPronT);
|
|
|
|
if ((cchPron + 1 + cchPronT) < (SP_MAX_PRON_LENGTH - 1)) // +1 for separating space, -1 for terminating NUL
|
|
{
|
|
strcpy(pchPron + cchPron, pchPronT);
|
|
|
|
cchPron += cchPronT;
|
|
|
|
pchPron[cchPron++] = ' ';
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (cchPron)
|
|
{
|
|
pchPron[cchPron - 1] = 0; // trim trailing space char
|
|
}
|
|
}
|
|
|
|
|
|
HRESULT LtscartGetPron(LTS_FOREST *l_forest, char *word, LTS_OUTPUT **ppLtsOutput)
|
|
{
|
|
SPDBG_FUNC("LtscartGetPron");
|
|
|
|
HRESULT hr = S_OK;
|
|
LTS_OUT_RESULT *pres = NULL;
|
|
char *p, *base;
|
|
SYMBOL buffer[LONGEST_STR], *pbuf = buffer + 1;
|
|
int len, id, hasvowel = 0, allcapital = 1;
|
|
|
|
l_forest->out.num_prons = 0;
|
|
buffer[0] = NULL_SYMBOL_ID;
|
|
len = 0;
|
|
|
|
|
|
if (word == NULL || (base = strtok(word, " \t\n")) == NULL)
|
|
{
|
|
assign_a_fixed_pron(&(l_forest->out), l_forest->bogus_pron, "NUL");
|
|
*ppLtsOutput = &(l_forest->out);
|
|
return S_FALSE;
|
|
}
|
|
else
|
|
{
|
|
base = strtok(word, " \t\n");
|
|
if (ispunct(*base))
|
|
{
|
|
for (p = base; *p && ispunct(*p); p++) {};
|
|
}
|
|
else
|
|
{
|
|
p = base;
|
|
}
|
|
}
|
|
|
|
char ach[2];
|
|
ach[1] = 0;
|
|
|
|
while (*p)
|
|
{
|
|
const int d = *p++;
|
|
const int c = tolower(d);
|
|
|
|
if (!hasvowel && (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u' || c == 'y'))
|
|
{
|
|
hasvowel = 1;
|
|
}
|
|
if (allcapital && d == c)
|
|
{
|
|
allcapital = 0;
|
|
}
|
|
|
|
ach[0] = (char)c;
|
|
|
|
if ((id = symbol_to_id (&(l_forest->symbols[INPUT]), ach)) == NO_SYMBOL || id == NULL_SYMBOL_ID)
|
|
{
|
|
ODS("cannot find the symbol %c, skip!\n", c);
|
|
continue;
|
|
}
|
|
|
|
pbuf[len++] = (SYMBOL) id;
|
|
}
|
|
|
|
pbuf[len] = NULL_SYMBOL_ID;
|
|
if (len >= SP_MAX_WORD_LENGTH || len <= 0)
|
|
{
|
|
// fill in bogus pron below
|
|
}
|
|
else if (len == 1)
|
|
{
|
|
LTS_SYMTAB *tab = l_forest->symbols;
|
|
char *p = id_to_symbol(&(tab[INPUT]), pbuf[0]);
|
|
int c = tolower(p[0]);
|
|
if (c >= 'a' && c <= 'z')
|
|
{
|
|
assign_a_fixed_pron(&(l_forest->out), l_forest->single_letter_pron[c - 'a'], word);
|
|
}
|
|
}
|
|
else if (!hasvowel)
|
|
{
|
|
assign_a_spelling_pron(&(l_forest->out), l_forest->single_letter_pron, word);
|
|
}
|
|
else
|
|
{
|
|
if (allcapital)
|
|
{
|
|
assign_a_spelling_pron(&(l_forest->out), l_forest->single_letter_pron, word);
|
|
}
|
|
pres = gen_one_output(l_forest, len, pbuf, 0, pbuf, DEFAULT_PRUNE);
|
|
if (!pres)
|
|
{
|
|
return E_OUTOFMEMORY;
|
|
}
|
|
|
|
lts_fill_out_buffer(l_forest, pres, word);
|
|
}
|
|
|
|
if (l_forest->out.num_prons == 0)
|
|
{
|
|
hr = S_FALSE;
|
|
|
|
assign_a_fixed_pron(&(l_forest->out), l_forest->bogus_pron, word);
|
|
}
|
|
|
|
*ppLtsOutput = &(l_forest->out);
|
|
|
|
SPDBG_RETURN(hr);
|
|
} /* LtscartGetPron */
|
|
|
|
|
|
LTS_FOREST *LtscartReadData (LCID lcid, PBYTE map_addr)
|
|
{
|
|
SPDBG_FUNC("LtscartReadData");
|
|
|
|
int i;
|
|
LTS_FOREST *l_forest;
|
|
LTS_SYMTAB *tab;
|
|
LTS_FEATURE *feat;
|
|
int output = 0;
|
|
|
|
l_forest = (LTS_FOREST *) calloc(1, sizeof(LTS_FOREST));
|
|
if (!l_forest)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
if (lcid == 1033)
|
|
{
|
|
l_forest->bogus_pron = bogus_pron_1033;
|
|
l_forest->single_letter_pron = single_letter_pron_1033;
|
|
}
|
|
else if (lcid == 1041)
|
|
{
|
|
l_forest->bogus_pron = bogus_pron_1041;
|
|
l_forest->single_letter_pron = single_letter_pron_1041;
|
|
}
|
|
else
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
//read in the symbol table
|
|
l_forest->symbols = (LTS_SYMTAB *) calloc(2, sizeof(LTS_SYMTAB));
|
|
if (!l_forest->symbols)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
tab = &(l_forest->symbols[INPUT]);
|
|
CopyMemory(&(tab->n_symbols), map_addr + output, sizeof(int));
|
|
output += sizeof(int);
|
|
|
|
tab->sym_idx = (int *)(map_addr + output);
|
|
output += tab->n_symbols * sizeof(int);
|
|
|
|
CopyMemory(&(tab->n_bytes), map_addr + output, sizeof(int));
|
|
output += sizeof(int);
|
|
|
|
tab->storage = (char*)(map_addr + output);
|
|
output += tab->n_bytes * sizeof(char);
|
|
|
|
tab = &(l_forest->symbols[OUTPUT]);
|
|
CopyMemory(&(tab->n_symbols), map_addr + output, sizeof(int));
|
|
output += sizeof(int);
|
|
|
|
tab->sym_idx = (int*)(map_addr + output);
|
|
output += tab->n_symbols * sizeof(int);
|
|
CopyMemory(&(tab->n_bytes), map_addr + output, sizeof(int));
|
|
output += sizeof(int);
|
|
|
|
tab->storage = (char*)(map_addr + output);
|
|
output += tab->n_bytes * sizeof(char);
|
|
|
|
// read in the feature vector
|
|
l_forest->features = (LTS_FEATURE *) calloc(2, sizeof(LTS_FEATURE));
|
|
if (!l_forest->features)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
feat = &(l_forest->features[INPUT]);
|
|
|
|
CopyMemory(&(feat->n_feat), map_addr + output, sizeof(int));
|
|
output += sizeof(int);
|
|
|
|
CopyMemory(&(feat->dim), map_addr + output, sizeof(int));
|
|
output += sizeof(int);
|
|
|
|
feat->feature = (int **) calloc(feat->n_feat, sizeof(int *));
|
|
if (!feat->feature)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
for (i = 0; i < feat->n_feat; i++)
|
|
{
|
|
feat->feature[i] = (int*)(map_addr + output);
|
|
output += feat->dim * sizeof(int);
|
|
}
|
|
|
|
feat = &(l_forest->features[OUTPUT]);
|
|
CopyMemory(&(feat->n_feat), map_addr + output, sizeof(int));
|
|
output += sizeof(int);
|
|
|
|
CopyMemory(&(feat->dim), map_addr + output, sizeof(int));
|
|
output += sizeof(int);
|
|
|
|
feat->feature = (int **) calloc(feat->n_feat, sizeof(int *));
|
|
if (!feat->feature)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
for (i = 0; i < feat->n_feat; i++)
|
|
{
|
|
feat->feature[i] = (int*)(map_addr + output);
|
|
output += feat->dim * sizeof(int);
|
|
}
|
|
|
|
/*
|
|
* read in the tree
|
|
*/
|
|
l_forest->tree = (LTS_TREE **) calloc(l_forest->symbols[INPUT].n_symbols,
|
|
sizeof(LTS_TREE *));
|
|
if (!l_forest->tree)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
for (i = 1; i < l_forest->symbols[INPUT].n_symbols; i++)
|
|
{
|
|
LTS_TREE *l_root;
|
|
l_forest->tree[i] = l_root = (LTS_TREE *) calloc(1, sizeof(LTS_TREE));
|
|
if (!l_root)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
CopyMemory(&(l_root->n_nodes), map_addr + output, sizeof(int));
|
|
output += sizeof(int);
|
|
|
|
l_root->nodes = (LTS_NODE*)(map_addr + output);
|
|
output += l_root->n_nodes * sizeof(LTS_NODE);
|
|
|
|
CopyMemory(&(l_root->size_dist), map_addr + output, sizeof(int));
|
|
output += sizeof(int);
|
|
|
|
l_root->p_dist = (LTS_DIST*)(map_addr + output);
|
|
output += l_root->size_dist * sizeof(char);
|
|
|
|
CopyMemory(&(l_root->size_prod), map_addr + output, sizeof(int));
|
|
output += sizeof(int);
|
|
|
|
if (l_root->size_prod > 0)
|
|
{
|
|
l_root->p_prod = (LTS_PROD*)(map_addr + output);
|
|
output += l_root->size_prod * sizeof(char);
|
|
}
|
|
}
|
|
|
|
return l_forest;
|
|
} // LTS_FOREST *LtscartReadData(char *forest_image, HANDLE *hFile1,
|
|
|
|
|
|
void LtscartFreeData(LTS_FOREST *l_forest)
|
|
{
|
|
SPDBG_FUNC("LtscartFreeData");
|
|
|
|
for (int i = 1; i < l_forest->symbols[INPUT].n_symbols; i++)
|
|
{
|
|
free(l_forest->tree[i]);
|
|
}
|
|
free(l_forest->tree);
|
|
|
|
free(l_forest->features[INPUT].feature);
|
|
free(l_forest->features[OUTPUT].feature);
|
|
free(l_forest->features);
|
|
|
|
free(l_forest->symbols);
|
|
|
|
free(l_forest);
|
|
} // void LtscartFreeData(LTS_FOREST *l_forest, HANDLE m_hFile,
|