1236 lines
31 KiB
C++
Raw Permalink Normal View History

2001-01-01 00:00:00 +01:00
/*******************************************************************************
* LtsCart.cpp *
*----------*
*
* ** WARNING **
* CART code for LTS. This code was created in MS Research and LiJ owns
* the algorithm. YunusM eliminated the private heap used by this code
* and used the new and delete operators instead.
*
* Created By: LIJ (MS Research) Date: 06/18/99
* Current Owner: Fil
*
* Copyright (C) 1999 Microsoft Corporation. All Rights Reserved
*******************************************************************************/
//--- Includes --------------------------------------------------------------
#include "StdAfx.h"
#include "LtsCart.h"
#pragma warning(disable : 4100)
/* the following are for exceptions: single letter and NULL output */
static const char *bogus_pron_1033 = "B OW G AH S P R AH N AH N S IY EY SH AH N";
static const char *bogus_pron_1041 = "N A N I"; // what?
static const char *single_letter_pron_1033[52] =
{
"EY",
"B IY",
"S IY",
"D IY",
"IY",
"EH F",
"JH IY",
"EY CH",
"AY",
"JH EY",
"K EY",
"EH L",
"EH M",
"EH N",
"OW",
"P IY",
"K Y UW",
"AA R",
"EH S",
"T IY",
"Y UW",
"V IY",
"D AH B AX L Y UW",
"EH K S",
"W AY",
"Z IY",
//
// PLURAL SPELLINGS
//
"EY Z",
"B IY Z",
"S IY Z",
"D IY Z",
"IY Z",
"EH F S",
"JH IY Z",
"EY CH AX Z",
"AY Z",
"JH EY Z",
"K EY Z",
"EH L Z",
"EH M Z",
"EH N Z",
"OW Z",
"P IY Z",
"K Y UW Z",
"AA R Z",
"EH S AX Z",
"T IY Z",
"Y UW Z",
"V IY Z",
"D AH B AX L Y UW Z",
"EH K S AX Z",
"W AY Z",
"Z IY Z"
};
static const char *single_letter_pron_1041[52] =
{
"EE",
"B II",
"SH II",
"D II",
"II",
"E H U",
"J II",
"EE CH I",
"A I",
"J EE",
"K EE",
"E R U",
"E M U",
"E N U",
"OO",
"P II",
"K Y UU",
"AA R U",
"E S U",
"T II",
"Y UU",
"B U I",
"D A B U R Y UU",
"E STOP K U S U",
"W A I",
"Z E STOP T O",
//
// PLURAL SPELLINGS
//
"EE Z U",
"B II Z U",
"SH II Z U",
"D II Z U",
"II Z U",
"E H U Z U",
"J II Z U",
"EE CH I Z U",
"A I Z U",
"J EE Z U",
"K EE Z U",
"E R U Z U",
"E M U Z U",
"E N U Z U",
"OO Z U",
"P II Z U",
"K Y UU Z U",
"AA R U Z U",
"E S U Z U",
"T II Z U",
"Y UU Z U",
"B U I Z U",
"D A B U R Y UU Z U",
"E STOP K U S U Z U",
"W A I Z U",
"Z E STOP T O Z U"
};
/*
* not worthwhile to use binary search with only about 30 entries
*/
static int symbol_to_id(LTS_SYMTAB *tab, char *sym)
{
USES_CONVERSION;
SPDBG_FUNC("symbol_to_id");
int i;
for (i = 0; i < tab->n_symbols; i++)
{
if (CSTR_EQUAL == CompareString(MAKELCID(MAKELANGID(LANG_ENGLISH, SUBLANG_ENGLISH_US), SORT_DEFAULT), NORM_IGNORECASE,
A2T(tab->storage + tab->sym_idx[i]), -1, A2T(sym), -1))
{
return i;
}
}
return NO_SYMBOL;
} // static int symbol_to_id(LTS_SYMTAB *tab, char *sym)
static char *id_to_symbol(LTS_SYMTAB *tab, int id)
{
SPDBG_FUNC("id_to_symbol");
if (id < 0 || id > tab->n_symbols)
{
return NULL;
}
else
{
return tab->storage + tab->sym_idx[id];
}
} // static char *id_to_symbol(LTS_SYMTAB *tab, int id)
__inline void ODS (const char *format, ...)
{
#ifdef _DEBUG
SPDBG_FUNC("ODS");
va_list arglist;
va_start (arglist, format);
char buf[2048];
_vsnprintf(buf, 2048, format, arglist);
OutputDebugStringA(buf);
va_end (arglist);
#endif
}
__inline int ans_simp_question (LTS_FEATURE *feat, SIMPLE_QUESTION question,
LTS_SAMPLE *sample)
{
SPDBG_FUNC("ans_simp_question");
SYMBOL id;
int *phones = feat[question.questype].feature[question.feature];
SAMPLE_GET_CONTEXT(sample, question.questype, question.context,
question.offset, id);
return (TST_BIT(phones, id) ? TRUE : FALSE);
} // __inline int ans_simp_question (LTS_FEATURE *feat, SIMPLE_QUESTION question,
static int product_eval (LTS_FEATURE *feat, char *term, LTS_SAMPLE *sample)
{
SPDBG_FUNC("product_eval");
int negate, result;
SIMPLE_QUESTION ques;
char *cptr;
cptr = term;
while (TRUE)
{
/* negation sign */
if (*cptr == '~')
{
negate = TRUE;
cptr++;
}
else
{
negate = FALSE;
}
if (!isdigit(*cptr))
{
//quit (-1, "Invalid product in product_eval\n");
// OutputDebugString("Invalid product in product_eval\n");
return FALSE;
}
for (result = *cptr++ - '0'; isdigit (*cptr); cptr++)
{
result = result * 10 + (*cptr - '0');
}
QUES_DECODE(result, ques.questype, ques.context, ques.offset,
ques.feature);
if ((negate ^ ans_simp_question (feat, ques, sample)) == FALSE)
{
return FALSE;
}
if (*cptr == '\0')
{
break;
}
if (*cptr++ != '&')
{
//quit (-1, "product_eval: syntax error in product term %s\n", term);
/*
char szTemp[512];
sprintf(szTemp, "product_eval: syntax error in product term %s\n", term);
OutputDebugString(szTemp);
*/
return FALSE;
}
}
return TRUE;
} // static int product_eval (LTS_FEATURE *feat, char *term, LTS_SAMPLE *sample)
static int ans_comp_question(LTS_FEATURE *feat, char *prod,
LTS_SAMPLE *sample)
{
SPDBG_FUNC("ans_comp_question");
int i, num_products, limit;
char *cptr, string[LONGEST_STR], *products[MAX_PRODUCTS];
strcpy(string, prod);
for (cptr = string, num_products = 1; *cptr != '\0'; cptr++)
{
if (*cptr == '|') num_products++;
}
if (num_products > MAX_PRODUCTS)
{
//quit(1, "please increase MAX_PRODUCTS up to %d at least\n", num_products);
/*
char szTemp[256];
sprintf(szTemp, "please increase MAX_PRODUCTS up to %d at least\n", num_products);
OutputDebugString(szTemp);
*/
return FALSE;
}
for (i = 0, limit = num_products -1, cptr = string; ; i++)
{
products[i] = cptr++;
if (i == limit)
{
break;
}
for (; *cptr != '|'; cptr++) {};
*cptr++ = '\0';
}
for (i = 0; i < num_products; i++)
{
if (product_eval (feat, products[i], sample) == TRUE)
{
return TRUE;
}
}
return FALSE;
} // static int ans_comp_question(LTS_FEATURE *feat, char *prod,
static T_NODE *find_leaf(LTS_FEATURE *feat, T_NODE *root, LTS_SAMPLE *sample)
{
SPDBG_FUNC("find_leaf");
if (!root->yes_child)
{
return root;
}
else if (ans_comp_question(feat, root->prod, sample))
{
return find_leaf(feat, root->yes_child, sample);
}
else
{
return find_leaf(feat, root->no_child, sample);
}
} // static T_NODE *find_leaf(LTS_FEATURE *feat, T_NODE *root, LTS_SAMPLE *sample)
static int lts_product_eval (LTS_FEATURE *feat, LTS_PROD *term,
LTS_SAMPLE *sample, LTS_PROD **next)
{
SPDBG_FUNC("lts_product_eval");
int negate, result;
SIMPLE_QUESTION ques;
LTS_PROD *cptr = term;
while (TRUE)
{
if ((*cptr) & PROD_NEG)
{
negate = TRUE;
result = (*cptr) ^ PROD_NEG;
}
else
{
negate = FALSE;
result = (*cptr);
}
QUES_DECODE(result, ques.questype, ques.context, ques.offset,
ques.feature);
if ((negate ^ ans_simp_question (feat, ques, sample)) == FALSE)
{
while (*cptr != PROD_TERM && *cptr != QUES_TERM)
{
cptr++;
}
if (*cptr == QUES_TERM)
{
*next = NULL;
}
else
{
*next = cptr + 1;
}
return FALSE;
}
cptr++;
if (*cptr == QUES_TERM)
{
*next = NULL;
break;
}
else if (*cptr == PROD_TERM)
{
*next = cptr + 1;
break;
}
}
return TRUE;
} // static int lts_product_eval (LTS_FEATURE *feat, LTS_PROD *term,
static int lts_ans_comp_question(LTS_TREE UNALIGNED *tree, LTS_FEATURE *feat,
int idx, LTS_SAMPLE *sample)
{
SPDBG_FUNC("lts_ans_comp_question");
LTS_PROD *next, *term = (LTS_PROD *) ((char *) tree->p_prod + idx);
while (TRUE)
{
if (lts_product_eval (feat, term, sample, &next) == TRUE)
{
return TRUE;
}
if (next == NULL)
{
break;
}
term = next;
}
return FALSE;
} // static int lts_ans_comp_question(LTS_TREE *tree, LTS_FEATURE *feat,
static LTS_NODE *lts_find_leaf(LTS_TREE UNALIGNED *tree, LTS_FEATURE *feat,
LTS_NODE *root, LTS_SAMPLE *sample)
{
SPDBG_FUNC("lts_find_leaf");
if (IS_LEAF_NODE(root))
{
return root;
}
else if (lts_ans_comp_question(tree, feat, ((LTS_NODE UNALIGNED *)root)->idx, sample))
{
return lts_find_leaf(tree, feat, root + ((LTS_NODE UNALIGNED *)root)->yes, sample);
}
else
{
return lts_find_leaf(tree, feat, root + ((LTS_NODE UNALIGNED *)root)->yes + 1, sample);
}
} // static LTS_NODE *lts_find_leaf(LTS_TREE *tree, LTS_FEATURE *feat,
static LTS_DIST *lts_find_leaf_count(LTS_FOREST *l_forest, SYMBOL *pIn,
SYMBOL *pOut)
{
SPDBG_FUNC("lts_find_leaf_count");
LTS_TREE UNALIGNED *tree = l_forest->tree[*pIn];
LTS_NODE UNALIGNED *leaf;
LTS_SAMPLE sample;
/*
* construct a sample in order to share all the code with training
*/
sample.pIn = pIn;
sample.pOut = pOut;
/* *pOut cannot be NULL_SYMBOL_ID */
*pOut = NULL_SYMBOL_ID + 1;
leaf = lts_find_leaf(tree, l_forest->features, &(tree->nodes[0]), &sample);
return (LTS_DIST *) ((char *)tree->p_dist + leaf->idx);
} // static LTS_DIST *lts_find_leaf_count(LTS_FOREST *l_forest, SYMBOL *pIn,
static LTS_OUT_RESULT *allocate_out_result(LTS_FOREST *l_forest)
{
SPDBG_FUNC("allocate_out_result");
LTS_OUT_RESULT *res = new LTS_OUT_RESULT;
if (res)
{
res->out_strings = new LTS_OUT_STRING *[MAX_ALT_STRINGS];
if (res->out_strings)
{
res->num_allocated_strings = MAX_ALT_STRINGS;
res->num_strings = 0;
}
else
{
delete res;
res = NULL;
}
}
return res;
} // static LTS_OUT_RESULT *allocate_out_result(LTS_FOREST *l_forest)
static void free_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res)
{
SPDBG_FUNC("free_out_result");
int i;
for (i = 0; i < res->num_strings; i++)
{
delete res->out_strings[i];
}
if (res->num_allocated_strings == MAX_ALT_STRINGS)
{
delete res->out_strings;
}
else
{
free(res->out_strings); /* dirty */
}
delete res;
} // static void free_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res)
static bool reallocate_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res,
int min)
{
SPDBG_FUNC("reallocate_out_result");
int s = res->num_allocated_strings, old_size = s;
LTS_OUT_STRING **p;
while (s < min)
s += INC_ALT_STRINGS;
p = res->out_strings;
res->out_strings = (LTS_OUT_STRING **)
calloc(s, sizeof(LTS_OUT_STRING *));
if (!res->out_strings)
{
return false;
}
memcpy(res->out_strings, p, old_size * sizeof(LTS_OUT_STRING *));
if (old_size == MAX_ALT_STRINGS)
{
delete p;
}
else
{
free(p);
}
res->num_allocated_strings = s;
ODS("increased out_strings to %d in order to meet %d\n", s, min);
return true;
} // static void reallocate_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res,
static bool grow_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res,
SYMBOL i, int count, float inv_sum,
LTS_OUT_RESULT *tmpRes)
{
SPDBG_FUNC("grow_out_result");
int j;
if (res->num_strings + tmpRes->num_strings >= res->num_allocated_strings)
{
if (!reallocate_out_result(l_forest, res,
res->num_strings + tmpRes->num_strings))
{
return false;
}
}
for (j = 0; j < tmpRes->num_strings; j++)
{
SYMBOL *psrc = tmpRes->out_strings[j]->psym;
SYMBOL *ptgt;
res->out_strings[res->num_strings + j] = new LTS_OUT_STRING;
if (!res->out_strings)
{
return false;
}
ptgt = res->out_strings[res->num_strings + j]->psym;
*ptgt++ = i;
while (*psrc != NULL_SYMBOL_ID)
{
*ptgt++ = *psrc++;
}
*ptgt++ = NULL_SYMBOL_ID;
res->out_strings[res->num_strings + j]->prob = count * inv_sum *
tmpRes->out_strings[j]->prob;
}
res->num_strings += tmpRes->num_strings;
free_out_result(l_forest, tmpRes);
return true;
} // static void grow_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res,
static LTS_OUT_RESULT *gen_one_output(LTS_FOREST *l_forest, int len,
SYMBOL *input_id, int in_index,
SYMBOL *output_id, float cutoff)
{
SPDBG_FUNC("gen_one_output");
SYMBOL out[SP_MAX_WORD_LENGTH], *pOut;
LTS_OUT_RESULT *res = allocate_out_result(l_forest);
if (!res)
{
return NULL;
}
int sum, i, dim;
LTS_DIST UNALIGNED *pdf;
LTS_PAIR UNALIGNED *l_pair, *lp;
float cut, inv_sum;
/*
* copy output_id to local
*/
SYMBOL *psrc = output_id - 1, *ptgt = out;
while (*psrc != NULL_SYMBOL_ID) psrc--;
while (psrc != output_id)
*ptgt++ = *psrc++;
pOut = ptgt;
/* sanity check */
if (pOut - out != in_index + 1)
{
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
int *z=0;
z[0]=z[1];
}
if (in_index == len - 1)
{
pdf = lts_find_leaf_count(l_forest, input_id + in_index, pOut);
l_pair = &(pdf->p_pair);
dim = pdf->c_dists;
for (lp = l_pair, sum = 0, i = 0; i < dim; i++, lp++)
{
sum += lp->cnt;
}
SPDBG_ASSERT(sum > 0);
inv_sum = 1.0f / sum;
cut = cutoff * sum;
for (lp = l_pair, i = 0; i < dim; i++, lp++)
{
if ((float)(lp->cnt) > cut)
{
res->out_strings[res->num_strings] = new LTS_OUT_STRING;
if (NULL == res->out_strings[res->num_strings])
{
return NULL;
}
res->out_strings[res->num_strings]->psym[0] = (SYMBOL) lp->id;
res->out_strings[res->num_strings]->psym[1] = NULL_SYMBOL_ID;
res->out_strings[res->num_strings]->prob = lp->cnt * inv_sum;
res->num_strings++;
} /* cut */
}
}
else
{
LTS_OUT_RESULT *tmpRes;
pdf = lts_find_leaf_count(l_forest, input_id + in_index, pOut);
dim = pdf->c_dists;
l_pair = &(pdf->p_pair);
for (lp = l_pair, sum = 0, i = 0; i < dim; i++, lp++)
{
sum += lp->cnt;
}
SPDBG_ASSERT(sum > 0);
inv_sum = 1.0f / sum;
cut = cutoff * sum;
for (lp = l_pair, i = 0; i < dim; i++, lp++)
{
if ((float)(lp->cnt) > cut)
{
SYMBOL *pTmpOut = pOut + 1;
*pOut = (SYMBOL) lp->id;
tmpRes = gen_one_output(l_forest, len, input_id, in_index + 1, pTmpOut, cutoff);
if (!tmpRes)
{
return NULL;
}
if (!grow_out_result(l_forest, res, (SYMBOL)(lp->id), lp->cnt,
inv_sum, tmpRes))
{
return NULL;
}
}
} /* i */
} /* else */
return res;
} // static LTS_OUT_RESULT *gen_one_output(LTS_FOREST *l_forest, int len,
static int comp_out_result_prob(const void *vp1, const void *vp2)
{
SPDBG_FUNC("comp_out_result_prob");
LTS_OUT_STRING **p1 = (LTS_OUT_STRING **) vp1,
**p2 = (LTS_OUT_STRING **) vp2;
if ((*p1)->prob > (*p2)->prob)
{
return -1;
}
else if ((*p1)->prob < (*p2)->prob)
{
return 1;
}
else
{
return 0;
}
} // static int comp_out_result_prob(const void *vp1, const void *vp2)
static void lts_fill_out_buffer(LTS_FOREST *l_forest, LTS_OUT_RESULT *out,
char *word)
{
SPDBG_FUNC("lts_fill_out_buffer");
int i, j, n;
float inv_sum, sum = 0.0f;
char phnstr[LONGEST_STR];
char *tmp;
LTS_SYMTAB *tab = l_forest->symbols;
if (out == NULL)
{
return;
}
if (word)
{
strcpy(l_forest->out.word, word);
}
else
{
l_forest->out.word[0] = 0;
}
/* normalize probabilities */
for (i = 0; i < out->num_strings; i++)
{
sum += out->out_strings[i]->prob;
}
inv_sum = 1.0f / sum;
for (i = 0; i < out->num_strings; i++)
{
out->out_strings[i]->prob *= inv_sum;
}
/*
* sort them according to the prob field
*/
qsort(out->out_strings, out->num_strings, sizeof(LTS_OUT_STRING *),
&comp_out_result_prob);
if (out->num_strings > MAX_OUTPUT_STRINGS - l_forest->out.num_prons)
{
n = MAX_OUTPUT_STRINGS - l_forest->out.num_prons;
for (sum = 0.0f, i = 0; i < n; i++)
{
sum += out->out_strings[i]->prob;
}
inv_sum = 1.0f / sum;
for (i = 0; i < n; i++)
{
out->out_strings[i]->prob *= inv_sum;
}
}
else
{
n = out->num_strings;
}
for (j = l_forest->out.num_prons, i = 0; i < n; i++)
{
SYMBOL *p = out->out_strings[i]->psym;
char *psrc, *ptgt;
if (out->out_strings[i]->prob < MIN_OUT_PROB)
{
continue;
}
phnstr[0] = 0;
l_forest->out.pron[j].prob = out->out_strings[i]->prob;
while (*p != NULL_SYMBOL_ID)
{
tmp = id_to_symbol(&(tab[OUTPUT]), *p++);
SPDBG_ASSERT(tmp);
if (tmp)
{
strcat(phnstr, tmp);
strcat(phnstr, " ");
}
}
psrc = phnstr;
ptgt = l_forest->out.pron[j].pstr;
while (*psrc)
{
if (*psrc != '#' && *psrc != '_')
{
*ptgt++ = *psrc++;
}
else if (*psrc == '_')
{
*ptgt++ = ' ';
psrc++;
}
else
{
psrc += 2; /* skip an extra space */
}
/* extreme case, truncate it */
if (ptgt - l_forest->out.pron[j].pstr >= SP_MAX_PRON_LENGTH)
{
for (ptgt--; !isspace(*ptgt); ptgt--) {}; /* never output partial phone */
ptgt++;
break;
}
}
// output could contain only '# '
if (ptgt > l_forest->out.pron[j].pstr && *(ptgt - 1) == ' ')
{
*(ptgt - 1) = 0; /* remove the last space */
}
else
{
*ptgt = 0; /* shouldn't happen unless ptgt didn't move */
}
if (ptgt > l_forest->out.pron[j].pstr)
{
j++;
}
} /* i */
if (j <= MAX_OUTPUT_STRINGS)
{
l_forest->out.num_prons = j;
}
else
{
l_forest->out.num_prons = MAX_OUTPUT_STRINGS; // should never happen
}
free_out_result(l_forest, out);
} // static void lts_fill_out_buffer(LTS_FOREST *l_forest, LTS_OUT_RESULT *out,
void assign_a_fixed_pron(LTS_OUTPUT *out, const char *pron, char *word)
{
SPDBG_FUNC("assign_a_fixed_pron");
out->num_prons = 1;
strcpy(out->word, word);
out->pron[0].prob = 1.0f;
if (strlen(pron) < SP_MAX_PRON_LENGTH)
{
strcpy(out->pron[0].pstr, pron);
}
else
{
char *p;
strncpy(out->pron[0].pstr, pron, SP_MAX_PRON_LENGTH);
p = &(out->pron[0].pstr[SP_MAX_PRON_LENGTH - 1]);
while (!isspace(*p))
{
p--; /* truncate the last partial phoneme */
}
*p = 0;
}
} // void assign_a_fixed_pron(LTS_OUTPUT *out, char *pron, char *word)
inline BOOL IsCharInRangeA(int ch, int chMin, int chMax)
{
return (unsigned)(ch - chMin) <= (unsigned)(chMax - chMin);
}
void assign_a_spelling_pron(LTS_OUTPUT *out, const char * single_letter_pron[52], char *word)
{
SPDBG_FUNC("assign_a_spelling_pron");
char *p;
int cchPron = 0;
strcpy(out->word, word);
if (ispunct(*word))
{
p = word + 1;
}
else
{
p = word;
}
out->num_prons = 1;
out->pron[0].prob = 1.0f;
out->pron[0].pstr[0] = 0;
char * pchPron = out->pron[0].pstr;
while (*p)
{
int cPOffset = 0; // 0 for single letter, 26 for plurals
int c = *p++;
// Lowercaseify, and skip over non-letters
if (IsCharInRangeA(c, 'A', 'Z'))
{
c += 'a' - 'A';
}
else if (!IsCharInRangeA(c, 'a', 'z'))
{
continue;
}
// Check if the next two characters are 'S (apostrophe S). Include the following cases: words ending in 's 'S s' S'
// If they are we use a the plural pronunciation for the letter and skip over the letter and 'S
if ((p[0] == '\'') && ((0 == p[1] && 's' == c) || 's' == p[1] || 'S' == p[1]))
{
cPOffset = 26;
p += p[1] ? 1 : 0; // skip 'S
}
// Make sure the string isn't too long accounting for the new phone and seperator
const char * const pchPronT = single_letter_pron[cPOffset + c - 'a'];
const int cchPronT = strlen(pchPronT);
if ((cchPron + 1 + cchPronT) < (SP_MAX_PRON_LENGTH - 1)) // +1 for separating space, -1 for terminating NUL
{
strcpy(pchPron + cchPron, pchPronT);
cchPron += cchPronT;
pchPron[cchPron++] = ' ';
}
else
{
break;
}
}
if (cchPron)
{
pchPron[cchPron - 1] = 0; // trim trailing space char
}
}
HRESULT LtscartGetPron(LTS_FOREST *l_forest, char *word, LTS_OUTPUT **ppLtsOutput)
{
SPDBG_FUNC("LtscartGetPron");
HRESULT hr = S_OK;
LTS_OUT_RESULT *pres = NULL;
char *p, *base;
SYMBOL buffer[LONGEST_STR], *pbuf = buffer + 1;
int len, id, hasvowel = 0, allcapital = 1;
l_forest->out.num_prons = 0;
buffer[0] = NULL_SYMBOL_ID;
len = 0;
if (word == NULL || (base = strtok(word, " \t\n")) == NULL)
{
assign_a_fixed_pron(&(l_forest->out), l_forest->bogus_pron, "NUL");
*ppLtsOutput = &(l_forest->out);
return S_FALSE;
}
else
{
base = strtok(word, " \t\n");
if (ispunct(*base))
{
for (p = base; *p && ispunct(*p); p++) {};
}
else
{
p = base;
}
}
char ach[2];
ach[1] = 0;
while (*p)
{
const int d = *p++;
const int c = tolower(d);
if (!hasvowel && (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u' || c == 'y'))
{
hasvowel = 1;
}
if (allcapital && d == c)
{
allcapital = 0;
}
ach[0] = (char)c;
if ((id = symbol_to_id (&(l_forest->symbols[INPUT]), ach)) == NO_SYMBOL || id == NULL_SYMBOL_ID)
{
ODS("cannot find the symbol %c, skip!\n", c);
continue;
}
pbuf[len++] = (SYMBOL) id;
}
pbuf[len] = NULL_SYMBOL_ID;
if (len >= SP_MAX_WORD_LENGTH || len <= 0)
{
// fill in bogus pron below
}
else if (len == 1)
{
LTS_SYMTAB *tab = l_forest->symbols;
char *p = id_to_symbol(&(tab[INPUT]), pbuf[0]);
int c = tolower(p[0]);
if (c >= 'a' && c <= 'z')
{
assign_a_fixed_pron(&(l_forest->out), l_forest->single_letter_pron[c - 'a'], word);
}
}
else if (!hasvowel)
{
assign_a_spelling_pron(&(l_forest->out), l_forest->single_letter_pron, word);
}
else
{
if (allcapital)
{
assign_a_spelling_pron(&(l_forest->out), l_forest->single_letter_pron, word);
}
pres = gen_one_output(l_forest, len, pbuf, 0, pbuf, DEFAULT_PRUNE);
if (!pres)
{
return E_OUTOFMEMORY;
}
lts_fill_out_buffer(l_forest, pres, word);
}
if (l_forest->out.num_prons == 0)
{
hr = S_FALSE;
assign_a_fixed_pron(&(l_forest->out), l_forest->bogus_pron, word);
}
*ppLtsOutput = &(l_forest->out);
SPDBG_RETURN(hr);
} /* LtscartGetPron */
LTS_FOREST *LtscartReadData (LCID lcid, PBYTE map_addr)
{
SPDBG_FUNC("LtscartReadData");
int i;
LTS_FOREST *l_forest;
LTS_SYMTAB *tab;
LTS_FEATURE *feat;
int output = 0;
l_forest = (LTS_FOREST *) calloc(1, sizeof(LTS_FOREST));
if (!l_forest)
{
return NULL;
}
if (lcid == 1033)
{
l_forest->bogus_pron = bogus_pron_1033;
l_forest->single_letter_pron = single_letter_pron_1033;
}
else if (lcid == 1041)
{
l_forest->bogus_pron = bogus_pron_1041;
l_forest->single_letter_pron = single_letter_pron_1041;
}
else
{
return NULL;
}
//read in the symbol table
l_forest->symbols = (LTS_SYMTAB *) calloc(2, sizeof(LTS_SYMTAB));
if (!l_forest->symbols)
{
return NULL;
}
tab = &(l_forest->symbols[INPUT]);
CopyMemory(&(tab->n_symbols), map_addr + output, sizeof(int));
output += sizeof(int);
tab->sym_idx = (int *)(map_addr + output);
output += tab->n_symbols * sizeof(int);
CopyMemory(&(tab->n_bytes), map_addr + output, sizeof(int));
output += sizeof(int);
tab->storage = (char*)(map_addr + output);
output += tab->n_bytes * sizeof(char);
tab = &(l_forest->symbols[OUTPUT]);
CopyMemory(&(tab->n_symbols), map_addr + output, sizeof(int));
output += sizeof(int);
tab->sym_idx = (int*)(map_addr + output);
output += tab->n_symbols * sizeof(int);
CopyMemory(&(tab->n_bytes), map_addr + output, sizeof(int));
output += sizeof(int);
tab->storage = (char*)(map_addr + output);
output += tab->n_bytes * sizeof(char);
// read in the feature vector
l_forest->features = (LTS_FEATURE *) calloc(2, sizeof(LTS_FEATURE));
if (!l_forest->features)
{
return NULL;
}
feat = &(l_forest->features[INPUT]);
CopyMemory(&(feat->n_feat), map_addr + output, sizeof(int));
output += sizeof(int);
CopyMemory(&(feat->dim), map_addr + output, sizeof(int));
output += sizeof(int);
feat->feature = (int **) calloc(feat->n_feat, sizeof(int *));
if (!feat->feature)
{
return NULL;
}
for (i = 0; i < feat->n_feat; i++)
{
feat->feature[i] = (int*)(map_addr + output);
output += feat->dim * sizeof(int);
}
feat = &(l_forest->features[OUTPUT]);
CopyMemory(&(feat->n_feat), map_addr + output, sizeof(int));
output += sizeof(int);
CopyMemory(&(feat->dim), map_addr + output, sizeof(int));
output += sizeof(int);
feat->feature = (int **) calloc(feat->n_feat, sizeof(int *));
if (!feat->feature)
{
return NULL;
}
for (i = 0; i < feat->n_feat; i++)
{
feat->feature[i] = (int*)(map_addr + output);
output += feat->dim * sizeof(int);
}
/*
* read in the tree
*/
l_forest->tree = (LTS_TREE **) calloc(l_forest->symbols[INPUT].n_symbols,
sizeof(LTS_TREE *));
if (!l_forest->tree)
{
return NULL;
}
for (i = 1; i < l_forest->symbols[INPUT].n_symbols; i++)
{
LTS_TREE *l_root;
l_forest->tree[i] = l_root = (LTS_TREE *) calloc(1, sizeof(LTS_TREE));
if (!l_root)
{
return NULL;
}
CopyMemory(&(l_root->n_nodes), map_addr + output, sizeof(int));
output += sizeof(int);
l_root->nodes = (LTS_NODE*)(map_addr + output);
output += l_root->n_nodes * sizeof(LTS_NODE);
CopyMemory(&(l_root->size_dist), map_addr + output, sizeof(int));
output += sizeof(int);
l_root->p_dist = (LTS_DIST*)(map_addr + output);
output += l_root->size_dist * sizeof(char);
CopyMemory(&(l_root->size_prod), map_addr + output, sizeof(int));
output += sizeof(int);
if (l_root->size_prod > 0)
{
l_root->p_prod = (LTS_PROD*)(map_addr + output);
output += l_root->size_prod * sizeof(char);
}
}
return l_forest;
} // LTS_FOREST *LtscartReadData(char *forest_image, HANDLE *hFile1,
void LtscartFreeData(LTS_FOREST *l_forest)
{
SPDBG_FUNC("LtscartFreeData");
for (int i = 1; i < l_forest->symbols[INPUT].n_symbols; i++)
{
free(l_forest->tree[i]);
}
free(l_forest->tree);
free(l_forest->features[INPUT].feature);
free(l_forest->features[OUTPUT].feature);
free(l_forest->features);
free(l_forest->symbols);
free(l_forest);
} // void LtscartFreeData(LTS_FOREST *l_forest, HANDLE m_hFile,