Windows2000/private/shell/ext/mlang/lcdetect.cpp
2020-09-30 17:12:32 +02:00

1279 lines
39 KiB
C++

/*
* Automatic language and codepage detector
* Copyright (C) 1996, 1997, Microsoft Corp. All rights reserved.
* History: 1-Feb-97 BobP Created
* 5-Aug-97 BobP Added Unicode support and rewrote
* scoring to use vector math.
* This is the runtime detector.
* See the comments in lcdcomp.cpp for a description of the compilation
* process and training data format.
* See design.txt for a description of the detection and scoring algorithm.
* Performance note: 60-80% of execution time in this code is AddVector(),
* which is probably memory-cycle bound by its random data access, but is
* still a candidate for further optimizing with an intrinsic vector operator,
* should one become available.
* to-do (as needed):
* - Adjust 7-bit and 8-bit scores to make them more comparable
* - detect UTF-8 in the SBCS entry point, via heuristic and via
* subdetection as 7-bit lang and as Unicode.
*/
#include "private.h"
// This is all the global (per-process) state
// It is set at DLL process init and its contents are const after that.
LCDetect * g_pLCDetect;
#ifdef DEBUG_LCDETECT
int g_fDebug;
#endif
/**/
static inline unsigned int
FindHighIdx (const int *pn, unsigned int n)
// Return the INDEX of the highest-valued integer in the given array.
{
int nMax = 0;
unsigned int nIdx = 0;
for (unsigned int i = 0; i < n; i++)
{
if (pn[i] > nMax)
{
nMax = pn[i];
nIdx = i;
}
}
return nIdx;
}
/**/
void
CScores::SelectCodePages (void)
// Find the highest scoring code page for each language, and remove
// all the other scores from the array such that the array contains
// exactly one score per detected language instead of one score per
// code page per language.
// When multiple scores are present for different code pages of the same
// language, this function combines the scores into a single score.
// The resulting entry will have the code page of the top-scoring code page
// for the various entries for that language, and the score and char count
// will be the SUM of the scores and char counts for ALL the entries for
// that language.
// For example, if the input contains:
// Lang Codepage Score Char count
// Russian 1251 42 200
// Russian 20866 69 300
// Then on output, the array will contain only one score for Russian:
// Russian 20866 111 500
// This overwrites the entries in place, and sets m_nUsed to the resulting
// number of active slots.
// The scores are already grouped by language, no need to sort by language.
// After return, the score array must NOT be referenced via ScoreIdx()
// because the index of the entries has changed.
{
// The score indices no longer matter, remove slots that scored zero.
RemoveZeroScores ();
if (m_nUsed == 0)
return;
// Select top score per language. This is fundamentally dependent
// on the score array already being ordered by language. This won't
// combine scores for the same language as both a 7-bit and 8-bit lang,
// but that's not worth fixing.
int maxscore = 0; // highest score for a given language
int totalscore = m_p[0].GetScore(); // sum of scores " "
int totalchars = m_p[0].GetCharCount();// sum of character counts " "
int nReturned = 0; // index and ultimate count of elts returned
unsigned int maxscoreidx = 0; // array index of the top-scoring code page,
// *** for the current language ***
for (unsigned int i = 1; i < m_nUsed; i++) {
if (m_p[i-1].GetLang() != m_p[i].GetLang())
{
// [i] indicates a different language from the previous entry
// Add the entry for the previous language to the result
// by copying the slot for its highest-scoring code page,
// and overwriting its score and char count with the sum counts.
m_p[maxscoreidx].SetScore(totalscore);
m_p[maxscoreidx].SetCharCount(totalchars);
m_p[nReturned++] = m_p[maxscoreidx];
// Start remembering the top and total scores for the new lang.
maxscore = m_p[i].GetScore();
totalscore = m_p[i].GetScore();
totalchars = m_p[i].GetCharCount();
maxscoreidx = i; // remember which [] had the top score
}
else
{
// Accumulate more scores for the same language
if (m_p[i].GetScore() > maxscore) {
maxscore = m_p[i].GetScore();
maxscoreidx = i;
}
totalscore += m_p[i].GetScore();
totalchars += m_p[i].GetCharCount();
}
}
// Process the the last language. Return the slot from its
// highest-scoring code page.
if (m_nUsed > 0)
{
m_p[maxscoreidx].SetScore(totalscore);
m_p[maxscoreidx].SetCharCount(totalchars);
m_p[nReturned++] = m_p[maxscoreidx];
}
m_nUsed = nReturned;
}
/**/
static void __fastcall
AddVector (int *pS, const PHElt *pH, int idx, unsigned int nScores)
// Add the score vector for a single n-gram to the running sum score
// vector at pS.
// On return, paS[0..nScores-1] is filled with the sum scores for each
// language.
// **** PERFORMANCE NOTE ****
// This is the critical inner-loop of the entire subsystem.
// Code generation and performance have been checked for various code
// organization. Ironically, making AddVector() a true function is
// FASTER than inlining it because when inlined, the registers are used
// for the OUTER loop variables and the inner loop here does approximately
// twice as many memory references per pass.
// On x86, all four loop variables are registered, and each pass makes only
// three memory references, which is optimal for the given representation.
// Future note: the histogram tables could be pivoted to collect all the
// scores for each n-gram in a block; that would eliminate the double
// indirection through ph and reduce the memory refs to two per pass.
{
nScores++; // makes faster end-test
while (--nScores != 0)
*pS++ += (*pH++)[idx];
}
static inline void
ScoreUnigramVector (LPCSTR pcsz, int nCh, PHistogram pH,
int *paS, const PHElt *paH, unsigned int nScores)
// Score this text for a unigram histogram. Each individual character is
// mapped to a histogram slot to yield a score for that character in each
// language.
{
if (nCh < 1)
return;
const PHIdx pMap = pH->GetMap();
unsigned char *p = (unsigned char *)pcsz;
while (nCh-- > 0)
AddVector (paS, paH, pMap[*p++], nScores);
}
static inline void
ScoreUnigramVectorW (LPCWSTR pcwsz, int nCh, PHistogram pH,
int *paS, const PHElt *paH, unsigned int nScores)
// WCHAR version. Only difference is the use of a map that maps the
// full 64K WCHAR space into the histogram index range.
{
if (nCh < 1)
return;
const PHIdx pMap = pH->GetMap();
while (nCh-- > 0)
AddVector (paS, paH, pMap[*pcwsz++], nScores);
}
static inline void
ScoreDigramVector (LPCSTR pcsz, int nCh, PHistogram pH,
int *paS, const PHElt *paH, unsigned int nScores)
// Score this text for a digram histogram. Each adjacent pair of characters
// are mapped to the index range and the mapped values combined to form an
// array index unique to that digram. The scores for that array slot are
// summed for each language.
{
if (nCh < 2)
return;
unsigned char *p = (unsigned char *)pcsz;
const PHIdx pMap = pH->GetMap();
unsigned char ch1 = pMap[*p++];
while (nCh-- > 1)
{
unsigned char ch2 = pMap[*p++];
AddVector (paS, paH, ch1 * pH->EdgeSize() + ch2, nScores);
ch1 = ch2;
}
}
static inline void
ScoreTrigramVector (LPCSTR pcsz, int nCh, PHistogram pH,
int *paS, const PHElt *paH, unsigned int nScores)
// Score this text for a trigram histogram. Each adjacent three-letter set
// of characters are mapped to the index range and the mapped values combined
// to form an array index unique to that trgram.
{
if (nCh < 3)
return;
unsigned char *p = (unsigned char *)pcsz;
const PHIdx pMap = pH->GetMap();
unsigned char ch1 = pMap[*p++];
unsigned char ch2 = pMap[*p++];
while (nCh-- > 2)
{
unsigned char ch3 = pMap[*p++];
debug(printf(" '%c%c%c':",unmapch(ch1),unmapch(ch2),unmapch(ch3)));
int idx = ((ch1 * pH->EdgeSize()) + ch2) * pH->EdgeSize() + ch3;
ch1 = ch2;
ch2 = ch3;
AddVector (paS, paH, idx, nScores);
debug(for (UINT i = 0; i < nScores; i++) printf(" %3d", paH[i][idx]));
debug(printf("\n"));
}
}
static inline void
ScoreTrigramVectorW (LPCWSTR pcwsz, int nCh, PHistogram pH,
int *paS, const PHElt *paH, unsigned int nScores)
// WCHAR version.
{
if (nCh < 3)
return;
const PHIdx pMap = pH->GetMap();
unsigned char ch1 = pMap[*pcwsz++];
unsigned char ch2 = pMap[*pcwsz++];
while (nCh-- > 2)
{
unsigned char ch3 = pMap[*pcwsz++];
int idx = ((ch1 * pH->EdgeSize()) + ch2) * pH->EdgeSize() + ch3;
ch1 = ch2;
ch2 = ch3;
AddVector (paS, paH, idx, nScores);
}
}
static inline void
ScoreNgramVector (LPCSTR pcsz, int nCh, PHistogram pH,
int *paS, const PHElt *paH, unsigned int nScores)
// Score this text for any dimension of n-gram. Get "N" from the
// dimensionality of the histogram.
// Each adjacent n-letter set of characters are mapped to the index range
// and the scores the reference summed for each language. This code is
// never used for the current data file, instead an optimized scoring
// loop exists for each existing case. This exists to enable trying
// different dimension scoring without requiring a new DLL.
{
if (nCh < pH->Dimensionality())
return;
unsigned char *p = (unsigned char *)pcsz;
const PHIdx pMap = pH->GetMap();
// Fill the pipeline
int idx = 0;
if (pH->Dimensionality() >= 2)
idx = idx * pH->EdgeSize() + pMap[*p++];
if (pH->Dimensionality() >= 3)
idx = idx * pH->EdgeSize() + pMap[*p++];
if (pH->Dimensionality() >= 4)
idx = idx * pH->EdgeSize() + pMap[*p++];
unsigned int nLoopCount = nCh - (pH->Dimensionality() - 1);
while (nLoopCount-- > 0)
{
idx = (idx * pH->EdgeSize() + pMap[*p++]) % pH->NElts();
AddVector (paS, paH, idx, nScores);
}
}
static inline void
ScoreNgramVectorW (LPCWSTR pcwsz, int nCh, PHistogram pH,
int *paS, const PHElt *paH, unsigned int nScores)
// WCHAR version.
{
if (nCh < pH->Dimensionality())
return;
const PHIdx pMap = pH->GetMap();
// Fill the pipeline
int idx = 0;
if (pH->Dimensionality() >= 2)
idx = idx * pH->EdgeSize() + pMap[*pcwsz++];
if (pH->Dimensionality() >= 3)
idx = idx * pH->EdgeSize() + pMap[*pcwsz++];
if (pH->Dimensionality() >= 4)
idx = idx * pH->EdgeSize() + pMap[*pcwsz++];
unsigned int nLoopCount = nCh - (pH->Dimensionality() - 1);
while (nLoopCount-- > 0)
{
idx = (idx * pH->EdgeSize() + pMap[*pcwsz++]) % pH->NElts();
AddVector (paS, paH, idx, nScores);
}
}
void
ScoreVector (LPCSTR pcsz, int nCh, PHistogram pH,
int *paS, const PHElt *paH, unsigned int nScores)
// Score a string into an array of scores using an array of histograms
// Each character n-gram is mapped to a histogram slot to yield a score
// for that character in each array at paH.
// On return, paS[0..nScores-1] is filled with the sum scores.
{
memset (paS, 0, sizeof(int) * nScores);
switch (pH->Dimensionality())
{
case 1:
ScoreUnigramVector (pcsz, nCh, pH, paS, paH, nScores);
break;
case 2:
ScoreDigramVector (pcsz, nCh, pH, paS, paH, nScores);
break;
case 3:
ScoreTrigramVector (pcsz, nCh, pH, paS, paH, nScores);
break;
default:
ScoreNgramVector (pcsz, nCh, pH, paS, paH, nScores);
break;
}
}
void
ScoreVectorW (LPCWSTR pcwsz, int nCh, PHistogram pH,
int *paS, const PHElt *paH, unsigned int nScores)
// Score a string into an array of scores using an array of histograms.
{
memset (paS, 0, sizeof(int) * nScores);
switch (pH->Dimensionality())
{
case 1:
ScoreUnigramVectorW (pcwsz, nCh, pH, paS, paH, nScores);
break;
case 3:
ScoreTrigramVectorW (pcwsz, nCh, pH, paS, paH, nScores);
break;
default:
ScoreNgramVectorW (pcwsz, nCh, pH, paS, paH, nScores);
break;
}
}
void
LCDetect::Score7Bit (LPCSTR pcszText, int nChars, CScores &S) const
// Do 7-bit language detection. Compute scores for all 7-bit languages
// and store the raw language score in S at the language's base score-idx.
// Fill in only the first score slot per language. Uses ScoreIdx() for
// the first code page, but does not detect or set the code page.
{
const PHistogram pH = Get7BitLanguage(0)->GetLangHistogram();
debug(printf(" "));
debug(for(unsigned int x=0;x<GetN7BitLanguages();x++)printf(" %3d", Get7BitLanguage(x)->LanguageID()));
debug(printf("\n"));
int sc[MAXSCORES];
// Compute the raw score vector
ScoreVector (pcszText, nChars, pH, sc, m_paHElt7Bit, GetN7BitLanguages());
// Fill in the CScores array from it
for (unsigned int i = 0; i < GetN7BitLanguages(); i++)
{
PLanguage7Bit pL = Get7BitLanguage(i);
CScore &s = S.Ref(pL->GetScoreIdx());
s.SetLang(pL);
s.SetCodePage(0);
s.SetScore(sc[i]);
s.SetCharCount(nChars);
}
}
void
LCDetect::Score8Bit (LPCSTR pcszText, int nChars, CScores &S) const
// Do 8-bit detection. Compute a combined language / code page score
// for each trained language / code page combination for the 8-bit languages.
// Store all the raw scores in S at the language+each codepage score-idx.
// May store multiple entries in S for each language, one per code page.
{
const PHistogram pH = Get8BitLanguage(0)->GetHistogram(0);
int sc[MAXSCORES];
// Compute the raw score vector
ScoreVector (pcszText, nChars, pH, sc, m_paHElt8Bit, m_nHElt8Bit);
// Fill in the CScores array from it
int nSc = 0;
for (unsigned int i = 0; i < GetN8BitLanguages(); i++)
{
PLanguage8Bit pL = Get8BitLanguage(i);
for (int j = 0; j < pL->NCodePages(); j++)
{
CScore &s = S.Ref(pL->GetScoreIdx() + j);
s.SetLang(pL);
s.SetCodePage(pL->GetCodePage(j));
s.SetScore( sc[ nSc++ ] );
s.SetCharCount(nChars);
}
}
}
void
LCDetect::ScoreLanguageAsSBCS (LPCWSTR wcs, int nch, CScores &S) const
// This scores Unicode text known to contain mostly characters in the
// script ranges used for 7-bit languages. This uses a special mapping,
// m_pH727Bit, that converts n-grams in the WCHAR text directly to the same
// mapping output space used for 7-bit language detection. It is then scored
// using the same language-only histograms used for 7-bit SBCS detection.
// The output is the same as if Score7Bit() had been called on the SBCS
// equivalent to this text. The same slots in S are filled in, using the
// 7-bit score indices, NOT the Unicode language score indices.
{
debug(printf(" scoring as SBCS\n"));
debug(printf(" "));
debug(for(unsigned int x=0;x<GetN7BitLanguages();x++)printf(" %3d", Get7BitLanguage(x)->LanguageID()));
debug(printf("\n"));
// Call ScoreVectorW(), passing the histogram set up or the WCHAR map.
int sc[MAXSCORES];
// Compute the raw score vector
ScoreVectorW (wcs, nch, m_pHU27Bit, sc, m_paHElt7Bit,GetN7BitLanguages());
// Fill in the CScores array from it
for (unsigned int i = 0; i < GetN7BitLanguages(); i++)
{
PLanguage7Bit pL = Get7BitLanguage(i);
CScore &s = S.Ref(pL->GetScoreIdx());
s.SetLang(pL);
s.SetCodePage(0);
s.SetScore(sc[i]);
s.SetCharCount(nch);
}
}
void
Language::ScoreCodePage (LPCSTR, int nCh, CScore &S, int &idx) const
// The default handler for scoring the code page for text for which the
// language is already known. Initially used only for Unicode.
{
idx = 0;
S.SetCodePage(0);
}
void
Language7Bit::ScoreCodePage (LPCSTR pStr, int nCh, CScore &S, int &idx) const
// Detect the code page for text whose language has already been detected
// and is indicated in S. Set S.CodePage(), do not change other
// fields of S.
// Set idx to the index of the high-scoring code page. The caller uses this
// to place the score in the correct ScoreIdx slot.
// Note that the arg is a single CScore, not an array. The CScore S is
// filled in with the score of the high-scoring code page, and no information
// about the other code pages is returned.
{
if (NCodePages() == 1)
{
// If lang is trained with only one codepage, just return it.
idx = 0;
S.SetCodePage(GetCodePage(0));
debug(printf(" score code page: only one; cp=%d\n",GetCodePage(0)));
}
debug(printf("scoring 7-bit code pages: "));
int sc[MAXSUBLANG];
// Compute the raw score vector
ScoreVector (pStr, nCh, GetCodePageHistogram(0),
sc, GetPHEltArray(), NCodePages());
// Find the high-scoring code page and fill in S with its values
idx = FindHighIdx (sc, NCodePages());
debug(printf("selecting cp=%d idx=%d\n", GetCodePage(idx), idx));
S.SetCodePage (GetCodePage(idx));
}
void
LanguageUnicode::ScoreSublanguages (LPCWSTR wcs, int nch, CScores &S) const
// Score wcs for each sub-language and add the raw scores to S.
// The scores are not qualified at this time.
// Relevant only for Unicode language groups that require subdetection,
// initially CJK.
{
if (m_nSubLangs == 0)
return;
debug(printf(" scoring Unicode sublanguages:\n"));
int sc[MAXSUBLANG];
// Compute the raw score vector
ScoreVectorW (wcs, nch, GetHistogram(0), sc, m_paHElt, m_nSubLangs);
// Fill in the CScores array from it
for (int i = 0; i < NSubLangs(); i++)
{
PLanguageUnicode pSL = GetSublanguage(i);
CScore &s = S.Ref(pSL->GetScoreIdx());
s.SetLang (pSL);
s.SetScore (sc[i]);
s.SetCharCount (nch);
s.SetCodePage (0);
debug(printf(" lang=%d score=%d\n", pSL->LanguageID(), sc[i]));
}
}
int
LCDetect::ChooseDetectionType (LPCSTR pcszText, int nChars) const
// Histogram the raw char values to determine whether to use 7-bit or
// 8-bit detection for this block.
{
// Count the proportion of chars < vs. >= 0x80
int nHi = 0;
for (int i = nChars; i-- > 0; )
nHi += ((unsigned char)*pcszText++) & 0x80;
nHi /= 0x80;
int nLo = nChars - nHi;
// Make sure there is sufficient data to make a good choice
// work here -- try if abs(nHi - nLo) < 10
if (nHi + nLo < 10)
return DETECT_NOTDEFINED;
if (nHi * 2 > nLo)
return DETECT_8BIT;
else
return DETECT_7BIT;
}
void
LCDetect::ScoreLanguageA (LPCSTR pStr, int nChars, CScores &S) const
// Score the text at pStr for each language that it potentially contains.
// Add the scores to S at the ScoreIdx() for each language and codepage
// combination.
// This adds all the raw scores for either all the 7-bit or all the
// 8-bit entries, depending on which category the rough initial analysis
// indicates. At this time, there are no entries for which both methods
// are required.
// For 7-bit detection, code page is always set to 0 and the language's score
// is placed in the 0'th slot for each language. The caller later scores
// code pages if needed, and fills the remaining slots.
// For 8-bit detection, scores are generated for each code page and all
// ScoreIdx() slots are used.
{
switch (ChooseDetectionType (pStr, nChars)) {
case DETECT_7BIT:
Score7Bit (pStr, nChars, S);
break;
case DETECT_8BIT:
Score8Bit (pStr, nChars, S);
break;
}
}
void
LCDetect::ScoreLanguageW (LPCWSTR wcs, int nch, CScores &S, PCLCDConfigure pC) const
// Score the text at wcs for each language that it potentially contains.
// Add the scores to S at the ScoreIdx() for each language.
// This first determines the Unicode script groups represented in wcs.
// Each WCHAR is mapped through CHARMAP_UNICODE to yield its "language group
// ID". The IDs for each char are counted and the top scoring IDs indicate
// the probable languages or language groups. Note that unlike all other
// use of n-gram scoring, NO WEIGHTS are associated with the IDs -- whichever
// group contains the most raw chars, wins.
// Some languages are indicated by presence of characters in a particular
// script group; these scores are immediately added to S.
// For script groups that indicate multiple languages, subdetection within
// the group is done only when the score for the group exceeds a threshhold
// that indicates the sub-detected languages are likely to be included in
// the final result. This is purely a performance optimization, not to
// be confused with the uniform score threshhold applied by the caller.
// The "Group" entries themselves are never included in the result; they
// exist only to invoke subdetection.
// In many cases even a single Unicode character provides sufficient
// identification of script and language, so there is no minimum
// qualification for scores in the script ranges that indicate a
// specific language by range alone.
{
// Score the chars according to the Unicode script group they belong to.
// The array indices are the raw outputs of the primary Unicode Charmap
// NOT to be confused with the ScoreIdx() of each language. Further,
// the scores are the simple count of the characters in each script
// range, and are NOT weighted by any histogram.
// In this initial step, the simple majority of characters per range
// determines which further detection steps to take.
const PHIdx map = GetMap (CHARMAP_UNICODE);
int anScore[MAXSCORES];
memset (anScore, 0, sizeof(int) * GetNUnicodeLanguages());
for (int x = 0; x < nch; x++)
anScore[map[wcs[x]]]++;
debug(printf(" char_ignore score=%d\n",anScore[HIDX_IGNORE]));
// Ignore scores for chars that correlate with no language
anScore[HIDX_IGNORE] = 0;
// Identify the scores that qualify a language for immediate inclusion
// in the result, or that qualify a language group for further detection.
// Find the high score to use as a relative threshhold for inclusion.
int nMaxScore = 0;
for (unsigned int i = 0; i < GetNUnicodeLanguages(); i++)
{
if (anScore[i] > nMaxScore)
nMaxScore = anScore[i];
}
debug(printf(" unicode range max score=%d\n",nMaxScore));
// Process all individual and group scores above a threshhold.
// The threshhold logic is different from the logic for SBCS/DBCS
// detection, because presence of even a single character in certain
// Unicode script ranges can be a strong correct indicator for a
// specific language. The threshhold for subdetected scores is
// higher, since that is a statistical result; single characters
// are not as strong an indicator.
// Set the threshhold for subdetecting.
int nRelThresh = 1 + (nMaxScore * pC->nRelativeThreshhold) / 100;
for (i = 0; i < GetNUnicodeLanguages(); i++)
{
// Threshhold for any range is at least this many raw chars in range.
if (anScore[i] >= 2)
{
PLanguageUnicode pL = GetUnicodeLanguage(i);
debug(printf(" using lang=%d score=%d:\n", pL->LanguageID(), anScore[i]));
if (pL->LanguageID() == LANGID_UNKNOWN)
{
// DO NOTHING -- text is an unknown language
debug(printf(" lang=unknown\n"));
}
else if (pL->NSubLangs() > 0)
{
// Subdetect language within a Unicode group, and add all the
// unqualified raw scores directly to S.
pL->ScoreSublanguages (wcs, nch, S);
}
else if ( pL->LanguageID() == LANGID_LATIN_GROUP &&
anScore[i] >= nRelThresh )
{
// Subdetect Latin/Western languages, and add all the
// unqualified raw scores to S.
ScoreLanguageAsSBCS (wcs, nch, S);
}
else
{
debug(printf(" range identifies language\n"));
// This range identifies a specific language; add it.
CScore &s = S.Ref(pL->GetScoreIdx());
s.SetLang (pL);
s.SetScore (anScore[i] * UNICODE_DEFAULT_CHAR_SCORE);
s.SetCharCount (nch);
s.SetCodePage (0);
}
}
}
}
/**/
DWORD
LCDetect::DetectA (LPCSTR pStr, int nInputChars,
PLCDScore paScores, int *pnScores,
PCLCDConfigure pLCDC) const
// Do SBCS / DBCS detection. Detect language and code page of pStr,
// fill paScores[] with the result and set *pnScores to the result count.
// On input, *pnScores is the available capacity of paScores.
// The text at pStr is broken into chunks, typically several hundred
// bytes.
// In the first phase, each chunk is scored by language. The scores for
// a single chunk are qualified by both an absolute threshhold and by a
// threshhold based on the high score of just that chunk. Scores exceeding
// the threshhold are remembered towards the second phase; other scores
// are discarded.
// For each score that will be remembered, if a code page is not already
// known for it then the code page for the chunk is determined and included
// with the score. Note that the score refers only to the language, NOT
// to the confidence of the code page.
// In the second phase, the combined scores for all chunks are examined.
// The scores are further qualified by a relative threshhold. Only
// languages with scores exceeding the threshhold are included in the
// final result; the remainder are discarded.
// The two-step process is designed to yield good results for input containing
// text in multiple languages, or containing a high portion of whitespace or
// symbol characters that correlate with no language. It also is designed
// to optimally handle tie-cases whether due to similar languages or to
// mixed-language input, and to avoid applying threshholds based on
// absolute scores.
// The presumption is that each chunk, generally, represents text in a single
// language, and no matter what the absolute high score is, its high score
// most likely is for that language. The point of the first phase is to
// identify all the languages that are known with some confidence to be
// represented in the text. For a given chunk, multiple languages scores may
// meet this criteria and be remembered towards the result. Specifically,
// when a tie occurs, BOTH scores are always included. (Choosing just one
// would be wrong too often to be worthwhile.)
// The point of the second phase is to filter out the noise allowed by the
// first phase.
{
TScores<MAXSCORES> SChunk; // Scores for one chunk at a time
TScores<MAXSCORES> SAll; // Qualified scores for ultimate result
if (pLCDC == NULL) // Use the default config if not specified
pLCDC = &m_LCDConfigureDefault;
if (*pnScores == 0)
return NO_ERROR;
#define MAX_INPUT (USHRT_MAX-1)
// CScore.NChars() is a USHORT to save space+time, so only this # of chars
// can be accepted per call or the scoring would overflow.
nInputChars = min (nInputChars, MAX_INPUT);
debug(printf("LCD_Detect: detecting %d chars\n", nInputChars));
// The first loop processed fixed-size chunks and accumulates all the
// credibly-detected languages in SAll. This is the "coarse" accuracy
// qualification: detect the language of text blocks small enough to
// typically be in *one* language, and remember only the highest scoring
// language for that chunk. Then generate a multivalued result that
// shows the distribution of language in the doc, instead of simply
// returning the dominant language. This is necessary because it is
// much harder to determine the sole language than to determine the
// multivalued result.
int nProcessed = 0;
while (nProcessed < nInputChars)
{
SChunk.Reset(); // reset is cheaper than constructing
// Process nChunkSize worth of text if that will leave at least
// another nChunkSize piece for the final pass. If that would
// leave a smaller final chunk, go ahead and process the entire
// remaining input.
int nch = nInputChars - nProcessed;
if (nch >= pLCDC->nChunkSize * 2)
nch = pLCDC->nChunkSize;
debug(printf("\nStarting chunk: %d ch\n\"%.*s\"\n", nch, nch, &pStr[nProcessed]));
ScoreLanguageA (&pStr[nProcessed], nch, SChunk);
// Compute the threshhold for inclusion of each score in the
// overall result.
int nRelThresh = 1 + (SChunk.FindHighScore().GetScore() * pLCDC->nRelativeThreshhold) / 100;
int nThresh7 = max (pLCDC->nMin7BitScore * nch, nRelThresh);
int nThresh8 = max (pLCDC->nMin8BitScore * nch, nRelThresh);
debug(printf("high score=%d min7=%d thresh7=%d thresh8=%d\n", SChunk.FindHighScore().GetScore(),pLCDC->nMin7BitScore*nch,nThresh7,nThresh8));
// Qualify each score, remember only scores well-above the noise.
for (unsigned int i = 0; i < SChunk.NElts(); i++)
{
CScore &s = SChunk.Ref(i);
PLanguage pL = s.GetLang();
// debug(if (s.GetScore()) printf(" raw: lang=%d score=%d cp=%d\n",pL->LanguageID(),s.GetScore(),s.GetCodePage()));
if ( (s.GetScore() >= nThresh7 && pL->Type() == DETECT_7BIT) ||
(s.GetScore() >= nThresh8 && pL->Type() == DETECT_8BIT) )
{
debug(printf(" qual: lang=%d score=%d cp=%d\n",pL->LanguageID(),s.GetScore(),s.GetCodePage()));
// If code page is not already set, detect it, and store
// the score for this language using the scoreidx slot
// for that code page. Store no score in the slots for
// other code pages for the same language.
int idx = 0;
if (s.GetCodePage() == 0)
pL->ScoreCodePage (&pStr[nProcessed], nch, s, idx);
// Remember this score for the overall results
SAll.Ref(i + idx) += s;
}
}
nProcessed += nch;
}
// SAll has entries for each unique { lang ID, code page }
// with the char count and total raw score (not normalized per char)
// for those chunks whose score qualifies as a confident result and
// that contributed to the entry.
// Select the top-scoring code page for each language
// and remove all other code page scores.
debug(printf("Selecting top-scoring code pages\n"));
SAll.SelectCodePages ();
// Sort by decreasing score
SAll.SortByScore ();
// Build the client return structure
// Language ID
// Code page
// Doc percent 0-100
// Confidence 0-100
int nScoresReturned = 0;
for (unsigned i = 0; i < SAll.NElts() && nScoresReturned < *pnScores; i++)
{
CScore &s = SAll.Ref(i);
LCDScore R;
R.nLangID = s.GetLang()->LanguageID();
R.nCodePage = s.GetCodePage();
// Percent of doc for which this language scored above the
// confidence threshhold, even if not 1st place for that chunk.
R.nDocPercent = (s.GetCharCount() * 100) / nProcessed;
debug(printf("s.CharCount=%d nProcessed=%d\n", s.GetCharCount(), nProcessed));
// Confidence is the raw score for all the chunks for which this
// language was detected above the confidence threshhold, divided
// by the number of characters in those chunks.
R.nConfidence = s.GetScore() / s.GetCharCount();
debug(printf("Examining: lang=%d cp=%d docpct=%d\n", R.nLangID, R.nCodePage, R.nDocPercent));
// Return only scores for languages detected in over a
// minimum % of the doc.
if (R.nDocPercent > pLCDC->nDocPctThreshhold)
{
debug(printf(" returning score\n"));
paScores[nScoresReturned++] = R;
}
}
debug(printf("Returning %d scores\n", nScoresReturned));
*pnScores = nScoresReturned;
return NO_ERROR;
}
DWORD
LCDetect::DetectW (LPCWSTR pwStr, int nInputChars,
PLCDScore paScores, int *pnScores, PCLCDConfigure pLCDC) const
// WCHAR (Unicode) version of LCD_Detect. Score into paScores, one score
// per language.
{
if (pLCDC == NULL) // Use the default config if not specified
pLCDC = &m_LCDConfigureDefault;
if (*pnScores == 0)
return NO_ERROR;
// CScore.NChars() is a USHORT to save space+time, so only this # of chars
// can be accepted per call or the scoring would overflow.
nInputChars = min (nInputChars, MAX_INPUT);
debug(printf("LCD_DetectW: detecting %d chars\n", nInputChars));
TScores<MAXSCORES> SChunk; // Raw score for one chunk at a time
TScores<MAXSCORES> SAll; // Qualifying scores for final result
// SChunk is defined outside the loop since it's cheaper to Reset() it
// than to reconstruct it each time.
int nProcessed = 0;
// Process one chunk of the input per loop
while (nProcessed < nInputChars)
{
SChunk.Reset();
// Process nChunkSize worth of text if that will leave at least
// another nChunkSize piece for the final pass. If that would
// leave a smaller final chunk, go ahead and process the entire
// remaining input.
int nch = nInputChars - nProcessed;
if (nch >= pLCDC->nChunkSize * 2)
nch = pLCDC->nChunkSize;
debug(printf("\nStarting chunk: %d ch\n", nch));
// Compute the raw scores for the chunk.
// This automatically includes the sub-detected language scores
// for the Latin/Western group and Unicode groups, <<< when the
// group itself >>> scores above the inclusion threshhold.
// But, the sub-detected scores themselves still need to be
// qualified.
ScoreLanguageW (&pwStr[nProcessed], nch, SChunk, pLCDC);
// Compute the threshhold for inclusion of each score in the
// overall result.
int nRelThresh = 1 + (SChunk.FindHighScore().GetScore() * pLCDC->nRelativeThreshhold) / 100;
int nThresh7 = max (pLCDC->nMin7BitScore * nch, nRelThresh);
int nThreshU = max (pLCDC->nMinUnicodeScore * nch, nRelThresh);
debug(printf("scores: nElts=%d rel=%d%% high=%d min=%d min7=%d minU=%d\n", SChunk.NElts(), pLCDC->nRelativeThreshhold, SChunk.FindHighScore().GetScore(), nRelThresh,nThresh7,nThreshU));
// Qualify each score, remember only scores well-above the noise.
for (unsigned int i = 0; i < SChunk.NElts(); i++)
{
CScore &s = SChunk.Ref(i);
PLanguage pL = s.GetLang();
if ( (s.GetScore() >= nThresh7 && pL->Type() == DETECT_7BIT) ||
(s.GetScore() >= nThreshU && pL->Type() == DETECT_UNICODE) )
{
debug(printf(" using lang=%d score=%d nch=%d\n",pL->LanguageID(),s.GetScore(),s.GetCharCount()));
// Remember this score for the overall results
SAll.Ref(i) += s;
}
}
nProcessed += nch;
}
// SAll has entries for each unique language with char count and total
// raw score (not normalized per char) for those chunks whose score
// qualifies as a confident result.
// SAll may contain entries only for 7-bit and Unicode languages,
// at most one entry per unique Win32 language ID
debug(printf("Selecting scores for result:\n"));
// Sort by decreasing score
SAll.SortByScore ();
// Build the client return structure
// Language ID
// Code page
// Doc percent 0-100
// Confidence 0-100
int nScoresReturned = 0;
for (unsigned i = 0; i < SAll.NElts() && nScoresReturned < *pnScores; i++)
{
CScore &s = SAll.Ref(i);
LCDScore R;
R.nLangID = s.GetLang()->LanguageID();
R.nCodePage = s.GetCodePage();
// Percent of doc for which this language scored above the
// confidence threshhold, even if not 1st place for that chunk.
R.nDocPercent = (s.GetCharCount() * 100) / nProcessed;
// Confidence is the raw score for all the chunks for which this
// language was detected above the confidence threshhold, divided
// by the number of characters in those chunks.
R.nConfidence = s.GetScore() / s.GetCharCount();
debug(printf(" testing: lang=%d nch=%d docpct=%d\n", R.nLangID,s.GetCharCount(),R.nDocPercent));
// Return only scores for languages detected in over a
// minimum % of the doc.
if (R.nDocPercent > pLCDC->nDocPctThreshhold)
{
debug(printf(" returning score\n"));
paScores[nScoresReturned++] = R;
}
}
debug(printf("Returning %d scores\n", nScoresReturned));
*pnScores = nScoresReturned;
return NO_ERROR;
}
/**/
/**/
#if 0
// Export functions
BOOL APIENTRY
DllMain (HANDLE hM, DWORD ul_reason, LPVOID lpReserved)
{
switch (ul_reason) {
case DLL_PROCESS_ATTACH:
{
DisableThreadLibraryCalls( (HINSTANCE)hM );
LCDetect *pLC = new LCDetect ( (HMODULE)hM );
if (pLC == NULL)
return FALSE;
if (pLC->LoadState() != NO_ERROR)
{
delete pLC;
return FALSE;
}
g_pLCDetect = pLC;
}
return TRUE;
case DLL_PROCESS_DETACH:
if (g_pLCDetect != NULL)
delete (LCDetect *)g_pLCDetect;
g_pLCDetect = NULL;
return TRUE;
case DLL_THREAD_ATTACH:
case DLL_THREAD_DETACH:
break;
}
return TRUE;
}
#endif
extern "C" void WINAPI
LCD_GetConfig (PLCDConfigure pLCDC)
{
if (g_pLCDetect)
*pLCDC = g_pLCDetect->GetConfig();
}
extern "C" DWORD WINAPI
LCD_Detect (LPCSTR pStr, int nInputChars,
PLCDScore paScores, int *pnScores,
PCLCDConfigure pLCDC)
// Score into paScores, one score per language, "qualifying" scores only.
// Return ranked by decreasing score.
{
if (g_pLCDetect == NULL)
return ERROR_INVALID_FUNCTION;
return g_pLCDetect->DetectA(pStr, nInputChars, paScores, pnScores, pLCDC);
}
extern "C" DWORD WINAPI
LCD_DetectW (LPCWSTR wcs, int nInputChars,
PLCDScore paScores, int *pnScores,
PCLCDConfigure pLCDC)
{
if (g_pLCDetect == NULL)
return ERROR_INVALID_FUNCTION;
return g_pLCDetect->DetectW(wcs, nInputChars, paScores, pnScores, pLCDC);
}
extern "C" void WINAPI
LCD_SetDebug (int f)
{
#ifdef DEBUG_LCDETECT
g_fDebug = f;
#endif
}