479 lines
14 KiB
C++
479 lines
14 KiB
C++
/*
|
|
* Automatic language and codepage detector
|
|
|
|
* Bob Powell, 2/97
|
|
* Copyright (C) 1996, 1997, Microsoft Corp. All rights reserved.
|
|
*/
|
|
|
|
#ifdef __cplusplus
|
|
|
|
#include <wtypes.h>
|
|
#include <limits.h>
|
|
|
|
#include "lcdetect.h"
|
|
#include "lccommon.h"
|
|
|
|
#include "tqsort.h"
|
|
|
|
|
|
// Turn this on in SOURCES to enable debug output
|
|
#ifdef DEBUG_LCDETECT
|
|
#include <stdio.h>
|
|
extern int g_fDebug;
|
|
#define debug(x) { if (g_fDebug) { x; }}
|
|
#define unmapch(x) ((x) >= 2 ? (x)+'a'-2 : ' ')
|
|
#else
|
|
#define debug(x)
|
|
#endif
|
|
|
|
class LCDetect;
|
|
typedef LCDetect *PLCDetect;
|
|
|
|
class Language;
|
|
class Language7Bit;
|
|
class Language8Bit;
|
|
class LanguageUnicode;
|
|
typedef Language *PLanguage;
|
|
typedef Language7Bit *PLanguage7Bit;
|
|
typedef Language8Bit *PLanguage8Bit;
|
|
typedef LanguageUnicode *PLanguageUnicode;
|
|
|
|
class CScore;
|
|
class CScores;
|
|
|
|
/****************************************************************/
|
|
|
|
#define MAXSCORES 50 // Max possible simultaneous # of scores
|
|
|
|
#define MINRAWSCORE 100 // Score threshhold (weight * char count)
|
|
// for further processing
|
|
|
|
/****************************************************************/
|
|
|
|
// Histograms
|
|
|
|
// A histogram stores an array of n-gram occurrence counts.
|
|
// HElt stores the count, at present this is an unsigned char.
|
|
|
|
// The in-memory structure is similar to the file.
|
|
// The histogram array pointers m_panElts point into the mapped file image.
|
|
|
|
class Histogram {
|
|
|
|
public:
|
|
Histogram (const PFileHistogramSection pHS, const PHIdx pMap);
|
|
Histogram (const Histogram &H, const PHIdx pMap);
|
|
virtual ~Histogram (void);
|
|
|
|
DWORD Validate (DWORD nBytes) const;
|
|
|
|
UCHAR Dimensionality (void) { return m_nDimensionality; }
|
|
UCHAR EdgeSize (void) { return m_nEdgeSize; }
|
|
USHORT CodePage (void) { return m_nCodePage; }
|
|
USHORT GetRangeID (void) { return m_nRangeID; }
|
|
USHORT NElts (void) { return m_nElts; }
|
|
PHIdx GetMap (void) { return m_pMap; }
|
|
|
|
HElt Ref (USHORT i1) const { return m_panElts[i1]; }
|
|
HElt Ref (UCHAR i1, UCHAR i2) const {
|
|
return m_panElts[(i1 * m_nEdgeSize) + i2]; }
|
|
HElt Ref (UCHAR i1, UCHAR i2, UCHAR i3) const {
|
|
return m_panElts[((i1 * m_nEdgeSize) + i2) * m_nEdgeSize + i3]; }
|
|
|
|
HElt *Array (void) { return m_panElts; }
|
|
|
|
protected:
|
|
UCHAR m_nDimensionality; // 1=unigram, 2=digram etc.
|
|
UCHAR m_nEdgeSize; // edge size (is a function of char map)
|
|
union {
|
|
USHORT m_nCodePage; // For 7 and 8-bit, is code page
|
|
USHORT m_nRangeID; // For Unicode, is sub-language range ID
|
|
};
|
|
USHORT m_nElts; // (edge size ^ dimensionality)
|
|
PHIdx m_pMap; // char/WCHAR to histogram idx mapping
|
|
|
|
HElt *m_panElts; // array of elements / counts
|
|
};
|
|
typedef Histogram *PHistogram;
|
|
|
|
/****************************************************************/
|
|
|
|
// A Language object stores all the detection state for a given language,
|
|
// i.e. primary language ID.
|
|
|
|
class Language {
|
|
public:
|
|
// nCodePages is same as nSubLangs
|
|
Language (PLCDetect pL, int nLangID, int nCodePages, int nRangeID = 0);
|
|
virtual ~Language (void) { }
|
|
|
|
virtual DWORD AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx) = 0;
|
|
|
|
// Score the code pages for this language
|
|
virtual void ScoreCodePage (LPCSTR, int nCh, CScore &S, int &idx) const;
|
|
|
|
int LanguageID (void) const { return m_nLangID; }
|
|
int NCodePages (void) const { return m_nCodePages; }
|
|
int NSubLangs (void) const { return m_nSubLangs; }
|
|
int RangeID (void) const { return m_nRangeID; }
|
|
int GetScoreIdx (void) const { return m_nScoreIdx; }
|
|
void SetScoreIdx (int nScoreIdx) { m_nScoreIdx = nScoreIdx; }
|
|
|
|
virtual int GetCodePage (int n) const { return 0; }
|
|
virtual int GetSublangRangeID (int n) const { return 0; }
|
|
virtual int GetSublangID (int n) const { return 0; }
|
|
|
|
virtual DetectionType Type (void) = 0;
|
|
virtual Language7Bit const * GetLanguage7Bit (void) const { return NULL; }
|
|
virtual Language8Bit const * GetLanguage8Bit (void) const { return NULL; }
|
|
virtual LanguageUnicode const * GetLanguageUnicode (void) const { return NULL; }
|
|
|
|
protected:
|
|
PLCDetect m_pLC;
|
|
|
|
int m_nLangID; // Win32 primary language ID
|
|
int m_nRangeID; // Unicode range ID, for Unicode langs
|
|
union {
|
|
int m_nCodePages; // # of code pages trained for this language
|
|
int m_nSubLangs;
|
|
};
|
|
int m_nScoreIdx; // Used to create a unique index into the score arrays
|
|
// for each lang + cp combination, to eliminate the
|
|
// need to search the arrays to merge scores. Add
|
|
// the code page index to this to get the array index.
|
|
};
|
|
|
|
|
|
|
|
class Language7Bit : public Language {
|
|
public:
|
|
Language7Bit (PLCDetect pL, int nLangID, int nCodePages);
|
|
~Language7Bit (void);
|
|
|
|
DWORD AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx);
|
|
|
|
void ScoreCodePage (LPCSTR, int nCh, CScore &S, int &idx) const;
|
|
|
|
int GetCodePage (int n) const { return m_ppCodePageHistogram[n]->CodePage();}
|
|
virtual DetectionType Type (void) { return DETECT_7BIT; }
|
|
|
|
PHistogram GetLangHistogram (void) const { return m_pLangHistogram; }
|
|
PHistogram GetCodePageHistogram (int i) const {
|
|
return m_ppCodePageHistogram[i]; }
|
|
|
|
virtual Language7Bit const * GetLanguage7Bit (void) const { return this; }
|
|
|
|
const PHElt * GetPHEltArray (void) const { return m_paHElt; }
|
|
|
|
private:
|
|
PHistogram m_pLangHistogram;
|
|
PHistogram m_ppCodePageHistogram[MAXSUBLANG];
|
|
|
|
PHElt m_paHElt[MAXSUBLANG];
|
|
};
|
|
|
|
|
|
|
|
class Language8Bit : public Language {
|
|
public:
|
|
Language8Bit (PLCDetect pL, int nLangID, int nCodePages);
|
|
~Language8Bit (void);
|
|
|
|
DWORD AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx);
|
|
|
|
int GetCodePage (int n) const { return m_ppHistogram[n]->CodePage(); }
|
|
|
|
virtual DetectionType Type (void) { return DETECT_8BIT; }
|
|
|
|
PHistogram GetHistogram (int i) const { return m_ppHistogram[i]; }
|
|
|
|
virtual Language8Bit const * GetLanguage8Bit (void) const { return this; }
|
|
|
|
private:
|
|
PHistogram m_ppHistogram[MAXSUBLANG];
|
|
};
|
|
|
|
|
|
|
|
class LanguageUnicode : public Language {
|
|
public:
|
|
LanguageUnicode (PLCDetect pL, int nLangID, int nRecordCount, int nRangeID);
|
|
~LanguageUnicode (void);
|
|
|
|
DWORD AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx);
|
|
|
|
void ScoreSublanguages (LPCWSTR wcs, int nch, CScores &S) const;
|
|
|
|
int GetSublangRangeID (int i) const{return GetHistogram(i)->GetRangeID();}
|
|
PLanguageUnicode GetSublanguage (int n) const;
|
|
|
|
virtual DetectionType Type (void) { return DETECT_UNICODE; }
|
|
|
|
PHistogram GetHistogram (int i) const { return m_ppSubLangHistogram[i]; }
|
|
|
|
virtual LanguageUnicode const * GetLanguageUnicode (void) const {
|
|
return this;
|
|
}
|
|
|
|
const PHElt * GetPHEltArray (void) const { return m_paHElt; }
|
|
|
|
private:
|
|
PHistogram m_ppSubLangHistogram[MAXSUBLANG];
|
|
|
|
PHElt m_paHElt[MAXSUBLANG];
|
|
};
|
|
|
|
/****************************************************************/
|
|
|
|
class Charmap {
|
|
|
|
public:
|
|
Charmap (PFileMapSection pMS) : m_nID(pMS->m_dwID), m_nSize(pMS->m_dwSize),
|
|
m_nUnique(pMS->m_dwNUnique), m_pElts( (PHIdx) (&pMS[1]) ) { }
|
|
|
|
// int ID (void) const { return m_nID; }
|
|
int Size (void) const { return m_nSize; }
|
|
int NUnique (void) const { return m_nUnique; }
|
|
PHIdx Map (void) const { return m_pElts; }
|
|
HIdx Map (WCHAR x) const { return m_pElts[x]; }
|
|
|
|
private:
|
|
int m_nID; // ID by which hardwired code finds the table
|
|
int m_nSize; // size of table (256 or 65536)
|
|
int m_nUnique; // # of unique output values
|
|
|
|
PHIdx m_pElts;
|
|
};
|
|
typedef Charmap *PCharmap;
|
|
|
|
/****************************************************************/
|
|
|
|
// class CScore -- score for one lang and/or code page, variously used for
|
|
// individual chunks and also for an entire document.
|
|
|
|
class CScore {
|
|
public:
|
|
// Only these two slots need to be initialized
|
|
CScore (void) : m_nScore(0), m_nChars(0) {}
|
|
~CScore (void) { }
|
|
|
|
const PLanguage GetLang (void) const { return m_pLang; }
|
|
int GetScore (void) const { return m_nScore; }
|
|
unsigned short GetCodePage (void) const { return m_nCodePage; }
|
|
unsigned short GetCharCount (void) const { return m_nChars; }
|
|
|
|
void SetLang (PLanguage p) { m_pLang = p; }
|
|
void SetScore (int x) { m_nScore = x; }
|
|
void SetCharCount (unsigned x) { m_nChars = (unsigned short)x; }
|
|
void SetCodePage (unsigned x) { m_nCodePage = (unsigned short)x; }
|
|
|
|
void Add (CScore &S) {
|
|
SetLang(S.GetLang());
|
|
SetCodePage(S.GetCodePage());
|
|
SetScore(GetScore() + S.GetScore());
|
|
SetCharCount(GetCharCount() + S.GetCharCount());
|
|
}
|
|
CScore & operator += (CScore &S) { Add (S); return *this; }
|
|
|
|
int operator <= (CScore &S) {
|
|
// Special: always put 8-bit langs first since the code page
|
|
// matters more for them.
|
|
if (GetLang()->Type() != S.GetLang()->Type())
|
|
return GetLang()->Type() == DETECT_8BIT ? -1 : 1;
|
|
return GetScore() <= S.GetScore();
|
|
}
|
|
|
|
#ifdef DEBUG_LCDETECT
|
|
void Print(void) {
|
|
printf("Lang=%d CodePage=%d Score=%d NChars=%d\n",
|
|
GetLang() ? GetLang()->LanguageID() : -1,
|
|
GetCodePage(), GetScore(), GetCharCount());
|
|
}
|
|
#endif
|
|
|
|
private:
|
|
PLanguage m_pLang;
|
|
int m_nScore;
|
|
unsigned short m_nCodePage;
|
|
unsigned short m_nChars;
|
|
};
|
|
typedef CScore *PScore;
|
|
|
|
|
|
|
|
// class CScores
|
|
|
|
// For SBCS detection, the index e.g. Ref(i) is the language+codepage index,
|
|
// one of a contiguous set of values which identifies each unique supported
|
|
// language and codepage combination.
|
|
|
|
// For DBCS detection, the index is just the Unicode language group.
|
|
|
|
class CScores {
|
|
public:
|
|
CScores (int nAlloc, PScore p) : m_nAlloc(nAlloc), m_nUsed(0), m_p(p) { }
|
|
virtual ~CScores (void) { }
|
|
|
|
void Reset (void) {
|
|
memset ((void *)m_p, 0, sizeof(CScore) * m_nUsed);
|
|
m_nUsed = 0;
|
|
}
|
|
|
|
unsigned int &NElts (void) { return m_nUsed; }
|
|
CScore &Ref (unsigned int n) {
|
|
if (m_nUsed <= n)
|
|
m_nUsed = n + 1;
|
|
return m_p[n];
|
|
}
|
|
|
|
void SelectCodePages (void);
|
|
|
|
void RemoveZeroScores (void) {
|
|
for (unsigned int i = 0, j = 0; i < m_nUsed; i++)
|
|
{
|
|
if (m_p[i].GetScore() > MINRAWSCORE)
|
|
m_p[j++] = m_p[i];
|
|
}
|
|
m_nUsed = j;
|
|
}
|
|
|
|
// Sort by decreasing score.
|
|
// Instantiates template qsort using CScore::operator <=
|
|
|
|
void SortByScore (void) {
|
|
RemoveZeroScores ();
|
|
if (m_nUsed)
|
|
QSort (m_p, m_nUsed, FALSE);
|
|
}
|
|
|
|
CScore & FindHighScore (void) {
|
|
int highscore = 0;
|
|
for (unsigned int i = 0, highidx = 0; i < m_nUsed; i++) {
|
|
if (m_p[i].GetScore() > highscore)
|
|
{
|
|
highscore = m_p[i].GetScore();
|
|
highidx = i;
|
|
}
|
|
}
|
|
return m_p[highidx];
|
|
}
|
|
|
|
protected:
|
|
unsigned int m_nAlloc;
|
|
unsigned int m_nUsed; // high water mark to optimize NElts(), Reset()
|
|
PScore m_p; // score array, typically per TScores<NNN>
|
|
};
|
|
|
|
template<ULONG Size>class TScores : public CScores {
|
|
|
|
public:
|
|
TScores (void) : CScores (Size, m_S) { }
|
|
virtual ~TScores (void) { }
|
|
|
|
private:
|
|
CScore m_S[Size];
|
|
};
|
|
|
|
|
|
|
|
class LCDetect {
|
|
|
|
public:
|
|
LCDetect (HMODULE hM);
|
|
~LCDetect (void);
|
|
|
|
unsigned int GetNCharmaps() const { return m_nCharmaps; }
|
|
unsigned int GetN7BitLanguages() const { return m_n7BitLanguages; }
|
|
unsigned int GetN8BitLanguages() const { return m_n8BitLanguages; }
|
|
unsigned int GetNUnicodeLanguages() const { return m_nUnicodeLanguages; }
|
|
|
|
PLanguage7Bit Get7BitLanguage (int i) const { return m_pp7BitLanguages[i]; }
|
|
PLanguage8Bit Get8BitLanguage (int i) const { return m_pp8BitLanguages[i]; }
|
|
PLanguageUnicode GetUnicodeLanguage (int i) const { return m_ppUnicodeLanguages[i]; }
|
|
|
|
PHIdx GetMap (int i) const { return m_ppCharmaps[i]->Map(); }
|
|
|
|
const LCDConfigure &GetConfig () const { return m_LCDConfigureDefault; }
|
|
|
|
DWORD LoadState (void);
|
|
|
|
DWORD DetectA (LPCSTR pStr, int nChars, PLCDScore paScores,
|
|
int *pnScores, PCLCDConfigure pLCDC) const;
|
|
|
|
DWORD DetectW (LPCWSTR wcs, int nInputChars, PLCDScore paScores,
|
|
int *pnScores, PCLCDConfigure pLCDC) const;
|
|
|
|
private:
|
|
DWORD Initialize7BitLanguage (PFileLanguageSection pLS, PLanguage *ppL);
|
|
DWORD Initialize8BitLanguage (PFileLanguageSection pLS, Language **ppL);
|
|
DWORD InitializeUnicodeLanguage (PFileLanguageSection pLS,Language **ppL);
|
|
DWORD LoadLanguageSection (void *pv, int nSectionSize, PLanguage *ppL);
|
|
DWORD LoadHistogramSection (void *pv, int nSectionSize, Language *pL);
|
|
DWORD LoadMapSection (void *pv, int nSectionSize);
|
|
DWORD BuildState (DWORD nFileSize);
|
|
|
|
void Score7Bit (LPCSTR pcszText, int nChars, CScores &S) const;
|
|
void Score8Bit (LPCSTR pcszText, int nChars, CScores &S) const;
|
|
int ScoreCodePage (LPCSTR pStr, int nChars, CScore &S) const;
|
|
int ChooseDetectionType (LPCSTR pcszText, int nChars) const;
|
|
void ScoreLanguageA (LPCSTR pStr, int nChars, CScores &S) const;
|
|
void ScoreLanguageW (LPCWSTR wcs, int nChars, CScores &S, PCLCDConfigure) const;
|
|
void ScoreLanguageAsSBCS (LPCWSTR wcs, int nch, CScores &S) const;
|
|
void ScoreUnicodeSublanguages (PLanguageUnicode pL, LPCWSTR wcs,
|
|
int nch, CScores &S) const;
|
|
|
|
private:
|
|
// Language training info virtual-mapped in training file
|
|
|
|
unsigned int m_nCharmaps;
|
|
unsigned int m_n7BitLanguages;
|
|
unsigned int m_n8BitLanguages;
|
|
unsigned int m_nUnicodeLanguages;
|
|
|
|
PCharmap *m_ppCharmaps;
|
|
PLanguage7Bit *m_pp7BitLanguages;
|
|
PLanguage8Bit *m_pp8BitLanguages;
|
|
PLanguageUnicode *m_ppUnicodeLanguages;
|
|
|
|
// Cached information for the optimized scoring inner-loops.
|
|
|
|
PHElt m_paHElt7Bit[MAX7BITLANG];
|
|
PHElt m_paHElt8Bit[MAXSCORES];
|
|
int m_nHElt8Bit;
|
|
|
|
// Special 7-bit lang histogram for ScoreLanguageAsSBCS()
|
|
|
|
PHistogram m_pHU27Bit;
|
|
|
|
// Initialization state variables
|
|
|
|
unsigned int m_n7BitLangsRead;
|
|
unsigned int m_n8BitLangsRead;
|
|
unsigned int m_nUnicodeLangsRead;
|
|
unsigned int m_nMapsRead;
|
|
int m_nHistogramsRead;
|
|
int m_nScoreIdx;
|
|
|
|
// Default configuration to use when NULL parameter passed to detect
|
|
|
|
LCDConfigure m_LCDConfigureDefault;
|
|
|
|
// File mapping information for the training data file
|
|
|
|
HANDLE m_hf;
|
|
HANDLE m_hmap;
|
|
void *m_pv;
|
|
|
|
HMODULE m_hModule;
|
|
};
|
|
|
|
|
|
|
|
inline PLanguageUnicode
|
|
LanguageUnicode::GetSublanguage (int n) const
|
|
{
|
|
return m_pLC->GetUnicodeLanguage(GetSublangRangeID(n));
|
|
}
|
|
|
|
#endif // __cplusplus
|