143 lines
3.7 KiB
C++
143 lines
3.7 KiB
C++
|
// Internet Character Set Detection: For Japanese
|
||
|
|
||
|
#include "private.h"
|
||
|
#include "detcbase.h"
|
||
|
#include "detcjpn.h"
|
||
|
#include "fechrcnv.h"
|
||
|
#include "codepage.h"
|
||
|
|
||
|
|
||
|
CIncdJapanese::CIncdJapanese()
|
||
|
{
|
||
|
m_nScoreJis = 0;
|
||
|
m_nScoreEuc = 0;
|
||
|
m_nScoreSJis = 0;
|
||
|
|
||
|
m_nISOMode = NONE;
|
||
|
m_nJISMode = REGULAR;
|
||
|
m_nEucMode = REGULAR;
|
||
|
m_fDoubleByteSJis = FALSE;
|
||
|
}
|
||
|
|
||
|
|
||
|
BOOL CIncdJapanese::CheckISOChar(UCHAR tc)
|
||
|
{
|
||
|
switch (m_nISOMode) {
|
||
|
case NONE:
|
||
|
if (tc == ESC)
|
||
|
m_nISOMode = ISO_ESC;
|
||
|
break;
|
||
|
case ISO_ESC:
|
||
|
if (tc == ISO2022_IN_CHAR) // '$'
|
||
|
m_nISOMode = ISO_ESC_IN;
|
||
|
else if (tc == ISO2022_OUT_CHAR)
|
||
|
m_nISOMode = ISO_ESC_OUT; // '('
|
||
|
else
|
||
|
m_nISOMode = NONE;
|
||
|
break;
|
||
|
case ISO_ESC_IN: // esc '$'
|
||
|
m_nISOMode = NONE;
|
||
|
if (tc == ISO2022_IN_JP_CHAR1 || // 'B'
|
||
|
tc == ISO2022_IN_JP_CHAR2) // '@'
|
||
|
{
|
||
|
m_nJISMode = DOUBLEBYTE;
|
||
|
return TRUE;
|
||
|
}
|
||
|
break;
|
||
|
case ISO_ESC_OUT: // esc '('
|
||
|
m_nISOMode = NONE;
|
||
|
if (tc == ISO2022_OUT_JP_CHAR1 || // 'B'
|
||
|
tc == ISO2022_OUT_JP_CHAR2) // 'J'
|
||
|
{
|
||
|
m_nJISMode = REGULAR;
|
||
|
return TRUE;
|
||
|
}
|
||
|
else if (tc == ISO2022_OUT_JP_CHAR3) // 'I'
|
||
|
{
|
||
|
m_nJISMode = KATAKANA;
|
||
|
return TRUE;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
|
||
|
BOOL CIncdJapanese::DetectChar(UCHAR tc)
|
||
|
{
|
||
|
// JIS
|
||
|
if (CheckISOChar(tc))
|
||
|
return FALSE; // JIS mode change, don't need to check other type
|
||
|
switch (m_nJISMode) {
|
||
|
case REGULAR:
|
||
|
if (tc < 0x80)
|
||
|
m_nScoreJis += SCORE_MAJOR;
|
||
|
break;
|
||
|
case DOUBLEBYTE:
|
||
|
case KATAKANA:
|
||
|
m_nScoreJis += SCORE_MAJOR;
|
||
|
return FALSE; // In JIS mode for sure, don't need to check other type
|
||
|
}
|
||
|
|
||
|
// EUC-J
|
||
|
switch (m_nEucMode) {
|
||
|
case REGULAR:
|
||
|
if (tc >= 0xa1 && tc <= 0xfe) // Double Byte
|
||
|
m_nEucMode = DOUBLEBYTE;
|
||
|
else if (tc == 0x8e) // Single Byte Katakana
|
||
|
m_nEucMode = KATAKANA;
|
||
|
else if (tc < 0x80)
|
||
|
m_nScoreEuc += SCORE_MAJOR;
|
||
|
break;
|
||
|
case DOUBLEBYTE:
|
||
|
if (tc >= 0xa1 && tc <= 0xfe)
|
||
|
m_nScoreEuc += SCORE_MAJOR * 2;
|
||
|
m_nEucMode = REGULAR;
|
||
|
break;
|
||
|
case KATAKANA:
|
||
|
if (tc >= 0xa1 && tc <= 0xdf) // Katakana range
|
||
|
m_nScoreEuc += SCORE_MAJOR * 2;
|
||
|
m_nEucMode = REGULAR;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
// Shift-JIS
|
||
|
if (!m_fDoubleByteSJis) {
|
||
|
if ((tc >= 0x81 && tc <= 0x9f) || (tc >= 0xe0 && tc <= 0xfc)) // Double Byte
|
||
|
m_fDoubleByteSJis = TRUE;
|
||
|
else if (tc <= 0x7e || (tc >= 0xa1 && tc <= 0xdf))
|
||
|
m_nScoreSJis += SCORE_MAJOR;
|
||
|
}
|
||
|
else {
|
||
|
if (tc >= 0x40 && tc <= 0xfc && tc != 0x7f) // Trail Byte range
|
||
|
m_nScoreSJis += SCORE_MAJOR * 2;
|
||
|
m_fDoubleByteSJis = FALSE;
|
||
|
}
|
||
|
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
|
||
|
int CIncdJapanese::GetDetectedCodeSet()
|
||
|
{
|
||
|
int nMaxScore = m_nScoreSJis;
|
||
|
int nCodeSet = CP_JPN_SJ;
|
||
|
|
||
|
if (m_nScoreEuc > nMaxScore) {
|
||
|
nMaxScore = m_nScoreEuc;
|
||
|
nCodeSet = CP_EUC_JP; // EUC
|
||
|
}
|
||
|
else if (m_nScoreEuc == nMaxScore && m_nScoreEuc > MIN_JPN_DETECTLEN * SCORE_MAJOR) {
|
||
|
// If the given string is not long enough, we should rather choose SJIS
|
||
|
// This helps fix the bug when we are just given Window Title at Shell HyperText view.
|
||
|
nCodeSet = CP_EUC_JP; // EUC
|
||
|
}
|
||
|
|
||
|
if (m_nScoreJis > nMaxScore)
|
||
|
nCodeSet = CP_ISO_2022_JP; // JIS
|
||
|
else if (m_nScoreJis == nMaxScore) // Even score means all 7bits chars
|
||
|
nCodeSet = 0; // in this case, it maybe just pure ANSI data, we return it is ambiguous.
|
||
|
|
||
|
return nCodeSet;
|
||
|
}
|