WindowsXP-SP1/shell/ext/mlang/detcjpn.cpp

// =================================================================================
// Internet Character Set Detection: For Japanese
// =================================================================================

#include "private.h"
#include "detcbase.h"
#include "detcjpn.h"
#include "fechrcnv.h"
#include "codepage.h"

CIncdJapanese::CIncdJapanese(DWORD nCp)
{
    m_nScoreJis = 0;
    m_nScoreEuc = 0;
    m_nScoreSJis = 0;

    m_nISOMode = NONE ;
    m_nJISMode = REGULAR;
    m_nEucMode = REGULAR;
    m_fDoubleByteSJis = FALSE;
    // If Jpn autoselect, we'll bias to Shift-Jis like we did before
    m_nPreferredCp = (nCp == CP_JP_AUTO)? CP_JPN_SJ : nCp;
}

BOOL CIncdJapanese::CheckISOChar(UCHAR tc)
{
    switch (m_nISOMode) {
    case NONE:
        if ( tc == ESC )
            m_nISOMode = ISO_ESC ;
        break;
    case ISO_ESC:
        if ( tc == ISO2022_IN_CHAR )        // '$'
            m_nISOMode = ISO_ESC_IN ;
        else if ( tc == ISO2022_OUT_CHAR )
            m_nISOMode = ISO_ESC_OUT ;      // '('
        else
            m_nISOMode = NONE ;
        break;
    case ISO_ESC_IN:    // esc '$'
        m_nISOMode = NONE ;
        if ( tc == ISO2022_IN_JP_CHAR1 ||       // 'B'
                tc == ISO2022_IN_JP_CHAR2 )     // '@'
        {
            m_nJISMode = DOUBLEBYTE ;
            return TRUE ;
        }
        break;
    case ISO_ESC_OUT:   // esc '('
        m_nISOMode = NONE ;
        if ( tc == ISO2022_OUT_JP_CHAR1 ||      //	'B'
                tc == ISO2022_OUT_JP_CHAR2 )    //	'J'
        {
            m_nJISMode = REGULAR ;
            return TRUE ;
        }
        else if ( tc == ISO2022_OUT_JP_CHAR3 )   //	'I'
        {
            m_nJISMode = KATAKANA;
            return TRUE ;
        }
        break;
    }
    return FALSE;
}

BOOL CIncdJapanese::DetectChar(UCHAR tc)
{
	// JIS
	if ( CheckISOChar(tc) )
	    return FALSE;   // JIS mode change, don't need to check other type

	switch (m_nJISMode) {
	case REGULAR:
	    if (tc < 0x80)
	        m_nScoreJis += SCORE_MAJOR;
	    break;
	case DOUBLEBYTE:
	case KATAKANA:
	    m_nScoreJis += SCORE_MAJOR;
	    return FALSE;   // In JIS mode for sure, don't need to check other type
	}

	// EUC-J
	switch (m_nEucMode) {
	case REGULAR:
		if (tc >= 0xa1 && tc <= 0xfe) // Double Byte
			m_nEucMode = DOUBLEBYTE;
		else if (tc == 0x8e) // Single Byte Katakana
			m_nEucMode = KATAKANA;
		else if (tc < 0x80)
			m_nScoreEuc += SCORE_MAJOR;
		break;
	case DOUBLEBYTE:
		if (tc >= 0xa1 && tc <= 0xfe)
			m_nScoreEuc += SCORE_MAJOR * 2;
		m_nEucMode = REGULAR;
		break;
	case KATAKANA:
		if (tc >= 0xa1 && tc <= 0xdf) // Katakana range
			m_nScoreEuc += SCORE_MAJOR * 2;
		m_nEucMode = REGULAR;
		break;
	}

	// Shift-JIS
	if (!m_fDoubleByteSJis) {
		if ((tc >= 0x81 && tc <= 0x9f) || (tc >= 0xe0 && tc <= 0xfc)) // Double Byte
			m_fDoubleByteSJis = TRUE;
		else if (tc <= 0x7e || (tc >= 0xa1 && tc <= 0xdf))
			m_nScoreSJis += SCORE_MAJOR;
	} else {
		if (tc >= 0x40 && tc <= 0xfc && tc != 0x7f) // Trail Byte range
			m_nScoreSJis += SCORE_MAJOR * 2;
		m_fDoubleByteSJis = FALSE;
	}

	return FALSE;
}

int CIncdJapanese::GetDetectedCodeSet()
{
    int nMaxScore = m_nScoreSJis;
    int nCodeSet = CP_JPN_SJ;

    if (m_nScoreEuc > nMaxScore) {
	    nMaxScore = m_nScoreEuc;
	    nCodeSet = CP_EUC_JP ; // EUC
    } else if (m_nScoreEuc == nMaxScore) {
        if (m_nScoreEuc > MIN_JPN_DETECTLEN * SCORE_MAJOR)
            // If the given string is not long enough, we should rather choose SJIS
            // This helps fix the bug when we are just given Window Title
            // at Shell HyperText view.
	        nCodeSet = CP_EUC_JP ; // EUC
        else
            // If we can't distinguish between EUC and Shift-Jis, we use the preferred one
            nCodeSet = m_nPreferredCp;
    }

    // JIS
    if (m_nScoreJis > nMaxScore)
	    nCodeSet = CP_ISO_2022_JP ;
    // Even score means all 7bits chars
    // in this case, it maybe just pure ANSI data, we return it is ambiguous.
    else if (m_nScoreJis == nMaxScore)
	    nCodeSet = 0 ;

    return nCodeSet;
}