Windows2000/private/shell/ext/mlang/detcjpn.cpp

// Internet Character Set Detection: For Japanese

#include "private.h"
#include "detcbase.h"
#include "detcjpn.h"
#include "fechrcnv.h"
#include "codepage.h"


CIncdJapanese::CIncdJapanese()
{
    m_nScoreJis = 0;
    m_nScoreEuc = 0;
    m_nScoreSJis = 0;

    m_nISOMode = NONE;
    m_nJISMode = REGULAR;
    m_nEucMode = REGULAR;
    m_fDoubleByteSJis = FALSE;
}


BOOL CIncdJapanese::CheckISOChar(UCHAR tc)
{
    switch (m_nISOMode) {
    case NONE:
        if (tc == ESC)
            m_nISOMode = ISO_ESC;
        break;
    case ISO_ESC:
        if (tc == ISO2022_IN_CHAR)        // '$'
            m_nISOMode = ISO_ESC_IN;
        else if (tc == ISO2022_OUT_CHAR)
            m_nISOMode = ISO_ESC_OUT;      // '('
        else
            m_nISOMode = NONE;
        break;
    case ISO_ESC_IN:    // esc '$'
        m_nISOMode = NONE;
        if (tc == ISO2022_IN_JP_CHAR1 ||       // 'B'
            tc == ISO2022_IN_JP_CHAR2)     // '@'
        {
            m_nJISMode = DOUBLEBYTE;
            return TRUE;
        }
        break;
    case ISO_ESC_OUT:   // esc '('
        m_nISOMode = NONE;
        if (tc == ISO2022_OUT_JP_CHAR1 ||      //    'B'
            tc == ISO2022_OUT_JP_CHAR2)    //    'J'
        {
            m_nJISMode = REGULAR;
            return TRUE;
        }
        else if (tc == ISO2022_OUT_JP_CHAR3)   //    'I'
        {
            m_nJISMode = KATAKANA;
            return TRUE;
        }
        break;
    }

    return FALSE;
}


BOOL CIncdJapanese::DetectChar(UCHAR tc)
{
    // JIS
    if (CheckISOChar(tc))
        return FALSE;   // JIS mode change, don't need to check other type
    switch (m_nJISMode) {
    case REGULAR:
        if (tc < 0x80)
            m_nScoreJis += SCORE_MAJOR;
        break;
    case DOUBLEBYTE:
    case KATAKANA:
        m_nScoreJis += SCORE_MAJOR;
        return FALSE;   // In JIS mode for sure, don't need to check other type
    }

    // EUC-J
    switch (m_nEucMode) {
    case REGULAR:
        if (tc >= 0xa1 && tc <= 0xfe) // Double Byte
            m_nEucMode = DOUBLEBYTE;
        else if (tc == 0x8e) // Single Byte Katakana
            m_nEucMode = KATAKANA;
        else if (tc < 0x80)
            m_nScoreEuc += SCORE_MAJOR;
        break;
    case DOUBLEBYTE:
        if (tc >= 0xa1 && tc <= 0xfe)
            m_nScoreEuc += SCORE_MAJOR * 2;
        m_nEucMode = REGULAR;
        break;
    case KATAKANA:
        if (tc >= 0xa1 && tc <= 0xdf) // Katakana range
            m_nScoreEuc += SCORE_MAJOR * 2;
        m_nEucMode = REGULAR;
        break;
    }

    // Shift-JIS
    if (!m_fDoubleByteSJis) {
        if ((tc >= 0x81 && tc <= 0x9f) || (tc >= 0xe0 && tc <= 0xfc)) // Double Byte
            m_fDoubleByteSJis = TRUE;
        else if (tc <= 0x7e || (tc >= 0xa1 && tc <= 0xdf))
            m_nScoreSJis += SCORE_MAJOR;
    }
    else {
        if (tc >= 0x40 && tc <= 0xfc && tc != 0x7f) // Trail Byte range
            m_nScoreSJis += SCORE_MAJOR * 2;
        m_fDoubleByteSJis = FALSE;
    }

    return FALSE;
}


int CIncdJapanese::GetDetectedCodeSet()
{
    int nMaxScore = m_nScoreSJis;
    int nCodeSet = CP_JPN_SJ;

    if (m_nScoreEuc > nMaxScore) {
        nMaxScore = m_nScoreEuc;
        nCodeSet = CP_EUC_JP; // EUC
    }
    else if (m_nScoreEuc == nMaxScore && m_nScoreEuc > MIN_JPN_DETECTLEN * SCORE_MAJOR) {
        // If the given string is not long enough, we should rather choose SJIS
        // This helps fix the bug when we are just given Window Title at Shell HyperText view.
        nCodeSet = CP_EUC_JP; // EUC
    }

    if (m_nScoreJis > nMaxScore)
        nCodeSet = CP_ISO_2022_JP; // JIS
    else if (m_nScoreJis == nMaxScore) // Even score means all 7bits chars
        nCodeSet = 0;    // in this case, it maybe just pure ANSI data, we return it is ambiguous.

    return nCodeSet;
}