WindowsXP-SP1/shell/ext/mlang/jislex.cpp
2020-09-30 16:53:49 +02:00

486 lines
17 KiB
C++

/*----------------------------------------------------------------------------
%%File: jislex.c
%%Unit: fechmap
%%Contact: jpick
Simple converter for decoding a subset of possible ISO-2022-7 encoded
files (ISO-2022). Data is translated to and from Unicode. Converter
operates according to user options.
Module currently handles ISO-2022-JP (and JIS) and ISO-2022-KR.
Converter is set up to handle ISO-2022-TW and ISO-2022-CN, but there
are as yet no conversion tables for these.
----------------------------------------------------------------------------*/
#include <stdio.h>
#include <stddef.h>
#include "private.h"
#include "fechmap_.h"
#include "lexint_.h"
// State table for reading ISO-2022-7 encoded text
//
// Lexer recognizes the following designator sequences, used
// to select a one or two byte character set:
//
// <esc> $ @ -- JIS C 6626-1978 (synonym of <esc> $ ( @)
// <esc> $ A -- GB 2312-80 (synonym of <esc> $ ( A)
// <esc> $ B -- JIS X 0208-1983 (synonym of <esc> $ ( B)
//
// <esc> $ ( @ -- JIS C 6626-1978
// <esc> $ ( A -- GB 2312-80
// <esc> $ ( B -- JIS X 0208-1983
// <esc> $ ( C -- KS C 5601-1992
// <esc> $ ( D -- JIS X 0212-1990
// <esc> $ ( E -- ??? (ISO-IR-165:1992) ???
// <esc> $ ( G -- CNS 11643-1992 Plane 1
// <esc> $ ( H -- CNS 11643-1992 Plane 2
// <esc> $ ( I -- CNS 11643-1992 Plane 3
// <esc> $ ( J -- CNS 11643-1992 Plane 4
// <esc> $ ( K -- CNS 11643-1992 Plane 5
// <esc> $ ( L -- CNS 11643-1992 Plane 6
// <esc> $ ( M -- CNS 11643-1992 Plane 7
//
// <esc> $ ) C -- KSC 5601-1987 (Implies ISO-2022-KR ??)
//
// <esc> & @ <esc> $ B -- JIS X 0208-1990
//
// <esc> ( B -- Ascii
// <esc> ( H -- Deprecated variant of JIS-Roman
// <esc> ( I -- Half-Width Katakana
// <esc> ( J -- JIS-Roman
// <esc> ( T -- GB 1988-89 Roman
//
// Lexer recognizes the following shift sequences, used to allow
// interpretation of a given byte or bytes:
//
// <si> -- locking shift, interpret bytes as G0
// <so> -- locking shift, interpret bytes as G1
// <esc> n -- locking shift, interpret bytes as G2
// <esc> o -- locking shift, interpret bytes as G3
// <esc> N -- single shift, interpret bytes as G2
// <esc> O -- single shift, interpret bytes as G3
//
// REVIEW (jpick): don't currently need the final four shift
// sequences. If we support ISO-2022-CN, we'll need to use
// G2 and G3 and potentially, then, the last four shifts.
//
/*----------------------------------------------------------------------------
Character Classification Table
----------------------------------------------------------------------------*/
// Tokens
//
#define txt (JTK) 0
#define ext (JTK) 1 // extended characters that are legal under certain circumstances
#define esc (JTK) 2
#define si (JTK) 3
#define so (JTK) 4
#define dlr (JTK) 5
#define at (JTK) 6
#define amp (JTK) 7
#define opr (JTK) 8
#define cpr (JTK) 9
#define tkA (JTK) 10
#define tkB (JTK) 11
#define tkC (JTK) 12
#define tkD (JTK) 13
#define tkE (JTK) 14
#define tkG (JTK) 15
#define tkH (JTK) 16
#define tkI (JTK) 17
#define tkJ (JTK) 18
#define tkK (JTK) 19
#define tkL (JTK) 20
#define tkM (JTK) 21
#define tkT (JTK) 22
#define unk (JTK) 23 // Unexpected character
#define eof (JTK) 24 // end-of-file
#define err (JTK) 25 // read error
#define nTokens 26
// Lookup table for ISO-2022-7 encoded files
//
static JTK _rgjtkCharClass[256] =
// 0 1 2 3 4 5 6 7 8 9 a b c d e f
{
// nul soh stx etx eot enq ack bel bs tab lf vt np cr so si 0
txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, so, si,
// dle dc1 dc2 dc3 dc4 nak syn etb can em eof esc fs gs rs us 1
txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, esc, txt, txt, txt, txt,
// sp ! " # $ % & ' ( ) * + , - . / 2
txt, txt, txt, txt, dlr, txt, amp, txt, opr, cpr, txt, txt, txt, txt, txt, txt,
// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 3
txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,
// @ A B C D E F G H I J K L M N O 4
at, tkA, tkB, tkC, tkD, tkE, txt, tkG, tkH, tkI, tkJ, tkK, tkL, tkM, txt, txt,
// P Q R S T U V W X Y Z [ \ ] ^ _ 5
txt, txt, txt, txt, tkT, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,
// ` a b c d e f g h i j k l m n o 6
txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,
// p q r s t u v w x y z { | } ~ del 7
txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,
// 8
unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,
// 9
unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,
// a
unk, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,
// b
ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,
// c
ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,
// d
ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,
// e
unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,
// f
unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,
// 0 1 2 3 4 5 6 7 8 9 a b c d e f
};
/*----------------------------------------------------------------------------
State Table
----------------------------------------------------------------------------*/
// Final states have the high-bit set. States that represent the reading
// of a valid character escape sequence also encode the character set
// "name" (moniker??) -- the state with the high bit masked off.
//
// Table State
//
typedef unsigned char TST;
// Final State Mask, Related
//
#define grfFinal (TST) 0x80
#define _NEscTypeFromState(nState) (int) ((nState) & 0x7f)
// ASCII Escape Sequence (Final State)
#define ASC (TST) (grfFinal | 0x00) // Ascii
// Japanese Escape Sequences (Final States)
#define JS0 (TST) (grfFinal | 0x01) // JIS-Roman
#define JS1 (TST) (grfFinal | 0x02) // Half-Width Katakana
#define JS2 (TST) (grfFinal | 0x03) // JIS C 6226-1978
#define JS3 (TST) (grfFinal | 0x04) // JIS X 0208-1983
#define JS4 (TST) (grfFinal | 0x05) // JIS X 0208-1990
#define JS5 (TST) (grfFinal | 0x06) // JIS X 0212-1990
// Chinese (PRC) Escape Sequences (Final States)
#define CS0 (TST) (grfFinal | 0x07) // GB 1988-89 Roman
#define CS1 (TST) (grfFinal | 0x08) // GB 2312-80
// Chinese (Taiwan) Escape Sequences (Final States)
#define TS0 (TST) (grfFinal | 0x09) // CNS 11643-1992 Plane 1
#define TS1 (TST) (grfFinal | 0x0a) // CNS 11643-1992 Plane 2
#define TS2 (TST) (grfFinal | 0x0b) // CNS 11643-1992 Plane 3
#define TS3 (TST) (grfFinal | 0x0c) // CNS 11643-1992 Plane 4
#define TS4 (TST) (grfFinal | 0x0d) // CNS 11643-1992 Plane 5
#define TS5 (TST) (grfFinal | 0x0e) // CNS 11643-1992 Plane 6
#define TS6 (TST) (grfFinal | 0x0f) // CNS 11643-1992 Plane 7
// Korean Escape Sequences (Final State)
#define KS0 (TST) (grfFinal | 0x10) // KS C 5601-1992
// Document "Signal" for ISO-2022-KR (Doc needs special processing)
#define KSD (TST) (grfFinal | 0x11) // ISO-2022-KR Document Signal
// Number of unique *character set* escape sequences
//
#define cCsEsc 18
// Special States (not escape sequence) (Final States)
//
#define TXT (TST) (grfFinal | (cCsEsc + 1)) // Process Text
#define EXT (TST) (grfFinal | (cCsEsc + 2)) // Process (Possibly Illegal) Extended Chars
#define FIN (TST) (grfFinal | (cCsEsc + 3)) // Finish
#define EOI (TST) (grfFinal | (cCsEsc + 4)) // Unexpected End-Of-Input
#define UNK (TST) (grfFinal | (cCsEsc + 5)) // Unknown State (Unexpected Character)
#define ERR (TST) (grfFinal | (cCsEsc + 6)) // Read Error
// Shift Sequences (do not specify character set) (Final States)
//
#define LSO (TST) (grfFinal | (cCsEsc + 7)) // Locking shift out (g1 into GL)
#define LSI (TST) (grfFinal | (cCsEsc + 8)) // Locking shift in (g0 into GL)
// For convenience, also define constants for the sets
// that the states represent.
//
#define csNIL (-1) // Invalid Designator
#define csASC (_NEscTypeFromState(ASC)) // Ascii
#define csJS0 (_NEscTypeFromState(JS0)) // JIS-Roman
#define csJS1 (_NEscTypeFromState(JS1)) // Half-Width Katakana
#define csJS2 (_NEscTypeFromState(JS2)) // JIS C 6226-1978
#define csJS3 (_NEscTypeFromState(JS3)) // JIS X 0208-1983
#define csJS4 (_NEscTypeFromState(JS4)) // JIS X 0208-1990
#define csJS5 (_NEscTypeFromState(JS5)) // JIS X 0212-1990
#define csCS0 (_NEscTypeFromState(CS0)) // GB 1988-89 Roman
#define csCS1 (_NEscTypeFromState(CS1)) // GB 2312-80
#define csTS0 (_NEscTypeFromState(TS0)) // CNS 11643-1992 Plane 1
#define csTS1 (_NEscTypeFromState(TS1)) // CNS 11643-1992 Plane 2
#define csTS2 (_NEscTypeFromState(TS2)) // CNS 11643-1992 Plane 3
#define csTS3 (_NEscTypeFromState(TS3)) // CNS 11643-1992 Plane 4
#define csTS4 (_NEscTypeFromState(TS4)) // CNS 11643-1992 Plane 5
#define csTS5 (_NEscTypeFromState(TS5)) // CNS 11643-1992 Plane 6
#define csTS6 (_NEscTypeFromState(TS6)) // CNS 11643-1992 Plane 7
#define csKS0 (_NEscTypeFromState(KS0)) // KS C 5601-1992 (into G0)
#define csKSD (_NEscTypeFromState(KSD)) // KS C 5601-1992 (into G1)
// Table States (Intermediate States)
#define ST0 (TST) 0
#define ST1 (TST) 1
#define ST2 (TST) 2
#define ST3 (TST) 3
#define ST4 (TST) 4
#define ST5 (TST) 5
#define ST6 (TST) 6
#define ST7 (TST) 7
#define ST8 (TST) 8
#define ST9 (TST) 9
// Number of "real" (table) states
//
#define nStates 10
#define IsFinal(state) ((state) & grfFinal)
// State Have Seen Looking For
// ----------------------------------------------------------
// ST0 -- Start State -- <ESC> Text
// ST1 <ESC> $ & (
// ST2 <ESC> $ ( ) @ A B (**)
// ST3 <ESC> $ ( @ A B C D E G H I J K L M
// ST4 <ESC> $ ) C
// ST5 <ESC> & @
// ST6 <ESC> & @ <ESC>
// ST7 <ESC> & @ <ESC> $
// ST8 <ESC> & @ <ESC> $ B
// ST9 <ESC> ( B H I J T
//
// (**) "<ESC> $ ID" is a synonym of "<ESC> $ ( ID" for ID=(@, A, B)
//
// Because of the large number of tokens, this table is
// inverted (tokens x states).
//
static signed char _rgchNextState[nTokens][nStates] =
{
//
// S S S S S S S S S S
// T T T T T T T T T T
// 0 1 2 3 4 5 6 7 8 9
//--------------------------------------------------------------------
//
/* txt */ TXT, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
/* ext */ EXT, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
/* esc */ ST1, UNK, UNK, UNK, UNK, UNK, ST7, UNK, UNK, UNK,
/* si */ LSI, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
/* so */ LSO, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
/* $ */ TXT, ST2, UNK, UNK, UNK, UNK, UNK, ST8, UNK, UNK,
/* @ */ TXT, UNK, JS2, JS2, UNK, ST6, UNK, UNK, UNK, UNK,
/* & */ TXT, ST5, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
/* ( */ TXT, ST9, ST3, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
/* ) */ TXT, UNK, ST4, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
/* A */ TXT, UNK, CS1, CS1, UNK, UNK, UNK, UNK, UNK, UNK,
/* B */ TXT, UNK, JS3, JS3, UNK, UNK, UNK, UNK, JS4, ASC,
/* C */ TXT, UNK, UNK, KS0, KSD, UNK, UNK, UNK, UNK, UNK,
/* D */ TXT, UNK, UNK, JS5, UNK, UNK, UNK, UNK, UNK, UNK,
/* E */ TXT, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
/* G */ TXT, UNK, UNK, TS0, UNK, UNK, UNK, UNK, UNK, UNK,
/* H */ TXT, UNK, UNK, TS1, UNK, UNK, UNK, UNK, UNK, JS0,
/* I */ TXT, UNK, UNK, TS2, UNK, UNK, UNK, UNK, UNK, JS1,
/* J */ TXT, UNK, UNK, TS3, UNK, UNK, UNK, UNK, UNK, JS0,
/* K */ TXT, UNK, UNK, TS4, UNK, UNK, UNK, UNK, UNK, UNK,
/* L */ TXT, UNK, UNK, TS5, UNK, UNK, UNK, UNK, UNK, UNK,
/* M */ TXT, UNK, UNK, TS6, UNK, UNK, UNK, UNK, UNK, UNK,
/* T */ TXT, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, CS0,
/* unk */ UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
/* eof */ FIN, EOI, EOI, EOI, EOI, EOI, EOI, EOI, EOI, EOI,
/* err */ ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR,
};
// Also for ISO-2022 out. Build arrays of possible character
// sets for each type of input character set. Character sets
// should appear in order of hit probability (e.g., in 2022-Jp
// JS3 is the most common set). Mark the end of array with -1.
// (Only store these for non-ascii sets).
//
//
// China (icetIso2022Cn)
static int _rgceCn[] = { -1, };
// Japan (icetIso2022Jp)
static int _rgceJp[] = { csJS3, csJS1, csJS5, -1, };
// Korea (icetIso2022Kr)
static int _rgceKr[] = { -1, };
// Taiwan (icetIso2022Tw)
static int _rgceTw[] = { -1, };
static int *_mpicetrgce[icetCount] =
{
0, // icetEucCn
0, // icetEucJp
0, // icetEucKr
0, // icetEucTw
_rgceCn, // icetIso2022Cn
_rgceJp, // icetIso2022Jp
_rgceKr, // icetIso2022Kr
_rgceTw, // icetIso2022Tw
0, // icetBig5
0, // icetGbk
0, // icetShiftJis
0, // icetWansung
0, // icetUtf8
};
/* _ J T K G E T N E X T */
/*----------------------------------------------------------------------------
%%Function: _JtkGetNext
%%Contact: jpick
Get the next character and classify it. Return the token.
----------------------------------------------------------------------------*/
static JTK __inline _JtkGetNext(IStream *pstmIn, PUCHAR puch)
{
ULONG rc;
HRESULT hr;
hr = pstmIn->Read(puch, 1, &rc);
if (hr != S_OK )
return err;
else if (rc == 0)
return eof;
else
return _rgjtkCharClass[*puch];
}
/* C C E R E A D E S C S E Q */
/*----------------------------------------------------------------------------
%%Function: CceReadEscSeq
%%Contact: jpick
Read pointer is positioned at an escape sequence, figure out
which escape sequence it is.
----------------------------------------------------------------------------*/
CCE CceReadEscSeq(IStream *pstmIn, ICET *lpicet)
{
UCHAR uch;
TST tstCurr;
JTK jtk;
CCE cceRet;
#ifdef DEBUG
TST tstPrev;
#endif
// Sanity checks ...
//
#ifdef DEBUG
if (!pstmIn || !lpicet)
return cceInvalidParameter;
#endif
tstCurr = ST0;
while (1)
{
// Find the next stopping state.
//
do
{
// Get the next character and clasify it.
//
jtk = _JtkGetNext(pstmIn, &uch);
#ifdef DEBUG
// Save the previous state for debugging purposes, only.
//
tstPrev = tstCurr;
#endif
// Transition -- note that order is different than
// "normal" transition tables.
//
tstCurr = _rgchNextState[jtk][tstCurr];
} while (!IsFinal(tstCurr));
switch (tstCurr)
{
case JS0: // JIS-Roman
case JS1: // Half-Width Katakana
case JS2: // JIS C 6226-1978
case JS3: // JIS X 0208-1983
case JS4: // JIS X 0208-1990
case JS5: // JIS X 0212-1990
*lpicet = icetIso2022Jp;
cceRet = cceSuccess;
goto _LRet;
case CS0: // GB 1988-89 Roman
case CS1: // GB 2312-80
*lpicet = icetIso2022Cn;
cceRet = cceSuccess;
goto _LRet;
case TS0: // CNS 11643-1992 Plane 1
case TS1: // CNS 11643-1992 Plane 2
case TS2: // CNS 11643-1992 Plane 3
case TS3: // CNS 11643-1992 Plane 4
case TS4: // CNS 11643-1992 Plane 5
case TS5: // CNS 11643-1992 Plane 6
case TS6: // CNS 11643-1992 Plane 7
*lpicet = icetIso2022Tw;
cceRet = cceSuccess;
goto _LRet;
case KS0: // KS C 5601-1992
case KSD: // ISO-2022-KR Document Signal
*lpicet = icetIso2022Kr;
cceRet = cceSuccess;
goto _LRet;
case ASC: // Ascii
case LSO:
case LSI:
case TXT:
case EXT:
case FIN:
// Insufficient information to choose a flavor ...
cceRet = cceMayBeAscii;
goto _LRet;
case ERR:
cceRet = cceRead;
goto _LRet;
default: // UNK, EOI
cceRet = cceUnknownInput;
goto _LRet;
}
}
_LRet:
return cceRet;
}