Windows2003-3790/base/win32/winnls/data/dlls/tools/gb18030/gbunicnv/conveng.cpp
2020-09-30 16:53:55 +02:00

763 lines
21 KiB
C++

#include "stdafx.h"
#include "conveng.h"
#include "convdata.tbl"
// These file contain 3 parts:
// First part, Some basic service functions for Ansi char format convert,
// Distance/Advance calculate and Binary search algorithm copied from STL
// Second part, Unicode to Ansi
// Third part, Ansi to Unicode
// ****************************************************************************
// Frist part, Ansi char convert functions
//
// This part not use any data base in .tbl file
// ****************************************************************************
// Binary search algorithm
// Copy from STL, only very little modify
template <class RandomAccessIterator, class T>
RandomAccessIterator __lower_bound(RandomAccessIterator first,
RandomAccessIterator last, const T& value) {
INT_PTR len = last - first;
INT_PTR half;
RandomAccessIterator middle;
while (len > 0) {
half = len / 2;
middle = first + half;
if (*middle < value) {
first = middle + 1;
len = len - half - 1;
} else {
len = half;
}
}
return first;
}
template <class RandomAccessIterator, class T>
RandomAccessIterator __upper_bound(RandomAccessIterator first,
RandomAccessIterator last, const T& value) {
DWORD len = last - first;
DWORD half;
RandomAccessIterator middle;
while (len > 0) {
half = len / 2;
middle = first + half;
if (!(value < *middle)) {
first = middle + 1;
len = len - half - 1;
} else {
len = half;
}
}
return first;
}
template<class T>
inline ValueIn(
T Value,
T Low,
T High)
{
return (Value >= Low && Value < High);
}
inline BOOL IsValidSurrogateLeadWord(
WCHAR wchUnicode)
{
return ValueIn(wchUnicode, cg_wchSurrogateLeadWordLow, cg_wchSurrogateLeadWordHigh);
}
inline BOOL IsValidSurrogateTailWord(
WCHAR wchUnicode)
{
return ValueIn(wchUnicode, cg_wchSurrogateTailWordLow, cg_wchSurrogateTailWordHigh);
}
inline BOOL IsValidQByteAnsiLeadByte(
BYTE byAnsi)
{
return ValueIn(byAnsi, cg_byQByteAnsiLeadByteLow, cg_byQByteAnsiLeadByteHigh);
}
inline BOOL IsValidQByteAnsiTailByte(
BYTE byAnsi)
{
return ValueIn(byAnsi, cg_byQByteAnsiTailByteLow, cg_byQByteAnsiTailByteHigh);
}
// Generate QByte Ansi. The Ansi char is in DWORD format,
// in another word, it's in reverse order of GB18030 standard
DWORD QByteAnsiBaseAddOffset(
DWORD dwBaseAnsi, // In reverse order
int nOffset)
{
DWORD dwAnsi = dwBaseAnsi;
PBYTE pByte = (PBYTE)&dwAnsi;
// dwOffset should less than 1M
ASSERT (nOffset < 0x100000);
nOffset += pByte[0] - 0x30;
pByte[0] = 0x30 + nOffset % 10;
nOffset /= 10;
nOffset += pByte[1] - 0x81;
pByte[1] = 0x81 + nOffset % 126;
nOffset /= 126;
nOffset += pByte[2] - 0x30;
pByte[2] = 0x30 + nOffset % 10;
nOffset /= 10;
nOffset += pByte[3] - 0x81;
pByte[3] = 0x81 + nOffset % 126;
nOffset /= 126;
ASSERT(nOffset == 0);
return dwAnsi;
}
// Get "distance" of 2 QByte Ansi
int CalcuDistanceOfQByteAnsi(
DWORD dwAnsi1, // In reverse order
DWORD dwAnsi2) // In reverse order
{
signed char* pschAnsi1 = (signed char*)&dwAnsi1;
signed char* pschAnsi2 = (signed char*)&dwAnsi2;
int nDistance = 0;
nDistance += (pschAnsi1[0] - pschAnsi2[0]);
nDistance += (pschAnsi1[1] - pschAnsi2[1])*10;
nDistance += (pschAnsi1[2] - pschAnsi2[2])*1260;
nDistance += (pschAnsi1[3] - pschAnsi2[3])*12600;
return nDistance;
}
// Reverse 4 Bytes order, from DWORD format to GB format,
// or GB to DWORD
void ReverseQBytesOrder(
PBYTE pByte)
{
BYTE by;
by = pByte[0];
pByte[0] = pByte[3];
pByte[3] = by;
by = pByte[1];
pByte[1] = pByte[2];
pByte[2] = by;
return;
}
// ****************************************************************************
// Second part, Unicode to Ansi
// ****************************************************************************
// ------------------------------------------------
// Two helper function for UnicodeToAnsi
// return Ansi char code
// the Ansi is in GB standard order (not Word value order)
//
// Unicode to double bytes Ansi char
//
// Return Ansi char code, 0 means fail (internal error, etc.)
//
WORD UnicodeToDByteAnsi(
WCHAR wchUnicode)
{
char achAnsiBuf[4];
WORD wAnsi = 0;
int cLen = 0;
// Code changed from GBK to GB18030, or code not compatible
// from CP936 to CP54936
for (int i = 0; i < sizeof(asAnsiCodeChanged)/sizeof(SAnsiCodeChanged); i++) {
if (wchUnicode == asAnsiCodeChanged[i].wchUnicode) {
wAnsi = asAnsiCodeChanged[i].wchAnsiNew;
goto Exit;
}
}
// Not in Changed code list, that is same with GBK, or CP936
// (Most DByte Ansi char code should compatible from GBK to GB18030)
cLen = WideCharToMultiByte(936,
WC_COMPOSITECHECK, &wchUnicode, 1,
achAnsiBuf, sizeof(achAnsiBuf)-1, NULL, NULL);
if (cLen != 2) {
ASSERT(cLen == 2);
wAnsi = 0;
} else {
wAnsi = *(PWORD)achAnsiBuf;
}
Exit:
return wAnsi;
}
// Unicode to quad bytes Ansi char
//
// Return Ansi char code
// 0 means fail (interal error)
//
DWORD UnicodeToQByteAnsi(
int nSection,
int nOffset)
{
DWORD dwBaseAnsi;
if (nSection < 0 || nSection >= sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD)) {
ASSERT(FALSE);
return 0;
}
dwBaseAnsi = adwAnsiQBytesAreaStartValue[nSection];
// Check adwAnsiQByteAreaStartValue array is correctly
#ifdef _DEBUG
int ncQByteAnsiNum = 0;
for (int i = 0; i < nSection; i++) {
// Calcu QByte Ansi char numbers
ncQByteAnsiNum += awchAnsiDQByteBound[2*i+1] - awchAnsiDQByteBound[2*i];
}
ASSERT(dwBaseAnsi == QByteAnsiBaseAddOffset(cg_dwQByteAnsiStart, ncQByteAnsiNum));
#endif
DWORD dwAnsi = QByteAnsiBaseAddOffset(dwBaseAnsi, nOffset);
// Value order to standard order
ReverseQBytesOrder((PBYTE)(&dwAnsi));
return dwAnsi;
}
// ---------------------------------------------------------
// Two function support 2 bytes Unicode (BMP)
// and 4 bytes Unicode (Surrogate) translate to Ansi
// 2 bytes Unicode (BMP)
// Return Ansi str len, when success, should be 2 or 4;
// return 0 means fail (internal error, etc.)
int UnicodeToAnsi(
WCHAR wchUnicode,
char* pchAnsi,
DWORD dwBufSize)
{
// Classic Unicode, not support surrogate in this function
ASSERT(!IsValidSurrogateLeadWord(wchUnicode)
&& !IsValidSurrogateTailWord(wchUnicode));
DWORD lAnsiLen = 0;
const WORD* p;
INT_PTR i;
// ASCII, 0 - 0x7f
if (wchUnicode <= 0x7f) {
*pchAnsi = (char)wchUnicode;
lAnsiLen = 1;
goto Exit;
}
// BMP, 4 byte or 2 byte
p = __lower_bound(awchAnsiDQByteBound, awchAnsiDQByteBound
+ sizeof(awchAnsiDQByteBound)/sizeof(WCHAR), wchUnicode);
if (p == awchAnsiDQByteBound
+ sizeof(awchAnsiDQByteBound)/sizeof(WCHAR)) {
p --;
} else if (wchUnicode < *p) {
p --;
} else if (wchUnicode == *p) {
} else {
ASSERT(FALSE);
}
i = p - awchAnsiDQByteBound;
ASSERT(i >= 0);
// Stop when >= *(((PWORD)asAnsi2ByteArea) + i);
if (i%2) { // Odd, in 2 bytes area
WORD wAnsi = UnicodeToDByteAnsi(wchUnicode);
if (wAnsi && dwBufSize >= 2) {
*(UNALIGNED WORD*)pchAnsi = wAnsi;
lAnsiLen = 2;
} else {
lAnsiLen = 0;
}
} else { // Duel, in 4 bytes area
DWORD dwAnsi = UnicodeToQByteAnsi
((int)i/2, wchUnicode - awchAnsiDQByteBound[i]);
if (dwAnsi && dwBufSize >= 4) {
*(UNALIGNED DWORD*)pchAnsi = dwAnsi;
lAnsiLen = 4;
} else {
lAnsiLen = 0;
}
}
Exit:
return lAnsiLen;
}
// 4 bytes Unicode (Surrogate)
// Return Ansi str length, when success, should be 4
// return 0 means fail (Buffer overflow)
int SurrogateToAnsi(
PCWCH pwchUnicode,
PCHAR pchAnsi,
DWORD dwBufSize)
{
ASSERT(IsValidSurrogateLeadWord(pwchUnicode[0]));
ASSERT(IsValidSurrogateTailWord(pwchUnicode[1]));
// dwOffset is ISO char code - 0x10000
DWORD dwOffset = ((pwchUnicode[0] - cg_wchSurrogateLeadWordLow)<<10)
+ (pwchUnicode[1] - cg_wchSurrogateTailWordLow)
+ 0x10000 - 0x10000;
if (dwBufSize < 4) {
return 0;
}
*(UNALIGNED DWORD*)pchAnsi = QByteAnsiBaseAddOffset
(cg_dwQByteAnsiToSurrogateStart, dwOffset);
ReverseQBytesOrder((PBYTE)pchAnsi);
return 4;
}
// API: high level service for Unicode to Ansi
// return result Ansi str length (in byte)
// return -1 means fail (Buffer overflow, internal error, etc.)
int UnicodeStrToAnsiStr(
PCWCH pwchUnicodeStr,
int ncUnicodeStr, // in WCHAR
PCHAR pchAnsiStrBuf,
int ncAnsiStrBufSize) // in BYTE
{
int ncAnsiStr = 0;
int ncAnsiCharSize;
for (int i = 0; i < ncUnicodeStr; i++, pwchUnicodeStr++) {
if (ncAnsiStr > (ncAnsiStrBufSize-4)) {
// Buffer overflow
break;
}
if (IsValidSurrogateLeadWord(pwchUnicodeStr[0])) {
if ((i+1 < ncUnicodeStr)
&& (IsValidSurrogateTailWord(pwchUnicodeStr[1]))) {
ncAnsiCharSize = SurrogateToAnsi(pwchUnicodeStr, pchAnsiStrBuf, 4);
ASSERT(ncAnsiCharSize == 4);
if (ncAnsiCharSize == 0) {
ASSERT(FALSE);
break;
}
ncAnsiStr += ncAnsiCharSize;
pchAnsiStrBuf += ncAnsiCharSize;
pwchUnicodeStr++;
i++;
} else {
// Invalide Uncode char, skip
}
} else if (*pwchUnicodeStr == 0) {
*pchAnsiStrBuf = 0;
pchAnsiStrBuf ++;
ncAnsiStr ++;
} else {
ncAnsiCharSize = UnicodeToAnsi(*pwchUnicodeStr, pchAnsiStrBuf, 4);
if (ncAnsiCharSize == 0) {
ASSERT(FALSE);
break;
}
pchAnsiStrBuf += ncAnsiCharSize;
ncAnsiStr += ncAnsiCharSize;
}
}
if (i < ncUnicodeStr) { return -1; }
return ncAnsiStr;
}
// ****************************************************************************
// Third part, Ansi to Unicode
// ****************************************************************************
// Return Unicode number (number always equal to 1 when success)
// return 0 if can't find corresponding Unicode
// -1 means fail (internal error, etc.)
int QByteAnsiToSingleUnicode(
DWORD dwAnsi,
PWCH pwchUnicode)
{
const DWORD* p;
INT_PTR i;
// 0x8431a439(cg_dwQByteAnsiToBMPLast) to 0x85308130 haven't Unicode corresponding
// 0x85308130 to 0x90308130(cg_dwQByteAnsiToSurrogateStart) are reserved zone,
// haven't Unicode corresponding
if (dwAnsi > cg_dwQByteAnsiToBMPLast) {
return 0;
}
// Invalid input value
if (dwAnsi < adwAnsiQBytesAreaStartValue[0]) {
return -1;
}
p = __lower_bound(adwAnsiQBytesAreaStartValue,
adwAnsiQBytesAreaStartValue + sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD),
dwAnsi);
if (p == adwAnsiQBytesAreaStartValue
+ sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD)) {
p --;
} else if (dwAnsi < *p) {
p --;
} else if (dwAnsi == *p) {
} else {
ASSERT(FALSE);
}
i = p - adwAnsiQBytesAreaStartValue;
if (i < 0) {
ASSERT(i >= 0);
return -1;
}
*pwchUnicode = awchAnsiDQByteBound[2*i] + CalcuDistanceOfQByteAnsi(dwAnsi, *p);
#ifdef _DEBUG
{
int nAnsiCharDistance = CalcuDistanceOfQByteAnsi(dwAnsi, *p);
ASSERT(nAnsiCharDistance >= 0);
WCHAR wchUnicodeDbg;
if ((p+1) < adwAnsiQBytesAreaStartValue
+ sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD)) {
nAnsiCharDistance = CalcuDistanceOfQByteAnsi(dwAnsi, *(p+1));
wchUnicodeDbg = awchAnsiDQByteBound[2*i+1] + nAnsiCharDistance;
} else if ((p+1) == adwAnsiQBytesAreaStartValue
+ sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD)) {
nAnsiCharDistance = CalcuDistanceOfQByteAnsi(dwAnsi, 0x8431A530);
wchUnicodeDbg = 0x10000 + nAnsiCharDistance;
} else {
ASSERT(FALSE);
}
ASSERT(nAnsiCharDistance < 0);
ASSERT(wchUnicodeDbg == *pwchUnicode);
}
#endif
return 1;
}
// Return Unicode number (number always 2 when success)
// return 0 if can't find corresponding Unicode
int QByteAnsiToDoubleUnicode(
DWORD dwAnsi,
PWCH pwchUnicode)
{
int nDistance = CalcuDistanceOfQByteAnsi(dwAnsi, cg_dwQByteAnsiToSurrogateStart);
ASSERT (nDistance >= 0);
if (nDistance >= 0x100000) {
return 0;
}
pwchUnicode[1] = nDistance % 0x400 + 0xDC00;
pwchUnicode[0] = nDistance / 0x400 + 0xD800;
return 2;
}
// Return Unicode number (1 or 2 when success)
// return 0 if can't find corresponding Unicode
// return -1 if fail (Buffer overflow, invalid GB char code input,
// internal error, etc.)
int QByteAnsiToUnicode(
const BYTE* pbyAnsiChar,
PWCH pwchUnicode,
DWORD dwBufLen) // In WCHAR
{
DWORD dwAnsi;
int nLen = -1;
if ( IsValidQByteAnsiLeadByte(pbyAnsiChar[0])
&& IsValidQByteAnsiTailByte(pbyAnsiChar[1])
&& IsValidQByteAnsiLeadByte(pbyAnsiChar[2])
&& IsValidQByteAnsiTailByte(pbyAnsiChar[3])) {
} else {
return -1; // Invalid char
}
dwAnsi = *(UNALIGNED DWORD*)pbyAnsiChar;
ReverseQBytesOrder((PBYTE)(&dwAnsi));
if (dwAnsi >= cg_dwQByteAnsiToSurrogateStart) {
if (dwBufLen >= 2) {
nLen = QByteAnsiToDoubleUnicode(dwAnsi, pwchUnicode);
}
} else {
if (dwBufLen >= 1) {
nLen = QByteAnsiToSingleUnicode(dwAnsi, pwchUnicode);
}
}
return nLen;
}
// Unicode to double bytes Ansi char
// Return: Unicode char code, 0 means fail (internal error, etc.)
WCHAR DByteAnsiToUnicode(
const BYTE* pbyAnsi)
{
WORD wAnsi = *(UNALIGNED WORD*)pbyAnsi;
int cLen = 1;
WCHAR wchUnicode;
// Code changed from GBK to GB18030, or code not compatible
// from CP936 to CP54936
for (int i = 0; i < sizeof(asAnsiCodeChanged)/sizeof(SAnsiCodeChanged); i++) {
if (wAnsi == asAnsiCodeChanged[i].wchAnsiNew) {
wchUnicode = asAnsiCodeChanged[i].wchUnicode;
goto Exit;
}
}
// Not in Changed code list, that is same with GBK, or CP936
// (Most DByte Ansi char code should compatible from GBK to GB18030)
cLen = MultiByteToWideChar(936, MB_PRECOMPOSED,
(PCCH)pbyAnsi, 2, &wchUnicode, 1);
if (cLen != 1) {
wchUnicode = 0;
}
Exit:
return wchUnicode;
}
// API: High level service for Ansi to Unicode
// return Unicode str length (in WCHAR)
// return -1 means fail (Buffer overflow, etc.)
int AnsiStrToUnicodeStr(
const BYTE* pbyAnsiStr,
int ncAnsiStrSize, // In char
PWCH pwchUnicodeBuf,
int ncBufLen) // In WCHAR
{
int nCharLen;
int ncUnicodeBuf = 0;
for (int i = 0; i < ncAnsiStrSize; ) {
if (ncUnicodeBuf > (ncBufLen-4)) {
// Buffer overflow
break;
}
// 1 byte Ansi char
if (*pbyAnsiStr < 0x80) {
*pwchUnicodeBuf = (WCHAR)*pbyAnsiStr;
pwchUnicodeBuf ++;
ncUnicodeBuf ++;
i++;
pbyAnsiStr++;
// 2 byte Ansi char
} else if ((i+1 < ncAnsiStrSize) && pbyAnsiStr[1] >= 0x40) {
*pwchUnicodeBuf = DByteAnsiToUnicode(pbyAnsiStr);
if (*pwchUnicodeBuf == 0) {
*pwchUnicodeBuf = '?';
}
pwchUnicodeBuf ++;
ncUnicodeBuf ++;
i += 2;
pbyAnsiStr += 2;
// 4 byte Ansi char
} else if ((i+3 < ncAnsiStrSize)
&& IsValidQByteAnsiLeadByte(pbyAnsiStr[0])
&& IsValidQByteAnsiTailByte(pbyAnsiStr[1])
&& IsValidQByteAnsiLeadByte(pbyAnsiStr[2])
&& IsValidQByteAnsiTailByte(pbyAnsiStr[3])) {
// QByte GB char
nCharLen = QByteAnsiToUnicode(pbyAnsiStr, pwchUnicodeBuf, 4);
if (nCharLen < 0) {
ASSERT(FALSE); // Invalid Ansi char input, or buffer overflow, etc.
// Should never happen but an internal error
break;
} else if (nCharLen == 0) { // hasn't corresponding Unicode Char
*pwchUnicodeBuf = '?';
pwchUnicodeBuf ++;
ncUnicodeBuf ++;
} else if (nCharLen > 0) {
ASSERT(nCharLen <= 2);
pwchUnicodeBuf += nCharLen;
ncUnicodeBuf += nCharLen;
} else {
ASSERT(FALSE);
}
i += 4;
pbyAnsiStr += 4;
// Invalid Ansi char
} else {
// Invalid
i++;
pbyAnsiStr++;
}
}
if (i < ncAnsiStrSize) { return -1; }
return ncUnicodeBuf;
}
// ******************************************************
// Testing program
// ******************************************************
/*
"\u0080", <0x81;0x30;0x81;0x30>
"\u00A3", <0x81;0x30;0x84;0x35>
"\u00A4", <0xA1;0xE8>
"\u00A5", <0x81;0x30;0x84;0x36>
"\u00A6", <0x81;0x30;0x84;0x37>
"\u00A7", <0xA1;0xEC>
"\u00A8", <0xA1;0xA7>
"\u00A9", <0x81;0x30;0x84;0x38>
"\u00AF", <0x81;0x30;0x85;0x34>
"\u00B0", <0xA1;0xE3>
"\u00B1", <0xA1;0xC0>
"\u00B2", <0x81;0x30;0x85;0x35>
{0x20AC, 0xe3a2},
{0x01f9, 0xbfa8},
{0x303e, 0x89a9},
{0x2ff0, 0x8aa9},
{0x2ff1, 0x8ba9},
50EF 836A
50F0 836B
50F1 836C
50F2 836D
*/
#if 0
int test (void)
{
const WCHAR awchUnicodeStr[] = {0x01, 0x7f, 0x80, 0x81, 0x82,
0xa2,
0xa3, // 0x81;0x30;0x84;0x35
0xa4, // 0xA1;0xE8
0xa5, // 0x81;0x30;0x84;0x36
0xa6, // 0x81;0x30;0x84;0x37
0xaf, // 0x81;0x30;0x85;0x34
0xb0, // 0xA1;0xE3
0xb1, // 0xA1;0xC0
0xb6, // 0x81;0x30;0x85;0x39
0xb7, // 0xA1;0xA4
// Some normal DByte Ansi char
0x50ef, // 0x83, 0x6A
0x50f2, // 0x83, 0x6D
// Some ansi char code changed in new standard
0x20ac, // 0xa2, 0xe3
0xE76C, // not (0xa2, 0xe3), should some QByte char
0x2ff0, // 0xa9, 0x8A
0x2ff1, // 0xa9, 0x8B
0x4723, // 0xFE, 0x80
// Ansi char arround DC00 to E000
0xd7ff, // 0x83, 0x36, 0xC7, 0x38
0xe76c, // 0x83, 0x36, 0xC7, 0x39
0xE76B, // 0xA2, 0xB0
0xffff, // 0x84, 0x31, 0xa4, 0x39,
0x00};
char* pchAnsiStr = new char[sizeof(awchUnicodeStr)*2+5];
UnicodeStrToAnsiStr(awchUnicodeStr, sizeof(awchUnicodeStr)/sizeof(WCHAR),
pchAnsiStr, sizeof(awchUnicodeStr)*2+5);
delete[] pchAnsiStr;
BYTE abyAnsiStr2[] = {
0x81, 0x30, 0x81, 0x30,
0x81, 0x30, 0x84, 0x35,
0xA1, 0xE8,
0x81, 0x30, 0x84, 0x36,
0x81, 0x30, 0x84, 0x37,
0xA1, 0xEC,
0xA1, 0xA7,
0x81, 0x30, 0x84, 0x38,
0x81, 0x30, 0x85, 0x34,
0xA1, 0xE3,
0xA1, 0xC0,
0x81, 0x30, 0x85, 0x35,
// Testing D800 to DE00
0x82, 0x35, 0x8f, 0x33, // 0x9FA6
0x83, 0x36, 0xC7, 0x38, // 0xD7FF
0xA2, 0xB0, // 0xE76B
0x83, 0x36, 0xC7, 0x39, // 0xE76C
// Testing last char in BMP
0x84, 0x31, 0xa4, 0x39, // 0xFFFF
// Some char code changed in new GB standard
0xa2, 0xe3, // 0x20AC,
0xa8, 0xbf, // 0x01f9,
0xa9, 0x89, // 0x303e,
0xa9, 0x8a, // 0x2ff0,
0xa9, 0x8b, // 0x2ff1,
0xFE, 0x9F, // 0x4dae
0x83, 0x6A, // 50EF
0x83, 0x6B, // 50F0
0x83, 0x6C, // 50F1
0x83, 0x6D // 50F2
};
WCHAR* pwchUnicodeStr2 = new WCHAR[sizeof(abyAnsiStr2)+3];
AnsiStrToUnicodeStr(abyAnsiStr2, sizeof(abyAnsiStr2),
pwchUnicodeStr2, sizeof(abyAnsiStr2)+3);
delete[] pwchUnicodeStr2;
return 0;
}
#endif