763 lines
21 KiB
C++
763 lines
21 KiB
C++
|
#include "stdafx.h"
|
||
|
#include "conveng.h"
|
||
|
|
||
|
#include "convdata.tbl"
|
||
|
|
||
|
// These file contain 3 parts:
|
||
|
// First part, Some basic service functions for Ansi char format convert,
|
||
|
// Distance/Advance calculate and Binary search algorithm copied from STL
|
||
|
// Second part, Unicode to Ansi
|
||
|
// Third part, Ansi to Unicode
|
||
|
|
||
|
|
||
|
// ****************************************************************************
|
||
|
// Frist part, Ansi char convert functions
|
||
|
//
|
||
|
// This part not use any data base in .tbl file
|
||
|
// ****************************************************************************
|
||
|
|
||
|
// Binary search algorithm
|
||
|
// Copy from STL, only very little modify
|
||
|
template <class RandomAccessIterator, class T>
|
||
|
RandomAccessIterator __lower_bound(RandomAccessIterator first,
|
||
|
RandomAccessIterator last, const T& value) {
|
||
|
INT_PTR len = last - first;
|
||
|
INT_PTR half;
|
||
|
RandomAccessIterator middle;
|
||
|
|
||
|
while (len > 0) {
|
||
|
half = len / 2;
|
||
|
|
||
|
middle = first + half;
|
||
|
if (*middle < value) {
|
||
|
first = middle + 1;
|
||
|
len = len - half - 1;
|
||
|
} else {
|
||
|
len = half;
|
||
|
}
|
||
|
}
|
||
|
return first;
|
||
|
}
|
||
|
|
||
|
template <class RandomAccessIterator, class T>
|
||
|
RandomAccessIterator __upper_bound(RandomAccessIterator first,
|
||
|
RandomAccessIterator last, const T& value) {
|
||
|
DWORD len = last - first;
|
||
|
DWORD half;
|
||
|
RandomAccessIterator middle;
|
||
|
|
||
|
while (len > 0) {
|
||
|
half = len / 2;
|
||
|
|
||
|
middle = first + half;
|
||
|
if (!(value < *middle)) {
|
||
|
first = middle + 1;
|
||
|
len = len - half - 1;
|
||
|
} else {
|
||
|
len = half;
|
||
|
}
|
||
|
}
|
||
|
return first;
|
||
|
}
|
||
|
|
||
|
template<class T>
|
||
|
inline ValueIn(
|
||
|
T Value,
|
||
|
T Low,
|
||
|
T High)
|
||
|
{
|
||
|
return (Value >= Low && Value < High);
|
||
|
}
|
||
|
|
||
|
inline BOOL IsValidSurrogateLeadWord(
|
||
|
WCHAR wchUnicode)
|
||
|
{
|
||
|
return ValueIn(wchUnicode, cg_wchSurrogateLeadWordLow, cg_wchSurrogateLeadWordHigh);
|
||
|
}
|
||
|
|
||
|
inline BOOL IsValidSurrogateTailWord(
|
||
|
WCHAR wchUnicode)
|
||
|
{
|
||
|
return ValueIn(wchUnicode, cg_wchSurrogateTailWordLow, cg_wchSurrogateTailWordHigh);
|
||
|
}
|
||
|
|
||
|
inline BOOL IsValidQByteAnsiLeadByte(
|
||
|
BYTE byAnsi)
|
||
|
{
|
||
|
return ValueIn(byAnsi, cg_byQByteAnsiLeadByteLow, cg_byQByteAnsiLeadByteHigh);
|
||
|
}
|
||
|
|
||
|
inline BOOL IsValidQByteAnsiTailByte(
|
||
|
BYTE byAnsi)
|
||
|
{
|
||
|
return ValueIn(byAnsi, cg_byQByteAnsiTailByteLow, cg_byQByteAnsiTailByteHigh);
|
||
|
}
|
||
|
|
||
|
// Generate QByte Ansi. The Ansi char is in DWORD format,
|
||
|
// in another word, it's in reverse order of GB18030 standard
|
||
|
DWORD QByteAnsiBaseAddOffset(
|
||
|
DWORD dwBaseAnsi, // In reverse order
|
||
|
int nOffset)
|
||
|
{
|
||
|
DWORD dwAnsi = dwBaseAnsi;
|
||
|
PBYTE pByte = (PBYTE)&dwAnsi;
|
||
|
|
||
|
// dwOffset should less than 1M
|
||
|
ASSERT (nOffset < 0x100000);
|
||
|
|
||
|
nOffset += pByte[0] - 0x30;
|
||
|
pByte[0] = 0x30 + nOffset % 10;
|
||
|
nOffset /= 10;
|
||
|
|
||
|
nOffset += pByte[1] - 0x81;
|
||
|
pByte[1] = 0x81 + nOffset % 126;
|
||
|
nOffset /= 126;
|
||
|
|
||
|
nOffset += pByte[2] - 0x30;
|
||
|
pByte[2] = 0x30 + nOffset % 10;
|
||
|
nOffset /= 10;
|
||
|
|
||
|
nOffset += pByte[3] - 0x81;
|
||
|
pByte[3] = 0x81 + nOffset % 126;
|
||
|
nOffset /= 126;
|
||
|
ASSERT(nOffset == 0);
|
||
|
|
||
|
return dwAnsi;
|
||
|
}
|
||
|
|
||
|
// Get "distance" of 2 QByte Ansi
|
||
|
int CalcuDistanceOfQByteAnsi(
|
||
|
DWORD dwAnsi1, // In reverse order
|
||
|
DWORD dwAnsi2) // In reverse order
|
||
|
{
|
||
|
signed char* pschAnsi1 = (signed char*)&dwAnsi1;
|
||
|
signed char* pschAnsi2 = (signed char*)&dwAnsi2;
|
||
|
|
||
|
int nDistance = 0;
|
||
|
|
||
|
nDistance += (pschAnsi1[0] - pschAnsi2[0]);
|
||
|
nDistance += (pschAnsi1[1] - pschAnsi2[1])*10;
|
||
|
nDistance += (pschAnsi1[2] - pschAnsi2[2])*1260;
|
||
|
nDistance += (pschAnsi1[3] - pschAnsi2[3])*12600;
|
||
|
|
||
|
return nDistance;
|
||
|
}
|
||
|
|
||
|
// Reverse 4 Bytes order, from DWORD format to GB format,
|
||
|
// or GB to DWORD
|
||
|
void ReverseQBytesOrder(
|
||
|
PBYTE pByte)
|
||
|
{
|
||
|
BYTE by;
|
||
|
|
||
|
by = pByte[0];
|
||
|
pByte[0] = pByte[3];
|
||
|
pByte[3] = by;
|
||
|
|
||
|
by = pByte[1];
|
||
|
pByte[1] = pByte[2];
|
||
|
pByte[2] = by;
|
||
|
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
// ****************************************************************************
|
||
|
// Second part, Unicode to Ansi
|
||
|
// ****************************************************************************
|
||
|
|
||
|
// ------------------------------------------------
|
||
|
// Two helper function for UnicodeToAnsi
|
||
|
// return Ansi char code
|
||
|
// the Ansi is in GB standard order (not Word value order)
|
||
|
//
|
||
|
|
||
|
// Unicode to double bytes Ansi char
|
||
|
//
|
||
|
// Return Ansi char code, 0 means fail (internal error, etc.)
|
||
|
//
|
||
|
WORD UnicodeToDByteAnsi(
|
||
|
WCHAR wchUnicode)
|
||
|
{
|
||
|
char achAnsiBuf[4];
|
||
|
WORD wAnsi = 0;
|
||
|
int cLen = 0;
|
||
|
|
||
|
// Code changed from GBK to GB18030, or code not compatible
|
||
|
// from CP936 to CP54936
|
||
|
for (int i = 0; i < sizeof(asAnsiCodeChanged)/sizeof(SAnsiCodeChanged); i++) {
|
||
|
if (wchUnicode == asAnsiCodeChanged[i].wchUnicode) {
|
||
|
wAnsi = asAnsiCodeChanged[i].wchAnsiNew;
|
||
|
goto Exit;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Not in Changed code list, that is same with GBK, or CP936
|
||
|
// (Most DByte Ansi char code should compatible from GBK to GB18030)
|
||
|
cLen = WideCharToMultiByte(936,
|
||
|
WC_COMPOSITECHECK, &wchUnicode, 1,
|
||
|
achAnsiBuf, sizeof(achAnsiBuf)-1, NULL, NULL);
|
||
|
|
||
|
if (cLen != 2) {
|
||
|
ASSERT(cLen == 2);
|
||
|
wAnsi = 0;
|
||
|
} else {
|
||
|
wAnsi = *(PWORD)achAnsiBuf;
|
||
|
}
|
||
|
|
||
|
Exit:
|
||
|
return wAnsi;
|
||
|
}
|
||
|
|
||
|
// Unicode to quad bytes Ansi char
|
||
|
//
|
||
|
// Return Ansi char code
|
||
|
// 0 means fail (interal error)
|
||
|
//
|
||
|
DWORD UnicodeToQByteAnsi(
|
||
|
int nSection,
|
||
|
int nOffset)
|
||
|
{
|
||
|
DWORD dwBaseAnsi;
|
||
|
if (nSection < 0 || nSection >= sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD)) {
|
||
|
ASSERT(FALSE);
|
||
|
return 0;
|
||
|
}
|
||
|
dwBaseAnsi = adwAnsiQBytesAreaStartValue[nSection];
|
||
|
|
||
|
// Check adwAnsiQByteAreaStartValue array is correctly
|
||
|
#ifdef _DEBUG
|
||
|
int ncQByteAnsiNum = 0;
|
||
|
for (int i = 0; i < nSection; i++) {
|
||
|
// Calcu QByte Ansi char numbers
|
||
|
ncQByteAnsiNum += awchAnsiDQByteBound[2*i+1] - awchAnsiDQByteBound[2*i];
|
||
|
}
|
||
|
ASSERT(dwBaseAnsi == QByteAnsiBaseAddOffset(cg_dwQByteAnsiStart, ncQByteAnsiNum));
|
||
|
#endif
|
||
|
|
||
|
DWORD dwAnsi = QByteAnsiBaseAddOffset(dwBaseAnsi, nOffset);
|
||
|
// Value order to standard order
|
||
|
ReverseQBytesOrder((PBYTE)(&dwAnsi));
|
||
|
|
||
|
return dwAnsi;
|
||
|
}
|
||
|
|
||
|
|
||
|
// ---------------------------------------------------------
|
||
|
// Two function support 2 bytes Unicode (BMP)
|
||
|
// and 4 bytes Unicode (Surrogate) translate to Ansi
|
||
|
|
||
|
// 2 bytes Unicode (BMP)
|
||
|
|
||
|
// Return Ansi str len, when success, should be 2 or 4;
|
||
|
// return 0 means fail (internal error, etc.)
|
||
|
int UnicodeToAnsi(
|
||
|
WCHAR wchUnicode,
|
||
|
char* pchAnsi,
|
||
|
DWORD dwBufSize)
|
||
|
{
|
||
|
// Classic Unicode, not support surrogate in this function
|
||
|
ASSERT(!IsValidSurrogateLeadWord(wchUnicode)
|
||
|
&& !IsValidSurrogateTailWord(wchUnicode));
|
||
|
|
||
|
DWORD lAnsiLen = 0;
|
||
|
const WORD* p;
|
||
|
INT_PTR i;
|
||
|
|
||
|
// ASCII, 0 - 0x7f
|
||
|
if (wchUnicode <= 0x7f) {
|
||
|
*pchAnsi = (char)wchUnicode;
|
||
|
lAnsiLen = 1;
|
||
|
goto Exit;
|
||
|
}
|
||
|
|
||
|
// BMP, 4 byte or 2 byte
|
||
|
p = __lower_bound(awchAnsiDQByteBound, awchAnsiDQByteBound
|
||
|
+ sizeof(awchAnsiDQByteBound)/sizeof(WCHAR), wchUnicode);
|
||
|
|
||
|
if (p == awchAnsiDQByteBound
|
||
|
+ sizeof(awchAnsiDQByteBound)/sizeof(WCHAR)) {
|
||
|
p --;
|
||
|
} else if (wchUnicode < *p) {
|
||
|
p --;
|
||
|
} else if (wchUnicode == *p) {
|
||
|
} else {
|
||
|
ASSERT(FALSE);
|
||
|
}
|
||
|
|
||
|
i = p - awchAnsiDQByteBound;
|
||
|
ASSERT(i >= 0);
|
||
|
|
||
|
// Stop when >= *(((PWORD)asAnsi2ByteArea) + i);
|
||
|
if (i%2) { // Odd, in 2 bytes area
|
||
|
WORD wAnsi = UnicodeToDByteAnsi(wchUnicode);
|
||
|
|
||
|
if (wAnsi && dwBufSize >= 2) {
|
||
|
*(UNALIGNED WORD*)pchAnsi = wAnsi;
|
||
|
lAnsiLen = 2;
|
||
|
} else {
|
||
|
lAnsiLen = 0;
|
||
|
}
|
||
|
} else { // Duel, in 4 bytes area
|
||
|
DWORD dwAnsi = UnicodeToQByteAnsi
|
||
|
((int)i/2, wchUnicode - awchAnsiDQByteBound[i]);
|
||
|
|
||
|
if (dwAnsi && dwBufSize >= 4) {
|
||
|
*(UNALIGNED DWORD*)pchAnsi = dwAnsi;
|
||
|
lAnsiLen = 4;
|
||
|
} else {
|
||
|
lAnsiLen = 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
Exit:
|
||
|
return lAnsiLen;
|
||
|
|
||
|
}
|
||
|
|
||
|
// 4 bytes Unicode (Surrogate)
|
||
|
|
||
|
// Return Ansi str length, when success, should be 4
|
||
|
// return 0 means fail (Buffer overflow)
|
||
|
int SurrogateToAnsi(
|
||
|
PCWCH pwchUnicode,
|
||
|
PCHAR pchAnsi,
|
||
|
DWORD dwBufSize)
|
||
|
{
|
||
|
ASSERT(IsValidSurrogateLeadWord(pwchUnicode[0]));
|
||
|
ASSERT(IsValidSurrogateTailWord(pwchUnicode[1]));
|
||
|
|
||
|
// dwOffset is ISO char code - 0x10000
|
||
|
DWORD dwOffset = ((pwchUnicode[0] - cg_wchSurrogateLeadWordLow)<<10)
|
||
|
+ (pwchUnicode[1] - cg_wchSurrogateTailWordLow)
|
||
|
+ 0x10000 - 0x10000;
|
||
|
|
||
|
if (dwBufSize < 4) {
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
*(UNALIGNED DWORD*)pchAnsi = QByteAnsiBaseAddOffset
|
||
|
(cg_dwQByteAnsiToSurrogateStart, dwOffset);
|
||
|
ReverseQBytesOrder((PBYTE)pchAnsi);
|
||
|
|
||
|
return 4;
|
||
|
}
|
||
|
|
||
|
// API: high level service for Unicode to Ansi
|
||
|
// return result Ansi str length (in byte)
|
||
|
// return -1 means fail (Buffer overflow, internal error, etc.)
|
||
|
int UnicodeStrToAnsiStr(
|
||
|
PCWCH pwchUnicodeStr,
|
||
|
int ncUnicodeStr, // in WCHAR
|
||
|
PCHAR pchAnsiStrBuf,
|
||
|
int ncAnsiStrBufSize) // in BYTE
|
||
|
{
|
||
|
int ncAnsiStr = 0;
|
||
|
int ncAnsiCharSize;
|
||
|
|
||
|
for (int i = 0; i < ncUnicodeStr; i++, pwchUnicodeStr++) {
|
||
|
if (ncAnsiStr > (ncAnsiStrBufSize-4)) {
|
||
|
// Buffer overflow
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (IsValidSurrogateLeadWord(pwchUnicodeStr[0])) {
|
||
|
if ((i+1 < ncUnicodeStr)
|
||
|
&& (IsValidSurrogateTailWord(pwchUnicodeStr[1]))) {
|
||
|
ncAnsiCharSize = SurrogateToAnsi(pwchUnicodeStr, pchAnsiStrBuf, 4);
|
||
|
|
||
|
ASSERT(ncAnsiCharSize == 4);
|
||
|
if (ncAnsiCharSize == 0) {
|
||
|
ASSERT(FALSE);
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
ncAnsiStr += ncAnsiCharSize;
|
||
|
pchAnsiStrBuf += ncAnsiCharSize;
|
||
|
pwchUnicodeStr++;
|
||
|
i++;
|
||
|
} else {
|
||
|
// Invalide Uncode char, skip
|
||
|
}
|
||
|
} else if (*pwchUnicodeStr == 0) {
|
||
|
*pchAnsiStrBuf = 0;
|
||
|
pchAnsiStrBuf ++;
|
||
|
ncAnsiStr ++;
|
||
|
} else {
|
||
|
ncAnsiCharSize = UnicodeToAnsi(*pwchUnicodeStr, pchAnsiStrBuf, 4);
|
||
|
|
||
|
if (ncAnsiCharSize == 0) {
|
||
|
ASSERT(FALSE);
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
pchAnsiStrBuf += ncAnsiCharSize;
|
||
|
ncAnsiStr += ncAnsiCharSize;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (i < ncUnicodeStr) { return -1; }
|
||
|
return ncAnsiStr;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
// ****************************************************************************
|
||
|
// Third part, Ansi to Unicode
|
||
|
// ****************************************************************************
|
||
|
|
||
|
|
||
|
// Return Unicode number (number always equal to 1 when success)
|
||
|
// return 0 if can't find corresponding Unicode
|
||
|
// -1 means fail (internal error, etc.)
|
||
|
int QByteAnsiToSingleUnicode(
|
||
|
DWORD dwAnsi,
|
||
|
PWCH pwchUnicode)
|
||
|
{
|
||
|
const DWORD* p;
|
||
|
INT_PTR i;
|
||
|
|
||
|
// 0x8431a439(cg_dwQByteAnsiToBMPLast) to 0x85308130 haven't Unicode corresponding
|
||
|
// 0x85308130 to 0x90308130(cg_dwQByteAnsiToSurrogateStart) are reserved zone,
|
||
|
// haven't Unicode corresponding
|
||
|
if (dwAnsi > cg_dwQByteAnsiToBMPLast) {
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
// Invalid input value
|
||
|
if (dwAnsi < adwAnsiQBytesAreaStartValue[0]) {
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
p = __lower_bound(adwAnsiQBytesAreaStartValue,
|
||
|
adwAnsiQBytesAreaStartValue + sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD),
|
||
|
dwAnsi);
|
||
|
|
||
|
if (p == adwAnsiQBytesAreaStartValue
|
||
|
+ sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD)) {
|
||
|
p --;
|
||
|
} else if (dwAnsi < *p) {
|
||
|
p --;
|
||
|
} else if (dwAnsi == *p) {
|
||
|
} else {
|
||
|
ASSERT(FALSE);
|
||
|
}
|
||
|
|
||
|
i = p - adwAnsiQBytesAreaStartValue;
|
||
|
|
||
|
if (i < 0) {
|
||
|
ASSERT(i >= 0);
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
*pwchUnicode = awchAnsiDQByteBound[2*i] + CalcuDistanceOfQByteAnsi(dwAnsi, *p);
|
||
|
#ifdef _DEBUG
|
||
|
{
|
||
|
|
||
|
int nAnsiCharDistance = CalcuDistanceOfQByteAnsi(dwAnsi, *p);
|
||
|
ASSERT(nAnsiCharDistance >= 0);
|
||
|
|
||
|
WCHAR wchUnicodeDbg;
|
||
|
if ((p+1) < adwAnsiQBytesAreaStartValue
|
||
|
+ sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD)) {
|
||
|
nAnsiCharDistance = CalcuDistanceOfQByteAnsi(dwAnsi, *(p+1));
|
||
|
wchUnicodeDbg = awchAnsiDQByteBound[2*i+1] + nAnsiCharDistance;
|
||
|
} else if ((p+1) == adwAnsiQBytesAreaStartValue
|
||
|
+ sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD)) {
|
||
|
nAnsiCharDistance = CalcuDistanceOfQByteAnsi(dwAnsi, 0x8431A530);
|
||
|
wchUnicodeDbg = 0x10000 + nAnsiCharDistance;
|
||
|
} else {
|
||
|
ASSERT(FALSE);
|
||
|
}
|
||
|
ASSERT(nAnsiCharDistance < 0);
|
||
|
ASSERT(wchUnicodeDbg == *pwchUnicode);
|
||
|
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
// Return Unicode number (number always 2 when success)
|
||
|
// return 0 if can't find corresponding Unicode
|
||
|
int QByteAnsiToDoubleUnicode(
|
||
|
DWORD dwAnsi,
|
||
|
PWCH pwchUnicode)
|
||
|
{
|
||
|
int nDistance = CalcuDistanceOfQByteAnsi(dwAnsi, cg_dwQByteAnsiToSurrogateStart);
|
||
|
ASSERT (nDistance >= 0);
|
||
|
|
||
|
if (nDistance >= 0x100000) {
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
pwchUnicode[1] = nDistance % 0x400 + 0xDC00;
|
||
|
pwchUnicode[0] = nDistance / 0x400 + 0xD800;
|
||
|
|
||
|
return 2;
|
||
|
}
|
||
|
|
||
|
// Return Unicode number (1 or 2 when success)
|
||
|
// return 0 if can't find corresponding Unicode
|
||
|
// return -1 if fail (Buffer overflow, invalid GB char code input,
|
||
|
// internal error, etc.)
|
||
|
int QByteAnsiToUnicode(
|
||
|
const BYTE* pbyAnsiChar,
|
||
|
PWCH pwchUnicode,
|
||
|
DWORD dwBufLen) // In WCHAR
|
||
|
{
|
||
|
DWORD dwAnsi;
|
||
|
int nLen = -1;
|
||
|
|
||
|
if ( IsValidQByteAnsiLeadByte(pbyAnsiChar[0])
|
||
|
&& IsValidQByteAnsiTailByte(pbyAnsiChar[1])
|
||
|
&& IsValidQByteAnsiLeadByte(pbyAnsiChar[2])
|
||
|
&& IsValidQByteAnsiTailByte(pbyAnsiChar[3])) {
|
||
|
|
||
|
} else {
|
||
|
return -1; // Invalid char
|
||
|
}
|
||
|
|
||
|
dwAnsi = *(UNALIGNED DWORD*)pbyAnsiChar;
|
||
|
ReverseQBytesOrder((PBYTE)(&dwAnsi));
|
||
|
|
||
|
if (dwAnsi >= cg_dwQByteAnsiToSurrogateStart) {
|
||
|
if (dwBufLen >= 2) {
|
||
|
nLen = QByteAnsiToDoubleUnicode(dwAnsi, pwchUnicode);
|
||
|
}
|
||
|
} else {
|
||
|
if (dwBufLen >= 1) {
|
||
|
nLen = QByteAnsiToSingleUnicode(dwAnsi, pwchUnicode);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return nLen;
|
||
|
}
|
||
|
|
||
|
// Unicode to double bytes Ansi char
|
||
|
// Return: Unicode char code, 0 means fail (internal error, etc.)
|
||
|
WCHAR DByteAnsiToUnicode(
|
||
|
const BYTE* pbyAnsi)
|
||
|
{
|
||
|
WORD wAnsi = *(UNALIGNED WORD*)pbyAnsi;
|
||
|
int cLen = 1;
|
||
|
WCHAR wchUnicode;
|
||
|
|
||
|
// Code changed from GBK to GB18030, or code not compatible
|
||
|
// from CP936 to CP54936
|
||
|
for (int i = 0; i < sizeof(asAnsiCodeChanged)/sizeof(SAnsiCodeChanged); i++) {
|
||
|
if (wAnsi == asAnsiCodeChanged[i].wchAnsiNew) {
|
||
|
wchUnicode = asAnsiCodeChanged[i].wchUnicode;
|
||
|
goto Exit;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Not in Changed code list, that is same with GBK, or CP936
|
||
|
// (Most DByte Ansi char code should compatible from GBK to GB18030)
|
||
|
cLen = MultiByteToWideChar(936, MB_PRECOMPOSED,
|
||
|
(PCCH)pbyAnsi, 2, &wchUnicode, 1);
|
||
|
|
||
|
if (cLen != 1) {
|
||
|
wchUnicode = 0;
|
||
|
}
|
||
|
|
||
|
Exit:
|
||
|
return wchUnicode;
|
||
|
}
|
||
|
|
||
|
// API: High level service for Ansi to Unicode
|
||
|
// return Unicode str length (in WCHAR)
|
||
|
// return -1 means fail (Buffer overflow, etc.)
|
||
|
int AnsiStrToUnicodeStr(
|
||
|
const BYTE* pbyAnsiStr,
|
||
|
int ncAnsiStrSize, // In char
|
||
|
PWCH pwchUnicodeBuf,
|
||
|
int ncBufLen) // In WCHAR
|
||
|
{
|
||
|
int nCharLen;
|
||
|
int ncUnicodeBuf = 0;
|
||
|
|
||
|
for (int i = 0; i < ncAnsiStrSize; ) {
|
||
|
if (ncUnicodeBuf > (ncBufLen-4)) {
|
||
|
// Buffer overflow
|
||
|
break;
|
||
|
}
|
||
|
// 1 byte Ansi char
|
||
|
if (*pbyAnsiStr < 0x80) {
|
||
|
*pwchUnicodeBuf = (WCHAR)*pbyAnsiStr;
|
||
|
pwchUnicodeBuf ++;
|
||
|
ncUnicodeBuf ++;
|
||
|
i++;
|
||
|
pbyAnsiStr++;
|
||
|
// 2 byte Ansi char
|
||
|
} else if ((i+1 < ncAnsiStrSize) && pbyAnsiStr[1] >= 0x40) {
|
||
|
*pwchUnicodeBuf = DByteAnsiToUnicode(pbyAnsiStr);
|
||
|
|
||
|
if (*pwchUnicodeBuf == 0) {
|
||
|
*pwchUnicodeBuf = '?';
|
||
|
}
|
||
|
|
||
|
pwchUnicodeBuf ++;
|
||
|
ncUnicodeBuf ++;
|
||
|
i += 2;
|
||
|
pbyAnsiStr += 2;
|
||
|
// 4 byte Ansi char
|
||
|
} else if ((i+3 < ncAnsiStrSize)
|
||
|
&& IsValidQByteAnsiLeadByte(pbyAnsiStr[0])
|
||
|
&& IsValidQByteAnsiTailByte(pbyAnsiStr[1])
|
||
|
&& IsValidQByteAnsiLeadByte(pbyAnsiStr[2])
|
||
|
&& IsValidQByteAnsiTailByte(pbyAnsiStr[3])) {
|
||
|
// QByte GB char
|
||
|
nCharLen = QByteAnsiToUnicode(pbyAnsiStr, pwchUnicodeBuf, 4);
|
||
|
if (nCharLen < 0) {
|
||
|
ASSERT(FALSE); // Invalid Ansi char input, or buffer overflow, etc.
|
||
|
// Should never happen but an internal error
|
||
|
break;
|
||
|
} else if (nCharLen == 0) { // hasn't corresponding Unicode Char
|
||
|
*pwchUnicodeBuf = '?';
|
||
|
pwchUnicodeBuf ++;
|
||
|
ncUnicodeBuf ++;
|
||
|
} else if (nCharLen > 0) {
|
||
|
ASSERT(nCharLen <= 2);
|
||
|
pwchUnicodeBuf += nCharLen;
|
||
|
ncUnicodeBuf += nCharLen;
|
||
|
} else {
|
||
|
ASSERT(FALSE);
|
||
|
}
|
||
|
i += 4;
|
||
|
pbyAnsiStr += 4;
|
||
|
// Invalid Ansi char
|
||
|
} else {
|
||
|
// Invalid
|
||
|
i++;
|
||
|
pbyAnsiStr++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (i < ncAnsiStrSize) { return -1; }
|
||
|
|
||
|
return ncUnicodeBuf;
|
||
|
}
|
||
|
|
||
|
|
||
|
// ******************************************************
|
||
|
// Testing program
|
||
|
// ******************************************************
|
||
|
|
||
|
/*
|
||
|
"\u0080", <0x81;0x30;0x81;0x30>
|
||
|
"\u00A3", <0x81;0x30;0x84;0x35>
|
||
|
"\u00A4", <0xA1;0xE8>
|
||
|
"\u00A5", <0x81;0x30;0x84;0x36>
|
||
|
"\u00A6", <0x81;0x30;0x84;0x37>
|
||
|
"\u00A7", <0xA1;0xEC>
|
||
|
"\u00A8", <0xA1;0xA7>
|
||
|
"\u00A9", <0x81;0x30;0x84;0x38>
|
||
|
"\u00AF", <0x81;0x30;0x85;0x34>
|
||
|
"\u00B0", <0xA1;0xE3>
|
||
|
"\u00B1", <0xA1;0xC0>
|
||
|
"\u00B2", <0x81;0x30;0x85;0x35>
|
||
|
|
||
|
{0x20AC, 0xe3a2},
|
||
|
{0x01f9, 0xbfa8},
|
||
|
{0x303e, 0x89a9},
|
||
|
{0x2ff0, 0x8aa9},
|
||
|
{0x2ff1, 0x8ba9},
|
||
|
|
||
|
50EF 836A
|
||
|
50F0 836B
|
||
|
50F1 836C
|
||
|
50F2 836D
|
||
|
|
||
|
*/
|
||
|
#if 0
|
||
|
int test (void)
|
||
|
{
|
||
|
const WCHAR awchUnicodeStr[] = {0x01, 0x7f, 0x80, 0x81, 0x82,
|
||
|
0xa2,
|
||
|
0xa3, // 0x81;0x30;0x84;0x35
|
||
|
0xa4, // 0xA1;0xE8
|
||
|
0xa5, // 0x81;0x30;0x84;0x36
|
||
|
0xa6, // 0x81;0x30;0x84;0x37
|
||
|
0xaf, // 0x81;0x30;0x85;0x34
|
||
|
0xb0, // 0xA1;0xE3
|
||
|
0xb1, // 0xA1;0xC0
|
||
|
0xb6, // 0x81;0x30;0x85;0x39
|
||
|
0xb7, // 0xA1;0xA4
|
||
|
|
||
|
// Some normal DByte Ansi char
|
||
|
0x50ef, // 0x83, 0x6A
|
||
|
0x50f2, // 0x83, 0x6D
|
||
|
|
||
|
// Some ansi char code changed in new standard
|
||
|
0x20ac, // 0xa2, 0xe3
|
||
|
0xE76C, // not (0xa2, 0xe3), should some QByte char
|
||
|
0x2ff0, // 0xa9, 0x8A
|
||
|
0x2ff1, // 0xa9, 0x8B
|
||
|
0x4723, // 0xFE, 0x80
|
||
|
|
||
|
// Ansi char arround DC00 to E000
|
||
|
0xd7ff, // 0x83, 0x36, 0xC7, 0x38
|
||
|
0xe76c, // 0x83, 0x36, 0xC7, 0x39
|
||
|
0xE76B, // 0xA2, 0xB0
|
||
|
|
||
|
0xffff, // 0x84, 0x31, 0xa4, 0x39,
|
||
|
0x00};
|
||
|
|
||
|
char* pchAnsiStr = new char[sizeof(awchUnicodeStr)*2+5];
|
||
|
|
||
|
UnicodeStrToAnsiStr(awchUnicodeStr, sizeof(awchUnicodeStr)/sizeof(WCHAR),
|
||
|
pchAnsiStr, sizeof(awchUnicodeStr)*2+5);
|
||
|
|
||
|
delete[] pchAnsiStr;
|
||
|
|
||
|
|
||
|
BYTE abyAnsiStr2[] = {
|
||
|
0x81, 0x30, 0x81, 0x30,
|
||
|
0x81, 0x30, 0x84, 0x35,
|
||
|
0xA1, 0xE8,
|
||
|
0x81, 0x30, 0x84, 0x36,
|
||
|
0x81, 0x30, 0x84, 0x37,
|
||
|
0xA1, 0xEC,
|
||
|
0xA1, 0xA7,
|
||
|
0x81, 0x30, 0x84, 0x38,
|
||
|
0x81, 0x30, 0x85, 0x34,
|
||
|
0xA1, 0xE3,
|
||
|
0xA1, 0xC0,
|
||
|
0x81, 0x30, 0x85, 0x35,
|
||
|
|
||
|
// Testing D800 to DE00
|
||
|
0x82, 0x35, 0x8f, 0x33, // 0x9FA6
|
||
|
0x83, 0x36, 0xC7, 0x38, // 0xD7FF
|
||
|
0xA2, 0xB0, // 0xE76B
|
||
|
0x83, 0x36, 0xC7, 0x39, // 0xE76C
|
||
|
|
||
|
// Testing last char in BMP
|
||
|
0x84, 0x31, 0xa4, 0x39, // 0xFFFF
|
||
|
|
||
|
// Some char code changed in new GB standard
|
||
|
0xa2, 0xe3, // 0x20AC,
|
||
|
0xa8, 0xbf, // 0x01f9,
|
||
|
0xa9, 0x89, // 0x303e,
|
||
|
0xa9, 0x8a, // 0x2ff0,
|
||
|
0xa9, 0x8b, // 0x2ff1,
|
||
|
0xFE, 0x9F, // 0x4dae
|
||
|
|
||
|
0x83, 0x6A, // 50EF
|
||
|
0x83, 0x6B, // 50F0
|
||
|
0x83, 0x6C, // 50F1
|
||
|
0x83, 0x6D // 50F2
|
||
|
};
|
||
|
|
||
|
WCHAR* pwchUnicodeStr2 = new WCHAR[sizeof(abyAnsiStr2)+3];
|
||
|
|
||
|
AnsiStrToUnicodeStr(abyAnsiStr2, sizeof(abyAnsiStr2),
|
||
|
pwchUnicodeStr2, sizeof(abyAnsiStr2)+3);
|
||
|
|
||
|
delete[] pwchUnicodeStr2;
|
||
|
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
#endif
|