2020-09-30 16:53:55 +02:00

217 lines
4.2 KiB
C++

#include "precomp.h"
#include "utf8str.h"
CUTF8String::~CUTF8String()
{
if (ALLOC_UNICODE & m_eAlloc)
{
delete m_pwszUnicode;
}
else if (ALLOC_UTF8 & m_eAlloc)
{
delete m_pszUTF8;
}
}
CUTF8String::operator LPWSTR()
{
if ((NULL == m_pwszUnicode) && (NULL != m_pszUTF8))
{
DecodeUTF8();
}
return m_pwszUnicode;
}
CUTF8String::operator LPSTR()
{
if ((NULL == m_pszUTF8) && (NULL != m_pwszUnicode))
{
EncodeUTF8();
}
return m_pszUTF8;
}
VOID CUTF8String::EncodeUTF8()
{
DebugEntry(CUTF8String::EncodeUTF8);
m_hr = S_OK;
ASSERT(NULL != m_pwszUnicode);
int cchUTF8 = 1; // always include a NULL terminator
// First make a pass to see how many characters we will be converting.
LPWSTR pwsz = m_pwszUnicode;
while (L'\0' != *pwsz)
{
WCHAR wch = *pwsz++;
if (wch < 0x80)
{
cchUTF8 += 1;
}
else if (wch < 0x800)
{
cchUTF8 += 2;
}
else
{
cchUTF8 += 3;
}
}
ASSERT(NULL == m_pszUTF8);
m_pszUTF8 = new CHAR[cchUTF8];
if (NULL != m_pszUTF8)
{
ASSERT(ALLOC_NONE == m_eAlloc);
m_eAlloc = ALLOC_UTF8;
// Start encoding here:
const BYTE cUtf8FirstSignal[4] = {0x00, 0x00, 0xC0, 0xE0};
const BYTE cMask = 0xBF;
const BYTE cSignal = 0x80;
LPSTR pszStop = m_pszUTF8 + cchUTF8;
LPSTR pszDst = m_pszUTF8;
pwsz = m_pwszUnicode;
while (pszDst < pszStop)
{
WCHAR wch = *pwsz++;
#ifdef DEBUG
if (L'\0' == wch)
{
ASSERT(pszDst == pszStop - 1);
}
#endif // DEBUG
int cchTotal;
if (wch < 0x80)
{
cchTotal = 1;
}
else if (wch < 0x800)
{
cchTotal = 2;
}
else
{
cchTotal = 3;
}
pszDst += cchTotal;
switch (cchTotal)
{
case 3:
*--pszDst = (wch | cSignal) & cMask;
wch >>= 6;
// FALL THROUGH
case 2:
*--pszDst = (wch | cSignal) & cMask;
wch >>= 6;
// FALL THROUGH
case 1:
*--pszDst = (wch | cUtf8FirstSignal[cchTotal]);
}
pszDst += cchTotal;
}
m_hr = S_OK;
}
else
{
m_hr = E_OUTOFMEMORY;
}
DebugExitVOID(CUTF8String::EncodeUTF8);
}
VOID CUTF8String::DecodeUTF8()
{
DebugEntry(CUTF8String::DecodeUTF8);
m_hr = S_OK;
ASSERT(NULL != m_pszUTF8);
int cchUnicode = 1; // always include a NULL terminator
LPSTR psz = m_pszUTF8;
// First determine the destination size (cchUnicode)
while ('\0' != *psz)
{
int cbChar = 0;
BYTE bFirst = (BYTE) *psz;
while (bFirst & 0x80)
{
cbChar++;
ASSERT(cbChar < 8);
bFirst <<= 1;
}
cbChar = max(1, cbChar);
psz += cbChar;
cchUnicode++;
}
ASSERT(NULL == m_pwszUnicode);
m_pwszUnicode = new WCHAR[cchUnicode];
if (NULL != m_pwszUnicode)
{
ASSERT(ALLOC_NONE == m_eAlloc);
m_eAlloc = ALLOC_UNICODE;
// Start decoding here:
LPWSTR pwszStop = m_pwszUnicode + cchUnicode;
LPWSTR pwszDst = m_pwszUnicode;
psz = m_pszUTF8;
while (pwszDst < pwszStop)
{
int cbChar = 0;
BYTE bFirst = (BYTE) *psz;
while (bFirst & 0x80)
{
cbChar++;
ASSERT(cbChar < 8);
bFirst <<= 1;
}
BOOL fValid = TRUE;
WCHAR wch = L'\0';
switch (cbChar)
{
case 6: psz++; // FALLTHROUGH // we don't handle
case 5: psz++; // FALLTHROUGH // UCS-4; skip first
case 4: psz++; // FALLTHROUGH // three bytes
case 3:
wch = WCHAR(*psz++ & 0x0f) << 12; // 0x0800 - 0xffff
fValid = fValid && ((*psz & 0xc0) == 0x80);
// FALLTHROUGH
case 2:
wch |= WCHAR(*psz++ & 0x3f) << 6; // 0x0080 - 0x07ff
fValid = fValid && ((*psz & 0xc0) == 0x80);
wch |= WCHAR(*psz++ & 0x3f);
break;
case 0:
wch = WCHAR(*psz++); // 0x0000 - 0x007f
break;
default:
ERROR_OUT(("CUTF8String::DecodeUTF8 found invalid UTF-8 lead byte"));
wch = L'?';
psz += cbChar;
break;
}
if (FALSE == fValid)
{
ERROR_OUT(("CUTF8String::DecodeUTF8 found bad UTF-8 sequence"));
*pwszDst = L'\0';
m_hr = E_FAIL;
break;
}
#ifdef DEBUG
cchUnicode--;
#endif // DEBUG
*pwszDst++ = wch;
}
ASSERT(0 == cchUnicode);
}
else
{
m_hr = E_OUTOFMEMORY;
}
DebugExitVOID(CUTF8String::DecodeUTF8);
}