/* * @(#)EncodingStream.cxx 1.0 6/10/97 * * Copyright (c) 1997 - 1999 Microsoft Corporation. All rights reserved. * */ #include "stdinc.h" #include "core.hxx" #include "xmlhelper.hxx" #include "encodingstream.hxx" #pragma hdrstop const int EncodingStream::BUFFERSIZE = 4096*sizeof(WCHAR); ////////////////////////////////////////////////////////////////////////////////// EncodingStream::EncodingStream(IStream * pStream): stream(pStream), encoding(NULL), buf(NULL), pfnWideCharFromMultiByte(NULL), btotal(0), bnext(0), startAt(0), lastBuffer(false), bufsize(0), _fEOF(false), _fReadStream(true), _fUTF8BOM(false), _dwMode(0), codepage(CP_UNDEFINED) { } ////////////////////////////////////////////////////////////////////////////////// /** * Builds the EncodingStream for input. * Reads the first two bytes of the InputStream * in order to make a guess * as to the character encoding of the file. */ IStream * EncodingStream::newEncodingStream(IStream * pStream) { EncodingStream * es = NEW (EncodingStream(pStream)); if (es == NULL) return NULL; // // REVIEW REVIEW: // Shouldn't this rewind the stream cursor back to a known good point? The // comment above is useless, too - there's no data reading at all here. // es->AddRef(); // xwu@@ : check this addRef()! es->isInput = true; es->buf = NULL; return es; } ////////////////////////////////////////////////////////////////////////////////// EncodingStream::~EncodingStream() { if (buf) { delete [] buf; buf = NULL; } if (encoding != NULL) { delete encoding; encoding = NULL; } stream = NULL; // smart pointer } ////////////////////////////////////////////////////////////////////////////////// /** * Reads characters from stream and encode it to Unicode */ HRESULT STDMETHODCALLTYPE EncodingStream::Read(void * pv, ULONG cb, ULONG * pcbRead) { HRESULT hr; ULONG num = 0; if (pcbRead != NULL) *pcbRead = 0; if (btotal == 0 && _fEOF) // we already hit EOF - so return right away. return S_OK; // Calculate how many UNICODE chars we are allowed to return, // xiaoyu : which is the same as the number of BYTES read from the file cb /= sizeof(WCHAR); checkhr2(prepareForInput(cb)); if (stream && _fReadStream) { // btotal = number of bytes already in start of buffer. if (cb > btotal) { hr = stream->Read(buf + btotal, cb - btotal, &num); // Let's show what we've seen in the debugger so that we can diagnose bad manifests // more easily. mgrier 12/28/2000 if (::FusionpDbgWouldPrintAtFilterLevel(FUSION_DBG_LEVEL_XMLSTREAM)) { ::FusionpDbgPrintEx( FUSION_DBG_LEVEL_XMLSTREAM, "SXS.DLL: Read %lu bytes from XML stream; HRESULT returned = 0x%08lx\n", num, hr); if (num > 0) { ::FusionpDbgPrintBlob( FUSION_DBG_LEVEL_XMLSTREAM, buf + btotal, num, L" "); } } if ((hr == E_PENDING) && (num > 0)) { // in which case we ignore the error, and continue on !!. // BUGBUG - this may be a problem.since we are changing the // return code returned from the stream. This may mean we // should not ever hand out this stream outside of MSXML. hr = 0; } if (FAILED(hr)) { return hr; } if (btotal == 0 && num == 0) { _fEOF = true; return hr; } } else { hr = S_OK; } } else if (btotal == 0) { return (lastBuffer) ? S_FALSE : E_PENDING; } btotal += num; UINT b = btotal, utotal = cb; if (b > cb) { // If we have more bytes in our buffer than the caller has // room for, then only return the number of bytes the caller // asked for -- otherwise pfnWideCharFromMultiByte will write // off the end of the caller's buffer. b = cb; } if (pfnWideCharFromMultiByte == NULL) // first read() call { checkhr2(autoDetect()); if (pfnWideCharFromMultiByte == NULL) // failed to fully determine encoding return (lastBuffer) ? S_FALSE : E_PENDING; b -= bnext; startAt -= bnext; } hr = (this->pfnWideCharFromMultiByte)(&_dwMode, codepage, buf + bnext, &b, (WCHAR *)pv, &utotal); if (hr != S_OK) return hr; if (b == 0 && num == 0 && (stream || lastBuffer)) { // stream says we're at the end, but pfnWideCharFromMultiByte // disagrees !! ::FusionpDbgPrintEx( FUSION_DBG_LEVEL_ERROR, "SXS.DLL: XML Parser found incomplete encoding\n"); return XML_E_INCOMPLETE_ENCODING; } bnext += b; if (pcbRead != NULL) *pcbRead = utotal*sizeof(WCHAR); return (utotal == 0) ? E_PENDING : S_OK; } ////////////////////////////////////////////////////////////////////////////////// /** * Checks the first two/four bytes of the input Stream in order to * detect UTF-16/UCS-4 or UTF-8 encoding; * otherwise assume it is UTF-8 * xiaoyu : since only UCS-2 and UTF-8 are support, we do not deal with others... */ HRESULT EncodingStream::autoDetect() { // wait until we have enough to be sure. if (btotal < 2) return S_OK; unsigned int guess = (((unsigned char)buf[0]) << 8) + ((unsigned char)buf[1]); HRESULT hr; if (guess == 0xFEFF || guess == 0xFFFE) // BOM found { // wait until we have enough to be sure. if (btotal < 4) return S_OK; unsigned int guess1 = (((unsigned char)buf[2]) << 8) + ((unsigned char)buf[3]); if (guess == guess1) { /* if (!encoding) { static const WCHAR* wchUCS4 = TEXT("UCS-4"); encoding = Encoding::newEncoding(wchUCS4, 5, (0xFFFE == guess), true); } bnext = 4; */ // FUSION_XML_PARSER does not support UCS4 return XML_E_INVALIDENCODING; } else { if (!encoding) { static const WCHAR wchUCS2[] = L"UCS-2"; encoding = Encoding::newEncoding(wchUCS2, LENGTH(wchUCS2), (0xFFFE == guess), true); } bnext = 2; } if (NULL == encoding) return E_OUTOFMEMORY; encoding->littleendian = (0xFFFE == guess); } else { if (!encoding) { encoding = Encoding::newEncoding(L"UTF-8", 5, false, false); if (NULL == encoding) return E_OUTOFMEMORY; } // In some system, such as win2k, there is BOM 0xEF BB BF for UTF8 if (guess == 0xEFBB) { if (btotal < 3) return S_OK; if (buf[2] == 0xBF) _fUTF8BOM = true; bnext = 3; } else { encoding->byteOrderMark = false; } } checkhr2(CharEncoder::getWideCharFromMultiByteInfo(encoding, &codepage, &pfnWideCharFromMultiByte, &maxCharSize)); return S_OK; } ///////////////////////////////////////////////////////////////////////////////////////// /** * Switchs the character encoding of the input stream * Returns: * S_OK: succeeded, and do not need re-read * S_FALSE: succeeded, needs to re-read from newPosition * Otherwise: error code * Notice: * This method only works for input stream, newPosition starts with 1 */ HRESULT EncodingStream::switchEncodingAt(Encoding * newEncoding, int newPosition) { // Ignore encoding information in the document when charset information is set from outside // xwu: fusion xml parsed does not use Charset //if (_fSetCharset) // return S_OK; int l = newPosition - startAt; if (l < 0 || l > (int)bnext) { // out of range delete newEncoding; return E_INVALIDARG; } UINT newcodepage; UINT newCharSize; // // get and check charset information // WideCharFromMultiByteFunc * pfn; HRESULT hr = CharEncoder::getWideCharFromMultiByteInfo(newEncoding, &newcodepage, &pfn, &newCharSize); if (hr != S_OK) { delete newEncoding; return E_INVALIDARG; } if (codepage == newcodepage) { delete newEncoding; return S_OK; } // Now if we are in UCS-2/UCS-4 we cannot switch out of UCS-2/UCS-4 and if we are // not in UCS-2/UCS-4 we cannot switch into UCS-2/UCS-4. // Also if UTF-8 BOM is presented, we cannot switch away if ((codepage != CP_UCS_2 && newcodepage == CP_UCS_2) || (codepage == CP_UCS_2 && newcodepage != CP_UCS_2) || (codepage == CP_UTF_8 && newcodepage != CP_UTF_8 && _fUTF8BOM)) { delete newEncoding; return E_FAIL; } // Ok, then, let's make the switch. if (encoding) { delete encoding; } encoding = newEncoding; maxCharSize = newCharSize; codepage = newcodepage; pfnWideCharFromMultiByte = pfn; // Because the XML declaration is encoded in UTF-8, // Mapping input characters to wide characters is one-to-one mapping if ((int)bnext != l) { bnext = l; return S_FALSE; } return S_OK; } ////////////////////////////////////////////////////////////////////////////////// // minlen is the number of UNICODE, which is the same number of byte we read from the file HRESULT EncodingStream::prepareForInput(ULONG minlen) { Assert(btotal >= bnext); btotal -= bnext; if (bufsize < minlen) { BYTE* newbuf = NEW (BYTE[minlen]); if (newbuf == NULL) { return E_OUTOFMEMORY; } if (buf){ ::memcpy(newbuf, buf+bnext, btotal); delete[] buf; } buf = newbuf; bufsize = minlen; } else if (bnext > 0 && btotal > 0) { // Shift remaining bytes down to beginning of buffer. ::memmove(buf, buf + bnext, btotal); } startAt += bnext; bnext = 0; return S_OK; } ////////////////////////////////////////////////////////////////////////////////// // xiaoyu : here it assumes that it is a BYTE buffer, not a WCHAR byte, so it can be copied directly HRESULT EncodingStream::AppendData( const BYTE* buffer, ULONG length, BOOL fLastBuffer) { Assert(btotal >= bnext); lastBuffer = (fLastBuffer != FALSE); HRESULT hr; ULONG minlen = length + (btotal - bnext); // make sure we don't loose any data if (minlen < BUFFERSIZE) minlen = BUFFERSIZE; checkhr2( prepareForInput(minlen)); // guarantee enough space in the array if (length > 0 && buffer != NULL){ // Copy raw data into new buffer. ::memcpy(buf + btotal, buffer, length); btotal += length; } if (pfnWideCharFromMultiByte == NULL) // first AppendData call { checkhr2(autoDetect()); } return hr; } ////////////////////////////////////////////////////////////////////////////////// HRESULT EncodingStream::BufferData() { HRESULT hr = S_OK; checkhr2(prepareForInput(0)); // 0 is used just for shift down (so bnext=0). if (_fEOF) // already hit the end of the stream. return S_FALSE; const DWORD BUFSIZE = 4096; DWORD dwRead = 1; while (S_OK == hr && dwRead > 0) { // if we cannot fit another buffer full, then re-allocate. DWORD minsize = (btotal+BUFSIZE > bufsize) ? bufsize + BUFSIZE : bufsize; checkhr2( prepareForInput(minsize)); // make space available. dwRead = 0; hr = stream->Read(buf + btotal, BUFSIZE, &dwRead); btotal += dwRead; } if (SUCCEEDED(hr) && dwRead == 0) { _fEOF = true; hr = S_FALSE; // return S_FALSE when at eof. } return hr; }