920 lines
22 KiB
C++
Raw Normal View History

2001-01-01 00:00:00 +01:00
// ==========================================================================================
// RtfParser.cpp
//
// Impl RTF parser
//
// History:
// first created
// ==========================================================================================
//#include <windows.h>
#include "stdafx.h"
#include <stdio.h>
#include <assert.h>
#include "rtfparser.h"
#include "ConvEng.h"
//extern BOOL MapFunc(PBYTE, UINT, PBYTE, UINT*);
const char szRTFSignature[] = "{\\rtf";
// Keyword descriptions
SYM g_rgSymRtf[] = {
// keyword kwd idx
"*", kwdSpec, ipfnSkipDest,
"'", kwdSpec, ipfnHex,
"bin", kwdSpec, ipfnBin,
"upr", kwdDest, idestSkip,
"fonttbl", kwdDest, idestSkip,
/*
// we will search through following destinations
"author", kwdDest, idestSkip,
"buptim", kwdDest, idestSkip,
"colortbl", kwdDest, idestSkip,
"comment", kwdDest, idestSkip,
"creatim", kwdDest, idestSkip,
"doccomm", kwdDest, idestSkip,
"fonttbl", kwdDest, idestSkip,
"footer", kwdDest, idestSkip,
"footerf", kwdDest, idestSkip,
"footerl", kwdDest, idestSkip,
"footerr", kwdDest, idestSkip,
"footnote", kwdDest, idestSkip,
"ftncn", kwdDest, idestSkip,
"ftnsep", kwdDest, idestSkip,
"ftnsepc", kwdDest, idestSkip,
"header", kwdDest, idestSkip,
"headerf", kwdDest, idestSkip,
"headerl", kwdDest, idestSkip,
"headerr", kwdDest, idestSkip,
"info", kwdDest, idestSkip,
"keywords", kwdDest, idestSkip,
"operator", kwdDest, idestSkip,
"pict", kwdDest, idestSkip,
"printim", kwdDest, idestSkip,
"private1", kwdDest, idestSkip,
"revtim", kwdDest, idestSkip,
"rxe", kwdDest, idestSkip,
"stylesheet", kwdDest, idestSkip,
"subject", kwdDest, idestSkip,
"tc", kwdDest, idestSkip,
"title", kwdDest, idestSkip,
"txe", kwdDest, idestSkip,
"xe", kwdDest, idestSkip,
*/
};
int g_iSymMax = sizeof(g_rgSymRtf) / sizeof(SYM);
// ctor
CRtfParser::CRtfParser( BYTE* pchInput, UINT cchInput,
BYTE* pchOutput, UINT cchOutput)
{
m_fInit = FALSE;
m_pchInput = pchInput;
m_cchInput = cchInput;
m_pchOutput = pchOutput;
m_cchOutput = cchOutput;
Reset();
if (pchInput && pchOutput && cchInput && cchOutput) {
m_fInit = TRUE;
}
}
// Reset
// clean internal status before start the parser
void CRtfParser::Reset(void)
{
m_cGroup = 0;
m_cbBin = 0;
m_fSkipDestIfUnk = FALSE;
m_ris = risNorm;
m_rds = rdsNorm;
m_psave = NULL;
m_uCursor = 0;
m_uOutPos = 0;
m_bsStatus = bsDefault;
m_uConvStart = 0;
m_cchConvLen = 0;
memset(&m_sKeyword,0, sizeof(SKeyword));
}
// check signature
BOOL CRtfParser::fRTFFile()
{
if (m_fInit &&
0 == memcmp(m_pchInput, szRTFSignature, strlen(szRTFSignature)))
{
return TRUE;
}
return FALSE;
}
// Get major version
int
CRtfParser::GetVersion(PDWORD pdwVersion)
{
int ec;
*pdwVersion = 1;
// set keyword to get
m_sKeyword.wStatus |= KW_ENABLE;
strcpy(m_sKeyword.szKeyword, "rtf");
ec = Do();
if (ec == ecOK &&
(m_sKeyword.wStatus & KW_FOUND) &&
(m_sKeyword.wStatus & KW_PARAM))
{
*pdwVersion = (DWORD) atoi(m_sKeyword.szParameter);
}
Reset();
return ec;
}
// GetCodepage
int
CRtfParser::GetCodepage(PDWORD pdwCodepage)
{
int ec;
*pdwCodepage = 0;
// set keyword to get
m_sKeyword.wStatus |= KW_ENABLE;
strcpy(m_sKeyword.szKeyword, "ansicpg");
ec = Do();
if (ec == ecOK &&
(m_sKeyword.wStatus & KW_FOUND) &&
(m_sKeyword.wStatus & KW_PARAM))
{
*pdwCodepage = atoi(m_sKeyword.szParameter);
}
Reset();
return ec;
}
// do
// main parser function
int
CRtfParser::Do()
{
int ec;
int cNibble = 2;
BYTE ch;
BSTATUS bsStatus;
while ((ec = GetByte(&ch)) == ecOK)
{
if (m_cGroup < 0)
return ecStackUnderflow;
// check if search specific keyword
if (m_sKeyword.wStatus & KW_ENABLE) {
if (m_sKeyword.wStatus & KW_FOUND) {
ReleaseRtfState();
break;
}
}
// set buf status
bsStatus = bsDefault;
if (m_ris == risBin) // if we're parsing binary data, handle it directly
{
// fall through
}
else
{
switch (ch)
{
case '{':
if ((ec = PushRtfState()) != ecOK)
return ec;
break;
case '}':
if ((ec = PopRtfState()) != ecOK)
return ec;
break;
case '\\':
if ((ec = ParseRtfKeyword()) != ecOK)
return ec;
continue; // all keyword is processed in ParseRtfKeyword
case 0x0d:
case 0x0a: // cr and lf are noise characters...
break;
default:
if (m_ris == risNorm )
{
bsStatus = bsText;
} else if (m_ris == risHex)
{
cNibble--;
if (!cNibble) {
cNibble = 2;
m_ris = risNorm;
}
bsStatus = bsHex;
} else {
return ecAssertion;
}
break;
} // switch
} // else (ris != risBin)
if ((ec = ParseChar(ch, bsStatus)) != ecOK)
return ec;
} // while
if (m_cGroup < 0)
return ecStackUnderflow;
if (m_cGroup > 0)
return ecUnmatchedBrace;
return ecOK;
}
//
// PushRtfState
//
// Save relevant info on a linked list of SAVE structures.
//
int
CRtfParser::PushRtfState(void)
{
SAVE *psaveNew = new SAVE;
if (!psaveNew)
return ecStackOverflow;
psaveNew -> pNext = m_psave;
psaveNew -> rds = m_rds;
psaveNew -> ris = m_ris;
m_ris = risNorm;
// do not save rds, rds status spread to sub destination until this destination
// terminated
m_psave = psaveNew;
m_cGroup++;
return ecOK;
}
//
// PopRtfState
//
// If we're ending a destination (that is, the destination is changing),
// call ecEndGroupAction.
// Always restore relevant info from the top of the SAVE list.
//
int
CRtfParser::PopRtfState(void)
{
SAVE *psaveOld;
if (!m_psave)
return ecStackUnderflow;
if (m_rds != m_psave->rds)
{ // todo:
// if ((ec = EndGroupAction(rds)) != ecOK)
// return ec;
}
m_rds = m_psave->rds;
m_ris = m_psave->ris;
psaveOld = m_psave;
m_psave = m_psave->pNext;
m_cGroup--;
delete psaveOld;
return ecOK;
}
//
// ReleaseRtfState
// when find specific keyword and want to abort the parser abnormally
// call this function to flash the state stack
//
int CRtfParser::ReleaseRtfState(void)
{
SAVE *psaveOld;
while(psaveOld = m_psave)
{
assert(m_cGroup);
m_psave = m_psave->pNext;
m_cGroup--;
delete psaveOld;
}
return ecOK;
}
//
// ParseChar
//
// Route the character to the appropriate destination stream.
//
int
CRtfParser::ParseChar(BYTE ch, BSTATUS bsStatus)
{
int ec;
if (m_ris == risBin && --m_cbBin <= 0)
m_ris = risNorm;
switch (m_rds)
{
case rdsSkip:
// Toss this character.
bsStatus = bsDefault;
break;
case rdsNorm:
// Output a character. Properties are valid at this point.
break;
default:
// handle other destinations....
break;
}
// set status, trigger the conversion if any
if ((ec = SetStatus(bsStatus)) != ecOK) {
return ec;
}
// save the char
if ((ec = SaveByte(ch)) != ecOK) {
return ec;
}
return ec;
}
//
// ParseRtfKeyword
//
// get a control word (and its associated value) and
// call TranslateKeyword to dispatch the control.
//
int
CRtfParser::ParseRtfKeyword()
{
BOOL fNeg = FALSE;
char *pch;
char szKeyword[30];
char szParameter[20];
BYTE ch;
szKeyword[0] = '\0';
szParameter[0] = '\0';
if (GetByte(&ch) != ecOK)
return ecEndOfFile;
if (!isalpha(ch)) // a control symbol; no delimiter.
{
szKeyword[0] = (char) ch;
szKeyword[1] = '\0';
return TranslateKeyword(szKeyword, szParameter);
}
for (pch = szKeyword; isalpha(ch); GetByte(&ch))
*pch++ = (char) ch;
*pch = '\0';
if (ch == '-')
{
fNeg = TRUE;
if (GetByte(&ch) != ecOK)
return ecEndOfFile;
}
if (isdigit(ch))
{
pch = szParameter;
if (fNeg) *pch++ = '-';
for (; isdigit(ch); GetByte(&ch))
*pch++ = (char) ch;
*pch = '\0';
}
if (ch != ' ') {
unGetByte(ch);
} else {
strcat(szParameter, " "); // append the space to keyword
}
return TranslateKeyword(szKeyword, szParameter);
}
//
// TranslateKeyword.
// Inputs:
// szKeyword: The RTF control to evaluate.
int
CRtfParser::TranslateKeyword(char *szKeyword, char* szParameter)
{
BSTATUS bsStatus;
int isym;
int ec;
BYTE ch;
// check specific keyword first
if (m_sKeyword.wStatus & KW_ENABLE)
{
if (strcmp(szKeyword, m_sKeyword.szKeyword) == 0)
{
strcpy(m_sKeyword.szParameter, szParameter);
if (szParameter[0] != '\0' && szParameter[0] != ' ')
m_sKeyword.wStatus |= KW_PARAM;
m_sKeyword.wStatus |= KW_FOUND;
return ecOK;
}
}
// search for szKeyword in rgsymRtf
for (isym = 0; isym < g_iSymMax; isym++) {
if (strcmp(szKeyword, g_rgSymRtf[isym].szKeyword) == 0)
break;
}
if (isym == g_iSymMax) // control word not found
{
if (m_fSkipDestIfUnk) // if this is a new destination
m_rds = rdsSkip; // skip the destination
// else just discard it
m_fSkipDestIfUnk = FALSE;
ec = ecOK;
goto gotoExit;
}
// found it! use kwd and idx to determine what to do with it.
m_fSkipDestIfUnk = FALSE;
switch (g_rgSymRtf[isym].kwd)
{
case kwdChar:
break;
case kwdDest:
ec = ChangeDest((IDEST)g_rgSymRtf[isym].idx);
break;
case kwdSpec:
ec = ParseSpecialKeyword((IPFN)g_rgSymRtf[isym].idx, szParameter);
break;
default:
ec = ecBadTable;
}
gotoExit:
// save keyword and parameter
if (m_ris == risHex) {
bsStatus = bsHex;
} else {
bsStatus =bsDefault;
}
ParseChar('\\', bsStatus);
while (ch = *szKeyword++) ParseChar(ch, bsStatus);
while (ch = *szParameter++) ParseChar(ch, bsStatus);
return ec;
}
//
// ParseSpecialKeyword
//
// Evaluate an RTF control that needs special processing.
//
int
CRtfParser::ParseSpecialKeyword(IPFN ipfn, char* szParameter)
{
if (m_rds == rdsSkip && ipfn != ipfnBin) // if we're skipping, and it's not
return ecOK; // the \bin keyword, ignore it.
switch (ipfn)
{
case ipfnBin:
m_ris = risBin;
m_cbBin = atol(szParameter);
break;
case ipfnSkipDest:
m_fSkipDestIfUnk = TRUE;
break;
case ipfnHex:
m_ris = risHex;
break;
default:
return ecBadTable;
}
return ecOK;
}
//
// ChangeDest
//
// Change to the destination specified by idest.
// There's usually more to do here than this...
//
int
CRtfParser::ChangeDest(IDEST idest)
{
if (m_rds == rdsSkip) // if we're skipping text,
return ecOK; // don't do anything
switch (idest)
{
case idestPict:
case idestSkip:
default:
m_rds = rdsSkip; // when in doubt, skip it...
break;
}
return ecOK;
}
//
// GetByte
//
// Get one char from input buffer
//
int
CRtfParser::GetByte(BYTE* pch)
{
if (m_uCursor >= m_cchInput) {
return ecEndOfFile;
}
*pch = *(m_pchInput + m_uCursor);
m_uCursor ++;
return ecOK;
}
//
// unGetByte
//
// adjust the cursor, return one char
//
int
CRtfParser::unGetByte(BYTE ch)
{
if (m_uCursor) {
m_uCursor--;
}
return ecOK;
}
//
// SaveByte
//
// Save one char to output buffer
//
int
CRtfParser::SaveByte(BYTE ch)
{
if (m_uOutPos >= m_cchOutput) {
return ecBufTooSmall;
}
*(m_pchOutput + m_uOutPos) = ch;
m_uOutPos++; // output buffer ++
m_cchConvLen++; // mapping range also ++
return ecOK;
}
//
// SetStatus
//
// set the buffer status, if buffer status changed then start convert
//
int
CRtfParser::SetStatus(BSTATUS bsStatus)
{
PBYTE pchDBCS, pchWCHAR, pchUniDes;
UINT i, cchLen;
assert(m_uOutPos == m_uConvStart + m_cchConvLen);
if (bsStatus != m_bsStatus)
{
switch(m_bsStatus)
{
case bsDefault:
// control symbol, keyword, group char...
break;
case bsText:
// here we got Ansi text
// we do not do conversion for ansi text
/*
pchWCHAR = new BYTE[m_cchConvLen*2 + 8];
if (!pchWCHAR) return ecOutOfMemory;
MapFunc(m_pchOutput + m_uConvStart, m_cchConvLen,
pchWCHAR, &cchLen);
// replace old buffer with mapped buffer
for (i=0; i<cchLen; i++, m_uConvStart++) {
*(m_pchOutput + m_uConvStart) = *(pchWCHAR + i);
}
// set new output buffer position
m_uOutPos = m_uConvStart;
//
delete [] pchWCHAR;
*/
break;
case bsHex:
// when we are here,
// the rtf contains DBCS chars like "\'xx\'xx"
// we only need to do DBCS->Unicode conversion, since we can not get
// \upr keyword here (\upr is skipped, see keyword table)
// so the MapFunc can be only (ANSI->Unicode) converter
// we will map DBCS string "\'xx\'xx" to
// "{\upr{"\'xx\'xx"}{\*\ud{\uc0 "Unicode string"}}}
// in which Unicode string is like this:
// \u12345\u-12345....
// rtf treat unicode value as signed 16-bit decimal
// so we don't distinquish 16-bit or 32-bit wide char, all
// processed as 2-byte WCHAR
if (m_cchConvLen == 0) {
break;
}
pchDBCS = new BYTE[m_cchConvLen * 3 + 8];
if (!pchDBCS) return ecOutOfMemory;
pchWCHAR = pchDBCS + m_cchConvLen;
// length: pchDBCS = m_cchConvLen
// pchWCHAR = m_cchConvLen * 2 + 8
// map Hex string to DBCS string
// return cchLen in Byte
Hex2Char(m_pchOutput + m_uConvStart, m_cchConvLen, pchDBCS, m_cchConvLen, &cchLen);
// map DBCS string to Unicode string
// return cchLen in WCHAR
cchLen = AnsiStrToUnicodeStr(pchDBCS, cchLen, (PWCH)pchWCHAR, cchLen+4);
// MapFunc(pchDBCS, cchLen, pchWCHAR, &cchLen);
// allocate a buffer for unicode destination
// since one WCHAR map to max \u-xxxxx, that's 8 bytes
// adding other 20 bytes for surrounding keywords and group chars
// adding DBCS strings
pchUniDes = new BYTE[cchLen * 8 + 32 + m_cchConvLen];
if (!pchUniDes) {
delete [] pchDBCS;
return ecOutOfMemory;
}
// map to unicode destination
GetUnicodeDestination(pchUniDes, (LPWSTR)pchWCHAR, cchLen, &cchLen);
// replace old hex with new hex
for (i=0; i<cchLen; i++, m_uConvStart++) {
*(m_pchOutput + m_uConvStart) = *(pchUniDes + i);
}
// set new output position
m_uOutPos = m_uConvStart;
//
delete [] pchDBCS;
delete [] pchUniDes;
break;
default:
assert(0);
return ecAssertion;
}
// clean map buffer
m_uConvStart = m_uOutPos;
m_cchConvLen = 0;
// set status
m_bsStatus = bsStatus;
}
return ecOK;
}
//
// Hex2Char
//
// convert hex string to char string
//
int
CRtfParser::Hex2Char(BYTE* pchSrc, UINT cchSrc, BYTE* pchDes, UINT cchDes, UINT* pcchLen)
{
BYTE* pchTmp = pchDes;
BYTE ch;
BYTE b = 0;
BYTE cNibble = 2;
// should be \'xx\'xx\'xx
assert (cchSrc % 4 == 0);
*pcchLen = 0;
if (cchDes < cchSrc/4) {
goto gotoError;
}
while (cchSrc--)
{
ch = *pchSrc++;
if (ch == '\\') {
if (*pchSrc != '\'') {
goto gotoError;
}
} else if (ch == '\'') {
}
else
{
b = b << 4;
if (isdigit(ch))
b += (char) ch - '0';
else
{
if (islower(ch))
{
if (ch < 'a' || ch > 'f')
goto gotoError;
b += (char) ch - 'a' + 10;
}
else
{
if (ch < 'A' || ch > 'F')
goto gotoError;
b += (char) ch - 'A' + 10;
}
}
cNibble--;
if (!cNibble)
{
*pchDes++ = b;
cNibble = 2;
b = 0;
}
}
}
*pcchLen = (UINT)(pchDes - pchTmp);
return ecOK;
gotoError:
assert(0);
return ecInvalidHex;
}
#define LONIBBLE(c) (c&0x0f)
#define HINIBBLE(c) ((c&0xf0)>>4)
//
// Char2Hex
//
// convert char string to hex string
//
int
CRtfParser::Char2Hex(BYTE* pchSrc, UINT cchSrc, BYTE* pchDes, UINT cchDes, UINT* pcchLen)
{
BYTE* pchTmp = pchDes;
BYTE ch,c;
*pcchLen = 0;
if (cchDes < cchSrc * 4) {
goto gotoError;
}
while(cchSrc--)
{
*pchDes++ = '\\';
*pchDes++ = '\'';
ch = *pchSrc++;
c = HINIBBLE(ch);
if(c>9 && c<=0xF) {
c += 'a'-10;
} else if (c<=9) {
c += '0';
} else {
goto gotoError;
}
*pchDes++ = c;
c = LONIBBLE(ch);
if(c>9 && c<=0xF) {
c += 'a'-10;
} else if (c<=9) {
c += '0';
} else {
goto gotoError;
}
*pchDes++ = c;
}
*pcchLen = (UINT)(pchDes - pchTmp);
return ecOK;
gotoError:
assert(0);
return ecInvalidHex;
}
//
// GetUnicodeDestination
//
// convert unicode string to unicode destination in RTF
// the format is:
// "{\upr{\'xx\'xx}{\*\ud{\uc0 \u12345\u-12345}}
//
int
CRtfParser::GetUnicodeDestination(BYTE* pchUniDes, LPWSTR pwchStr, UINT wchLen, UINT* pcchLen)
{
static char pch1[] = "{\\upr{";
static char pch2[] = "}{\\*\\ud{\\uc0 ";
static char pch3[] = "}}}";
UINT cchLen, cchDone;
// copy \upr
cchLen = strlen(pch1);
memcpy(pchUniDes, pch1, cchLen);
// copy DBCS string
memcpy(pchUniDes + cchLen, m_pchOutput+m_uConvStart, m_cchConvLen);
cchLen += m_cchConvLen;
// copy middle part
memcpy(pchUniDes + cchLen, pch2, strlen(pch2));
cchLen += strlen(pch2);
// copy unicode string
for (UINT i=0; i<wchLen; i++)
{
WideCharToKeyword(pwchStr[i], pchUniDes + cchLen, &cchDone);
cchLen += cchDone;
}
// copy last part
memcpy(pchUniDes + cchLen, pch3, strlen(pch3));
cchLen += strlen(pch3);
// return
*pcchLen = cchLen;
return ecOK;
}
//
// WideCharToKeyword
//
// map one wide char to \u keyword
//
int
CRtfParser::WideCharToKeyword(WCHAR wch, BYTE* pchDes, UINT* pcchLen)
{
short num = (short) wch;
char* pch = (char*) pchDes;
sprintf(pch,"\\u%d", num);
*pcchLen = strlen(pch);
return ecOK;
}