Windows2003-3790/inetcore/outlookexpress/oejunk/junkutil.cpp
2020-09-30 16:53:55 +02:00

528 lines
16 KiB
C++

/*
JUNKUTIL.CPP
(c) copyright 1998 Microsoft Corp
Shared utility functions
Created by Robert Rounthwaite (RobertRo@microsoft.com)
Modified by Brian Moore (brimo@microsoft.com)
*/
#include <pch.hxx>
#include "junkutil.h"
#include <msoedbg.h>
#define _WIN32_OE 0x0501
#include <mimeole.h>
WORD WGetStringTypeEx(LPCSTR pszText)
{
WORD wType = 0;
if (NULL == pszText)
{
wType = 0;
goto exit;
}
if (IsDBCSLeadByte(*pszText))
SideAssert(GetStringTypeEx(LOCALE_USER_DEFAULT, CT_CTYPE1, pszText, 2, &wType));
else
SideAssert(GetStringTypeEx(LOCALE_USER_DEFAULT, CT_CTYPE1, pszText, 1, &wType));
exit:
return wType;
}
BOOL FMatchToken(BOOL fStart, BOOL fEnd, LPCSTR pszPrev, DWORD * pdwFlagsPrev, LPCSTR pszWord, ULONG cchWord, DWORD * pdwFlagsWord, LPCSTR pszEnd)
{
BOOL fRet = FALSE;
DWORD dwFlagsEnd = 0;
LPCSTR pszEndWord = NULL;
// this code checks to see that the spot we found is a "word" and not a subword
// we want the character before and after to be word break, unless the character on that end of the
// string already is not a word break (or we're at the beginning of the string, for the char before)
// front checking
// f1: in either case we don't have to check the front if this is the first character; otherwise,
// f2: either the first character of the string is alnum and the previous character is not (and is not an "internal" character)
// f3: or the first character of the string isn't alnum, the previous character either is, or is a whitespace character
// rear checking
// r1: either we are at the end of the string
// r2: or the last character is alpha and the following character is not alpha or number (and is not an "internal" character)
// r3: or the last character is not alpha or num and the following character either is, or is a whitespace character
// r4: or the last character is num and the test depends on the first character:
// r5: if it was alphanum, then the following character is not alpha or number (and is not an "internal" character)
// r6: or it wasn't alphanum, then the following character is alpha or is a whitespace character
// Whew! This mimics the criteria used by GetNextFeature() in splitting up the string. The easiest way to understand this criteria
// is to examine that function
if ((FALSE != fStart) || // f1
((FALSE != FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) &&
(FALSE == FDoWordMatchStart(pszPrev, pdwFlagsPrev, CT_START_ALPHANUM)) &&
(FALSE == FIsInternalChar(*pszPrev))) || // f2
((FALSE == FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) &&
(FALSE != FDoWordMatchStart(pszPrev, pdwFlagsPrev, CT_START_ALPHANUMSPACE)))) // f3
{
// Make it a little more readable
pszEndWord = pszWord + cchWord - 1;
if ((FALSE != fEnd) || // r1
((FALSE != FDoWordMatchEnd(pszEndWord, pdwFlagsWord, CT_END_ALPHA)) &&
(FALSE == FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHANUM)) &&
(FALSE == FIsInternalChar(*pszEnd))) || // r2
((FALSE == FDoWordMatchEnd(pszEndWord, pdwFlagsWord, CT_END_ALPHANUM)) &&
(FALSE != FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHASPACE))) || // r3
((FALSE != FDoWordMatchEnd(pszEndWord, pdwFlagsWord, CT_END_NUM)) && // r4
(((FALSE != FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) &&
(FALSE == FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHANUM)) &&
(FALSE == FIsInternalChar(*pszEnd))) || // r5
((FALSE == FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) &&
(FALSE != FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHANUMSPACE)))))) // r6
{
// Good match
fRet = TRUE;
}
}
return fRet;
}
/////////////////////////////////////////////////////////////////////////////
// FWordPresent
//
// Determines if the given "word" is present in the Text. A word in this
// case is any string of characters with a non-alpha character on either
// side (or with the beginning or end of the text on either side).
// Case sensitive.
/////////////////////////////////////////////////////////////////////////////
BOOL FWordPresent(LPSTR pszText, DWORD * pdwFlags, LPSTR pszWord, ULONG cchWord, LPSTR * ppszMatch)
{
BOOL fRet = FALSE;
LPSTR pszLoc = NULL;
DWORD dwFlagsPrev = 0;
// If there's nothing to do then just exit
if ((NULL == pszText) || ('\0' == pszText[0]) || (NULL == pszWord) || (NULL == pdwFlags) || (0 == cchWord))
{
fRet = FALSE;
goto exit;
}
// How big is the text
for (pszLoc = pszText; NULL != (pszLoc = StrStr(pszLoc, pszWord)); pszLoc = CharNext(pszLoc))
{
if (FALSE != FMatchToken((pszLoc == pszText), ('\0' == pszLoc[cchWord]),
(pszLoc != pszText) ? CharPrev(pszText, pszLoc) : NULL,
&dwFlagsPrev, pszWord, cchWord, pdwFlags, pszLoc + cchWord))
{
// Good match
if (NULL != ppszMatch)
{
*ppszMatch = pszLoc;
}
fRet = TRUE;
goto exit;
}
// Don't cache these flags...
dwFlagsPrev = 0;
}
exit:
return fRet;
}
/////////////////////////////////////////////////////////////////////////////
// Special feature implementations
//
/////////////////////////////////////////////////////////////////////////////
// This feature is 25% of first 50 words contain no lowercase letters (includes words with no letters at all)
// p20_BODY_INTRO_UPPERCASE_WORDS
const UINT g_cWordsMax = 50;
const DOUBLE g_cNonLowerWordsThreshold = 0.25;
BOOL FSpecialFeatureUpperCaseWords(LPCSTR pszText)
{
BOOL fRet = FALSE;
UINT cWords = 0;
UINT cNonLowerWords = 0;
BOOL fHasLowerLetter = FALSE;
LPCSTR pszPos = NULL;
WORD wType = 0;
if (NULL == pszText)
{
fRet = FALSE;
goto exit;
}
// Skip over the leading spaces
pszPos = PszSkipWhiteSpace(pszText);
if ('\0' == *pszPos)
{
fRet = FALSE;
goto exit;
}
while (cWords < g_cWordsMax)
{
// Are we at the end of a word?
wType = WGetStringTypeEx(pszPos);
if ((0 != (wType & C1_SPACE)) || ('\0' == *pszPos))
{
// We found a word
cWords++;
// Did we have any lower case letters in the word
if (FALSE == fHasLowerLetter)
{
cNonLowerWords++;
}
else
{
fHasLowerLetter = FALSE;
}
// Skip over the trailing spaces
pszPos = PszSkipWhiteSpace(pszPos);
// Are we done with the string?
if ('\0' == *pszPos)
{
break;
}
}
else
{
fHasLowerLetter |= (0 != (wType & C1_LOWER));
// Move to the next character
pszPos = CharNext(pszPos);
}
}
// Set the return value
fRet = ((cWords > 0) && ((cNonLowerWords / (double)cWords) >= g_cNonLowerWordsThreshold));
exit:
return fRet;
}
BOOL FSpecialFeatureUpperCaseWordsStm(IStream * pIStm)
{
BOOL fRet = FALSE;
TCHAR rgchBuff[4096 + 1];
ULONG chRead = 0;
LARGE_INTEGER liZero = {0};
if (NULL == pIStm)
{
fRet = FALSE;
goto exit;
}
// Seek to the start of the stream
if (FAILED(pIStm->Seek(liZero, STREAM_SEEK_SET, NULL)))
{
fRet = FALSE;
goto exit;
}
// Fill up the buffer
if (FAILED(pIStm->Read(rgchBuff, 4096, &chRead)))
{
fRet = FALSE;
goto exit;
}
// Make sure the buffer is zero terminated
rgchBuff[chRead] = '\0';
fRet = FSpecialFeatureUpperCaseWords(rgchBuff);
exit:
return fRet;
}
// This feature is: 8% of first 200 non-space and non-numeric characters aren't letters
// p20_BODY_INTRO_NONALPHA
const UINT g_cchTextMax = 200;
const DOUBLE g_cNonSpaceNumThreshold = 0.08;
BOOL FSpecialFeatureNonAlpha(LPCSTR pszText)
{
BOOL fRet = FALSE;
UINT cchText = 0;
UINT cchNonAlpha = 0;
LPCSTR pszPos = NULL;
WORD wType = 0;
if (NULL == pszText)
{
fRet = FALSE;
goto exit;
}
// Skip over the leading spaces
pszPos = PszSkipWhiteSpace(pszText);
for (; '\0' != *pszPos; pszPos = CharNext(pszPos))
{
wType = WGetStringTypeEx(pszPos);
// Are we not a space or a digit?
if ((0 == (wType & C1_SPACE)) && (0 == (wType & C1_DIGIT)))
{
cchText++;
if (0 == (wType & C1_ALPHA))
{
cchNonAlpha++;
}
// Have we checked enough characters?
if (cchText >= g_cchTextMax)
{
break;
}
}
}
// Set the return value
fRet = (cchText > 0) && ((cchNonAlpha / (double)cchText) >= g_cNonSpaceNumThreshold);
exit:
return fRet;
}
BOOL FSpecialFeatureNonAlphaStm(IStream * pIStm)
{
BOOL fRet = FALSE;
TCHAR rgchBuff[1024 + 1];
ULONG chRead = 0;
LARGE_INTEGER liZero = {0};
if (NULL == pIStm)
{
fRet = FALSE;
goto exit;
}
// Seek to the start of the stream
if (FAILED(pIStm->Seek(liZero, STREAM_SEEK_SET, NULL)))
{
fRet = FALSE;
goto exit;
}
// Fill up the buffer
if (FAILED(pIStm->Read(rgchBuff, 1024, &chRead)))
{
fRet = FALSE;
goto exit;
}
// Make sure the buffer is zero terminated
rgchBuff[chRead] = '\0';
fRet = FSpecialFeatureNonAlpha(rgchBuff);
exit:
return fRet;
}
// --------------------------------------------------------------------------------
// FStreamStringSearch
// --------------------------------------------------------------------------------
#define CB_STREAMMATCH 0x00000FFF
BOOL FStreamStringSearch(LPSTREAM pstm, DWORD * pdwFlagsSearch, LPSTR pszSearch, ULONG cchSearch, DWORD dwFlags)
{
BOOL fRet = FALSE;
ULONG cbSave = 0;
CHAR rgchBuff[CB_STREAMMATCH + 1];
LPSTR pszRead = NULL;
ULONG cbRead = 0;
ULONG cbIn = 0;
ULONG cchGood = NULL;
CHAR chSave = '\0';
LONG cbSize = 0;
LPSTR pszMatch = NULL;
ULONG cbWalk = 0;
// Check incoming params
if ((NULL == pstm) || (NULL == pszSearch) || (0 == cchSearch))
{
goto exit;
}
// We want to save off the lead char and
// a possible ending lead byte...
cbSave = cchSearch + 2;
if (cbSave > ARRAYSIZE(rgchBuff))
{
// we've got a problem - this can cause a buffer overflow later on
Assert(0);
goto exit;
}
// Get the stream size
if (FAILED(HrGetStreamSize(pstm, (ULONG *) &cbSize)))
{
goto exit;
}
// Reset the stream to the beginning
if (FAILED(HrRewindStream(pstm)))
{
goto exit;
}
// Set up the defaults
pszRead = rgchBuff;
cbRead = CB_STREAMMATCH;
// Search for string through the entire stream
while ((cbSize > 0) && (S_OK == pstm->Read(pszRead, cbRead, &cbIn)))
{
// We're done if we read nothing...
if (0 == cbIn)
{
goto exit;
}
// Note that we've read the bytes
cbSize -= cbIn;
// Zero terminate the buffer
pszRead[cbIn] = '\0';
// Should we convert the buffer to upper case
if (0 == (dwFlags & SSF_CASESENSITIVE))
{
cchGood = CharUpperBuff(rgchBuff, (ULONG)(cbIn + pszRead - rgchBuff));
}
else
{
// We need to spin over the buffer figuring out if the end character is a lead
// byte without a corresponding tail byte
cbWalk = (ULONG) (cbIn + pszRead - rgchBuff);
for (cchGood = 0; cchGood < cbWalk; cchGood++)
{
if (IsDBCSLeadByte(rgchBuff[cchGood]))
{
if ((cchGood + 1) >= cbWalk)
{
break;
}
cchGood++;
}
}
}
chSave = rgchBuff[cchGood];
rgchBuff[cchGood] = '\0';
// Search for string
if (FALSE != FWordPresent(rgchBuff, pdwFlagsSearch, pszSearch, cchSearch, &pszMatch))
{
// If we aren't at the end of the stream and we can't
// tell if we are at a word break
if ((0 >= cbSize) || ((pszMatch + cchSearch) != (pszRead + cchGood)))
{
fRet = TRUE;
break;
}
}
// Are we done with the stream
if (0 >= cbSize)
{
break;
}
rgchBuff[cchGood] = chSave;
// Save part of the buffer
// How much space do we have in the buffer
cbRead = CB_STREAMMATCH - cbSave;
// Save the characters
MoveMemory(rgchBuff, rgchBuff + cbRead, cbSave);
// Figure out the new start of the buffer
pszRead = rgchBuff + cbSave;
}
exit:
return(fRet);
}
HRESULT HrConvertHTMLToPlainText(IStream * pIStmHtml, IStream ** ppIStmText)
{
HRESULT hr = S_OK;
IDataObject * pIDataObj = NULL;
FORMATETC fetc = {0};
STGMEDIUM stgmed = {0};
// Check incoming params
if ((NULL == pIStmHtml) || (NULL == ppIStmText))
{
hr = E_INVALIDARG;
goto exit;
}
// Initialize the outgoing param
*ppIStmText = NULL;
hr = MimeEditDocumentFromStream(pIStmHtml, IID_IDataObject, (VOID **)&pIDataObj);
if (FAILED(hr))
{
goto exit;
}
// Set up the format
fetc.cfFormat = CF_TEXT;
fetc.dwAspect = DVASPECT_CONTENT;
fetc.lindex = -1;
fetc.tymed = TYMED_ISTREAM;
// Get the data
hr = pIDataObj->GetData(&fetc, &stgmed);
if (FAILED(hr))
{
goto exit;
}
if (NULL == stgmed.pstm)
{
hr = E_FAIL;
goto exit;
}
// Save the item
*ppIStmText = stgmed.pstm;
(*ppIStmText)->AddRef();
// addref the pUnk as it will be release in releasestgmed
if(NULL != stgmed.pUnkForRelease)
{
(stgmed.pUnkForRelease)->AddRef();
}
hr = S_OK;
exit:
ReleaseStgMedium(&stgmed);
ReleaseObj(pIDataObj);
return hr;
}