Windows2003-3790/inetsrv/query/deflang/defbreak.cxx
2020-09-30 16:53:55 +02:00

743 lines
22 KiB
C++

//+---------------------------------------------------------------------------
//
// Microsoft Windows
// Copyright (C) Microsoft Corporation, 1991 - 1994.
//
// File: DefBreak.cxx
//
// Contents: Text Word Breaker
//
// History: 08-May-91 t-WadeR Created stubs, filled in ASCII code.
// 06-Jun-91 t-WadeR Changed to use input-based pipeline
// 11-Apr-92 KyleP Sync to spec
//
//----------------------------------------------------------------------------
#include <pch.cxx>
#pragma hdrstop
#include <DefBreak.hxx>
//+---------------------------------------------------------------------------
//
// Member: CDefWordBreaker::CDefWordBreaker
//
// Synopsis: Constructor for the CDefWordBreaker class.
//
// History: 07-June-91 t-WadeR Created
// 12-Oct-92 AmyA Added Unicode support
//
//----------------------------------------------------------------------------
CDefWordBreaker::CDefWordBreaker()
: _cRefs(1)
{
ciDebugOut(( DEB_ITRACE, "Creating default wordbreaker\n" ));
// Look at IsWordChar. We don't want the last non-breaking
// space in the chunk to be considered a word break.
// It will be processed again (correctly) when we move to the next chunk.
_aCharInfo3 [CDefWordBreaker::ccCompare] = C3_NONSPACING;
}
//+---------------------------------------------------------------------------
//
// Member: CWordBreaker::~CWordBreaker
//
// Synopsis: Destructor for the CWordBreaker class.
//
//----------------------------------------------------------------------------
CDefWordBreaker::~CDefWordBreaker()
{
}
//+-------------------------------------------------------------------------
//
// Method: CDefWordBreaker::QueryInterface
//
// Synopsis: Rebind to other interface
//
// Arguments: [riid] -- IID of new interface
// [ppvObject] -- New interface * returned here
//
// Returns: S_OK if bind succeeded, E_NOINTERFACE if bind failed
//
// History: 23-Feb-1994 KyleP Created
//
//--------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CDefWordBreaker::QueryInterface( REFIID riid,
void ** ppvObject)
{
if ( 0 == ppvObject )
return E_INVALIDARG;
if ( IID_IWordBreaker == riid )
*ppvObject = (IUnknown *)(IWordBreaker *)this;
else if ( IID_IUnknown == riid )
*ppvObject = (IUnknown *)(IPersist *)(IPersistFile *)this;
else
{
*ppvObject = 0;
return E_NOINTERFACE;
}
AddRef();
return S_OK;
}
//+-------------------------------------------------------------------------
//
// Method: CDefWordBreaker::AddRef
//
// Synopsis: Increments refcount
//
// History: 23-Feb-1994 KyleP Created
//
//--------------------------------------------------------------------------
ULONG STDMETHODCALLTYPE CDefWordBreaker::AddRef()
{
return InterlockedIncrement( &_cRefs );
}
//+-------------------------------------------------------------------------
//
// Method: CDefWordBreaker::Release
//
// Synopsis: Decrement refcount. Delete if necessary.
//
// History: 23-Feb-1994 KyleP Created
//
//--------------------------------------------------------------------------
ULONG STDMETHODCALLTYPE CDefWordBreaker::Release()
{
unsigned long uTmp = InterlockedDecrement( &_cRefs );
if ( 0 == uTmp )
delete this;
return uTmp;
}
//+-------------------------------------------------------------------------
//
// Method: CDefWordBreaker::Init
//
// Synopsis: Initialize word-breaker
//
// Arguments: [fQuery] -- TRUE if query-time
// [ulMaxTokenSize] -- Maximum size token stored by caller
// [pfLicense] -- Set to true if use restricted
//
// Returns: Status code
//
// History: 11-Apr-1994 KyleP Created
//
//--------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CDefWordBreaker::Init( BOOL fQuery,
ULONG ulMaxTokenSize,
BOOL *pfLicense )
{
if ( 0 == pfLicense )
return E_INVALIDARG;
*pfLicense = FALSE;
return S_OK;
}
//+-------------------------------------------------------------------------
//
// Method: CDefWordBreaker::IsWordChar
//
// Synopsis: Find whether the i'th character in the buffer _awString
// is a word character (rather than word break)
//
// Arguments: [i] -- index into _awString
//
// History: 22-Jul-1994 BartoszM Created
//
//--------------------------------------------------------------------------
inline BOOL CDefWordBreaker::IsWordChar (int i) const
{
if ( (_aCharInfo1[i] & (C1_ALPHA | C1_DIGIT))
|| (_aCharInfo3[i] & C3_NONSPACING) )
{
return TRUE;
}
WCHAR c = _pwcChunk[i];
if (c == L'_')
return TRUE;
if (c == 0xa0) // non breaking space
{
// followed by a non-spacing character
// (looking ahead is okay)
if (_aCharInfo3[i+1] & C3_NONSPACING)
return TRUE;
}
return FALSE;
}
//+---------------------------------------------------------------------------
//
// Member: CDefWordBreaker::ScanChunk
//
// Synopsis: For each character find its type
//
//
// History: 16-Aug-94 BartoszM Created
//
//----------------------------------------------------------------------------
BOOL CDefWordBreaker::ScanChunk ()
{
//
// GetStringTypeW is returning error 87 (ERROR_INVALID_PARAMETER) if
// we pass in a null string.
//
Win4Assert( (0 != _cMapped) && (0 != _pwcChunk) );
if ( !GetStringTypeW( CT_CTYPE1, // POSIX character typing
_pwcChunk, // Source
_cMapped, // Size of source
_aCharInfo1 ) ) // Character info
{
ciDebugOut(( DEB_ERROR, "GetStringTypeW returned %d\n",
GetLastError() ));
return FALSE;
}
if ( !GetStringTypeW( CT_CTYPE3, // Additional POSIX
_pwcChunk,
_cMapped, // Size of source
_aCharInfo3 ) ) // Character info 3
{
ciDebugOut(( DEB_ERROR, "GetStringTypeW CTYPE3 returned %d\n",
GetLastError() ));
return FALSE;
}
return TRUE;
}
//+---------------------------------------------------------------------------
//
// Member: CDefWordBreaker::BreakText
//
// Synopsis: Break input stream into words.
//
// Arguments: [pTextSource] - source of input buffers
// [pWordSink] - sink for words
// [pPhraseSink] - sink for noun phrases
//
// History: 07-June-91 t-WadeR Created
// 12-Oct-92 AmyA Added Unicode support
// 18-Nov-92 AmyA Overloaded
// 11-Apr-94 KyleP Sync with spec
// 26-Aug-94 BartoszM Fixed Unicode parsing
//
//----------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CDefWordBreaker::BreakText( TEXT_SOURCE *pTextSource,
IWordSink *pWordSink,
IPhraseSink *pPhraseSink )
{
if ( 0 == pTextSource )
return E_INVALIDARG;
if ( 0 == pWordSink || pTextSource->iCur == pTextSource->iEnd)
return S_OK;
if (pTextSource->iCur > pTextSource->iEnd)
{
Win4Assert ( !"BreakText called with bad TEXT_SOURCE" );
return E_FAIL;
}
SCODE sc = S_OK;
ULONG cwc, cwcProcd; // cwcProcd is # chars actually processed by Tokenize()
TRY
{
do
{
//
// Flag for first time thru loop below. This is to fix the case
// where the length of the buffer passed in is less than
// MAX_II_BUFFER_LEN. In this case iEnd-iCur is <= MAX_II_BUFFER_LEN
// and we break out the inner loop and call
// pfnFillTextBuffer without having processed any characters,
// and so pfnFillTextBuffer returns TRUE without adding any new
// characters and this results in an infinite loop.
//
BOOL fFirstTime = TRUE;
while ( pTextSource->iCur < pTextSource->iEnd )
{
cwc = pTextSource->iEnd - pTextSource->iCur;
//
// Process in buckets of MAX_II_BUFER_LEN only
//
if ( cwc >= CDefWordBreaker::ccCompare )
cwc = CDefWordBreaker::ccCompare;
else if ( !fFirstTime )
break;
Tokenize( pTextSource, cwc, pWordSink, cwcProcd );
Win4Assert( cwcProcd <= cwc );
pTextSource->iCur += cwcProcd;
fFirstTime = FALSE;
}
} while ( SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)) );
cwc = pTextSource->iEnd - pTextSource->iCur;
// we know that the remaining text should be less than ccCompare
Win4Assert( cwc < CDefWordBreaker::ccCompare );
if ( 0 != cwc )
{
Tokenize( pTextSource, cwc, pWordSink, cwcProcd );
}
}
CATCH (CException, e)
{
ciDebugOut(( DEB_ITRACE,
"Exception 0x%x caught when breaking text in default wordbreaker\n",
e.GetErrorCode() ));
sc = GetOleError( e );
}
END_CATCH
return sc;
}
//+---------------------------------------------------------------------------
//
// Member: CDefWordBreaker::Tokenize
//
// Synopsis: Tokenize the input buffer into words
//
// Arguments: [pTextSource] -- input text source
// [cwc] -- # chars to process
// [pWordSink] -- sink for words
// [cwcProd] -- # chars actually processed returned here
//
// History: 10-Aug-95 SitaramR Created
//
//----------------------------------------------------------------------------
void CDefWordBreaker::Tokenize( TEXT_SOURCE *pTextSource,
ULONG cwc,
IWordSink *pWordSink,
ULONG& cwcProcd )
{
_pwcChunk = &pTextSource->awcBuffer[pTextSource->iCur];
_cMapped = cwc;
if ( !ScanChunk() )
THROW( CException( E_FAIL ) );
BOOL fWordHasZWS = FALSE; // Does the current word have a zero-width-space ?
unsigned uLenZWS; // Length of a word minus embedded zero-width-spaces
//
// iBeginWord is the offset into _aCharInfo of the beginning character of
// a word. iCur is the first *unprocessed* character.
// They are indexes into the mapped chunk.
//
unsigned iBeginWord = 0;
unsigned iCur = 0;
SCODE sc = S_OK;
//
// Pump words from mapped chunk to word sink
//
while ( iCur < _cMapped )
{
//
// Skip whitespace, punctuation, etc.
//
for (; iCur < _cMapped; iCur++)
if ( IsWordChar (iCur) )
break;
// iCur points to a word char or is equal to _cMapped
iBeginWord = iCur;
if (iCur < _cMapped)
iCur++; // we knew it pointed at word character
//
// Find word break. Filter may output Unicode zero-width-space, which
// should be ignored by the wordbreaker.
//
fWordHasZWS = FALSE;
for (; iCur < _cMapped; iCur++)
{
if ( !IsWordChar (iCur) )
{
if ( _pwcChunk[iCur] == ZERO_WIDTH_SPACE )
fWordHasZWS = TRUE;
else
break;
}
}
if ( fWordHasZWS )
{
//
// Copy word into _awcBufZWS after stripping zero-width-spaces
//
uLenZWS = 0;
for ( unsigned i=iBeginWord; i<iCur; i++ )
{
if ( _pwcChunk[i] != ZERO_WIDTH_SPACE )
_awcBufZWS[uLenZWS++] = _pwcChunk[i];
}
}
// iCur points to a non-word char or is equal to _cMapped
if ( iCur < _cMapped )
{
// store the word and its source position
if ( fWordHasZWS )
sc = pWordSink->PutWord( uLenZWS,
_awcBufZWS, // stripped word
iCur - iBeginWord,
pTextSource->iCur + iBeginWord );
else
sc = pWordSink->PutWord( iCur - iBeginWord,
_pwcChunk + iBeginWord, // the word
iCur - iBeginWord,
pTextSource->iCur + iBeginWord );
if ( FAILED( sc ) )
THROW( CException( sc ) );
iCur++; // we knew it pointed at non-word char
iBeginWord = iCur; // in case we exit the loop now
}
} // next word
Win4Assert( iCur == _cMapped );
// End of words in chunk.
// iCur == _cMapped
// iBeginWord points at beginning of word or == _cMapped
if ( 0 == iBeginWord )
{
// A single word fills from beginning of this chunk
// to the end. This is either a very long word or
// a short word in a leftover buffer.
// store the word and its source position
if ( fWordHasZWS )
sc = pWordSink->PutWord( uLenZWS,
_awcBufZWS, // stripped word
iCur,
pTextSource->iCur ); // its source pos.
else
sc = pWordSink->PutWord( iCur,
_pwcChunk, // the word
iCur,
pTextSource->iCur ); // its source pos.
if ( FAILED( sc ) )
THROW( CException( sc ) );
//
// Position it to not add the word twice.
//
iBeginWord = iCur;
}
//
// If this is the last chunk from text source, then process the
// last fragment
//
if ( cwc < CDefWordBreaker::ccCompare && iBeginWord != iCur )
{
// store the word and its source position
if ( fWordHasZWS )
sc = pWordSink->PutWord( uLenZWS,
_awcBufZWS, // stripped word
iCur - iBeginWord,
pTextSource->iCur + iBeginWord );
else
sc = pWordSink->PutWord( iCur - iBeginWord,
_pwcChunk + iBeginWord, // the word
iCur - iBeginWord,
pTextSource->iCur + iBeginWord );
if ( FAILED( sc ) )
THROW( CException( sc ) );
iBeginWord = iCur;
}
cwcProcd = iBeginWord;
}
//+---------------------------------------------------------------------------
//
// Member: CDefWordBreaker::ComposePhrase
//
// Synopsis: Convert a noun and a modifier into a phrase
//
// Arguments: [pwcNoun] -- pointer to noun.
// [cwcNoun] -- count of chars in pwcNoun
// [pwcModifier] -- pointer to word modifying pwcNoun
// [cwcModifier] -- count of chars in pwcModifier
// [ulAttachmentType] -- relationship between pwcNoun &pwcModifier
//
// History: 10-Aug-95 SitaramR Created Header
//
//----------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CDefWordBreaker::ComposePhrase( WCHAR const *pwcNoun,
ULONG cwcNoun,
WCHAR const *pwcModifier,
ULONG cwcModifier,
ULONG ulAttachmentType,
WCHAR *pwcPhrase,
ULONG *pcwcPhrase )
{
//
// Never emitted phrase in the first place.
//
ciDebugOut(( DEB_WARN,
"IWordBreaker::ComposePhrase called on default word breaker\n" ));
return( E_FAIL );
}
//+---------------------------------------------------------------------------
//
// Member: CWordBreaker::GetLicenseToUse
//
// Synopsis: Returns a pointer to vendors license information
//
// Arguments: [ppwcsLicense] -- ptr to ptr to which license info is returned
//
// History: 10-Aug-95 SitaramR Created Header
//
//----------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CDefWordBreaker::GetLicenseToUse( const WCHAR **ppwcsLicense )
{
if ( 0 == ppwcsLicense )
return E_INVALIDARG;
static WCHAR const * wcsCopyright = L"Copyright (c) Microsoft Corporation, 1991-1998";
*ppwcsLicense = wcsCopyright;
return( S_OK );
}
extern long gulcInstances;
//+-------------------------------------------------------------------------
//
// Method: CDefWordBreakerCF::CDefWordBreakerCF
//
// Synopsis: Default Word Breaker class factory constructor
//
// History: 07-Feb-1995 SitaramR Created
//
//--------------------------------------------------------------------------
CDefWordBreakerCF::CDefWordBreakerCF( )
: _cRefs( 1 )
{
InterlockedIncrement( &gulcInstances );
}
//+-------------------------------------------------------------------------
//
// Method: CDefWordBreakerCF::~CDefWordBreakerCF
//
// Synopsis: Default Word Breaker class factory destructor
//
// History: 07-Feb-1995 SitaramR Created
//
//--------------------------------------------------------------------------
CDefWordBreakerCF::~CDefWordBreakerCF()
{
InterlockedDecrement( &gulcInstances );
}
//+-------------------------------------------------------------------------
//
// Method: CDefWordBreakerCF::QueryInterface
//
// Synopsis: Rebind to other interface
//
// Arguments: [riid] -- IID of new interface
// [ppvObject] -- New interface * returned here
//
// Returns: S_OK if bind succeeded, E_NOINTERFACE if bind failed
//
// History: 07-Feb-1995 SitaramR Created
//
//--------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CDefWordBreakerCF::QueryInterface(
REFIID riid,
void ** ppvObject )
{
if ( IID_IClassFactory == riid )
*ppvObject = (IUnknown *)(IClassFactory *)this;
else if ( IID_IUnknown == riid )
*ppvObject = (IUnknown *)this;
else
{
*ppvObject = 0;
return E_NOINTERFACE;
}
AddRef();
return S_OK;
}
//+-------------------------------------------------------------------------
//
// Method: CDefWordBreakerCF::AddRef
//
// Synopsis: Increments refcount
//
// History: 07-Feb-1995 SitaramR Created
//
//--------------------------------------------------------------------------
ULONG STDMETHODCALLTYPE CDefWordBreakerCF::AddRef()
{
return InterlockedIncrement( &_cRefs );
}
//+-------------------------------------------------------------------------
//
// Method: CDefWordBreakerCF::Release
//
// Synopsis: Decrement refcount. Delete if necessary.
//
// History: 07-Feb-1995 SitaramR Created
//
//--------------------------------------------------------------------------
ULONG STDMETHODCALLTYPE CDefWordBreakerCF::Release()
{
unsigned long uTmp = InterlockedDecrement( &_cRefs );
if ( 0 == uTmp )
delete this;
return uTmp;
}
//+-------------------------------------------------------------------------
//
// Method: CDefWordBreakerCF::CreateInstance
//
// Synopsis: Creates new CDefWordBreaker object
//
// Arguments: [pUnkOuter] -- 'Outer' IUnknown
// [riid] -- Interface to bind
// [ppvObject] -- Interface returned here
//
// History: 07-Feb-1995 SitaramR Created
//
//--------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CDefWordBreakerCF::CreateInstance( IUnknown * pUnkOuter,
REFIID riid,
void * * ppvObject )
{
CDefWordBreaker *pIUnk = 0;
SCODE sc = S_OK;
TRY
{
pIUnk = new CDefWordBreaker();
sc = pIUnk->QueryInterface( riid , ppvObject );
pIUnk->Release(); // Release extra refcount from QueryInterface
}
CATCH(CException, e)
{
Win4Assert( 0 == pIUnk );
switch( e.GetErrorCode() )
{
case E_OUTOFMEMORY:
sc = (E_OUTOFMEMORY);
break;
default:
sc = (E_UNEXPECTED);
}
}
END_CATCH;
return (sc);
}
//+-------------------------------------------------------------------------
//
// Method: CDefWordBreakerCF::LockServer
//
// Synopsis: Force class factory to remain loaded
//
// Arguments: [fLock] -- TRUE if locking, FALSE if unlocking
//
// Returns: S_OK
//
// History: 07-Feb-1995 SitaramR Created
//
//--------------------------------------------------------------------------
SCODE STDMETHODCALLTYPE CDefWordBreakerCF::LockServer(BOOL fLock)
{
if(fLock)
InterlockedIncrement( &gulcInstances );
else
InterlockedDecrement( &gulcInstances );
return(S_OK);
}