2020-09-30 16:53:55 +02:00

840 lines
27 KiB

// Copyright (C) Microsoft Corporation, 1996 - 2001.
// File: docsum.cxx
// Contents: document summary helper class
// Classes: CDocCharacterization
// History: 12-Jan-96 dlee Created
// Todo: try to end summary on sentence or word boundary.
#include <pch.cxx>
#pragma hdrstop
#include <propspec.hxx>
#include <ciguid.hxx>
#include "docsum.hxx"
const WCHAR wcParagraph = 0x2029;
const WCHAR *pwcDescription = L"DESCRIPTION";
static CFullPropSpec psRevName( guidQuery, DISPID_QUERY_REVNAME );
static CFullPropSpec psName( guidStorage, PID_STG_NAME );
const GUID guidCharacterization = PSGUID_CHARACTERIZATION;
const GUID guidHTMLUrl = HTMLUrl;
const GUID guidHTMLComment = HTMLComment;
const GUID guidHTMLScript = HTMLScriptGuid;
static CFullPropSpec psCharacterization( guidCharacterization,
propidCharacterization );
const GUID guidDocSummary = defGuidDocSummary;
static CFullPropSpec psTitle( guidDocSummary, propidTitle );
static const GUID guidHtmlInformation = defGuidHtmlInformation;
static GUID const guidMeta = { 0xd1b5d3f0,
0xc0b3, 0x11cf,
0x9a, 0x92, 0x00, 0xa0,
0xc9, 0x08, 0xdb, 0xf1 };
inline unsigned DocSumScore( PROPID propid )
switch ( propid )
case propidTitle :
return scoreTitle;
case propidSubject :
return scoreSubject;
case propidKeywords :
return scoreKeywords;
case propidComments :
return scoreComments;
case propidTemplate :
case propidLastAuthor :
case propidRevNumber :
case propidAppName :
case propidAuthor :
return scoreIgnore;
return scoreIfNothingElse;
} //DocSumScore
inline unsigned HtmlPropScore( PROPID propid )
switch ( propid )
case PID_HEADING_1 :
return scoreHeader1;
case PID_HEADING_2 :
return scoreHeader2;
case PID_HEADING_3 :
return scoreHeader3;
case PID_HEADING_4 :
return scoreHeader4;
case PID_HEADING_5 :
return scoreHeader5;
case PID_HEADING_6 :
return scoreHeader6;
return scoreIgnore;
} //HtmlPropScore
// Function: StringToClsid
// Synopsis: Convert string containing CLSID to CLSID.
// The string must be of the form:
// {d1b5d3f0-c0b3-11cf-9a92-00a0c908dbf1}
// Arguments: [wszClass] -- string containg CLSID
// [guidClass] -- output guid
void StringToClsid( WCHAR *wszClass, GUID& guidClass )
wszClass[9] = 0;
guidClass.Data1 = wcstoul( &wszClass[1], 0, 16 );
wszClass[14] = 0;
guidClass.Data2 = (USHORT)wcstoul( &wszClass[10], 0, 16 );
wszClass[19] = 0;
guidClass.Data3 = (USHORT)wcstoul( &wszClass[15], 0, 16 );
WCHAR wc = wszClass[22];
wszClass[22] = 0;
guidClass.Data4[0] = (unsigned char)wcstoul( &wszClass[20], 0, 16 );
wszClass[22] = wc;
wszClass[24] = 0;
guidClass.Data4[1] = (unsigned char)wcstoul( &wszClass[22], 0, 16 );
for ( int i = 0; i < 6; i++ )
wc = wszClass[27+i*2];
wszClass[27+i*2] = 0;
guidClass.Data4[2+i] = (unsigned char)wcstoul( &wszClass[25+i*2], 0, 16 );
wszClass[27+i*2] = wc;
// Method: CDocCharacterization::CDocCharacterization, public
// Synopsis: constructor
// Arguments: [cwcAtMost] -- Max size of characterization. 0 --> Don't
// generate one.
// History: 12-Jan-96 dlee Created
// 20-Jun-97 KyleP Make 0 --> no characterization
CDocCharacterization::CDocCharacterization( unsigned cwcAtMost )
: _queue( FALSE, cwcAtMost ),
_scoreRawText( scoreRawText ),
_cwcIgnoreBuf( 0 ),
_fMetaDescriptionAdded( FALSE )
_fIsGenerating = (0 != cwcAtMost);
// Method: CDocCharacterization::~CDocCharacterization, public
// Synopsis: destructor
// History: 12-Jan-96 dlee Created
// clean up anything left in the queue -- it should be empty, except for
// the exception case.
CSummaryText text;
while ( _queue.DeQueue( text ) )
delete [] text.GetText();
} //~CDocCharacterization
// Method: CDocCharacterization::AddCleanedString, private
// Synopsis: Adds a noise-free string to the queue if it belongs
// Arguments: [pwcSummary] -- string to add to the summary
// [cwcSummary] -- # of characters in the string
// [utility] -- score for the string
// [fDeliniate] -- if TRUE, a termination is added to the string
// History: 12-Jan-96 dlee Created
BOOL CDocCharacterization::AddCleanedString(
const WCHAR * pwcSummary,
unsigned cwcSummary,
unsigned utility,
BOOL fDeliniate )
Win4Assert( _fIsGenerating );
CSummaryText text( (WCHAR *) pwcSummary,
cwcSummary + ( fDeliniate ? cwcSummarySpace : 0 ),
utility );
unsigned cDeQueue = 0;
// Check if the item will make it on the queue
if ( _queue.ShouldEnQueue( text, cDeQueue ) )
// Don't add duplicates. If the duplicate has a worse score than
// the new item, remove it and add the new item.
CSummaryText testText;
for ( unsigned x = 0; fDeliniate && x < _queue.Count(); x++ )
CSummaryText & testText = _queue.Peek( x );
if ( testText.isSame( pwcSummary,
__min( cwcSummary, testText.GetSize() ) ) )
if ( testText.GetUtility() < utility )
delete [] testText.GetText();
_queue.Remove( x );
// don't have to dequeue anymore if the old duplicate
// is large enough
BOOL f = _queue.ShouldEnQueue( text, cDeQueue );
Win4Assert( f );
return TRUE;
// need to remove the worst item to make room for this one?
for ( ; cDeQueue > 0; cDeQueue-- )
Win4Assert( 0 != _queue.Count() );
CSummaryText temp;
_queue.DeQueue( temp );
delete [] temp.GetText();
Win4Assert( _queue.CurrentSize() <= _queue.MaxTotalSize() );
// make a copy of the summary string and put in in the queue
unsigned cwc = cwcSummary + ( fDeliniate ? cwcSummarySpace : 0 );
XArray<WCHAR> xCopy( cwc );
RtlCopyMemory( xCopy.GetPointer(),
cwcSummary * sizeof WCHAR );
if ( fDeliniate )
RtlCopyMemory( xCopy.GetPointer() + cwcSummary,
cwcSummarySpace * sizeof WCHAR );
text.SetText( xCopy.GetPointer() );
_queue.EnQueue( text );
Win4Assert( _queue.CurrentSize() <= _queue.MaxTotalSize() );
// if the EnQueue doesn't throw, the queue owns the memory
return TRUE;
return FALSE;
} //_AddCleanedString
// Method: CDocCharacterization::YankNoise, private
// Synopsis: Creates a new string that has "noise" stripped out.
// Arguments: [pwcIn] -- string to add to the summary
// [pwcOut] -- resulting cleaned string
// [cwc] -- in/out number of characters
// History: 12-Jan-96 dlee Created
const WORD C1_OK = ( C1_DIGIT | C1_SPACE | C1_ALPHA );
const WORD C1_CP = ( C1_CNTRL | C1_PUNCT );
const WORD C1_CSP = ( C1_CNTRL | C1_SPACE | C1_PUNCT );
inline BOOL isCP( WORD wC1 ) { return 0 != (C1_CP & wC1); }
inline BOOL isCSP( WORD wC1 ) { return 0 != (C1_CSP & wC1); }
inline BOOL isOK( WORD wC1 ) { return 0 != (C1_OK & wC1); }
inline BOOL isDefined( WORD wC1 ) { return 0 != (0x200 & wC1); }
inline BOOL isSpace( WORD wC1 ) { return 0 != (C1_SPACE & wC1); }
inline BOOL isCntrl( WORD wC1 ) { return 0 != (C1_CNTRL & wC1); }
inline BOOL isPunct( WORD wC1 ) { return 0 != (C1_PUNCT & wC1); }
// For example: a Japanese vowel elongating symbol
inline BOOL isDiacritic( WORD wC3 ) { return 0 != (C3_DIACRITIC & wC3 ); }
void CDocCharacterization::YankNoise(
const WCHAR * pwcIn,
WCHAR * pwcOut,
unsigned & cwc )
Win4Assert( _fIsGenerating );
WORD awType[ cwcMaxRawUsed ];
Win4Assert( cwc <= cwcMaxRawUsed );
if ( GetStringTypeW( CT_CTYPE1, pwcIn, cwc, awType ) )
// eat any leading white space or punctuation
unsigned iIn = 0;
while ( ( iIn < cwc ) &&
( isCSP( awType[ iIn ] ) ) )
// make it look like the previous line ended with a CR/LF
WORD wPrev = C1_CNTRL;
unsigned iOut = 0;
// filter the text, stripping redundant punctuation and white space
while ( ( iIn < cwc ) &&
( iOut < cwcMaxRawUsed ) )
if ( ! ( isSpace( wPrev ) && isSpace( awType[ iIn ] ) ) )
// convert control characters and wcParagraph to ' '
if ( ( isCntrl( awType[ iIn ] ) ) ||
( wcParagraph == pwcIn[ iIn ] ) )
pwcOut[ iOut++ ] = L' ';
else if ( isOK( awType[ iIn ] ) )
pwcOut[ iOut++ ] = pwcIn[ iIn ];
else if ( ( isPunct( awType[ iIn ] ) ) &&
( !isCP( wPrev ) ) )
pwcOut[ iOut++ ] = pwcIn[ iIn ];
if ( isDefined( awType[ iIn ] ) )
WCHAR pwszSingleChar[2];
pwszSingleChar[0] = pwcIn[iIn];
pwszSingleChar[1] = L'0';
WORD wType;
GetStringTypeW( CT_CTYPE3, pwszSingleChar, 1, &wType );
if ( isDiacritic( wType ) )
pwcOut[ iOut++ ] = pwcIn[ iIn ];
wPrev = awType[ iIn++ ];
// eat any trailing spaces
while ( iOut > 0 && L' ' == pwcOut[iOut-1] )
cwc = iOut;
cwc = 0;
} //_YankNoise
// Method: CDocCharacterization::Add, private
// Synopsis: Preps and adds a string to the queue.
// Arguments: [pwcSummary] -- string to add to the summary
// [cwcSummary] -- # characters in the string
// [utility] -- score for the string, higher is better
// [fYankNoise] -- if TRUE, noise is removed from the string
// Returns: FALSE if the item was rejected from a full queue because
// it was worse than anything in the queue, TRUE otherwise.
// History: 12-Jan-96 dlee Created
const unsigned cwcTextAtATime = 25;
BOOL CDocCharacterization::Add(
const WCHAR * pwcSummary,
unsigned cwcSummary,
unsigned utility,
BOOL fYankNoise )
Win4Assert( _fIsGenerating );
if ( scoreIgnore == utility )
return FALSE;
if ( 0 != cwcSummary )
unsigned cwcBuf = __min( cwcSummary, cwcMaxRawUsed );
WCHAR awcBuf[ cwcMaxRawUsed ];
if ( fYankNoise )
YankNoise( pwcSummary, awcBuf, cwcBuf );
// no text left after removal of noise?
if ( 0 == cwcBuf )
return TRUE;
// something we should ignore (the raw text version of the title)?
if ( ( _cwcIgnoreBuf == cwcSummarySpace ) &&
( !wcsncmp( awcBuf, _awcIgnoreBuf, _cwcIgnoreBuf ) ) )
return TRUE;
RtlCopyMemory( awcBuf, pwcSummary, cwcBuf * sizeof WCHAR );
// if it looks like it's one sentence, send it all at once
if ( ( utility > scoreRawText ) ||
( cwcBuf <= cwcMaxIgnoreBuf ) )
return AddCleanedString( awcBuf, cwcBuf, utility, fYankNoise );
// large block of text, so send a little at a time to the queue.
for ( unsigned owc = 0; owc < cwcBuf; )
unsigned cwcNow = __min( cwcBuf - owc, cwcTextAtATime );
if ( !AddCleanedString( awcBuf + owc,
return FALSE;
owc += cwcNow;
return TRUE;
} //_Add
// Method: CDocCharacterization::AddRawText, private
// Synopsis: Adds some text to the queue with a utility of raw text.
// Arguments: [pwcRawText] -- string to add to the summary
// [cwcText] -- # characters in the string
// History: 12-Jan-96 dlee Created
void CDocCharacterization::AddRawText(
const WCHAR * pwcRawText,
unsigned cwcText )
Win4Assert( _fIsGenerating );
Win4Assert( _queue.CurrentSize() <= _queue.MaxTotalSize() );
if ( 0 != _scoreRawText )
if ( Add( pwcRawText, cwcText, _scoreRawText ) )
_scoreRawText -= cwcText / cwcTextAtATime;
_scoreRawText = 0;
} //_AddRawText
// Method: CDocCharacterization::RemoveLowScoringItems, private
// Synopsis: Removes low-scoring items from the queue
// Arguments: [iLimit] -- items scoring <= iLimit are removed
// History: 29-Aug-96 dlee Created
void CDocCharacterization::RemoveLowScoringItems(
unsigned iLimit )
Win4Assert( _fIsGenerating );
while ( 0 != _queue.Count() )
CSummaryText &top = _queue.PeekTop();
if ( top.GetUtility() <= iLimit )
CSummaryText text;
_queue.DeQueue( text );
delete [] text.GetText();
} //_RemoveLowScoringItems
// Method: CDocCharacterization::Get, public
// Synopsis: Returns the summary in one string.
// Arguments: [awcSummary] -- output string
// [cwcSummary] -- in/out the length of the string
// [fUseRawText] -- TRUE if raw text should be included
// History: 12-Jan-96 dlee Created
void CDocCharacterization::Get(
WCHAR * awcSummary,
unsigned & cwcSummary,
BOOL fUseRawText )
Win4Assert( _fIsGenerating );
// Caller should give us a buffer large enough to hold the
// characterization they requested and a null termination.
Win4Assert( _queue.CurrentSize() <= _queue.MaxTotalSize() );
Win4Assert( cwcSummary > _queue.MaxTotalSize() );
Win4Assert( cwcSummary > _queue.CurrentSize() );
// If we shouldn't include raw text, pop low-scoring items off the
// top of the queue queue.
if ( !fUseRawText )
RemoveLowScoringItems( scoreRawText );
// If a meta description was added, there's no point in tacking on
// additional text in the abstract.
if ( _fMetaDescriptionAdded )
Win4Assert( cwcSummary > _awcMetaDescription.Count() );
RtlCopyMemory( awcSummary,
_awcMetaDescription.SizeOf() );
cwcSummary = _awcMetaDescription.Count();
awcSummary[ cwcSummary ] = 0;
cwcSummary = _queue.CurrentSize();
// The item on the top of the queue is the least useful item, so
// we have to invert the order.
WCHAR *pwcSummary = awcSummary + cwcSummary;
*pwcSummary = 0;
CSummaryText text;
while ( _queue.DeQueue( text ) )
pwcSummary -= text.GetSize();
RtlCopyMemory( pwcSummary,
text.GetSize() * sizeof WCHAR );
delete [] text.GetText();
Win4Assert( pwcSummary == awcSummary );
} //GetSummary
// Method: CDocCharacterization::Ignore, private
// Synopsis: Tells the class to ignore this string in the generation
// of a summary. This is probably the "title" of an html
// document, which is stored in a separate property, and it
// would be redundant to store it twice.
// Arguments: [pwcIgnore] -- string to ignore
// [cwcText] -- # characters in the string
// History: 12-Jan-96 dlee Created
void CDocCharacterization::Ignore(
const WCHAR * pwcIgnore,
unsigned cwcText )
Win4Assert( _fIsGenerating );
// clean and save the string to ignore
_cwcIgnoreBuf = __min( cwcText, cwcMaxIgnoreBuf );
YankNoise( pwcIgnore, _awcIgnoreBuf, _cwcIgnoreBuf );
// remove any instance of the string in the queue
unsigned cwcTest = _cwcIgnoreBuf + cwcSummarySpace;
for ( unsigned x = 0; x < _queue.Count(); x++ )
CSummaryText &testText = _queue.Peek( x );
if ( ( cwcTest == testText.GetSize() ) &&
( testText.isSame( _awcIgnoreBuf, _cwcIgnoreBuf ) ) )
delete [] testText.GetText();
_queue.Remove( x );
} //_Ignore
// Method: CDocCharacterization::Add, public
// Synopsis: Adds a string value to the queue if appropriate, based on the
// propspec and the nature of the string.
// Arguments: [pwcSummary] -- string to ignore
// [cwcSummary] -- # characters in the string
// History: 12-Jan-96 dlee Created
void CDocCharacterization::Add( CStorageVariant const & var,
CFullPropSpec & ps )
// if the meta description has been added already, we're done.
if ( _fMetaDescriptionAdded || !_fIsGenerating )
#if CIDBG == 1
ciDebugOut(( DEB_DOCSUM, "docchar::Add variant type %#x\n", var.vt ));
if ( VT_LPWSTR == var.vt )
ciDebugOut(( DEB_DOCSUM, " wstr: '%ws'\n", var.pwszVal ));
else if ( VT_LPSTR == var.vt )
ciDebugOut(( DEB_DOCSUM, " str: '%s'\n", var.pszVal ));
ciDebugOut(( DEB_DOCSUM,
" guid {%08lx-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x}\n",
ps.GetPropSet().Data4[0], ps.GetPropSet().Data4[1],
ps.GetPropSet().Data4[2], ps.GetPropSet().Data4[3],
ps.GetPropSet().Data4[4], ps.GetPropSet().Data4[5],
ps.GetPropSet().Data4[6], ps.GetPropSet().Data4[7] ));
if ( ps.IsPropertyName() )
ciDebugOut(( DEB_DOCSUM, " string: '%ws'\n", ps.GetPropertyName() ));
ciDebugOut(( DEB_DOCSUM, " id: '%d'\n", ps.GetPropertyPropid() ));
#endif // CIDBG
// title is added as plain text and _Ignore() is called then.
if ( ps != psTitle )
if ( VT_LPWSTR == var.Type() )
// Don't put file names or meta properties in abstracts.
if ( ( psRevName != ps ) &&
( psName != ps ) )
if ( guidMeta == ps.GetPropSet() )
// This is the ideal string, based on html spec.
// Toss all other meta property values.
if ( ( ps.IsPropertyName() ) &&
( 0 == _wcsicmp( ps.GetPropertyName(), pwcDescription ) ) )
_fMetaDescriptionAdded = TRUE;
// make a copy of the meta description
if ( 0 == var.GetLPWSTR() )
_awcMetaDescription.Init( 0 );
unsigned cwc = __min( wcslen( var.GetLPWSTR() ),
_queue.MaxTotalSize() );
_awcMetaDescription.Init( cwc );
RtlCopyMemory( _awcMetaDescription.GetPointer(),
_awcMetaDescription.SizeOf() );
// toss everything in the queue
CSummaryText text;
while ( _queue.DeQueue( text ) )
delete [] text.GetText();
else if ( 0 != var.GetLPWSTR() &&
( guidDocSummary == ps.GetPropSet() ) )
Win4Assert( ps.IsPropertyPropid() );
Add( var.GetLPWSTR(),
wcslen( var.GetLPWSTR() ),
DocSumScore( ps.GetPropertyPropid() ) );
if ( 0 != var.GetLPWSTR() )
Add( var.GetLPWSTR(),
wcslen( var.GetLPWSTR() ),
scoreOtherProperty );
} // if VT_LPWSTR
} // ps != psTitle
} //Add
// Method: CDocCharacterization::Add, public
// Synopsis: Adds a string to the queue if appropriate, based on the
// propspec and the nature of the string.
// Arguments: [pwcSummary] -- string to ignore
// [cwcSummary] -- # characters in the string
// [ps] -- Property being added
// History: 12-Jan-96 dlee Created
void CDocCharacterization::Add(
const WCHAR * pwcSummary,
unsigned cwcSummary,
Win4Assert( _queue.CurrentSize() <= _queue.MaxTotalSize() );
// if the meta description has been added already, we're done.
if ( _fMetaDescriptionAdded || !_fIsGenerating )
#if CIDBG == 1
ciDebugOut(( DEB_DOCSUM, "docchar::Add: '%.*ws'\n", cwcSummary, pwcSummary ));
ciDebugOut(( DEB_DOCSUM,
" guid {%08lx-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x}\n",
ps.guidPropSet.Data4[0], ps.guidPropSet.Data4[1],
ps.guidPropSet.Data4[2], ps.guidPropSet.Data4[3],
ps.guidPropSet.Data4[4], ps.guidPropSet.Data4[5],
ps.guidPropSet.Data4[6], ps.guidPropSet.Data4[7] ));
if ( PRSPEC_LPWSTR == ps.psProperty.ulKind )
ciDebugOut(( DEB_DOCSUM, " string: '%ws'\n", ps.psProperty.lpwstr ));
ciDebugOut(( DEB_DOCSUM, " id: '%d'\n", ps.psProperty.propid ));
#endif // CIDBG
// add raw text unless it's the title
if ( guidHtmlInformation == ps.guidPropSet )
Add( pwcSummary,
HtmlPropScore( ps.psProperty.propid ) );
else if ( guidHTMLUrl == ps.guidPropSet ||
guidHTMLComment == ps.guidPropSet )
// just ignore it
else if ( guidHTMLScript == ps.guidPropSet )
// note: the current html filter doesn't emit scripts, but just
// in case that changes this case is checked.
ciDebugOut(( DEB_DOCSUM, "ignoring script\n" ));
else if ( psTitle == * ( (CFullPropSpec *)&ps ) )
Ignore( pwcSummary, cwcSummary );
AddRawText( pwcSummary, cwcSummary );
Win4Assert( _queue.CurrentSize() <= _queue.MaxTotalSize() );
} //Add