879 lines
26 KiB
C++
879 lines
26 KiB
C++
// FragInfo.cpp -- Implementation for class CFragInfo
|
|
|
|
#include "stdafx.h"
|
|
|
|
#include "FragInfo.h"
|
|
#include "Memex.h"
|
|
|
|
extern char acMap[];
|
|
|
|
/////////////////////////////////////////////////////////////////////////////
|
|
// Worker functions
|
|
|
|
BOOL AllLowerCase(PWCHAR pwText, UINT cwText)
|
|
{
|
|
WORD wSepMax= SORT_KEY_SEPARATOR | (SORT_KEY_SEPARATOR << 8);
|
|
|
|
BOOL fAllLower= TRUE;
|
|
|
|
for (; cwText; pwText++, cwText--)
|
|
if (SORT_KEY_SEPARATOR == ((*pwText) >> 8)) break;
|
|
|
|
for (; cwText--; )
|
|
if (wSepMax < *pwText++)
|
|
{
|
|
fAllLower= FALSE;
|
|
|
|
break;
|
|
}
|
|
|
|
return fAllLower;
|
|
}
|
|
|
|
UINT SortKeyText(PWCHAR pwText, UINT cwText, PWCHAR pwOut, UINT cwOut)
|
|
{ // convert sort key into text order based representation
|
|
UINT nChar = 0; // ... for IsAPrefix and IsASuffix type text operations.
|
|
// ... Work field is built up as individual alpha sort
|
|
PCHAR pWork, pAlpha, pBuild, pSeparator; // ... weights immediately followed by their diacritic
|
|
PWCHAR pwTemp; // ... and case weight bytes: [WORD][BYTE-BYTE] per char.
|
|
// ... Non-existent diacritic and case weights are filled
|
|
PWCHAR pwEnd = pwText + cwText; // ... out with SORT_KEY_SEPARATOR (lowest sort value).
|
|
PCHAR pStart = (PCHAR)pwOut;
|
|
|
|
static BOOL fSetup, fCaseLR, fDiacriticLR; // determine LR nature of case and diacritic weights.
|
|
|
|
if (!fSetup)
|
|
{
|
|
BYTE szTest[] = "Au"; // test case: one capital letter and one diacritic.
|
|
char szSort[25];
|
|
|
|
szTest[1] = 250; // converts second char to 'u' with diacritic.
|
|
|
|
fSetup = fCaseLR = fDiacriticLR = TRUE; // set defaults to L --> R case and diacritic processing.
|
|
|
|
int nLen = LCMapStringA(GetUserDefaultLCID(), LCMAP_SORTKEY, (PSTR)szTest, 2, szSort, sizeof(szSort));
|
|
|
|
if (nLen > 8 && szSort[4] == SORT_KEY_SEPARATOR && szSort[5] != SORT_KEY_SEPARATOR)
|
|
{
|
|
nLen = 8;
|
|
|
|
if (szSort[6] == SORT_KEY_SEPARATOR)
|
|
{
|
|
nLen--;
|
|
fDiacriticLR = FALSE; // we found R->L diacritic sort key generation.
|
|
}
|
|
|
|
if (szSort[nLen-1] == SORT_KEY_SEPARATOR && szSort[nLen] != SORT_KEY_SEPARATOR &&
|
|
szSort[nLen+1] != SORT_KEY_SEPARATOR && szSort[nLen+2] == SORT_KEY_SEPARATOR)
|
|
fCaseLR = FALSE; // we found R->L case weight sort key generation.
|
|
}
|
|
}
|
|
|
|
if (!cwText)
|
|
return 0;
|
|
|
|
while (HIBYTE(*pwText) != SORT_KEY_SEPARATOR) // search for first weight separator
|
|
{
|
|
pwOut[nChar++] = *pwText++;
|
|
pwOut[nChar++] = MAKEWORD(SORT_KEY_SEPARATOR, SORT_KEY_SEPARATOR);
|
|
if (nChar >= cwOut)
|
|
return 0;
|
|
}
|
|
pwOut[nChar] = 0; // terminating NULL for "strstr"
|
|
|
|
for (pwTemp = pwText; pwTemp < pwEnd; pwTemp++)
|
|
*pwTemp = (*pwTemp >> 8) | (*pwTemp << 8); // bring sort key weights in byte order
|
|
|
|
if (fDiacriticLR) // L->R diacritic weights.
|
|
{
|
|
pAlpha = (PCHAR)pwText + 1; // start at beginning of diacritics
|
|
|
|
pBuild = (PCHAR)pwOut + 3;
|
|
|
|
while (*pAlpha != SORT_KEY_SEPARATOR && pAlpha < (PCHAR)pwEnd)
|
|
{
|
|
*pBuild = *pAlpha++; // fill out diacritic weights
|
|
pBuild += 4; // next diacritic, moving from start to end
|
|
}
|
|
|
|
pWork = pAlpha;
|
|
}
|
|
// R->L diacritic weights (old style and French).
|
|
else
|
|
{ // skip diacritic separator
|
|
for (pWork = (PCHAR)pwText + 1; pWork < (PCHAR)pwEnd; pWork++)
|
|
if (*pWork == SORT_KEY_SEPARATOR) // find alpha weights separator
|
|
break;
|
|
|
|
pBuild = PCHAR(pwOut + nChar) - 1;
|
|
|
|
for (pAlpha = pWork - 1; pAlpha > (PCHAR)pwText && pBuild > pStart; pAlpha--)
|
|
{
|
|
*pBuild = *pAlpha; // fill out diacritic weights
|
|
pBuild -= 4; // next diacritic, moving from end to start
|
|
}
|
|
}
|
|
|
|
if (fCaseLR) // L->R case weights.
|
|
{
|
|
pWork++;
|
|
|
|
pBuild = (PCHAR)pwOut + 2;
|
|
|
|
while (*pWork != SORT_KEY_SEPARATOR && pWork < (PCHAR)pwEnd)
|
|
{
|
|
*pBuild = *pWork++; // fill out diacritic weights
|
|
pBuild += 4; // next diacritic, moving from start to end
|
|
}
|
|
}
|
|
|
|
else // R->L case weights (old style sort keys).
|
|
{
|
|
pSeparator = pWork++;
|
|
|
|
for ( ; pWork < (PCHAR)pwEnd; pWork++) // skip case separator
|
|
if (*pWork == SORT_KEY_SEPARATOR) // find special weights separator
|
|
break;
|
|
|
|
pBuild = PCHAR(pwOut + nChar) - 2;
|
|
|
|
for (pAlpha = pWork - 1; pAlpha > pSeparator && pBuild > pStart; pAlpha--)
|
|
{
|
|
*pBuild = *pAlpha; // fill out case weights
|
|
pBuild -= 4; // next case, moving from end to start
|
|
}
|
|
}
|
|
|
|
for (pwTemp = pwText; pwTemp < pwEnd; pwTemp++)
|
|
*pwTemp = (*pwTemp >> 8) | (*pwTemp << 8); // byte reverse sort keys weights
|
|
|
|
return nChar;
|
|
}
|
|
|
|
BOOL IsAPrefix(PWCHAR pwStringL, UINT cwStringL, PWCHAR pwStringR, UINT cwStringR)
|
|
{
|
|
WCHAR workL[512], workR[512];
|
|
|
|
UINT cwL, cwR;
|
|
|
|
if (cwStringL) pwStringL++, cwStringL--; // skip alpha-num-punc prefix
|
|
if (cwStringR) pwStringR++, cwStringR--; // skip alpha-num-punc prefix
|
|
|
|
PWCHAR pwL = pwStringL;
|
|
PWCHAR pwR = pwStringR;
|
|
|
|
for (cwL = 0; cwL < cwStringL; cwL++)
|
|
if (HIBYTE(pwStringL[cwL]) == SORT_KEY_SEPARATOR)
|
|
break;
|
|
|
|
for (cwR = 0; cwR < cwStringR; cwR++)
|
|
if (HIBYTE(pwStringR[cwR]) == SORT_KEY_SEPARATOR)
|
|
break;
|
|
|
|
if (cwL > cwR)
|
|
return FALSE; // suffix is larger than base word
|
|
|
|
while (cwL--)
|
|
if (*pwStringL++ != *pwStringR++)
|
|
return FALSE;
|
|
|
|
cwL = SortKeyText(pwL, cwStringL, workL, sizeof(workL)/sizeof(WCHAR));
|
|
cwR = SortKeyText(pwR, cwStringR, workR, sizeof(workR)/sizeof(WCHAR));
|
|
|
|
if (!cwL || !cwR) return FALSE;
|
|
|
|
pwL = workL;
|
|
pwR = workR;
|
|
|
|
while (cwL--)
|
|
if (*pwL++ != *pwR++)
|
|
return FALSE;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
BOOL IsASuffix(PWCHAR pwStringL, UINT cwStringL, PWCHAR pwStringR, UINT cwStringR)
|
|
{
|
|
WCHAR workL[512], workR[512];
|
|
|
|
UINT cwL, cwR;
|
|
|
|
if (cwStringL) pwStringL++, cwStringL--; // skip alpha-num-punc prefix
|
|
if (cwStringR) pwStringR++, cwStringR--; // skip alpha-num-punc prefix
|
|
|
|
PWCHAR pwL = pwStringL;
|
|
PWCHAR pwR = pwStringR;
|
|
|
|
for (cwL = 0; cwL < cwStringL; cwL++)
|
|
if (HIBYTE(pwStringL[cwL]) == SORT_KEY_SEPARATOR)
|
|
break;
|
|
|
|
for (cwR = 0; cwR < cwStringR; cwR++)
|
|
if (HIBYTE(pwStringR[cwR]) == SORT_KEY_SEPARATOR)
|
|
break;
|
|
|
|
if (cwL > cwR)
|
|
return FALSE; // suffix is larger than base word
|
|
|
|
pwStringR += cwR - cwL;
|
|
|
|
while (cwL--)
|
|
if (*pwStringL++ != *pwStringR++)
|
|
return FALSE;
|
|
|
|
cwL = SortKeyText(pwL, cwStringL, workL, sizeof(workL)/sizeof(WCHAR));
|
|
cwR = SortKeyText(pwR, cwStringR, workR, sizeof(workR)/sizeof(WCHAR));
|
|
|
|
if (!cwL || !cwR) return FALSE;
|
|
|
|
pwL = workL;
|
|
pwR = workR + cwR - cwL;
|
|
|
|
while (cwL--)
|
|
if (*pwL++ != *pwR++)
|
|
return FALSE;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
BOOL IsASubstring(PWCHAR pwL, UINT cwL, PWCHAR pwR, UINT cwR)
|
|
{
|
|
WCHAR workL[512], workR[512];
|
|
|
|
cwL = SortKeyText(++pwL, --cwL, workL, sizeof(workL)/sizeof(WCHAR));
|
|
|
|
cwR = SortKeyText(++pwR, --cwR, workR, sizeof(workR)/sizeof(WCHAR));
|
|
|
|
if (!cwL || !cwR || cwL > cwR)
|
|
return FALSE;
|
|
|
|
return (wcsstr(workR, workL) != NULL);
|
|
}
|
|
|
|
// End of Worker functions
|
|
/////////////////////////////////////////////////////////////////////////////
|
|
|
|
CFragInfo::CFragInfo()
|
|
{
|
|
m_pwcFrag = NULL;
|
|
m_cwcFrag = 0;
|
|
m_pFrag = NULL;
|
|
m_cFrag = 0;
|
|
m_fEvaluated = FALSE;
|
|
// m_cwcAllocated = 0;
|
|
m_ptkc = NULL;
|
|
m_ptlc = NULL;
|
|
m_pcsVisibleWords = NULL;
|
|
m_pcsSelectedWords = NULL;
|
|
m_pcsArticleSet = NULL;
|
|
m_fFlags = 0;
|
|
m_pRefList = NULL;
|
|
m_rt = NoRefs;
|
|
}
|
|
|
|
CFragInfo::~CFragInfo()
|
|
{
|
|
// if (m_pwcFrag) VFree(m_pwcFrag);
|
|
|
|
if (m_ptkc) DetachRef(m_ptkc);
|
|
if (m_ptlc) DetachRef(m_ptlc);
|
|
|
|
if (m_pcsVisibleWords ) DetachRef(m_pcsVisibleWords );
|
|
if (m_pcsSelectedWords) DetachRef(m_pcsSelectedWords);
|
|
if (m_pcsArticleSet ) DetachRef(m_pcsArticleSet );
|
|
|
|
PPerTextSet ppts, pptsNext;
|
|
|
|
for (ppts= m_pRefList; ppts; ppts= pptsNext)
|
|
{
|
|
pptsNext= ppts->pptsNext;
|
|
|
|
if (ppts->pcsRefs) DetachRef(ppts->pcsRefs);
|
|
|
|
VFree(ppts);
|
|
}
|
|
}
|
|
|
|
CFragInfo *CFragInfo::NewFragInfo(CTokenCollection *ptkc, CTitleCollection *ptlc,
|
|
RefType rt, BOOL fFeedback, UINT iWordMatchType, PWCHAR pwcFrag, UINT cwcFrag, PWCHAR pFrag, UINT cFrag)
|
|
{
|
|
CFragInfo *pfi= NULL;
|
|
|
|
__try
|
|
{
|
|
pfi= New CFragInfo();
|
|
|
|
pfi->AttachParameters(ptkc, ptlc, rt, fFeedback, iWordMatchType, pwcFrag, cwcFrag, pFrag, cFrag);
|
|
}
|
|
__finally
|
|
{
|
|
if (_abnormal_termination() && pfi)
|
|
{
|
|
delete pfi; pfi= NULL;
|
|
}
|
|
}
|
|
|
|
return pfi;
|
|
}
|
|
|
|
BOOL CFragInfo::AttachParameters(CTokenCollection *ptkc, CTitleCollection *ptlc,
|
|
RefType rt, BOOL fFeedback, UINT iWordMatchType, PWCHAR pwcFrag, UINT cwcFrag, PWCHAR pFrag, UINT cFrag)
|
|
{
|
|
AttachRef(m_ptkc, ptkc);
|
|
AttachRef(m_ptlc, ptlc);
|
|
|
|
m_rt= rt;
|
|
m_fFeedback = fFeedback;
|
|
|
|
m_iWordMatchType= iWordMatchType;
|
|
|
|
// m_cwcAllocated= CWC_FRAGMENT_GRANULE * ((cwcFrag + CWC_FRAGMENT_GRANULE - 1) / CWC_FRAGMENT_GRANULE);
|
|
|
|
// m_pwcFrag= PWCHAR(VAlloc(FALSE, m_cwcAllocated * sizeof(WCHAR));
|
|
// if (!m_pwcFrag) return FALSE;
|
|
|
|
m_cwcFrag= 0;
|
|
|
|
SetImage(pwcFrag, cwcFrag, pFrag, cFrag);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
RefType CFragInfo::GetRefType()
|
|
{
|
|
return m_rt;
|
|
}
|
|
|
|
CCompressedSet *CFragInfo::GetCSArticleSet()
|
|
{
|
|
ASSERT(m_rt == AllWords || m_rt == AnyWord);
|
|
|
|
if (!HasValue()) CoerceToValue();
|
|
|
|
return m_pcsArticleSet;
|
|
}
|
|
|
|
CIndicatorSet *CFragInfo::GetArticleSet()
|
|
{
|
|
CCompressedSet *pcs= GetCSArticleSet();
|
|
|
|
if (!pcs) return NULL;
|
|
|
|
return CIndicatorSet::NewIndicatorSet(pcs);
|
|
}
|
|
|
|
void CFragInfo::MoveToFirstLocationSet()
|
|
{
|
|
if (!HasValue()) CoerceToValue();
|
|
|
|
m_pRefNext= m_pRefList;
|
|
}
|
|
|
|
CCompressedSet *CFragInfo::GetCSLocationSet(UINT iTS)
|
|
{
|
|
if (!HasValue()) CoerceToValue();
|
|
|
|
for (m_pRefNext= m_pRefList; m_pRefNext && m_pRefNext->its <= iTS; m_pRefNext= m_pRefNext->pptsNext)
|
|
if (m_pRefNext->its == iTS)
|
|
{
|
|
// m_pRefNext= m_pRefNext->pptsNext;
|
|
|
|
return m_pRefNext->pcsRefs;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
CIndicatorSet *CFragInfo::GetLocationSet(UINT iTS)
|
|
{
|
|
CCompressedSet *pcs= GetCSLocationSet(iTS);
|
|
|
|
if (!pcs) return NULL;
|
|
|
|
return CIndicatorSet::NewIndicatorSet(pcs);
|
|
}
|
|
|
|
UINT CFragInfo::GetImage (const WCHAR **ppwc)
|
|
{
|
|
*ppwc= m_pwcFrag;
|
|
|
|
return m_cwcFrag;
|
|
}
|
|
|
|
BOOL CFragInfo::SetImage(PWCHAR pwcFrag, UINT cwcFrag, PWCHAR pFrag, UINT cFrag)
|
|
{
|
|
PWCHAR pwcOld= m_pwcFrag; m_pwcFrag= pwcFrag;
|
|
UINT cwcOld= m_cwcFrag; m_cwcFrag= cwcFrag;
|
|
PWCHAR pOld = m_pFrag; m_pFrag = pFrag;
|
|
UINT cOld = m_cFrag; m_cFrag = cFrag;
|
|
|
|
return EvaluateChange(pwcOld, cwcOld, pOld, cOld, m_rt, m_fFeedback, m_iWordMatchType);
|
|
}
|
|
|
|
BOOL CFragInfo::SetReferenceType(RefType rt, BOOL fFeedback)
|
|
{
|
|
RefType rtOld= m_rt; m_rt= rt;
|
|
BOOL fFeedbackOld = m_fFeedback; m_fFeedback = fFeedback;
|
|
|
|
return EvaluateChange(m_pwcFrag, m_cwcFrag, m_pFrag, m_cFrag, rtOld, fFeedback, m_iWordMatchType);
|
|
}
|
|
|
|
BOOL CFragInfo::SetImageAndType(PWCHAR pwcFrag, UINT cwcFrag, PWCHAR pFrag, UINT cFrag, RefType rt, BOOL fFeedback)
|
|
{
|
|
PWCHAR pwcOld= m_pwcFrag;
|
|
UINT cwcOld= m_cwcFrag;
|
|
RefType rtOld= m_rt;
|
|
BOOL fFeedbackOld = fFeedback;
|
|
PWCHAR pOld = m_pFrag;
|
|
UINT cOld = m_cFrag;
|
|
|
|
m_pwcFrag= pwcFrag;
|
|
m_cwcFrag= cwcFrag;
|
|
m_rt = rt;
|
|
m_fFeedback = fFeedback;
|
|
m_pFrag = pFrag;
|
|
m_cFrag = cFrag;
|
|
|
|
return EvaluateChange(pwcOld, cwcOld, pOld, cOld, rtOld, m_fFeedback, m_iWordMatchType);
|
|
}
|
|
|
|
BOOL CFragInfo::SetMatchCriteria(UINT iWordMatchType)
|
|
{
|
|
if (m_iWordMatchType == iWordMatchType) return TRUE;
|
|
|
|
UINT iWordMatchOld= m_iWordMatchType;
|
|
|
|
m_iWordMatchType= iWordMatchType;
|
|
|
|
return EvaluateChange(m_pwcFrag, m_cwcFrag, m_pFrag, m_cFrag, m_rt, m_fFeedback, iWordMatchOld);
|
|
}
|
|
|
|
BOOL CFragInfo::InvalidateMatches()
|
|
{
|
|
return EvaluateChange(m_pwcFrag, m_cwcFrag, m_pFrag, m_cFrag, m_rt, m_fFeedback, UINT(-1));
|
|
}
|
|
|
|
BOOL CFragInfo::EvaluateChange(PWCHAR pwcOld, UINT cwcOld, PWCHAR pOld, UINT cOld,
|
|
RefType rtOld, BOOL fFeedbackOld, UINT iWordMatchTypeOld)
|
|
{
|
|
BOOL fImageChanged= FALSE;
|
|
|
|
if (cwcOld != m_cwcFrag) fImageChanged= TRUE;
|
|
else
|
|
{
|
|
PWCHAR pwc= pwcOld, pwcNew= m_pwcFrag;
|
|
UINT cwc= cwcOld;
|
|
|
|
for (; cwc; cwc--) if (*pwc++ != *pwcNew++) break;
|
|
|
|
if (cwc) fImageChanged= TRUE;
|
|
}
|
|
|
|
BOOL fMatchTypeChanged= m_iWordMatchType != iWordMatchTypeOld;
|
|
|
|
if (m_pcsVisibleWords)
|
|
if (fMatchTypeChanged) DetachRef(m_pcsVisibleWords);
|
|
else
|
|
if (fImageChanged)
|
|
{
|
|
BOOL fSubset= FALSE;
|
|
|
|
if (cwcOld < m_cwcFrag && !fMatchTypeChanged)
|
|
switch(m_iWordMatchType)
|
|
{
|
|
case BEGIN_WITH: // Begins With...
|
|
|
|
fSubset= IsAPrefix(pwcOld, cwcOld, m_pwcFrag, m_cwcFrag);
|
|
|
|
break;
|
|
|
|
case CONTAIN: // Contains...
|
|
|
|
fSubset= IsASubstring(pwcOld, cwcOld, m_pwcFrag, m_cwcFrag);
|
|
|
|
break;
|
|
|
|
case END_WITH: // Ends With...
|
|
|
|
fSubset= IsASuffix(pwcOld, cwcOld, m_pwcFrag, m_cwcFrag);
|
|
|
|
break;
|
|
|
|
case MATCH: // Matches
|
|
case HAVE_SAME_STEM: // From the same root word
|
|
|
|
fSubset= FALSE;
|
|
|
|
break;
|
|
}
|
|
|
|
if (!fSubset) DetachRef(m_pcsVisibleWords);
|
|
}
|
|
|
|
if (m_pcsSelectedWords && (fImageChanged || fMatchTypeChanged)) DetachRef(m_pcsSelectedWords);
|
|
|
|
BOOL fRefTypeChanged= m_rt != rtOld || m_fFeedback != fFeedbackOld;
|
|
|
|
if (!fImageChanged && !fMatchTypeChanged && !fRefTypeChanged) return FALSE;
|
|
|
|
DiscardValue(rtOld);
|
|
|
|
CIndicatorSet *pisOldTerms= NULL, *pisTerms= NULL;
|
|
|
|
if ((fImageChanged || fMatchTypeChanged) && m_cwcFrag)
|
|
__try
|
|
{
|
|
// Need to recompute the visible word set...
|
|
|
|
if (m_pcsVisibleWords) AttachRef(pisOldTerms, CIndicatorSet::NewIndicatorSet(m_pcsVisibleWords ));
|
|
else AttachRef(pisOldTerms, CIndicatorSet::NewIndicatorSet(m_ptkc->ActiveTokens()));
|
|
|
|
if (m_iWordMatchType == HAVE_SAME_STEM)
|
|
{
|
|
ASSERT(m_ptkc->SimilaritySearch());
|
|
|
|
PWCHAR pwc= PWCHAR(_alloca(sizeof(WCHAR) * (m_cFrag + 1)));
|
|
|
|
CopyMemory(pwc, m_pFrag, m_cFrag * sizeof(WCHAR));
|
|
|
|
pwc[m_cFrag]= 0;
|
|
|
|
pisTerms = GetWordsWithTheSameStem(pwc, m_cFrag, m_ptkc->RowCount());
|
|
}
|
|
else
|
|
{
|
|
|
|
PWCHAR pwc= PWCHAR(_alloca(sizeof(WCHAR) * (m_cwcFrag + 1)));
|
|
|
|
CopyMemory(pwc, m_pwcFrag, m_cwcFrag * sizeof(WCHAR));
|
|
|
|
pwc[m_cwcFrag]= 0;
|
|
|
|
pisTerms= m_ptkc->TokensContaining(pwc,
|
|
acMap[m_iWordMatchType] & 1,
|
|
acMap[m_iWordMatchType] & 2,
|
|
pisOldTerms
|
|
);
|
|
}
|
|
|
|
if (pisOldTerms) DetachRef(pisOldTerms);
|
|
|
|
ChangeRef(m_pcsVisibleWords, CCompressedSet::NewCompressedSet(pisTerms));
|
|
}
|
|
__finally
|
|
{
|
|
if (pisOldTerms) DetachRef(pisOldTerms);
|
|
if (pisTerms ) { delete pisTerms; pisTerms= NULL; }
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
void CFragInfo::DiscardValue(RefType rtOld)
|
|
{
|
|
if (HasValue())
|
|
switch (rtOld)
|
|
{
|
|
case NoRefs:
|
|
|
|
break;
|
|
|
|
case AllWords:
|
|
case AnyWord:
|
|
|
|
if (m_pcsArticleSet) DetachRef(m_pcsArticleSet);
|
|
|
|
break;
|
|
|
|
case TokenRefs:
|
|
|
|
if (m_pRefList)
|
|
{
|
|
PPerTextSet pptsNext;
|
|
|
|
for (; m_pRefList; m_pRefList= pptsNext)
|
|
{
|
|
pptsNext= m_pRefList->pptsNext;
|
|
|
|
DetachRef(m_pRefList->pcsRefs);
|
|
|
|
VFree(m_pRefList);
|
|
}
|
|
}
|
|
|
|
m_pRefNext= NULL;
|
|
|
|
break;
|
|
}
|
|
|
|
m_fEvaluated= FALSE;
|
|
}
|
|
|
|
void CFragInfo::CoerceToValue()
|
|
{
|
|
ASSERT(!m_fEvaluated );
|
|
ASSERT(m_rt != NoRefs);
|
|
|
|
UINT cts= m_ptkc->TextSetCount();
|
|
|
|
PTokenInfo *ppti= (PTokenInfo *) _alloca(cts * sizeof(PTokenInfo));
|
|
|
|
ASSERT(ppti);
|
|
|
|
CIndicatorSet *pisTemp = NULL;
|
|
CIndicatorSet *pisArticles = NULL;
|
|
CIndicatorSet *pisTokens = NULL;
|
|
PPerTextSet pPTS = NULL;
|
|
|
|
__try
|
|
{
|
|
pisTemp= GetSelection();
|
|
|
|
if (!pisTemp)
|
|
{
|
|
m_fEvaluated= TRUE;
|
|
|
|
__leave;
|
|
}
|
|
|
|
m_ptkc->MapToTokenLists(pisTemp, ppti, cts);
|
|
|
|
delete pisTemp; pisTemp= NULL;
|
|
|
|
if (m_rt == AllWords || m_rt == AnyWord)
|
|
{
|
|
AttachRef(pisArticles, CIndicatorSet::NewIndicatorSet(m_ptlc->RowCount()));
|
|
|
|
UINT iTextSet;
|
|
|
|
for (iTextSet= cts; iTextSet--; )
|
|
{
|
|
PTokenInfo pti= ppti[iTextSet];
|
|
|
|
if (!pti) continue;
|
|
|
|
CTextSet *pts = m_ptkc->GetTextSet (iTextSet);
|
|
const UINT *piMap = m_ptlc->UniversalTitleMap(iTextSet);
|
|
|
|
for (; pti; pti= pti->ptiTextSetLink)
|
|
pts->IndicateArticleRefs(pisArticles, pti->iDescriptor, piMap);
|
|
}
|
|
|
|
AttachRef(m_pcsArticleSet, CCompressedSet::NewCompressedSet(pisArticles));
|
|
}
|
|
else
|
|
if (m_rt == TokenRefs)
|
|
{
|
|
// The code below pushes PerTextSet items on from high iTextSet to low iTextSet.
|
|
// This leaves the resulting m_pRefList chain ordered. The code which reads
|
|
// the chain presumes this ordering.
|
|
|
|
UINT iTextSet= cts;
|
|
|
|
for (; iTextSet--; )
|
|
{
|
|
PTokenInfo pti= ppti[iTextSet];
|
|
|
|
if (!pti) continue;
|
|
|
|
CTextSet *pts= m_ptkc->GetTextSet(iTextSet);
|
|
|
|
ChangeRef(pisTokens, CIndicatorSet::NewIndicatorSet(pts->TokenCount()));
|
|
|
|
for (; pti; pti= pti->ptiTextSetLink)
|
|
pts->IndicateTokenRefs(pisTokens, pti->iDescriptor);
|
|
|
|
pPTS= PPerTextSet(VAlloc(FALSE, sizeof(PerTextSet)));
|
|
|
|
pPTS->its= iTextSet;
|
|
|
|
AttachRef(pPTS->pcsRefs, CCompressedSet::NewCompressedSet(pisTokens));
|
|
|
|
pPTS->pptsNext= m_pRefList;
|
|
|
|
m_pRefList= pPTS; pPTS= NULL;
|
|
}
|
|
|
|
m_pRefNext= m_pRefList;
|
|
}
|
|
}
|
|
__finally
|
|
{
|
|
if (pisTemp) { delete pisTemp; pisTemp = NULL; }
|
|
|
|
if (pisArticles) DetachRef(pisArticles);
|
|
if (pisTokens ) DetachRef(pisTokens);
|
|
|
|
if (pPTS) VFree(pPTS);
|
|
}
|
|
|
|
m_fEvaluated= TRUE;
|
|
}
|
|
|
|
// Tnis routine is optimized as follows...
|
|
// First we look at each dictionary for words that have the same stem as the given word
|
|
// For each match, we enter that word into a new dictionary, pCombinedDict. This will
|
|
// ensure that duplicates do not exist.
|
|
// After we do this for all the dictionaries pCombinedDict has the unique list of words
|
|
// across all the dictionaries that have the common stem with the given word.
|
|
// Now, for each word in the dictionary, we will find variants using m_ptkc and OR the
|
|
// results into a CIndicatorSet, pTmp.
|
|
//
|
|
// When we are attempting to find words with the same stem as a designated stop word, there
|
|
// will be no matching words. When there are no matching words, we simply "match exact word"
|
|
// and get the results from TokensContaining.
|
|
|
|
CIndicatorSet * CFragInfo::GetWordsWithTheSameStem(PWCHAR lpsubstring, WORD cblpsubstring, DWORD cTokens)
|
|
{
|
|
CIndicatorSet *pisResult = NULL;
|
|
|
|
CIndicatorSet * pisTmp = NULL;
|
|
CIndicatorSet * pisTmp2 = NULL;
|
|
PWCHAR pbDest = NULL;
|
|
CDictionary *pCombinedDict = NULL; // pCombinedDict is used to record all the words resulting
|
|
// from the stem match in all the dicts.
|
|
|
|
__try
|
|
{
|
|
DWORD dwConId,
|
|
cWordCount;
|
|
WORD cWord = cblpsubstring,
|
|
cTmp;
|
|
PWCHAR pMatchingWord;
|
|
BOOL fFound = FALSE;
|
|
|
|
pisTmp = CIndicatorSet::NewIndicatorSet(cTokens, FALSE);
|
|
pbDest = (PWCHAR) VAlloc(FALSE, MaxSortKeyBytes(cblpsubstring));
|
|
|
|
if (cblpsubstring == 0) __leave;
|
|
|
|
// Create a combined dictionary
|
|
pCombinedDict= CDictionary::NewDictionary(FALSE);
|
|
|
|
// For each collection's dictionary, find the words that have the same stem as the passed in word and
|
|
// enter the matching words into a new dictionary, pCombinedDict
|
|
UINT cts = m_ptkc->TextSetCount();
|
|
|
|
for (UINT j = 0; j < cts; j++)
|
|
{
|
|
// Ignore collections that are currently not active
|
|
if (!m_ptkc->IsActive(j)) continue;
|
|
|
|
CDictionary *pDict = m_ptkc->GetTextSet(j)->PDict();
|
|
if (!pDict) continue;
|
|
|
|
// EnterWord with the last param set to TRUE is only looking up, not entering a word.
|
|
dwConId = pDict->EnterWord(lpsubstring, cblpsubstring, TRUE, TRUE);
|
|
if (dwConId == EOL) // This word doesn't exist in the current dictionary!
|
|
continue;
|
|
|
|
fFound = TRUE; // We have matching words in at least one of the dictionaries!
|
|
cWordCount = pDict->GetWordCountOfConcept(dwConId);
|
|
ASSERT(cWordCount);
|
|
|
|
for (UINT i = 0; i < cWordCount; i++)
|
|
{
|
|
// reallocate memory only if the matching word is more than twice as long as the current one.
|
|
// if it is <= twice, we already have enough space
|
|
if (i == 0)
|
|
pMatchingWord = pDict->GetFirstWordOfConcept(dwConId);
|
|
else
|
|
pMatchingWord = pDict->GetNextWordOfConcept(dwConId);
|
|
|
|
// Avoid the overhead of stemming. Enter each word as if it were a stop word. We are only REUSING
|
|
// CDict as a data structure to hold our strings, so the semantics of the dictionary don't matter much
|
|
pCombinedDict->EnterWord(pMatchingWord, wcslen(pMatchingWord), TRUE, FALSE);
|
|
}
|
|
}
|
|
|
|
pCombinedDict->EndDictInsertions();
|
|
|
|
if (fFound)
|
|
{
|
|
// Now, for each unique word in the pCombinedDict dictionary, find the variants from m_ptkc
|
|
cWordCount = pCombinedDict->GetWordCountOfConcept(EOL);
|
|
ASSERT(cWordCount);
|
|
for (UINT i = 0; i < cWordCount; i++)
|
|
{
|
|
if (i == 0)
|
|
pMatchingWord = pCombinedDict->GetFirstWordOfConcept(EOL);
|
|
else
|
|
pMatchingWord = pCombinedDict->GetNextWordOfConcept(EOL);
|
|
|
|
cTmp = wcslen(pMatchingWord);
|
|
|
|
if ( MaxSortKeyBytes(cTmp) > UINT(2*cWord))
|
|
{
|
|
VFree(pbDest);
|
|
pbDest = (PWCHAR) VAlloc(FALSE, MaxSortKeyBytes(cTmp));
|
|
}
|
|
|
|
cWord = cTmp;
|
|
LCSortKeyW(GetUserDefaultLCID(), 0, pMatchingWord, cWord, pbDest, MaxSortKeyBytes(cWord));
|
|
pisTmp2 = m_ptkc->TokensContaining(pbDest, 1, 1);
|
|
pisTmp->ORWith(pisTmp2);
|
|
if (pisTmp2) { delete pisTmp2; pisTmp2= NULL; }
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (pisTmp) delete pisTmp;
|
|
LCSortKeyW(GetUserDefaultLCID(), 0, lpsubstring, cblpsubstring, pbDest, MaxSortKeyBytes(cblpsubstring));
|
|
pisTmp = m_ptkc->TokensContaining(pbDest, 1, 1);
|
|
}
|
|
}
|
|
__finally
|
|
{
|
|
if (pCombinedDict) { delete pCombinedDict; pCombinedDict = NULL; }
|
|
if (pbDest ) { VFree(pbDest); pbDest = NULL; }
|
|
if (pisTmp2 ) { delete pisTmp2; pisTmp2 = NULL; }
|
|
|
|
if (_abnormal_termination() && pisTmp) { delete pisTmp; pisTmp = NULL; }
|
|
}
|
|
|
|
return pisTmp;
|
|
}
|
|
|
|
CCompressedSet *CFragInfo::GetCSWordSet()
|
|
{
|
|
return m_pcsVisibleWords;
|
|
}
|
|
|
|
CIndicatorSet *CFragInfo::GetWordSet()
|
|
{
|
|
CCompressedSet *pcs= GetCSWordSet();
|
|
|
|
if (pcs) return CIndicatorSet::NewIndicatorSet(pcs);
|
|
else return NULL;
|
|
}
|
|
|
|
void CFragInfo::SetSelection(CIndicatorSet *pisSelection)
|
|
{
|
|
if (!pisSelection || pisSelection->SelectionCount() == m_ptkc->ActiveTokens()->SelectionCount())
|
|
{
|
|
if (m_pcsSelectedWords) DetachRef(m_pcsSelectedWords);
|
|
}
|
|
else ChangeRef(m_pcsSelectedWords, CCompressedSet::NewCompressedSet(pisSelection));
|
|
|
|
DiscardValue(m_rt);
|
|
|
|
m_fEvaluated= FALSE;
|
|
}
|
|
|
|
|
|
CCompressedSet *CFragInfo::GetCSSelection()
|
|
{
|
|
if (m_pcsSelectedWords) return m_pcsSelectedWords;
|
|
else
|
|
if (m_pcsVisibleWords) return m_pcsVisibleWords;
|
|
else return NULL;
|
|
}
|
|
|
|
CIndicatorSet *CFragInfo::GetSelection()
|
|
{
|
|
CCompressedSet *pcs= GetCSSelection();
|
|
|
|
if (pcs) return CIndicatorSet::NewIndicatorSet(pcs);
|
|
else return NULL;
|
|
}
|