2020-09-30 17:12:29 +02:00

527 lines
21 KiB
C++

// Hilite.cpp -- Implementation of the text Hilite feature
//
#include "stdafx.h"
#include "hilite.h"
#include "ftslex.h"
///////////////////////////// Hiliting Functions ////////////////////////////
// Phase (1) Set up a new Hiliter
CHiliter* CHiliter::NewHiliter(HSEARCHER hSearcher) {
CHiliter* phil = NULL;
__try { // New is our version of new with __FILE__ and __LINE__ added
phil = New CHiliter();
phil->InitHiliter(hSearcher);
}
__except(FilterFTExceptions(_exception_code())) {
if (phil) { delete phil; phil = NULL; }
}
return phil;
}
CHiliter::CHiliter() : CGlobals(Hiliter) {
// start by initializing all our variables
m_philNext = NULL; // so we can chain our hiliters together
m_pSearcher = NULL; // the searcher passed to us
m_pbTextBuffer = NULL;
m_cbCarryOver = 0; // Hilite text text carry over values
m_iCharSetCarryOver = 0;
m_lcidCarryOver = 0;
m_paTokenInfo = NULL; // pointer to virtual buffer to hold token info
m_cTokenInfo = 0; // count of tokens in text passed
m_pavr = NULL; // pointer to value reference array to hold words as they come in
m_cbScanned = 0; // number of bytes added in previous passes
m_serialFind = -1; // validity count
m_baseNext = 0; // to reduce binary searches
m_iTokenNext = 0;
m_pMask = NULL; // mask for words selected
for (int iFrag=0; iFrag<cFRAG_MAX; iFrag++)
m_apMasks[iFrag] = NULL; // masks for words in phrases
m_cFrags = 0;
m_base = m_iTokenStart = 0;
m_limit = m_iLimit = 0;
m_cLit = m_cMax = 0;
m_paHilites = NULL;
}
void CHiliter::InitHiliter(HSEARCHER hSearcher) {
m_idProcess = ::GetCurrentProcessId();
Link(); // this links our new global object into a list so that we can be sure that
// .. everything gets deleted when we exit. ~CGlobals() does the UnLink()
m_Signature = GLOBAL_SIGNATURE; // .. in good standing
m_pSearcher = (CGlobals*)hSearcher; // handle is really a pointer
m_pSearcher->RegisterHiliter(this); // add this hiliter to our list
}
CHiliter::~CHiliter() {
if (m_pbTextBuffer) FreeVirtualBuffer(&m_vbTextBuffer);
if (m_paTokenInfo) FreeVirtualBuffer(&m_vbTokenInfo);
if (m_pavr) delete m_pavr;
if (CGlobals::ValidObject((CGlobals*)m_pSearcher, Searcher)) // if Searcher is still around
m_pSearcher->UnRegisterHiliter(this); // .. we need to tell it we have one less Hiliter
if (m_pMask) DetachRef(m_pMask); // remove any AND/OR mask
for (int iFrag=0; iFrag<cFRAG_MAX; iFrag++) // remove any phrase search masks
if (m_apMasks[iFrag]) DetachRef(m_apMasks[iFrag]);
}
BOOL CHiliter::ValidObject(HHILITER hhil) {
CHiliter* phil = (CHiliter*)hhil;
return CGlobals::ValidObject(phil, Hiliter);
}
//////////////////////////////// Phase (2) ////////////////////////////////////////
/*
The text is passed in sections by ScanDisplayText(). Each section is assumed to be
in Multi-byte code and has a locale assosciated with it. The locale is important
for tokenizing.
The first step is to break the text into tokens. We do not store the actual text
but store the following information about each token:
typedef struct {
int base; // offset of token (in bytes from start of text)
int limit; // offset of end of token
int type; // type (Punctuation=0, Numbers=1, Words=2)
int iSorted; // index into m_ppdSorted
} TOKEN_INFO;
We don't know how many tokens there are going to be, so we allocate a virtual
array to hold this information.
In order to find iToken, we need to convert the Multi-byte string to Unicode and
look this up in our hash table -- where we previously saved these indexes for
each token. If no entry in the hash table is found, we store -1.
*/
void CHiliter::InitTokenInfo() { // do any initialization needed
ASSERT (!m_pbTextBuffer);
ASSERT (!m_pavr);
ClearDisplayText(); // initialize variables
// allocate our virtual memory buffers
BYTE* pbTextBuffer = NULL;
TOKEN_INFO* paTokenInfo = NULL;
CAValRef* pavr = NULL;
__try {
CreateVirtualBuffer(&m_vbTextBuffer, 0, 4096*4096); // buffer for incoming Hilite text
pbTextBuffer = (BYTE*)m_vbTextBuffer.Base; // (needed for words split on input)
CreateVirtualBuffer(&m_vbTokenInfo, 0, 4096*4096); // buffer to store token spreads
paTokenInfo = (TOKEN_INFO*)m_vbTokenInfo.Base;
pavr = CAValRef::NewValRef(MAX_TOKENS_HILITE); // allocate a CAValRef object
}
__except(FilterFTExceptions(_exception_code())) {
if (pbTextBuffer) FreeVirtualBuffer(&m_vbTextBuffer);
if (paTokenInfo) FreeVirtualBuffer(&m_vbTokenInfo);
if (pavr) delete pavr;
return;
}
m_pbTextBuffer = pbTextBuffer;
m_paTokenInfo = paTokenInfo;
m_pavr = pavr;
}
ERRORCODE CHiliter::ScanDisplayText(BYTE* pbText, int cbText, UINT iCharset, UINT lcid) {
// this can get called repeatedly
if (!cbText || !pbText) return 0; // if there was no text we are done now
__try {
// do any initialization needed
if (!m_paTokenInfo) InitTokenInfo();
ASSERT (m_pbTextBuffer);
ASSERT (m_pavr);
// add the tokens to our custom hash table
m_pHash = m_pSearcher->GetHiliterHashTable();
// tokenize the text and convert tokens to value references
// NOTE: we don't process the last token we are passed in case it is split
// we call this routine one more time to process the last token
// .. when we know that there is no more input
// we can't split tokenizing across a change in charset or locale so just process the carryover
if (m_cbCarryOver && (m_iCharSetCarryOver != iCharset || m_lcidCarryOver != lcid))
m_cbCarryOver = FlushCarryOverText();
// if we had a split token last time, we have to join the old and new text
if (m_cbCarryOver) { // first check that our buffer is large enough
if (PBYTE(m_pbTextBuffer + m_cbCarryOver + cbText) >= PBYTE(m_vbTextBuffer.CommitLimit)) {
PVOID pNewEnd = PVOID(m_pbTextBuffer+m_cbCarryOver+cbText+CB_COMMIT_HILITEBUF);
if (!ExtendVirtualBuffer(&m_vbTextBuffer, pNewEnd))
return OUT_OF_MEMORY;
}
MoveMemory(m_pbTextBuffer+m_cbCarryOver, pbText, cbText); // join buffers
pbText = m_pbTextBuffer; // switch the pointer
cbText += m_cbCarryOver; // adjust the length
}
// now scan the text we were passed
m_cbCarryOver = AppendText(pbText, cbText, FALSE, iCharset, lcid);
if (m_cbCarryOver) { // copy carry text to start of buffer
if (PBYTE(m_pbTextBuffer+m_cbCarryOver) >= PBYTE(m_vbTextBuffer.CommitLimit)) {
PVOID pNewEnd = PVOID(m_pbTextBuffer+m_cbCarryOver+CB_COMMIT_HILITEBUF);
if (!ExtendVirtualBuffer(&m_vbTextBuffer, pNewEnd))
return OUT_OF_MEMORY;
}
MoveMemory(m_pbTextBuffer, pbText+cbText-m_cbCarryOver, m_cbCarryOver);
m_iCharSetCarryOver = iCharset;
m_lcidCarryOver = lcid;
}
}
__except(FilterFTExceptions(_exception_code())) {
return ErrorCodeForExceptions(_exception_code());
}
return 0;
}
int CHiliter::FlushCarryOverText() { // flush out any carryover text
return AppendText(m_pbTextBuffer, m_cbCarryOver, TRUE, m_iCharSetCarryOver, m_lcidCarryOver);
}
int CHiliter::AppendText(BYTE* pbText, int cbText, BOOL fArticleEnd, UINT iCharset, UINT lcid) {
// this code mimics the code in txdbase.cpp so that the handling of split
// words with punctuation (such as can't) works the same in both pieces of code
int cbScanned;
while (cbText) {
int cbChunk = min(cbText, MAX_HILITE_WORDS); // don't exceed our unicode buffer
int cbHeld = cbText - cbChunk;
cbScanned = AppendSlave(pbText, cbChunk, fArticleEnd, iCharset, lcid);
if (cbScanned >= 0) { // continue passing partial buffers
pbText += cbScanned;
cbText -= cbScanned;
}
else { // reached end of passed buffer
pbText -= cbScanned;
cbText += cbScanned;
if (cbHeld==0) break; // .. so break out of the loop
}
}
return cbText;
}
int CHiliter::AppendSlave(BYTE* pbText, int cbText, BOOL fArticleEnd, UINT iCharset, UINT lcid) {
CAbortSearch::CheckContinueState();
int nChar, nTokens;
int nMore = cbText;
PBYTE pbTextStart = pbText;
nTokens = pWordBreakA(iCharset, (char**)(&pbText), &nMore, // these variables get changed
(char**)m_paStart, (char**)m_paEnd, m_pbType, NULL, // no hash needed
MAX_TOKENS_HILITE, 0); // leave spaces in as part of punctuation
if (nTokens > 1 && (nMore || !fArticleEnd)) { // exhausted token space OR more article
if (nTokens > 2 && !(m_pbType[nTokens-1] & WORD_TYPE))
nTokens--; // break at word starts (punc not to span)
nChar = m_paStart[--nTokens] - pbTextStart; // reprocess last token
}
else nChar = cbText; // processed entire buffer
// now go and move the token spreads into our token info array
CopySpreads(nTokens, iCharset);
// .. and set the counts for the next pass
m_cbScanned += nChar;
if (!nMore) nChar = -nChar; // marks that text buffer is fully processed
return nChar; // amount still to be done
}
void FindToken(UINT iValue, PVOID pvTag, PVOID pvEnvironment) {
// Assimilate function to move the index of each token (into ppdSorted) into our TOKEN_INFO array
TOKEN_INFO* pTokenInfo = (TOKEN_INFO*)pvEnvironment;
int iToken = *(int*)pvTag;
pTokenInfo[iValue].iSorted = iToken;
}
void CHiliter::CopySpreads(int nTokens, UINT iCharset) {
WCHAR* pwBuf = m_wTextBuf; // pointer to unicode buffer
BYTE* pOrigin = m_paStart[0] - m_cbScanned; // where we got to so far
if (m_pavr) m_pavr->DiscardRefs(); // discard all the value references
// process through the list of incoming words
for (int iToken=0; iToken<nTokens; iToken++) {
// move the token locations into our TOKEN_INFO array
BYTE* pStart = m_paStart[iToken];
BYTE* pEnd = m_paEnd[iToken];
// convert to unicode + move to unicode buffer
UINT cw;
for (;;)
{
cw = MultiByteToWideChar(
iCharset, // code page
0, // character-type options
(char*)pStart, // address of string to map
pEnd - pStart, // number of characters in string
pwBuf, // address of wide-character buffer
MAX_HILITE_WORDS); // size of buffer in words
if (cw || pEnd == pStart) break;
iCharset= ANSI_CHARSET; // Force to ANSI if the previous charset
// didn't work.
}
if (m_pbType[iToken]==0) { // for puctuation remove blanks ..
int cBase; // number of leading white space chars.
int cLimit; // number of trailing white space chars.
int cwOld = cw; // needed to compute cEnd
cw = ::RemoveWhiteSpace(pwBuf, cw, cBase, cLimit);
// now adjust the base and limit if necessary to remove white space
if (cBase) pStart = (BYTE*)CharNextMult(iCharset, (char*)pStart, cBase);
if (cLimit) pEnd = (BYTE*)CharNextMult(iCharset, (char*)pStart, cwOld - cBase - cLimit);
}
int iTokenInfo = m_cTokenInfo + iToken; // slot for latest results
__try {
m_paTokenInfo[iTokenInfo].base = pStart - pOrigin;
m_paTokenInfo[iTokenInfo].limit = pEnd - pOrigin;
m_paTokenInfo[iTokenInfo].type = m_pbType[iToken];
m_paTokenInfo[iTokenInfo].iSorted = -1; // default -- means not in our token list
}
__except(VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbTokenInfo)) {
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
}
// add to pavr
m_pavr->AddValRef(pwBuf, cw * sizeof(WCHAR)); // returns index -- but we already know that
pwBuf += cw; // slot for next Unicode token
}
// get our token info table set up
m_pHash->Assimilate(m_pavr, &m_paTokenInfo[m_cTokenInfo], FindToken, NULL);
m_cTokenInfo += nTokens; // ready for next batch
}
ERRORCODE CHiliter::ClearDisplayText() {
m_cTokenInfo = 0;
m_cbScanned = 0;
m_iCharSetCarryOver = 0;
m_lcidCarryOver = 0; // set up our carry over variables
m_cbCarryOver = 0;
m_cbScanned = 0;
return 0;
}
//////////////////////////////// Phase (3) //////////////////////////////////
/*
Analysing the search parameters
When the user asks for a count or for actual hilites, we must go and find the
current state of the search parameter to find out which of the words in our
word list are selected by those criteria. This information is condensed into
an indicator set, m_pMask, which is ordered the same as m_ppdSorted.
We scan our array of TOKEN_INFO structures and use each index to access our
indicator set. On the first pass we just count the number of hits. Then we
allocate space to save the actual information which we put in on the second pass.
For each hilite we need to save the following:
typedef struct {
int base;
int limit;
} HILITE;
Every time there is a change in the search parameters, Cfind must increment its
copy of m_serialFind. This way we can tell if we need to recompute.
When we get a QueryHiliter() call, we first check m_iTokenNext to see if the
caller is moving sequentially through the list. If this does not match, we do
a binary search.
*/
void CHiliter::UpdateMask() {
CFind* pFind = m_pSearcher->m_pFind;
if (m_serialFind==pFind->GetSerial()) return;
// we don't have the indicator set -- go compute it
int cFrag = pFind->GetFragmentCount(); // number of fragments
if (m_pMask) DetachRef(m_pMask); // remove previous mask
m_pMask = NULL; // initial empty set
for (int iFrag=0; iFrag<cFrag; iFrag++) { // look at all the fragments
CFragInfo* pFrag = pFind->GetFragment(iFrag); // get current fragment
CIndicatorSet* pMask = pFrag->GetSelection(); // get new indicator set
if (pMask) { // NULL implies "all"
if (!m_pMask) AttachRef(m_pMask, CIndicatorSet::NewIndicatorSet(pMask));
// first time thru we create a mask
else m_pMask->ORWith(pMask); // or the masks!
delete pMask;
}
}
m_serialFind = pFind->GetSerial(); // avoid unwanted recomputes
}
BOOL CHiliter::PhraseSearch() {
// returns TRUE if he is searching for phrases
CFind* pFind = m_pSearcher->m_pFind;
CFragInfo* pFrag = pFind->GetFragment(0); // get any fragment
return pFrag->GetRefType()==TokenRefs;
}
int CHiliter::CountHilites(int base, int limit) {
// find out how many hilites there so user can allocate space
return QueryHilites(base, limit, COUNTING, NULL);
}
int CHiliter::QueryHilites(int base, int limit, int cMax, HILITE* paHilites) {
// base and limit describe the range of text we wish to get hilites for
// cHilites and paHilites describe a buffer to put the results into
// we return the number of hilites copied
if (!m_paTokenInfo) return 0; // no text -- no hilites
if (m_cbCarryOver) m_cbCarryOver = FlushCarryOverText();
__try {
if (base<0) base = 0;
int iEnd = m_cTokenInfo? m_paTokenInfo[m_cTokenInfo-1].limit
: 0;
if (limit==-1 || limit>=iEnd) limit = iEnd;
if (base>=limit) return 0;
m_base = base;
m_limit = limit; // we need these globals
m_cMax = cMax; // .. inside called functions
m_paHilites = paHilites;
m_cLit = 0;
int iToken = LocateBase(base); // convert offset to indexes
m_iLimit = LocateLimit(limit);
if (PhraseSearch()) {
UpdateMasks();
if (!m_apMasks || !m_apMasks[0]) return 0; // none is none
// we need to consider phrases that start outside the range
int iSlop = 2*(m_cFrags-1); // max we can be off
iToken = max(0, iToken-iSlop);
// .. and end outside the range
m_iLimit = min(m_cTokenInfo, m_iLimit+iSlop);
// loop thru our hiliter tokens and copy the valid ones
while (iToken<m_iLimit && m_cLit<m_cMax) {
// loop can end because we got the count we want
CheckNextToken(0, m_iTokenStart = iToken);
iToken++;
}
}
else {
UpdateMask();
if (!m_pMask) return 0; // return "all" == NONE!
while (iToken<m_iLimit && m_cLit<m_cMax) {
// loop can end because we got the count we want
CheckToken(iToken);
iToken++;
}
}
m_iTokenNext = iToken;
m_baseNext = m_paTokenInfo[m_iTokenNext].base;
}
__except(FilterFTExceptions(_exception_code())) {
return ErrorCodeForExceptions(_exception_code());
}
return m_cLit;
}
void CHiliter::CheckToken(int iToken) {
int iSorted = m_paTokenInfo[iToken].iSorted;
if (iSorted==-1) return; // not a word in our text sets
if (m_pMask && m_pMask->IsBitSet(iSorted)) {
if (m_cMax!=COUNTING) {
m_paHilites[m_cLit].base = max(m_base, m_paTokenInfo[iToken].base); // copy the HILITES
m_paHilites[m_cLit].limit = min(m_limit, m_paTokenInfo[iToken].limit);
}
m_cLit++;
}
}
int CHiliter::LocateBase(int base) {
// find the index to the span which contains the given base
int iToken;
if (base==0) iToken = 0; // easy case #1
else if (base==m_baseNext) iToken = m_iTokenNext; // #2 -- for serial access
else iToken = LocateOffset(base); // hard case
return iToken;
}
int CHiliter::LocateLimit(int limit) {
// find the index to the span which contains the given limit
int iToken;
if (limit>=m_paTokenInfo[m_cTokenInfo-1].limit) iToken = m_cTokenInfo;
else iToken = LocateOffset(limit) + 1; // hard case
// add one to include a word only partly included
return iToken;
}
int CHiliter::LocateOffset(int offset) {
// failing the above, we do a binary search
// likely to hang if terms are not properly in ascending order
int iStart = 0;
int iEnd = m_cTokenInfo - 1;
int i = (iStart + iEnd) / 2; // start in the middle
// binary search
while (iStart<iEnd) {
if (offset >= m_paTokenInfo[i].base) {
if (offset < m_paTokenInfo[i].limit) break; // we found it
else {
iStart = i;
i = (i + iEnd + 1) / 2;
}
}
else {
iEnd = i;
i = (iStart + i) / 2;
}
}
return i;
}
/////////////////////////////// Phase 4 ///////////////////////////////////////////
/* This is special code to handle phrase searching. Instead of ORing all our fragment
masks we simply store them in an array. A typical mask should be 5-10K so storing 20 of
them would mean allocating at most 200K.
We proceed by walking our display text as before. When we get to a word we ask does
this word occur in fragment[0]. If it does, we move to the next word. The second word
may be punctuation which generates two cases (a) this is our next word (check fragment[1])
or (b) this is punctuation to be ignored. Since we count space as a valid token, this
second alternative will be relatively common.
Suppose that the number of fragments is nFrag. Imagine a binary tree of depth nFrag.
All the left branches are where we have case (a) -- our next word. The right branches are
case (b) -- where we skip a punctuation token. We do a tree sweep. If we ever reach a
terminal node we have a hit. We then add an element to our array and store the base of
the first word and the limit of the last word.
Note that we may have several hits in a single tree and that hits may overlap. We
return to the caller all the overlapping hits. This would be a good form if he wants to
display them sequentially. If he wants to show them all at once he will have to write
code to combine ones that overlap.
*/
void CHiliter::UpdateMasks() {
// make sure we have the current array of indicator sets for each fragment
CFind* pFind = m_pSearcher->m_pFind;
if (m_serialFind==pFind->GetSerial()) return;
// we don't have the indicator sets -- go fetch them
for (int iFrag=0; iFrag<m_cFrags; iFrag++) { // remove previous masks
if (m_apMasks[iFrag]) DetachRef(m_apMasks[iFrag]);
m_apMasks[iFrag] = NULL; // empty sets
}
m_cFrags = pFind->GetFragmentCount(); // number of fragments now
if (m_cFrags>cFRAG_MAX) m_cFrags = cFRAG_MAX; // upper limit
for (iFrag=0; iFrag<m_cFrags; iFrag++) {
CFragInfo* pFrag = pFind->GetFragment(iFrag); // get current fragment
CIndicatorSet* pMask = pFrag->GetSelection();
if (pMask) AttachRef(m_apMasks[iFrag], pMask); // get new indicator set
}
m_serialFind = pFind->GetSerial(); // avoid unwanted recomputes
}
void CHiliter::CheckNextToken(int depth, int iToken) {
// recursive routine to find how many phrases start with this token
// returns the number of terminal nodes in this sub-tree
if (m_cLit>=m_cMax) return;
int iSorted = m_paTokenInfo[iToken].iSorted;
if (iSorted==-1) return; // not a word in our text sets
if (m_apMasks[depth] && !m_apMasks[depth]->IsBitSet(iSorted)) return;
// [.. he selected words in this fragment but not this one]
if (depth>=m_cFrags-1) { // ==== we got a hit ==========
int base = m_paTokenInfo[m_iTokenStart].base;
int limit = m_paTokenInfo[iToken].limit;
if (base<m_limit || limit>m_base) {
if (m_cMax!=COUNTING) { // we were called by QueryHilite
m_paHilites[m_cLit].base = max(m_base, base);
// the span starts at the start of the token at the top of the tree
m_paHilites[m_cLit].limit = min(m_limit, limit);
// .. and ends at the end of the token at the bottom of the tree
}
m_cLit++; // count one more for the gipper
}
}
else {
if (++iToken>=m_iLimit) return;
// we survived ferocious pruning so far so let us press deeper
depth++;
CheckNextToken(depth, iToken); // any sort of token -- just delve
// notice we can go two different ways at any node
if (m_paTokenInfo[iToken].type==0) { // [0==punctuation]
if (++iToken>=m_iLimit) return; // punctuation -- skip before delving
CheckNextToken(depth, iToken); // delve on MacDuff
}
}
}