978 lines
35 KiB
C++
978 lines
35 KiB
C++
|
|
#include "stdafx.h"
|
|
#include <math.h>
|
|
#include "vmbuffer.h"
|
|
#include "memex.h"
|
|
#include "saveload.h"
|
|
#include "TXDBase.h"
|
|
#include "bytemaps.h"
|
|
#include "textset.h"
|
|
#include "dict.h"
|
|
#include "vector.h"
|
|
#include "query.h"
|
|
|
|
// This file contains the definition of class CQuery
|
|
|
|
// Constructors
|
|
/*************************************************************************
|
|
* FUNCTION : *
|
|
* *
|
|
* RETURNS : *
|
|
* *
|
|
* PURPOSE : *
|
|
* *
|
|
* PARAMETERS : *
|
|
* *
|
|
* SIDE EFFECTS : *
|
|
* *
|
|
* DESCRIPTION : *
|
|
* *
|
|
* HISTORY : *
|
|
* *
|
|
* Author Date Action *
|
|
* ------ ---- ------ *
|
|
* *
|
|
* KrishnaN 4/23/94 Creation. *
|
|
* *
|
|
*************************************************************************/
|
|
|
|
CQuery::CQuery()
|
|
{
|
|
m_cConWts = 0;
|
|
#if 0
|
|
m_cOverFlows = 0;
|
|
#endif
|
|
m_pszQueryText = NULL;
|
|
}
|
|
|
|
// Destructor
|
|
/*************************************************************************
|
|
* FUNCTION : *
|
|
* *
|
|
* RETURNS : *
|
|
* *
|
|
* PURPOSE : *
|
|
* *
|
|
* PARAMETERS : *
|
|
* *
|
|
* SIDE EFFECTS : *
|
|
* *
|
|
* DESCRIPTION : *
|
|
* *
|
|
* HISTORY : *
|
|
* *
|
|
* Author Date Action *
|
|
* ------ ---- ------ *
|
|
* *
|
|
* KrishnaN 4/23/94 Creation. *
|
|
* *
|
|
*************************************************************************/
|
|
|
|
CQuery::~CQuery()
|
|
{
|
|
if (m_vbVectorConcept.Base)
|
|
FreeVirtualBuffer(&m_vbVectorConcept);
|
|
if (m_vbVectorTermFreq.Base)
|
|
FreeVirtualBuffer(&m_vbVectorTermFreq);
|
|
if (m_vbVectorWt.Base)
|
|
FreeVirtualBuffer(&m_vbVectorWt);
|
|
#if 0
|
|
if (m_vbTFOverFlow.Base)
|
|
FreeVirtualBuffer(&m_vbTFOverFlow);
|
|
#endif
|
|
}
|
|
|
|
CQuery *CQuery::NewQuery(CTextSet *pts)
|
|
{
|
|
CQuery *pQuery= NULL;
|
|
|
|
__try
|
|
{
|
|
pQuery= New CQuery;
|
|
|
|
pQuery->Initialize(pts, 100, 100000);
|
|
}
|
|
__finally
|
|
{
|
|
if (_abnormal_termination() && pQuery)
|
|
{
|
|
delete pQuery; pQuery= NULL;
|
|
}
|
|
}
|
|
|
|
return pQuery;
|
|
}
|
|
|
|
// Access Functions:
|
|
/*************************************************************************
|
|
* FUNCTION : *
|
|
* *
|
|
* RETURNS : *
|
|
* *
|
|
* PURPOSE : *
|
|
* *
|
|
* PARAMETERS : *
|
|
* *
|
|
* SIDE EFFECTS : *
|
|
* *
|
|
* DESCRIPTION : *
|
|
* *
|
|
* HISTORY : *
|
|
* *
|
|
* Author Date Action *
|
|
* ------ ---- ------ *
|
|
* *
|
|
* KrishnaN 4/23/94 Creation. *
|
|
* *
|
|
*************************************************************************/
|
|
|
|
void CQuery::Initialize(CTextSet *textsetIn, DWORD cInEstConWtPairs, DWORD cInMaxConWtPairs)
|
|
{
|
|
ASSERT(textsetIn != NULL);
|
|
|
|
m_ptdb = textsetIn;
|
|
|
|
m_cDocuments = m_ptdb->PColl()->m_cDocuments;
|
|
|
|
ASSERT(m_cDocuments != 0);
|
|
|
|
m_vbConcepts = m_ptdb->PColl()->m_vbConcepts;
|
|
|
|
ASSERT(m_vbConcepts.Base);
|
|
|
|
m_aWtInvIndex = m_ptdb->PColl()->m_aWtInvIndex;
|
|
|
|
ASSERT(m_aWtInvIndex);
|
|
|
|
CreateVirtualBuffer(&m_vbVectorConcept , cInEstConWtPairs * sizeof(DWORD), cInMaxConWtPairs * sizeof(DWORD));
|
|
CreateVirtualBuffer(&m_vbVectorTermFreq, cInEstConWtPairs * sizeof(DWORD), cInMaxConWtPairs * sizeof(DWORD));
|
|
CreateVirtualBuffer(&m_vbVectorWt , 0 , cInMaxConWtPairs * sizeof(float));
|
|
#if 0
|
|
CreateVirtualBuffer(&m_vbTFOverFlow , 0 , 0x4000 * sizeof(TFOverFlowStruct));
|
|
#endif
|
|
|
|
// Initialize allocated memory
|
|
// VritualAlloc zeroes all memory it commits, so we don't have to worry about zeroing the virtual buffers
|
|
}
|
|
|
|
/*************************************************************************
|
|
* FUNCTION : *
|
|
* *
|
|
* RETURNS : *
|
|
* *
|
|
* PURPOSE : *
|
|
* *
|
|
* PARAMETERS : *
|
|
* *
|
|
* SIDE EFFECTS : *
|
|
* *
|
|
* DESCRIPTION : *
|
|
* *
|
|
* HISTORY : *
|
|
* *
|
|
* Author Date Action *
|
|
* ------ ---- ------ *
|
|
* *
|
|
* KrishnaN 4/23/94 Creation. *
|
|
* *
|
|
*************************************************************************/
|
|
|
|
void CQuery::RecordConcept(DWORD ConceptId)
|
|
{
|
|
// Search for this concept id in the current document. If you find it,
|
|
// simply increment its frequency and that will take care of everything.
|
|
// If you don't find it, then enter the concept for the document.
|
|
|
|
DWORD i; // index of the con,wt pair being considered for match
|
|
|
|
for (i = 0; i < m_cConWts && Concept(i) != ConceptId; i++);
|
|
|
|
if (i == m_cConWts)
|
|
{
|
|
// This concept doesn't exist in the query. Record it.
|
|
__try
|
|
{
|
|
Concept(m_cConWts) = ConceptId;
|
|
}
|
|
__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbVectorConcept))
|
|
{
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
}
|
|
|
|
__try
|
|
{
|
|
TermFreq(m_cConWts) = 1; // this is the first time this concept occured for this document
|
|
}
|
|
__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbVectorTermFreq))
|
|
{
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
}
|
|
|
|
m_cConWts++;
|
|
}
|
|
else
|
|
{
|
|
// Term already exists in this document. Increase the occurence frequency.
|
|
// Since the term already exists in the document, it has a frequency of at least 1
|
|
#if 0
|
|
// The only time when the value can be 0 is when the frequency has exceeded 0xFFFF. In
|
|
// that case, the overflowing value is stored in the over flow area
|
|
if (TermFreq(i) == 0)
|
|
{
|
|
// go to the over flow area and update the value that tracks this term frequency
|
|
}
|
|
else
|
|
|
|
#endif
|
|
if (TermFreq(i) == 0xFFFF)
|
|
{
|
|
// we reached the upperbound on this value.
|
|
}
|
|
else // normal case. No overflow is involved. This is what happens MOST of the time.
|
|
(TermFreq(i))++;
|
|
}
|
|
}
|
|
|
|
/*************************************************************************
|
|
* FUNCTION : *
|
|
* *
|
|
* RETURNS : *
|
|
* *
|
|
* PURPOSE : *
|
|
* *
|
|
* PARAMETERS : *
|
|
* *
|
|
* SIDE EFFECTS : *
|
|
* *
|
|
* DESCRIPTION : *
|
|
* *
|
|
* HISTORY : *
|
|
* *
|
|
* Author Date Action *
|
|
* ------ ---- ------ *
|
|
* *
|
|
* KrishnaN 4/23/94 Creation. *
|
|
* *
|
|
*************************************************************************/
|
|
|
|
// ASSUMPTION : We are only weighting one query vector. This will hold true all the time.
|
|
|
|
BOOL CQuery::WeightVector(BYTE TFModType, BYTE WeightType, BYTE NormType)
|
|
{
|
|
DWORD i;
|
|
|
|
// Copy the Term Frequencies into an array of floating points. All operations will be computed
|
|
// on these floating point weights. The final results can then be converted to a fixed point.
|
|
// IMPORTANT : ALL WEIGHTS SHOULD BE NORMALIZED TO ENSURE THAT EACH WEIGHT IS LESS THAN ONE.
|
|
// THE FIXED POINT VALUE ONLY REPRESENTS VALUES BETWEEN 0.0 AND 1.0
|
|
for (i = 0; i < m_cConWts; i++)
|
|
{
|
|
__try
|
|
{
|
|
TermWt(i) = (float)GetRealTermFreq(i);
|
|
}
|
|
__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbVectorWt))
|
|
{
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
}
|
|
}
|
|
|
|
ApplyWeightingScheme(TFModType, WeightType, NormType, 0, m_cConWts);
|
|
|
|
// Plug back the weighted values into the term frequency array
|
|
// ASSUMPTION : Each weight in TermWt is between 0.0 and 1.0
|
|
// Multiplying this with WT_ONE forces each TermWt weight to be a
|
|
// fixed point number ranging between 0 and WT_ONE.
|
|
for (i = 0; i < m_cConWts; i++)
|
|
TermFreq(i) = (WORD)((double)TermWt(i) * (double)WT_ONE);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/*************************************************************************
|
|
* FUNCTION : *
|
|
* *
|
|
* RETURNS : *
|
|
* *
|
|
* PURPOSE : *
|
|
* *
|
|
* PARAMETERS : *
|
|
* *
|
|
* SIDE EFFECTS : *
|
|
* *
|
|
* DESCRIPTION : *
|
|
* *
|
|
* HISTORY : *
|
|
* *
|
|
* Author Date Action *
|
|
* ------ ---- ------ *
|
|
* *
|
|
* KrishnaN 4/23/94 Creation. *
|
|
* *
|
|
*************************************************************************/
|
|
|
|
void CQuery::ApplyWeightingScheme(BYTE TFModType, BYTE WeightType, BYTE NormType, DWORD iFirstConWt, DWORD cConWts)
|
|
{
|
|
register DWORD i;
|
|
double Wt; // used to hold different types of cumulative values at various points in the computations
|
|
|
|
// First modify weight based on the term frequency component
|
|
switch (TFModType)
|
|
{
|
|
case NEWTF_NONE: // do nothing
|
|
break;
|
|
|
|
case NEWTF_BINARY: // Since all the terms are in, turn them on
|
|
for (i = 0; i < cConWts; i++)
|
|
TermWt(i) = (float)1.0;
|
|
break;
|
|
|
|
case NEWTF_MAXNORM:
|
|
Wt = 0.0;
|
|
for (i = 0; i < cConWts; i++)
|
|
if (TermWt(i) > Wt)
|
|
Wt = TermWt(i);
|
|
|
|
// increase Max by 0.00001 to place all normalized TFs between 0.0 and 1.0
|
|
Wt += 0.00001;
|
|
|
|
for (i = 0; i < cConWts; i++)
|
|
TermWt(i) = (float) ((double)TermWt(i) / Wt);
|
|
break;
|
|
|
|
case NEWTF_AUGNORM:
|
|
Wt = 0.0;
|
|
for (i = 0; i < cConWts; i++)
|
|
if (TermWt(i) > Wt)
|
|
Wt = TermWt(i);
|
|
|
|
// increase Max by 0.00001 to place all normalized TFs between 0.0 and 1.0
|
|
Wt += 0.00001;
|
|
|
|
for (i = 0; i < cConWts; i++)
|
|
TermWt(i) = (float) (0.5 + 0.5 * (double)TermWt(i) / Wt);
|
|
break;
|
|
|
|
default:
|
|
// Assertion failure.
|
|
break;
|
|
}
|
|
|
|
// Then modify the weight based on the collection frequency component
|
|
switch (WeightType)
|
|
{
|
|
case WT_NONE: // do nothing
|
|
break;
|
|
|
|
// if a concept occurs in all docs, let's assign it a small value instead of assigning it a 0.0
|
|
case WT_TFIDF:
|
|
for (i = 0; i < cConWts; i++)
|
|
if (m_cDocuments == DocFromCumFreq(Concept(i + iFirstConWt)))
|
|
TermWt(i) = (float) 0.005;
|
|
else
|
|
TermWt(i) = (float) ((double)TermWt(i) * log((double)m_cDocuments / (double)DocFromCumFreq(Concept(i + iFirstConWt))));
|
|
break;
|
|
|
|
case WT_PROB:
|
|
for (i = 0; i < cConWts; i++)
|
|
if (m_cDocuments == DocFromCumFreq(Concept(i + iFirstConWt)))
|
|
TermWt(i) = (float) 0.005;
|
|
else
|
|
TermWt(i) = (float) ((double)TermWt(i) * log((double)(m_cDocuments - DocFromCumFreq(Concept(i + iFirstConWt))) / (double)DocFromCumFreq(Concept(i + iFirstConWt))));
|
|
break;
|
|
|
|
default:
|
|
// ASSERTion failure.
|
|
break;
|
|
}
|
|
|
|
switch (NormType)
|
|
{
|
|
case NORM_NONE:
|
|
break;
|
|
|
|
case NORM_SUM:
|
|
Wt = 0.0;
|
|
for (i = 0; i < cConWts; i++)
|
|
Wt += (double) TermWt(i);
|
|
|
|
for (i = 0; i < cConWts; i++)
|
|
TermWt(i) = (float) ((double)TermWt(i) / Wt);
|
|
break;
|
|
|
|
case NORM_COSINE:
|
|
Wt = 0.0;
|
|
// compute sum of squares of weights in the vector
|
|
for (i = 0; i < cConWts; i++)
|
|
Wt += TermWt(i) * TermWt(i);
|
|
|
|
Wt = sqrt(Wt);
|
|
// normalize each weight by the sum of squares computed above
|
|
for (i = 0; i < cConWts; i++)
|
|
TermWt(i) = (float) ((double)TermWt(i) / Wt);
|
|
break;
|
|
|
|
case NORM_MAX:
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*************************************************************************
|
|
* FUNCTION : *
|
|
* *
|
|
* RETURNS : *
|
|
* *
|
|
* PURPOSE : *
|
|
* *
|
|
* PARAMETERS : *
|
|
* *
|
|
* SIDE EFFECTS : *
|
|
* *
|
|
* DESCRIPTION : *
|
|
* *
|
|
* HISTORY : *
|
|
* *
|
|
* Author Date Action *
|
|
* ------ ---- ------ *
|
|
* *
|
|
* KrishnaN 4/23/94 Creation. *
|
|
* *
|
|
*************************************************************************/
|
|
|
|
BOOL CQuery::RankDocuments(SimStruct *aInSimilarity, DWORD cInHits)
|
|
{
|
|
register DWORD i, j;
|
|
DWORD ConceptId, DocId;
|
|
DWORD cDocs;
|
|
DWORD DocPos; // tracks the position of a document in the similiarity structure
|
|
DWORD startDocPos;
|
|
|
|
if (cInHits == 0)
|
|
{
|
|
SetLastError(QUERYERROR_NOHITS);
|
|
return FALSE;
|
|
}
|
|
|
|
if (aInSimilarity == NULL)
|
|
{
|
|
SetLastError(QUERYERROR_EMPTYSIMARRAY);
|
|
return FALSE;
|
|
}
|
|
|
|
// ASSUME THAT THE SIMILARITY STRUCTURE ARRAY HAS ENOUGH ENTRIES TO SUPPORT cInHits
|
|
|
|
// Zero out any existing similarity values
|
|
for (i = 0; i < cInHits; i++)
|
|
aInSimilarity[i].Similarity = 0;
|
|
|
|
// Compute similarity. Walk the doc,wt list for each concept
|
|
for (i = 0; i < m_cConWts; i++)
|
|
{
|
|
// Ignore concepts that have a zero weight. Later, we may want to extend this idea
|
|
// to suppress weights below a small value.
|
|
if (TermFreq(i) == 0)
|
|
continue;
|
|
|
|
ConceptId = Concept(i);
|
|
cDocs = DocFromCumFreq(ConceptId);
|
|
|
|
// Consider each doc in the (Doc, Wt) list for this concept and score docs that
|
|
// are in the predetermined hit list.
|
|
startDocPos = DocList(ConceptId); // get the starting point of the inverted list.
|
|
for (j = 0; j < cDocs; j++)
|
|
{
|
|
if (j == 0)
|
|
DocId = m_ptdb->PColl()->GetDocumentGap(&startDocPos) - 1;
|
|
else
|
|
DocId += m_ptdb->PColl()->GetDocumentGap(&startDocPos);
|
|
|
|
DocPos = GetDocPosInList(aInSimilarity, cInHits, DocId);
|
|
|
|
if (DocPos != DOESNOTEXIST)
|
|
aInSimilarity[DocPos].Similarity += TermFreq(i) * WtFromInvList(ConceptId, j);
|
|
|
|
/* IF WE LIMIT SIMILARITY TO 24 BITS, USE THE FOLLOWING LINE. IF WE LIMIT TO ANY OTHER NUMBER
|
|
OF BITS n, n < 32, RIGHT SHIFT THE RHS BY 32 - n.
|
|
aInSimilarity[DocPos].Similarity += (TermFreq(i) * WtFromInvList(ConceptId, j)) >> 8;
|
|
*/
|
|
}
|
|
}
|
|
/* MOVE SORTING TO THE CALLER!
|
|
// sort the scored documents.
|
|
qsort(aInSimilarity, cInHits, sizeof(SimStruct), CompareSimStruct);
|
|
*/
|
|
// return the number of hits
|
|
return cInHits;
|
|
}
|
|
|
|
/*************************************************************************
|
|
* FUNCTION : *
|
|
* *
|
|
* RETURNS : *
|
|
* *
|
|
* PURPOSE : *
|
|
* *
|
|
* PARAMETERS : *
|
|
* *
|
|
* SIDE EFFECTS : *
|
|
* *
|
|
* DESCRIPTION : *
|
|
* *
|
|
* HISTORY : *
|
|
* *
|
|
* Author Date Action *
|
|
* ------ ---- ------ *
|
|
* *
|
|
* KrishnaN 4/23/94 Creation. *
|
|
* *
|
|
*************************************************************************/
|
|
|
|
// cInHits is the number of previous hits. cInMaxHits is the maximum number of documents to retrieve
|
|
DWORD CQuery::RetrieveWithFeedback( SimStruct *aInSimilarity, DWORD cInMaxHits,
|
|
PWCHAR pwRelDocText, int cwRelDocText,
|
|
PWCHAR pwNonRelDocText, int cwNonRelDocText
|
|
)
|
|
{
|
|
DWORD i, j, k;
|
|
DWORD DocId;
|
|
DWORD cHits = 0;
|
|
|
|
DWORD cDocs, DocPos;
|
|
DWORD LastDocInPos = 0; // Document position which has the least partial similarity match
|
|
DWORD UBCurrentDoc; // Upper bound of the current document
|
|
DWORD CCurrentDoc; // C (CurrentDoc)
|
|
DWORD CFirstDocOut = 0; // C (FirstDocOut)
|
|
DWORD UBFirstDocOut; // Upper bound of the first document outside the RSet
|
|
LPDWORD aQTermSummation= NULL; // summation of query terms
|
|
|
|
DWORD startDocPos;
|
|
|
|
ASSERT(aInSimilarity);
|
|
|
|
ASSERT(pwRelDocText && cwRelDocText);
|
|
|
|
__try
|
|
{
|
|
// Add terms from the query. We will either have to reindex the initial query
|
|
// or store the term frequencies the first time they were computed.
|
|
// Assume that each term only occurs once in the query.
|
|
// This assumption usually holds good for queries typed in by the user.
|
|
// It doesn't matter much even if it doesn't hold good because the document
|
|
// text overwhelms the original query.
|
|
for (i = 0; i < m_cConWts; i++) // Enforce the above assumption.
|
|
TermFreq(i) = 1;
|
|
|
|
// Add terms from the relevant documents to the query
|
|
IndexDocumentText(pwRelDocText, cwRelDocText, TRUE);
|
|
|
|
|
|
// For the non-relevant document text, decrease termfreqs of concepts it has in common with
|
|
// the newly formed query.
|
|
// NOTE : The caller should pass in the document text of only the highest ranked
|
|
// non-relevant document to get the best results (Dec-Hi relevance feedback method).
|
|
if (pwNonRelDocText && cwNonRelDocText)
|
|
{
|
|
IndexDocumentText(pwNonRelDocText, cwNonRelDocText, FALSE);
|
|
|
|
// At this point, we may have some zero weighted concepts in the
|
|
// query. Remove any such concept, weight pairs.
|
|
for (i = j = 0; i < m_cConWts;)
|
|
{
|
|
// search for the next zero weighted concept
|
|
for (; j < m_cConWts && TermFreq(j) > 0; j++);
|
|
i = j; // update i so that outer loop terminates appropriately
|
|
if (j < m_cConWts) // we found a zero weighted concept
|
|
{
|
|
// search for the next non-zero weighted concept
|
|
for (k = j + 1; k < m_cConWts && TermFreq(k) == 0; k++);
|
|
if (k < m_cConWts) // we found a non-zero weighted concept
|
|
{
|
|
// copy the con,wt pair
|
|
Concept(j) = Concept(k);
|
|
TermFreq(j) = TermFreq(k);
|
|
// erase the copied pair
|
|
TermFreq(k) = 0;
|
|
j++; // update j so that the for loop advances
|
|
}
|
|
else // no more non-zero weighted concepts. we are done.
|
|
i = k;
|
|
}
|
|
}
|
|
|
|
ASSERT(i <= m_cConWts);
|
|
// Count the new number of ConWt pairs
|
|
for (m_cConWts = 0; TermFreq(m_cConWts) > 0; m_cConWts++);
|
|
}
|
|
|
|
if (!m_cConWts) __leave;
|
|
|
|
// Now weight the query vector
|
|
WeightVector(NEWTF_NONE, WT_TFIDF, NORM_COSINE);
|
|
|
|
SortQuery();
|
|
|
|
aQTermSummation = (LPDWORD)VAlloc(FALSE, m_cConWts * sizeof(DWORD));
|
|
|
|
// Compute summation of query terms. This summation will be used to compute the upperbound.
|
|
// aQTermSummation[j] gives is the sum of weights of query terms j+1 to the last term in the query
|
|
// The << 16 left shift takes care of multiplication by 1 i.e. it makes this a true 32-bit value
|
|
for (aQTermSummation[m_cConWts - 1] = TermFreq(i) << 16, i = m_cConWts - 1; i > 0; i--)
|
|
aQTermSummation[i - 1] = aQTermSummation[i] + TermFreq(i-1) << 16;
|
|
|
|
/*
|
|
// scale the values to 24 bit
|
|
for (i = 0; i < m_cConWts; i++)
|
|
aQTermSummation[i] = aQTermSummation[i] >> 8;
|
|
*/
|
|
|
|
/* IMPORTANT ASSUMPTION : The aInSimilarity array is properly initialized.
|
|
Proper initialization includes resetting all docid and sim values to 0
|
|
and the CollId field to the appropriate collection id.
|
|
|
|
If aInSimilarity is not properly initialized, the docid, sim values will
|
|
still be correct, but the caller will have no way of finding the collection
|
|
id of the docid, sim values set here.
|
|
*/
|
|
|
|
// Compute similarity. Walk the doc,wt list for each concept
|
|
// Compute until all terms are exhausted or the stopping conditions are met
|
|
// Skip terms that occur too frequently (how frequent is too frequent ?)
|
|
i = 0;
|
|
do
|
|
{
|
|
// CODE TO SKIP TERMS THAT ARE TOO FREQUENT CAN APPEAR HERE
|
|
// if (term is too frequent)
|
|
// {
|
|
// i++;
|
|
// continue;
|
|
// }
|
|
|
|
cDocs = DocFromCumFreq(Concept(i));
|
|
DocId = 0;
|
|
startDocPos = DocList(Concept(i));
|
|
// Consider each doc in the (Doc, Wt) list for this concept and score docs that
|
|
// are in the predetermined hit list.
|
|
for (j = 0; j < cDocs; j++)
|
|
{
|
|
// The first doc in an inverted list for a concept is encoded as docid + 1. The subsequent
|
|
// gaps are encoded as they are.
|
|
if (j == 0)
|
|
DocId = m_ptdb->PColl()->GetDocumentGap(&startDocPos) - 1;
|
|
else
|
|
DocId += m_ptdb->PColl()->GetDocumentGap(&startDocPos);
|
|
|
|
DocPos = GetDocPosInList2(aInSimilarity, cHits, DocId);
|
|
// ALG : If RsetNotFull then
|
|
if (cHits < cInMaxHits)
|
|
{
|
|
// ALG : Compute C(Document);
|
|
// ALG : Enter Document into the RSet
|
|
if (DocPos == DOESNOTEXIST)
|
|
{
|
|
// Add this new document
|
|
DocPos = cHits;
|
|
aInSimilarity[DocPos].DocId = DocId;
|
|
cHits++;
|
|
aInSimilarity[DocPos].Similarity += TermFreq(i) * WtFromInvList(Concept(i), j);
|
|
/* If we scale similarity to 24 bits, use this line instead of the above
|
|
aInSimilarity[DocPos].Similarity += (TermFreq(i) * WtFromInvList(Concept(i), j)) >> 8;
|
|
*/
|
|
if (aInSimilarity[DocPos].Similarity < aInSimilarity[LastDocInPos].Similarity)
|
|
LastDocInPos = DocPos;
|
|
}
|
|
else
|
|
{
|
|
// recompute the LastDocIn document if this document was LastDocIn before this cumulation
|
|
aInSimilarity[DocPos].Similarity += TermFreq(i) * WtFromInvList(Concept(i), j);
|
|
/* If we scale similarity to 24 bits, use this line instead of the above
|
|
aInSimilarity[DocPos].Similarity += (TermFreq(i) * WtFromInvList(Concept(i), j)) >> 8;
|
|
*/
|
|
if (DocPos == LastDocInPos)
|
|
for (k = 0; k < cHits; k++)
|
|
if (aInSimilarity[k].Similarity < aInSimilarity[LastDocInPos].Similarity)
|
|
LastDocInPos = k;
|
|
}
|
|
}
|
|
// ALG : else
|
|
else
|
|
{
|
|
// ALG : Compute Upperbound (Document)
|
|
// At this point we will also compute the partial similarity for this document
|
|
if (DocPos == DOESNOTEXIST)
|
|
{
|
|
CCurrentDoc = TermFreq(i) * WtFromInvList(Concept(i), j);
|
|
/* If we scale similarity to 24 bits, use this line instead of the above
|
|
CCurrentDoc = (TermFreq(i) * WtFromInvList(Concept(i), j)) >> 8;
|
|
*/
|
|
UBCurrentDoc = aQTermSummation[i];
|
|
}
|
|
else
|
|
{
|
|
CCurrentDoc = aInSimilarity[DocPos].Similarity + (TermFreq(i) * WtFromInvList(Concept(i), j));
|
|
/* If we scale similarity to 24 bits, use this line instead of the above
|
|
CCurrentDoc = aInSimilarity[DocPos].Similarity + ((TermFreq(i) * WtFromInvList(Concept(i), j)) >> 8);
|
|
*/
|
|
// The upper bound could exceed the maximum possible similarity value. We should protect
|
|
// against that by bounding the upper bound.
|
|
if ((MAXSIM - aInSimilarity[DocPos].Similarity) < aQTermSummation[i])
|
|
UBCurrentDoc = MAXSIM;
|
|
else
|
|
UBCurrentDoc = aInSimilarity[DocPos].Similarity + aQTermSummation[i];
|
|
}
|
|
|
|
// ALG : If U(Document) <= C(LastDoc) then
|
|
// ALG : DoNotAllocate / Remove Document
|
|
// If U < C condition is met and the doc is already in, remove it
|
|
if (UBCurrentDoc <= aInSimilarity[LastDocInPos].Similarity)
|
|
{
|
|
// This document is a loser. Check to see if it is at least better than
|
|
// the first document outside the RSet.
|
|
if (CCurrentDoc > CFirstDocOut)
|
|
CFirstDocOut = CCurrentDoc;
|
|
|
|
// Remove this loser if it was already entered
|
|
if (DocPos != DOESNOTEXIST)
|
|
{
|
|
// remove current document from the list
|
|
// remove by copying the document at the end into this document's position
|
|
aInSimilarity[DocPos].Similarity = aInSimilarity[cHits - 1].Similarity;
|
|
aInSimilarity[DocPos].DocId = aInSimilarity[cHits - 1].DocId;
|
|
cHits--;
|
|
ASSERT (cHits);
|
|
|
|
// Now that we changed the document set, recompute the last doc position
|
|
for (k = 0; k < cHits; k++)
|
|
if (aInSimilarity[k].Similarity < aInSimilarity[LastDocInPos].Similarity)
|
|
LastDocInPos = k;
|
|
}
|
|
}
|
|
// ALG : else
|
|
// ALG : Compute C (Document)
|
|
// ALG : if (C (Document) > C (LastDoc) then
|
|
// ALG : Enter Document into the RSet
|
|
else
|
|
{
|
|
if (CCurrentDoc > aInSimilarity[LastDocInPos].Similarity)
|
|
{
|
|
if (DocPos == DOESNOTEXIST)
|
|
{
|
|
// Since the RSet is already full, the only way to enter the current document
|
|
// is by replacing the document at the LastDocInPos - i.e replacing the doc with
|
|
// the least partial match
|
|
|
|
// Before replacing the LastDocIn, let us save it as the FirstDocOut
|
|
CFirstDocOut = aInSimilarity[LastDocInPos].Similarity;
|
|
|
|
// Replace
|
|
aInSimilarity[LastDocInPos].DocId = DocId;
|
|
aInSimilarity[LastDocInPos].Similarity = CCurrentDoc;
|
|
|
|
// Now that we changed the document set, recompute the last doc position
|
|
for (k = 0; k < cHits; k++)
|
|
if (aInSimilarity[k].Similarity < aInSimilarity[LastDocInPos].Similarity)
|
|
LastDocInPos = k;
|
|
}
|
|
else
|
|
{
|
|
aInSimilarity[DocPos].Similarity = CCurrentDoc;
|
|
// recompute the LastDocIn document if this document was LastDocIn before this cumulation
|
|
if (DocPos == LastDocInPos)
|
|
for (k = 0; k < cHits; k++)
|
|
if (aInSimilarity[k].Similarity < aInSimilarity[LastDocInPos].Similarity)
|
|
LastDocInPos = k;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
/* BEGIN : Fix FOR BUG 18016 */
|
|
if (cHits < cInMaxHits)
|
|
UBFirstDocOut = 0xFFFFFFFF; // No doc outside the RSet, so the first doc out potentially has infinite upperboud
|
|
else
|
|
/* END : fix for BUG 18016 */
|
|
// Compute upper bound of FirstDocOut
|
|
UBFirstDocOut = CFirstDocOut + aQTermSummation[i];
|
|
i++;
|
|
// ALG : until LastQueryTerm or U(FirstDocOut) <= C(LastDocIn)
|
|
// NOTE : We converted a repeat - until into a do - while, so the loop termination conditions are different
|
|
// between algorithm and the implementation.
|
|
} while (i < m_cConWts && UBFirstDocOut > aInSimilarity[LastDocInPos].Similarity && TermFreq(i) > 0 ); // INTRODUCE MORE STOPPING CONDITIONS HERE
|
|
|
|
#if 0 // statistics
|
|
if ( i < m_cConWts )
|
|
{
|
|
char szBuffer[200];
|
|
DWORD cDocsExamined = 0, cDocsNotExamined = 0;
|
|
|
|
for (k = 0; k < i; k++)
|
|
cDocsExamined += DocFromCumFreq(Concept(k));
|
|
|
|
for (k = i; k < m_cConWts; k++)
|
|
cDocsNotExamined += DocFromCumFreq(Concept(k));
|
|
|
|
wsprintf(szBuffer, "Examined only %u lists out of %u lists and only %u docs out of %u docs", i, m_cConWts, cDocsExamined, cDocsExamined+cDocsNotExamined);
|
|
MessageBox(GetFocus(), szBuffer, "Query Optimization", MB_OK);
|
|
}
|
|
#endif // 0, statistics
|
|
|
|
/* MOVE SORTING TO THE CALLER. THIS IS DONE TO ENABLE MULTIPLE FILE SEARCHES. THE CALLER WILL
|
|
GET ALL THE RESULTS INTO A HUGE SIMSTRUCT ARRAY AND SORT IT
|
|
// sort the scored documents.
|
|
qsort(aInSimilarity, cHits, sizeof(SimStruct), CompareSimStruct);
|
|
*/
|
|
}
|
|
__finally
|
|
{
|
|
if (aQTermSummation) VFree(aQTermSummation);
|
|
}
|
|
|
|
return cHits;
|
|
}
|
|
|
|
void CQuery::SortQuery()
|
|
{
|
|
TempConWtStruct * aConWts = NULL;
|
|
register DWORD i;
|
|
|
|
__try
|
|
{
|
|
// Sort the query concept, wt pairs based on the weight of the concepts. This will be used
|
|
// when we employ stop conditions to reduce the number of documents considered.
|
|
// Since the concepts and weights are not in the same structure, we need to
|
|
// copy them to a temporary buffer and then copy the sorted values back
|
|
aConWts = (TempConWtStruct *) VAlloc(FALSE, sizeof(TempConWtStruct) * m_cConWts);
|
|
|
|
for (i = 0; i < m_cConWts; i++)
|
|
{
|
|
aConWts[i].ConceptId = Concept(i);
|
|
aConWts[i].Weight = TermFreq(i);
|
|
}
|
|
|
|
qsort(aConWts, m_cConWts, sizeof(TempConWtStruct), CompareTempConWtStruct);
|
|
|
|
for (i = 0; i < m_cConWts; i++)
|
|
{
|
|
Concept(i) = aConWts[i].ConceptId;
|
|
TermFreq(i) = (WORD)aConWts[i].Weight;
|
|
}
|
|
}
|
|
__finally
|
|
{
|
|
if (aConWts) VFree(aConWts);
|
|
}
|
|
}
|
|
|
|
// Compare two dwords and return 0, < 0, or > 0 to help qsort sort in decreasing order
|
|
int _cdecl CompareTempConWtStruct(const void *arg1, const void *arg2)
|
|
{
|
|
if (((TempConWtStruct *)arg2)->Weight > ((TempConWtStruct *)arg1)->Weight)
|
|
return 1;
|
|
else if (((TempConWtStruct *)arg2)->Weight < ((TempConWtStruct *)arg1)->Weight)
|
|
return -1;
|
|
else
|
|
return 0;
|
|
}
|
|
|
|
|
|
// Compare two dwords and return 0, < 0, or > 0 to help qsort sort in decreasing order
|
|
int _cdecl CompareSimStruct(const void *arg1, const void *arg2)
|
|
{
|
|
if (((SimStruct *)arg2)->Similarity > ((SimStruct *)arg1)->Similarity)
|
|
return 1;
|
|
else if (((SimStruct *)arg2)->Similarity < ((SimStruct *)arg1)->Similarity)
|
|
return -1;
|
|
else
|
|
return 0;
|
|
}
|
|
|
|
|
|
// ASSUMPTION : There are at least two elements in the list
|
|
__inline DWORD CQuery::GetDocPosInList(SimStruct *aInSimilarity, DWORD cInHits, DWORD DocId)
|
|
{
|
|
register DWORD high = cInHits, low = 0, mid;
|
|
|
|
while (low < high)
|
|
{
|
|
mid = low + (high - low)/2;
|
|
if (DocId < aInSimilarity[mid].DocId)
|
|
high = mid;
|
|
else if (DocId > aInSimilarity[mid].DocId)
|
|
low = mid + 1;
|
|
else
|
|
return mid;
|
|
}
|
|
return DOESNOTEXIST;
|
|
}
|
|
|
|
__inline DWORD CQuery::GetDocPosInList2(SimStruct *aInSimilarity, DWORD cInHits, DWORD DocId)
|
|
{
|
|
register DWORD i;
|
|
|
|
for (i = 0; i < cInHits; i++)
|
|
if (aInSimilarity[i].DocId == DocId)
|
|
return i;
|
|
// the doc has not been found
|
|
return DOESNOTEXIST;
|
|
}
|
|
|
|
void CQuery::IndexDocumentText(PWCHAR pwDocText, int cwText, BOOL fRelevant)
|
|
{
|
|
int n, nTokens, nMore;
|
|
PUINT pwHash = NULL;
|
|
PBYTE pbType = NULL;
|
|
PWCHAR *paStart = NULL,
|
|
*paEnd = NULL;
|
|
|
|
PWCHAR pwText = pwDocText; // we will leave the pwDocText untouched so that
|
|
// the caller can delete that memory buffer.
|
|
DWORD ConId;
|
|
|
|
nMore = cwText;
|
|
|
|
ASSERT(pwText && cwText);
|
|
|
|
__try
|
|
{
|
|
// cwText is probably a lot more than we need, but it guarantees us that we won't run out of memory
|
|
// for tokens
|
|
pwHash = New UINT[cwText];
|
|
pbType = New BYTE[cwText];
|
|
paStart = New PWCHAR[cwText];
|
|
paEnd = New PWCHAR[cwText];
|
|
|
|
if (pwText && pwHash && paStart && pbType && paEnd)
|
|
{
|
|
nTokens = WordBreakW(&pwText, &nMore, paStart, paEnd, pbType, pwHash, cwText, REMOVE_SPACE_CHARS);
|
|
|
|
for (n = 0; n < nTokens; n++)
|
|
{
|
|
// EnterWord with last param set to TRUE is only looking up, not entering, a word
|
|
ConId = m_ptdb->PDict()->EnterWord(paStart[n], paEnd[n] - paStart[n], TRUE, TRUE);
|
|
if (ConId != EOL && ConId != STOPWORD)
|
|
if (fRelevant)
|
|
RecordConcept(ConId);
|
|
else // not relevant
|
|
{
|
|
DWORD i;
|
|
// For each concept in the document, check to see if it exists
|
|
// in the query. If it does, subtract it from the query's term frequency
|
|
|
|
for (i = 0; i < m_cConWts && Concept(i) != ConId; i++);
|
|
|
|
if (i < m_cConWts)
|
|
// This concept exists in the query. Subtract this term from the query.
|
|
if (TermFreq(i) > 0)
|
|
TermFreq(i) -= 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
__finally
|
|
{
|
|
if (paEnd) { delete paEnd; paEnd = NULL; }
|
|
if (paStart) { delete paStart; paStart = NULL; }
|
|
if (pbType) { delete pbType; pbType = NULL; }
|
|
if (pwHash) { delete pwHash; pwHash = NULL; }
|
|
}
|
|
}
|