840 lines
32 KiB
C++
840 lines
32 KiB
C++
// This file contains the definition of class CCollection
|
|
|
|
#include "stdafx.h"
|
|
#include <math.h>
|
|
#include "vmbuffer.h"
|
|
#include "memex.h"
|
|
#include "saveload.h"
|
|
#include "textset.h"
|
|
|
|
#include "vector.h"
|
|
|
|
// bitmasks for bit manipulations
|
|
|
|
DWORD bitMask32[] = {
|
|
0x80000000, 0x40000000, 0x20000000, 0x10000000,
|
|
0x08000000, 0x04000000, 0x02000000, 0x01000000,
|
|
0x00800000, 0x00400000, 0x00200000, 0x00100000,
|
|
0x00080000, 0x00040000, 0x00020000, 0x00010000,
|
|
0x00008000, 0x00004000, 0x00002000, 0x00001000,
|
|
0x00000800, 0x00000400, 0x00000200, 0x00000100,
|
|
0x00000080, 0x00000040, 0x00000020, 0x00000010,
|
|
0x00000008, 0x00000004, 0x00000002, 0x00000001
|
|
};
|
|
|
|
BYTE bitMask8[] = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
|
|
|
|
// Constructors
|
|
/*************************************************************************
|
|
* FUNCTION : *
|
|
* *
|
|
* RETURNS : *
|
|
* *
|
|
* PURPOSE : *
|
|
* *
|
|
* PARAMETERS : *
|
|
* *
|
|
* SIDE EFFECTS : *
|
|
* *
|
|
* DESCRIPTION : *
|
|
* *
|
|
* HISTORY : *
|
|
* *
|
|
* Author Date Action *
|
|
* ------ ---- ------ *
|
|
* *
|
|
* KrishnaN 4/23/94 Creation. *
|
|
* *
|
|
*************************************************************************/
|
|
|
|
CCollection::CCollection()
|
|
{
|
|
m_cConcepts = 0;
|
|
m_cDocuments = 0;
|
|
m_cConWts = 0;
|
|
m_cBitsUsedInEncoding = 0;
|
|
m_bCollState = COLL_UNUSABLE;
|
|
#if 0
|
|
m_cOverFlows = 0;
|
|
m_vbTFOverFlow = NULL;
|
|
#endif
|
|
|
|
m_acDocWts = NULL;
|
|
m_aWtInvIndex = NULL;
|
|
m_aDocInvIndex = NULL;
|
|
m_fLoadedFromDisk = FALSE;
|
|
|
|
m_vbConcepts .Base =
|
|
m_vbVectorRange .Base =
|
|
m_vbVectorConcept .Base = NULL;
|
|
m_vbVectorTermFreq.Base =
|
|
m_vbVectorWt .Base =
|
|
m_vbDocInvIndex .Base = NULL;
|
|
|
|
// Used for integration with Ron's code
|
|
m_pts = NULL;
|
|
}
|
|
|
|
CCollection *CCollection::NewCollection()
|
|
{
|
|
CCollection *pColl = NULL;
|
|
|
|
__try
|
|
{
|
|
pColl= New CCollection;
|
|
|
|
// 1st arg is estimated # of unique concepts (stems), 2nd arg is maximum # of concepts
|
|
// 3rd arg is estimated # of documents, 4th arg is max # of documents
|
|
// 5th arg is estimated # of concepts across all documents
|
|
// 6th arg is max # of concepts across all documents
|
|
// Assuming a minimum of one char per word and one separator, the maximum number of
|
|
// words in the document set is atmost cbArticles/2
|
|
|
|
pColl->Initialize(1024, 2000000, 1024, 10000000, 1024, 10000000);
|
|
}
|
|
__finally
|
|
{
|
|
if (_abnormal_termination() && pColl)
|
|
{
|
|
delete pColl; pColl= NULL;
|
|
}
|
|
}
|
|
|
|
return pColl;
|
|
}
|
|
|
|
// Destructor
|
|
/*************************************************************************
|
|
* FUNCTION : *
|
|
* *
|
|
* RETURNS : *
|
|
* *
|
|
* PURPOSE : *
|
|
* *
|
|
* PARAMETERS : *
|
|
* *
|
|
* SIDE EFFECTS : *
|
|
* *
|
|
* DESCRIPTION : *
|
|
* *
|
|
* HISTORY : *
|
|
* *
|
|
* Author Date Action *
|
|
* ------ ---- ------ *
|
|
* *
|
|
* KrishnaN 4/23/94 Creation. *
|
|
* *
|
|
*************************************************************************/
|
|
|
|
CCollection::~CCollection()
|
|
{
|
|
if (m_fLoadedFromDisk) return;
|
|
|
|
if (m_acDocWts ) VFree(m_acDocWts );
|
|
if (m_aWtInvIndex ) VFree(m_aWtInvIndex );
|
|
if (m_aDocInvIndex) VFree(m_aDocInvIndex);
|
|
|
|
if (m_vbConcepts .Base) FreeVirtualBuffer(&m_vbConcepts );
|
|
if (m_vbVectorRange .Base) FreeVirtualBuffer(&m_vbVectorRange );
|
|
if (m_vbVectorConcept .Base) FreeVirtualBuffer(&m_vbVectorConcept );
|
|
if (m_vbVectorTermFreq.Base) FreeVirtualBuffer(&m_vbVectorTermFreq);
|
|
if (m_vbVectorWt .Base) FreeVirtualBuffer(&m_vbVectorWt );
|
|
if (m_vbDocInvIndex .Base) FreeVirtualBuffer(&m_vbDocInvIndex );
|
|
#if 0
|
|
if (m_vbTFOverFlow .Base) FreeVirtualBuffer(&m_vbTFOverFlow);
|
|
#endif
|
|
}
|
|
|
|
// Access Functions:
|
|
/*************************************************************************
|
|
* FUNCTION : *
|
|
* *
|
|
* RETURNS : *
|
|
* *
|
|
* PURPOSE : *
|
|
* *
|
|
* PARAMETERS : *
|
|
* *
|
|
* SIDE EFFECTS : *
|
|
* *
|
|
* DESCRIPTION : *
|
|
* *
|
|
* HISTORY : *
|
|
* *
|
|
* Author Date Action *
|
|
* ------ ---- ------ *
|
|
* *
|
|
* KrishnaN 4/23/94 Creation. *
|
|
* *
|
|
*************************************************************************/
|
|
|
|
void CCollection::Initialize(DWORD cInEstConcepts, DWORD cInMaxConcepts, DWORD cInEstDocuments, DWORD cInMaxDocuments, DWORD cInEstConWtPairs, DWORD cInMaxConWtPairs)
|
|
{
|
|
// Initialization transitions the collection from a COLL_UNUSABLE state to a COLL_USABLE state.
|
|
// If it is called when it is in any state other COLL_UNUSABLE, the state will be undefined.
|
|
// Avoid that confusion.
|
|
|
|
ASSERT(m_bCollState == COLL_UNUSABLE);
|
|
|
|
ASSERT(cInEstConcepts);
|
|
|
|
CreateVirtualBuffer(&m_vbConcepts , cInEstConcepts * sizeof(ConceptStruct), cInMaxConcepts * sizeof(ConceptStruct ));
|
|
CreateVirtualBuffer(&m_vbVectorConcept , cInEstConWtPairs * sizeof(DWORD ), cInMaxConWtPairs * sizeof(DWORD ));
|
|
CreateVirtualBuffer(&m_vbVectorTermFreq, cInEstConWtPairs * sizeof(WORD ), cInMaxConWtPairs * sizeof(WORD ));
|
|
CreateVirtualBuffer(&m_vbVectorWt , 0 , cInMaxConWtPairs * sizeof(float ));
|
|
CreateVirtualBuffer(&m_vbVectorRange , cInEstDocuments * sizeof(DWORD ), cInMaxDocuments * sizeof(DWORD ));
|
|
#if 0
|
|
CreateVirtualBuffer(&m_vbTFOverFlow , 0 , 0x4000 * sizeof(TFOverFlowStruct));
|
|
#endif
|
|
|
|
// VritualAlloc zeroes all memory it commits, so we don't have to worry about zeroing the virtual buffers
|
|
|
|
m_bCollState = COLL_USABLE;
|
|
}
|
|
|
|
void CCollection::SetNumberOfConcepts(DWORD cInConcepts)
|
|
{
|
|
m_cConcepts = cInConcepts;
|
|
}
|
|
|
|
/*************************************************************************
|
|
* FUNCTION : *
|
|
* *
|
|
* RETURNS : *
|
|
* *
|
|
* PURPOSE : *
|
|
* *
|
|
* PARAMETERS : *
|
|
* *
|
|
* SIDE EFFECTS : *
|
|
* *
|
|
* DESCRIPTION : *
|
|
* *
|
|
* HISTORY : *
|
|
* *
|
|
* Author Date Action *
|
|
* ------ ---- ------ *
|
|
* *
|
|
* KrishnaN 4/23/94 Creation. *
|
|
* *
|
|
*************************************************************************/
|
|
|
|
void CCollection::RecordConcept(DWORD ConceptId)
|
|
{
|
|
// Search for this concept id in the current document. If you find it,
|
|
// simply increment its frequency and that will take care of everything.
|
|
// If you don't find it, then enter the concept for the document and
|
|
// increment DocFreq count for this concept.
|
|
|
|
DWORD i; // index of the con,wt pair being considered for match
|
|
|
|
for (i = DocSentinel(m_cDocuments); i < m_cConWts && Concept(i) != ConceptId; i++);
|
|
|
|
if (i == m_cConWts)
|
|
{
|
|
// This concept doesn't exist in the current document. Record it.
|
|
__try
|
|
{
|
|
Concept(m_cConWts) = ConceptId;
|
|
}
|
|
__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbVectorConcept))
|
|
{
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
}
|
|
|
|
__try
|
|
{
|
|
TermFreq(m_cConWts) = 1; // this is the first time this concept occured for this document
|
|
}
|
|
__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbVectorTermFreq))
|
|
{
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
}
|
|
|
|
m_cConWts++;
|
|
|
|
// Increase the DocFrequency for this concept in the dictionary
|
|
__try
|
|
{
|
|
DocFreq(ConceptId)++;
|
|
}
|
|
__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbConcepts))
|
|
{
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Term already exists in this document. Increase the occurence frequency.
|
|
// Since the term already exists in the document, it has a frequency of at least 1
|
|
#if 0
|
|
// The only time when the value can be 0 is when the frequency has exceeded 0xFFFF. In
|
|
// that case, the overflowing value is stored in the over flow area
|
|
if (TermFreq(i) == 0)
|
|
{
|
|
// go to the over flow area and update the value that tracks this term frequency
|
|
}
|
|
else
|
|
#endif
|
|
if (TermFreq(i) == 0xFFFF)
|
|
{
|
|
// we reached the upperbound on this value.
|
|
// Later we should place this in an overflow area
|
|
}
|
|
else // normal case. No overflow is involved. This is what happens MOST of the time.
|
|
(TermFreq(i))++;
|
|
}
|
|
}
|
|
|
|
/*************************************************************************
|
|
* FUNCTION : *
|
|
* *
|
|
* RETURNS : *
|
|
* *
|
|
* PURPOSE : *
|
|
* *
|
|
* PARAMETERS : *
|
|
* *
|
|
* SIDE EFFECTS : *
|
|
* *
|
|
* DESCRIPTION : *
|
|
* *
|
|
* HISTORY : *
|
|
* *
|
|
* Author Date Action *
|
|
* ------ ---- ------ *
|
|
* *
|
|
* KrishnaN 4/23/94 Creation. *
|
|
* *
|
|
*************************************************************************/
|
|
|
|
void CCollection::NewDocument()
|
|
{
|
|
m_cDocuments++;
|
|
__try
|
|
{
|
|
// record the last conwt pair's index (location in the conwt array) for the
|
|
// document we just finished processing. When we need to get the range of ConWts for
|
|
// a document i, we get DocSentinel(i) to DocSentinel(i+1) - 1
|
|
DocSentinel(m_cDocuments) = m_cConWts;
|
|
}
|
|
__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbVectorRange))
|
|
{
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
}
|
|
}
|
|
|
|
/*************************************************************************
|
|
* FUNCTION : *
|
|
* *
|
|
* RETURNS : *
|
|
* *
|
|
* PURPOSE : *
|
|
* *
|
|
* PARAMETERS : *
|
|
* *
|
|
* SIDE EFFECTS : *
|
|
* *
|
|
* DESCRIPTION : *
|
|
* *
|
|
* HISTORY : *
|
|
* *
|
|
* Author Date Action *
|
|
* ------ ---- ------ *
|
|
* *
|
|
* KrishnaN 4/23/94 Creation. *
|
|
* *
|
|
*************************************************************************/
|
|
|
|
// Computing the inverted index : The inverted index is a structure that lets us get a list of
|
|
// (document, weight) pairs for each concept. The document tells us the document in which this
|
|
// concept is present and the weight tells us the weight of this concept in the corresponding
|
|
// document. The inverted index is implemented as a huge array. We have as many entries in this
|
|
// index as we have (Concept, Wt) pairs. So we will first create memory based on that number.
|
|
// Then we create an array so that we can track the number of (Doc,Wt) pairs that went into the
|
|
// concept's doc,wt list at any given point. We then initialize the starting points of each
|
|
// concept's (doc, wt) list in this huge array. This computation is accomplished by using the
|
|
// DocFreq information we already computed for each concept.
|
|
|
|
// When we process a (concept, wt) pair in a document D, we obtain the location of the (D, wt)
|
|
// pair from the information we computed as descired in the above paragraph. This scheme allows
|
|
// us to compute an inverted index using an O(n) complex algorithm where n is the number of
|
|
// (Doc,Wt) pairs that constitute the inverted index.
|
|
|
|
// We will be using an intermediate array of floats to compute the weights. We will first copy
|
|
// the term frequencies from the document vectors into this array of floats. Weighting is
|
|
// performed on the floats and they are directly plugged into the inverted index with appropriate
|
|
// computation to convert them to fixed points. After all is said and done, the docuemnt vectors
|
|
// only contain the term frequencies. PERFECT!
|
|
|
|
void CCollection::WeightAndInvertVectors(BYTE TFModType, BYTE WeightType, BYTE NormType)
|
|
{
|
|
register DWORD i, j; // variables to implement for loops
|
|
DWORD k, l, m; // variables to hold temporary values
|
|
DWORD iFirstConWt; // the first con,wt pair for this vector
|
|
DWORD cConWts; // number of conwts for this document
|
|
|
|
// Compute the deltas
|
|
DWORD dwDelta; // used to hold the delta between successive document ids
|
|
int cSavOneBits, cOneBits; // holds the number of bits to be used to represent the first part of the gamma encoding
|
|
DWORD dwSavBitPos;
|
|
DWORD cByte = 0; // used to track the number of bytes used in the coding scheme
|
|
BYTE bitPos = 0; // used to track the position in the byte where the current bit should be encoded
|
|
|
|
// This routine is called to weight a collection. There is no reason to weight an
|
|
// already weighted collection. It is illegal to weight an COLL_UNUSABLE collection. Refuse to do so.
|
|
ASSERT(m_bCollState != COLL_UNUSABLE && m_bCollState != WEIGHTED);
|
|
|
|
ASSERT(!m_acDocWts);
|
|
|
|
__try
|
|
{
|
|
// create an array to hold the count of (doc, wt) pairs added so far to a given concept
|
|
m_acDocWts = (LPDWORD)VAlloc(FALSE, m_cConcepts*sizeof(DWORD));
|
|
|
|
ZeroMemory(m_acDocWts, m_cConcepts*sizeof(DWORD));
|
|
|
|
ASSERT(!m_aDocInvIndex);
|
|
|
|
m_aDocInvIndex = (LPDWORD) VAlloc(FALSE, sizeof(DWORD) * m_cConWts);
|
|
|
|
ASSERT(!m_aWtInvIndex);
|
|
|
|
m_aWtInvIndex = (LPWORD) VAlloc(NULL, sizeof(WORD) * m_cConWts);
|
|
|
|
// Set the pointers in the conceptstruct array so that they point to the right places in the array of
|
|
// (doc, wt) pairs
|
|
DocList(0) = 0;
|
|
// index for concept i+1 = index for concept i + number of documents in concept i+1
|
|
for (i = 1; i < m_cConcepts; i++)
|
|
DocList(i) = DocList(i - 1) + DocFreq(i - 1);
|
|
|
|
// now change the docfreq to hold the cumulative frequency, not the raw frequency
|
|
// the raw frequency for i can be recomputed by subtracting i from i + 1.
|
|
|
|
for (i = 0; i < m_cConcepts; i++)
|
|
DocFreq(i) = DocList(i);
|
|
|
|
// Cause an extra ConceptStruct to be allocated. This extra will be used to hold the
|
|
// total number of m_conwts. This can be used to compute the docfreq (df) for i as df(i+1) - df(i)
|
|
__try
|
|
{
|
|
DocFreq(m_cConcepts) = m_cConWts;
|
|
}
|
|
__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbConcepts))
|
|
{
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
}
|
|
|
|
// IMPORTANT : This for loop allows us to read each document vector from the disk and process the
|
|
// document completely before moving on to the next document vector. We first weight and
|
|
// normalize the vector and invert the vector after that.
|
|
|
|
for (i = 0; i < m_cDocuments; i++)
|
|
{
|
|
iFirstConWt = DocSentinel(i); // the first conwt of this doc vector
|
|
|
|
// Copy the Term Frequencies into an array of floating points. All operations will be computed
|
|
// on these floating point weights. The final results can then be converted to a fixed point.
|
|
// IMPORTANT : ALL WEIGHTS SHOULD BE NORMALIZED TO ENSURE THAT EACH WEIGHT IS LESS THAN ONE.
|
|
// THE FIXED POINT VALUE ONLY REPRESENTS VALUES BETWEEN 0.0 AND 1.0
|
|
|
|
cConWts = DocSentinel(i + 1) - iFirstConWt; // number of conwts in this vector
|
|
for (j = 0; j < cConWts; j++)
|
|
{
|
|
__try
|
|
{
|
|
TermWt(j) = (float)GetRealTermFreq(j + iFirstConWt);
|
|
}
|
|
__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbVectorWt))
|
|
{
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
}
|
|
}
|
|
|
|
//ApplyWeightingScheme(NEWTF_NONE, WT_TFIDF, NORM_COSINE, i);
|
|
ApplyWeightingScheme(TFModType, WeightType, NormType, iFirstConWt, cConWts);
|
|
|
|
// Now invert this document.
|
|
|
|
// k is the number of conwts in this vector
|
|
// m is the (Concept, Wt) pair of interest to us
|
|
// j tracks the number of (Concept, Wt) pairs processed so far for this document.
|
|
for (j = 0, k = DocSentinel(i + 1) - iFirstConWt, m = iFirstConWt; j < k; j++, m++)
|
|
{
|
|
// l is the index of the location in the inverted list array where the current (Doc,Wt) should go
|
|
// It is computed by adding the number of (Doc,Wt)s added so far to the current concept and the
|
|
// index where the first (Doc,Wt) for this concept should begin.
|
|
l = DocList(Concept(m)) + DocWtCount(Concept(m));
|
|
|
|
// Now copy the current (doc,wt) pair to the correct place in the inverted index array
|
|
Document(l) = i;
|
|
// ASSUMPTION : Each weight in TermWt is between 0.0 and 1.0
|
|
Weight(l) = (WORD)((double)TermWt(j) * (double)WT_ONE);
|
|
|
|
// Increase the counter to account for the addition of this document
|
|
DocWtCount(Concept(m))++;
|
|
}
|
|
}
|
|
|
|
// We don't need the m_acDocWts array any more
|
|
VFree(m_acDocWts); m_acDocWts= NULL;
|
|
|
|
// Now that we have an inverted index, we don't need the con,wt document vectors anymore
|
|
FreeVirtualBuffer(&m_vbVectorConcept );
|
|
FreeVirtualBuffer(&m_vbVectorTermFreq);
|
|
FreeVirtualBuffer(&m_vbVectorWt );
|
|
FreeVirtualBuffer(&m_vbVectorRange );
|
|
#if 0
|
|
FreeVirtualBuffer(&m_vbTFOverFlow );
|
|
#endif
|
|
|
|
// Now compress the documents in the inverted index
|
|
// Estimate that we will need only a fourth of the space (m_cConWts * 4 is the full number
|
|
// of bytes need to store the docs without compression)
|
|
CreateVirtualBuffer(&m_vbDocInvIndex, m_cConWts, m_cConWts * 4);
|
|
|
|
for (i = 0; i < m_cConcepts; i++)
|
|
{
|
|
// j holds the previous document id
|
|
// k holds the number of documents in the inverted list for this concept
|
|
k = DocFromCumFreq(i);
|
|
dwSavBitPos = m_cBitsUsedInEncoding;
|
|
|
|
for (j = l = 0; l < k; l++)
|
|
{
|
|
// compute the compressed representation and add it
|
|
// The encoding scheme cannot encode 0, so we will map (0 to numdocs - 1) to (1 to numdocs)
|
|
// The 1 being added here accomplishes that mapping.
|
|
// As a result of this, the first document id is stored as docId + 1, but the subsequent
|
|
// gaps are stored as they are. When decoding, therefore, we have to adjust for the first
|
|
// doc and do not need to adjust for the remaining docs. in the inverted list for a concept.
|
|
dwDelta = DocIdFromInvList(i, l) + 1 - j;
|
|
ASSERT(dwDelta); // dwDelta should always be greater than 0.
|
|
// Assume that there are at most 32 bits in the value that is being encoded.
|
|
// This assumption holds as long as we use a 32-bit value to store the initial document id
|
|
for (m = 0; m < 32 && !(bitMask32[m] & dwDelta); m++);
|
|
ASSERT(m < 32);
|
|
cSavOneBits = cOneBits = 31 - m;
|
|
// remove the highest 1 bit to get the reminder. removal is accomplished by xor'ing 1 with 1.
|
|
dwDelta ^= bitMask32[m];
|
|
|
|
m_cBitsUsedInEncoding += 2*cOneBits + 1;
|
|
|
|
// NOW ADD THE CODE BITS TO THE STREAM
|
|
__try
|
|
{
|
|
// add cOneBits bits to the stream
|
|
for (; cOneBits; cOneBits--)
|
|
{
|
|
CodeByte(cByte) |= bitMask8[bitPos];
|
|
bitPos = (bitPos + 1) % 8;
|
|
if (bitPos == 0) cByte++;
|
|
}
|
|
|
|
ASSERT(bitPos < 8);
|
|
// add a terminating 0 at the end
|
|
CodeByte(cByte) &= ~bitMask8[bitPos];
|
|
// advance the bit position
|
|
bitPos = (bitPos + 1) % 8;
|
|
if (bitPos == 0) cByte++;
|
|
|
|
// Add the reminder bits from dwDelta. The number of reminder bits is equal to the number of one bits
|
|
// Remember that m indicates the position of the highest 1 bit. Start there and write cSavOneBits bits.
|
|
for (; cSavOneBits; cSavOneBits--)
|
|
{
|
|
if (bitMask32[++m] & dwDelta) // if true, we have a 1 bit
|
|
CodeByte(cByte) |= bitMask8[bitPos];
|
|
else // we have a 0 bit
|
|
CodeByte(cByte) &= ~bitMask8[bitPos];
|
|
|
|
bitPos = (bitPos + 1) % 8;
|
|
if (bitPos == 0) cByte++;
|
|
}
|
|
}
|
|
__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbDocInvIndex))
|
|
{
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
}
|
|
|
|
// save doc id for use in the next iteration
|
|
// 1 is being added to map from 0 based numbering to 1 based numbering
|
|
j = DocIdFromInvList(i, l) + 1;
|
|
}
|
|
|
|
// now store the position of the first bit that codes the first document gap of the document inverted list
|
|
// Caution : This replaces the index previously stored there.
|
|
DocList(i) = dwSavBitPos;
|
|
}
|
|
|
|
m_bCollState = WEIGHTED;
|
|
}
|
|
__finally
|
|
{
|
|
// We don't need the uncompressed inverted index any more
|
|
|
|
if (m_aDocInvIndex) { VFree(m_aDocInvIndex); m_aDocInvIndex = NULL; }
|
|
if (m_acDocWts ) { VFree(m_acDocWts ); m_acDocWts = NULL; }
|
|
|
|
if (_abnormal_termination())
|
|
{
|
|
if (m_vbDocInvIndex.Base) FreeVirtualBuffer(&m_vbDocInvIndex);
|
|
|
|
if (m_aWtInvIndex ) { VFree(m_aWtInvIndex ); m_aWtInvIndex = NULL; }
|
|
|
|
m_bCollState= COLL_UNUSABLE;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*************************************************************************
|
|
* FUNCTION : *
|
|
* *
|
|
* RETURNS : *
|
|
* *
|
|
* PURPOSE : *
|
|
* *
|
|
* PARAMETERS : *
|
|
* *
|
|
* SIDE EFFECTS : *
|
|
* *
|
|
* DESCRIPTION : *
|
|
* *
|
|
* HISTORY : *
|
|
* *
|
|
* Author Date Action *
|
|
* ------ ---- ------ *
|
|
* *
|
|
* KrishnaN 4/23/94 Creation. *
|
|
* *
|
|
*************************************************************************/
|
|
|
|
void CCollection::StoreImage(CPersist *pDiskImage)
|
|
{
|
|
ASSERT(GetCollState() != COLL_UNUSABLE && GetCollState() != WEIGHTED);
|
|
|
|
// Account for the last document
|
|
NewDocument();
|
|
|
|
WeightAndInvertVectors(NEWTF_NONE, WT_TFIDF, NORM_COSINE);
|
|
|
|
CollHdr *pch = (CollHdr *) (pDiskImage->ReserveTableSpace(sizeof(CollHdr)));
|
|
|
|
pch->cConcepts = m_cConcepts;
|
|
pch->cDocuments = m_cDocuments;
|
|
pch->cDocWtPairs = m_cConWts;
|
|
pch->cBitsUsedInEncoding = m_cBitsUsedInEncoding;
|
|
|
|
pch->offConcepts = pDiskImage->NextOffset();
|
|
|
|
pDiskImage->SaveData(PBYTE(m_vbConcepts.Base), (m_cConcepts + 1) * sizeof(ConceptStruct));
|
|
|
|
pch->offWtInvIndex = pDiskImage->NextOffset();
|
|
pDiskImage->WriteWords(m_aWtInvIndex, m_cConWts);
|
|
|
|
pch->offDocInvIndex = pDiskImage->NextOffset();
|
|
pDiskImage->WriteBytes(PBYTE(m_vbDocInvIndex.Base), (m_cBitsUsedInEncoding + 7)/8);
|
|
}
|
|
|
|
CCollection * CCollection::CreateImage(CPersist *pDiskImage)
|
|
{
|
|
CCollection *pColl = NULL;
|
|
|
|
__try
|
|
{
|
|
pColl = New CCollection();
|
|
|
|
pColl->ConnectImage(pDiskImage);
|
|
}
|
|
__finally
|
|
{
|
|
if (_abnormal_termination() && pColl)
|
|
{
|
|
delete pColl; pColl= NULL;
|
|
}
|
|
}
|
|
|
|
return pColl;
|
|
}
|
|
|
|
void CCollection::ConnectImage(CPersist *pDiskImage)
|
|
{
|
|
m_fLoadedFromDisk = TRUE;
|
|
|
|
CollHdr *pch = (CollHdr *) (pDiskImage->ReserveTableSpace(sizeof(CollHdr)));
|
|
|
|
m_cConcepts = pch->cConcepts;
|
|
m_cDocuments = pch->cDocuments;
|
|
m_cConWts = pch->cDocWtPairs;
|
|
m_cBitsUsedInEncoding = pch->cBitsUsedInEncoding;
|
|
|
|
m_vbConcepts .Base = LPVOID(pDiskImage->LocationOf(pch->offConcepts ));
|
|
m_vbDocInvIndex.Base = LPVOID(pDiskImage->LocationOf(pch->offDocInvIndex));
|
|
m_aWtInvIndex = LPWORD(pDiskImage->LocationOf(pch->offWtInvIndex ));
|
|
|
|
// ready to use!
|
|
m_bCollState = COLL_USABLE;
|
|
}
|
|
|
|
/*************************************************************************
|
|
* FUNCTION : *
|
|
* *
|
|
* RETURNS : *
|
|
* *
|
|
* PURPOSE : *
|
|
* *
|
|
* PARAMETERS : *
|
|
* *
|
|
* SIDE EFFECTS : *
|
|
* *
|
|
* DESCRIPTION : *
|
|
* *
|
|
* HISTORY : *
|
|
* *
|
|
* Author Date Action *
|
|
* ------ ---- ------ *
|
|
* *
|
|
* KrishnaN 4/23/94 Creation. *
|
|
* *
|
|
*************************************************************************/
|
|
|
|
void CCollection::ApplyWeightingScheme(BYTE TFModType, BYTE WeightType, BYTE NormType, DWORD iFirstConWt, DWORD cConWts)
|
|
{
|
|
register DWORD i;
|
|
double Wt; // used to hold different types of cumulative values at various points in the computations
|
|
|
|
// First modify weight based on the term frequency component
|
|
switch (TFModType)
|
|
{
|
|
case NEWTF_NONE: // do nothing
|
|
break;
|
|
|
|
case NEWTF_BINARY: // Since all the terms are in, turn them on
|
|
for (i = 0; i < cConWts; i++)
|
|
TermWt(i) = (float)1.0;
|
|
break;
|
|
|
|
case NEWTF_MAXNORM:
|
|
Wt = 0.0;
|
|
for (i = 0; i < cConWts; i++)
|
|
if (TermWt(i) > Wt)
|
|
Wt = TermWt(i);
|
|
|
|
// increase Max by 0.00001 to place all normalized TFs between 0.0 and 1.0
|
|
Wt += 0.00001;
|
|
|
|
for (i = 0; i < cConWts; i++)
|
|
TermWt(i) = (float) ((double)TermWt(i)/Wt);
|
|
break;
|
|
|
|
case NEWTF_AUGNORM:
|
|
Wt = 0.0;
|
|
for (i = 0; i < cConWts; i++)
|
|
if (TermWt(i) > Wt)
|
|
Wt = TermWt(i);
|
|
|
|
// increase Max by 0.00001 to place all normalized TFs between 0.0 and 1.0
|
|
Wt += 0.00001;
|
|
|
|
for (i = 0; i < cConWts; i++)
|
|
TermWt(i) = (float) (0.5 + 0.5 * (double)TermWt(i) / Wt);
|
|
break;
|
|
|
|
default:
|
|
ASSERT(FALSE);
|
|
break;
|
|
}
|
|
|
|
// Then modify the weight based on the collection frequency component
|
|
switch (WeightType)
|
|
{
|
|
case WT_NONE: // do nothing
|
|
break;
|
|
|
|
// if a concept occurs in all docs, let's assign it a small value instead of assigning it a 0.0
|
|
case WT_TFIDF:
|
|
for (i = 0; i < cConWts; i++)
|
|
if (m_cDocuments == DocFromCumFreq(Concept(i + iFirstConWt)))
|
|
TermWt(i) = (float) 0.005;
|
|
else
|
|
TermWt(i) = (float) ((double)TermWt(i) * log((double)m_cDocuments / (double)DocFromCumFreq(Concept(i + iFirstConWt))));
|
|
break;
|
|
|
|
case WT_PROB:
|
|
for (i = 0; i < cConWts; i++)
|
|
if (m_cDocuments == DocFromCumFreq(Concept(i + iFirstConWt)))
|
|
TermWt(i) = (float) 0.005;
|
|
else
|
|
TermWt(i) = (float) ((double)TermWt(i) * log((double)(m_cDocuments - DocFromCumFreq(Concept(i + iFirstConWt))) / (double)DocFromCumFreq(Concept(i + iFirstConWt))));
|
|
break;
|
|
|
|
default:
|
|
ASSERT(FALSE);
|
|
break;
|
|
}
|
|
|
|
switch (NormType)
|
|
{
|
|
case NORM_NONE:
|
|
break;
|
|
|
|
case NORM_SUM:
|
|
break;
|
|
|
|
case NORM_COSINE:
|
|
Wt = 0.0;
|
|
// compute sum of squares of weights in the vector
|
|
for (i = 0; i < cConWts; i++)
|
|
Wt += TermWt(i) * TermWt(i);
|
|
|
|
Wt = sqrt(Wt);
|
|
// normalize each weight by the sum of squares computed above
|
|
for (i = 0; i < cConWts; i++)
|
|
TermWt(i) = (float) ((double)TermWt(i) / Wt);
|
|
break;
|
|
|
|
case NORM_MAX:
|
|
break;
|
|
}
|
|
}
|
|
|
|
DWORD CCollection::GetDocumentGap(LPDWORD startBitPos)
|
|
{
|
|
ASSERT(*startBitPos < m_cBitsUsedInEncoding);
|
|
|
|
int cOneBits = 0;
|
|
DWORD dwGap;
|
|
DWORD cByte = *startBitPos / 8;
|
|
BYTE bitPos = (BYTE) (*startBitPos % 8);
|
|
|
|
// determine the number of 1 bits
|
|
for ( ; CodeByte(cByte) & bitMask8[bitPos]; )
|
|
{
|
|
cOneBits++;
|
|
bitPos = (bitPos + 1) % 8;
|
|
if (bitPos == 0)
|
|
cByte++;
|
|
}
|
|
|
|
*startBitPos += 2*cOneBits + 1;
|
|
ASSERT(*startBitPos <= m_cBitsUsedInEncoding);
|
|
|
|
// reconstruct the doc id
|
|
// set the low bit and shift it left as you reconstruct the lower bits
|
|
dwGap = 1;
|
|
for ( ; cOneBits; cOneBits--)
|
|
{
|
|
bitPos = (bitPos + 1) % 8;
|
|
if (bitPos == 0)
|
|
cByte++;
|
|
dwGap <<= 1;
|
|
// If true, place a 1 bit in the lowest bit position
|
|
// If false, you already have a 0 bit in the lowest bit position
|
|
if (CodeByte(cByte) & bitMask8[bitPos])
|
|
dwGap = dwGap | bitMask32[31];
|
|
}
|
|
|
|
// Remember that we stored gap + 1
|
|
return(dwGap);
|
|
}
|