NT4/private/windows/win4help/ftsrch/vector.cpp

// This file contains the definition of class CCollection

#include  "stdafx.h"
#include <math.h>
#include "vmbuffer.h"
#include "memex.h"
#include "saveload.h"
#include "textset.h"

#include "vector.h"

// bitmasks for bit manipulations

DWORD bitMask32[] = {
						0x80000000, 0x40000000, 0x20000000, 0x10000000,
						0x08000000, 0x04000000, 0x02000000, 0x01000000,
						0x00800000, 0x00400000, 0x00200000, 0x00100000,
						0x00080000, 0x00040000, 0x00020000, 0x00010000,
						0x00008000, 0x00004000, 0x00002000, 0x00001000,
						0x00000800, 0x00000400, 0x00000200, 0x00000100,
						0x00000080, 0x00000040, 0x00000020, 0x00000010,
						0x00000008, 0x00000004, 0x00000002, 0x00000001
					};

BYTE bitMask8[] = 	{0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};

// Constructors
/*************************************************************************
 *	FUNCTION : 															 *
 *                                                                       *
 *  RETURNS  :															 *
 *																		 *
 *	PURPOSE :															 *
 *																		 *
 *	PARAMETERS :														 *
 *																		 *
 *	SIDE EFFECTS :														 *
 *                                                                       *
 *  DESCRIPTION :                                                        *
 *                                                                       *
 *  HISTORY :                                                            *
 *                                                                       *
 *          Author          Date              Action                     *
 *          ------          ----              ------                     *
 *                                                                       *
 *          KrishnaN        4/23/94           Creation.                  *
 *																		 *
 *************************************************************************/

CCollection::CCollection()
{
    m_cConcepts             = 0;
    m_cDocuments            = 0;
    m_cConWts               = 0;
    m_cBitsUsedInEncoding   = 0;
	m_bCollState            = COLL_UNUSABLE;
#if 0
	m_cOverFlows            = 0;
	m_vbTFOverFlow          = NULL;
#endif

	m_acDocWts              = NULL;
	m_aWtInvIndex           = NULL;
 	m_aDocInvIndex          = NULL;
	m_fLoadedFromDisk       = FALSE;

	m_vbConcepts      .Base =
	m_vbVectorRange   .Base =
	m_vbVectorConcept .Base = NULL;
	m_vbVectorTermFreq.Base =
	m_vbVectorWt      .Base =
	m_vbDocInvIndex   .Base = NULL;

	// Used for integration with Ron's code
	m_pts                   = NULL;
}

CCollection *CCollection::NewCollection()
{
    CCollection *pColl = NULL;

    __try
    {
        pColl= New CCollection;

    	// 1st arg is estimated # of unique concepts (stems), 2nd arg is maximum # of concepts
    	// 3rd arg is estimated # of documents, 4th arg is max # of documents
    	// 5th arg is estimated # of concepts across all documents
    	// 6th arg is max # of concepts across all documents
    	// Assuming a minimum of one char per word and one separator, the maximum number of
    	// words in the document set is atmost cbArticles/2

    	pColl->Initialize(1024, 2000000, 1024, 10000000, 1024, 10000000);
    }
    __finally
    {
        if (_abnormal_termination() && pColl)
        {
            delete pColl;  pColl= NULL;
        }
    }

    return pColl;
}

// Destructor
/*************************************************************************
 *	FUNCTION : 															 *
 *                                                                       *
 *  RETURNS  :															 *
 *																		 *
 *	PURPOSE :															 *
 *																		 *
 *	PARAMETERS :														 *
 *																		 *
 *	SIDE EFFECTS :														 *
 *                                                                       *
 *  DESCRIPTION :                                                        *
 *                                                                       *
 *  HISTORY :                                                            *
 *                                                                       *
 *          Author          Date              Action                     *
 *          ------          ----              ------                     *
 *                                                                       *
 *          KrishnaN        4/23/94           Creation.                  *
 *																		 *
 *************************************************************************/

CCollection::~CCollection()
{
	if (m_fLoadedFromDisk) return;

    if (m_acDocWts    ) VFree(m_acDocWts    );
	if (m_aWtInvIndex ) VFree(m_aWtInvIndex );
    if (m_aDocInvIndex) VFree(m_aDocInvIndex);

	if (m_vbConcepts      .Base) FreeVirtualBuffer(&m_vbConcepts      );
	if (m_vbVectorRange   .Base) FreeVirtualBuffer(&m_vbVectorRange   );
    if (m_vbVectorConcept .Base) FreeVirtualBuffer(&m_vbVectorConcept );
	if (m_vbVectorTermFreq.Base) FreeVirtualBuffer(&m_vbVectorTermFreq);
	if (m_vbVectorWt      .Base) FreeVirtualBuffer(&m_vbVectorWt      );
	if (m_vbDocInvIndex   .Base) FreeVirtualBuffer(&m_vbDocInvIndex   );
#if 0
	if (m_vbTFOverFlow    .Base) FreeVirtualBuffer(&m_vbTFOverFlow);
#endif
}

// Access Functions:
/*************************************************************************
 *	FUNCTION : 															 *
 *                                                                       *
 *  RETURNS  :															 *
 *																		 *
 *	PURPOSE :															 *
 *																		 *
 *	PARAMETERS :														 *
 *																		 *
 *	SIDE EFFECTS :														 *
 *                                                                       *
 *  DESCRIPTION :                                                        *
 *                                                                       *
 *  HISTORY :                                                            *
 *                                                                       *
 *          Author          Date              Action                     *
 *          ------          ----              ------                     *
 *                                                                       *
 *          KrishnaN        4/23/94           Creation.                  *
 *																		 *
 *************************************************************************/

void CCollection::Initialize(DWORD cInEstConcepts, DWORD cInMaxConcepts, DWORD cInEstDocuments, DWORD cInMaxDocuments, DWORD cInEstConWtPairs, DWORD cInMaxConWtPairs)
{
	// Initialization transitions the collection from a COLL_UNUSABLE state to a COLL_USABLE state.
	// If it is called when it is in any state other COLL_UNUSABLE, the state will be undefined.
	// Avoid that confusion.

	ASSERT(m_bCollState == COLL_UNUSABLE);

	ASSERT(cInEstConcepts);

    CreateVirtualBuffer(&m_vbConcepts      , cInEstConcepts   * sizeof(ConceptStruct), cInMaxConcepts   * sizeof(ConceptStruct   ));
    CreateVirtualBuffer(&m_vbVectorConcept , cInEstConWtPairs * sizeof(DWORD        ), cInMaxConWtPairs * sizeof(DWORD           ));
    CreateVirtualBuffer(&m_vbVectorTermFreq, cInEstConWtPairs * sizeof(WORD         ), cInMaxConWtPairs * sizeof(WORD            ));
    CreateVirtualBuffer(&m_vbVectorWt      , 0                                       , cInMaxConWtPairs * sizeof(float           ));
    CreateVirtualBuffer(&m_vbVectorRange   , cInEstDocuments  * sizeof(DWORD        ), cInMaxDocuments  * sizeof(DWORD           ));
#if 0
    CreateVirtualBuffer(&m_vbTFOverFlow    , 0                                       , 0x4000           * sizeof(TFOverFlowStruct));
#endif

	// VritualAlloc zeroes all memory it commits, so we don't have to worry about zeroing the virtual buffers

	m_bCollState = COLL_USABLE;
}

void CCollection::SetNumberOfConcepts(DWORD cInConcepts)
{
	m_cConcepts = cInConcepts;
}

/*************************************************************************
 *	FUNCTION : 															 *
 *                                                                       *
 *  RETURNS  :															 *
 *																		 *
 *	PURPOSE :															 *
 *																		 *
 *	PARAMETERS :														 *
 *																		 *
 *	SIDE EFFECTS :														 *
 *                                                                       *
 *  DESCRIPTION :                                                        *
 *                                                                       *
 *  HISTORY :                                                            *
 *                                                                       *
 *          Author          Date              Action                     *
 *          ------          ----              ------                     *
 *                                                                       *
 *          KrishnaN        4/23/94           Creation.                  *
 *																		 *
 *************************************************************************/

void CCollection::RecordConcept(DWORD ConceptId)
{
	// Search for this concept id in the current document. If you find it,
	// simply increment its frequency and that will take care of everything.
	// If you don't find it, then enter the concept for the document and
	// increment DocFreq count for this concept.

	DWORD i;	// index of the con,wt pair being considered for match

	for (i = DocSentinel(m_cDocuments); i < m_cConWts && Concept(i) != ConceptId; i++);

	if (i == m_cConWts)
	{
		// This concept doesn't exist in the current document. Record it.
		__try
		{
			Concept(m_cConWts) = ConceptId;
		}
		__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbVectorConcept))
		{
			RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
		}

		__try
		{
			TermFreq(m_cConWts) = 1;	// this is the first time this concept occured for this document
		}
		__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbVectorTermFreq))
		{
			RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
		}

		m_cConWts++;

		// Increase the DocFrequency for this concept in the dictionary
		__try
		{
			DocFreq(ConceptId)++;
		}
		__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbConcepts))
		{
			RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
		}
	}
	else
	{
		// Term already exists in this document. Increase the occurence frequency.
		// Since the term already exists in the document, it has a frequency of at least 1
#if 0
		// The only time when the value can be 0 is when the frequency has exceeded 0xFFFF. In
		// that case, the overflowing value is stored in the over flow area
		if (TermFreq(i) == 0)
		{
			// go to the over flow area and update the value that tracks this term frequency
		}
		else
#endif
		if (TermFreq(i) == 0xFFFF)
		{
			// we reached the upperbound on this value.
			// Later we should place this in an overflow area
		}
		else	// normal case. No overflow is involved. This is what happens MOST of the time.
			(TermFreq(i))++;
	}
}

/*************************************************************************
 *	FUNCTION : 															 *
 *                                                                       *
 *  RETURNS  :															 *
 *																		 *
 *	PURPOSE :															 *
 *																		 *
 *	PARAMETERS :														 *
 *																		 *
 *	SIDE EFFECTS :														 *
 *                                                                       *
 *  DESCRIPTION :                                                        *
 *                                                                       *
 *  HISTORY :                                                            *
 *                                                                       *
 *          Author          Date              Action                     *
 *          ------          ----              ------                     *
 *                                                                       *
 *          KrishnaN        4/23/94           Creation.                  *
 *																		 *
 *************************************************************************/

void CCollection::NewDocument()
{
	m_cDocuments++;
	__try
	{
		// record the last conwt pair's index (location in the conwt array) for the
		// document we just finished processing. When we need to get the range of ConWts for
		// a document i, we get DocSentinel(i) to DocSentinel(i+1) - 1
		DocSentinel(m_cDocuments) = m_cConWts;
	}
	__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbVectorRange))
	{
		RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
	}
}

/*************************************************************************
 *	FUNCTION : 															 *
 *                                                                       *
 *  RETURNS  :															 *
 *																		 *
 *	PURPOSE :															 *
 *																		 *
 *	PARAMETERS :														 *
 *																		 *
 *	SIDE EFFECTS :														 *
 *                                                                       *
 *  DESCRIPTION :                                                        *
 *                                                                       *
 *  HISTORY :                                                            *
 *                                                                       *
 *          Author          Date              Action                     *
 *          ------          ----              ------                     *
 *                                                                       *
 *          KrishnaN        4/23/94           Creation.                  *
 *																		 *
 *************************************************************************/

// Computing the inverted index : The inverted index is a structure that lets us get a list of
// (document, weight) pairs for each concept. The document tells us the document in which this
// concept is present and the weight tells us the weight of this concept in the corresponding
// document. The inverted index is implemented as a huge array. We have as many entries in this
// index as we have (Concept, Wt) pairs. So we will first create memory based on that number.
// Then we create an array so that we can track the number of (Doc,Wt) pairs that went into the
// concept's doc,wt list at any given point. We then initialize the starting points of each
// concept's (doc, wt) list in this huge array. This computation is accomplished by using the
// DocFreq information we already computed for each concept.

// When we process a (concept, wt) pair in a document D, we obtain the location of the (D, wt)
// pair from the information we computed as descired in the above paragraph. This scheme allows
// us to compute an inverted index using an O(n) complex algorithm where n is the number of
// (Doc,Wt) pairs that constitute the inverted index.

// We will be using an intermediate array of floats to compute the weights. We will first copy
// the term frequencies from the document vectors into this array of floats. Weighting is
// performed on the floats and they are directly plugged into the inverted index with appropriate
// computation to convert them to fixed points. After all is said and done, the docuemnt vectors
// only contain the term frequencies. PERFECT!

void CCollection::WeightAndInvertVectors(BYTE TFModType, BYTE WeightType, BYTE NormType)
{
	register DWORD i, j;	// variables to implement for loops
	DWORD k, l, m;			// variables to hold temporary values
	DWORD iFirstConWt;		// the first con,wt pair for this vector
	DWORD cConWts;			// number of conwts for this document

	// Compute the deltas
	DWORD dwDelta;	// used to hold the delta between successive document ids
	int cSavOneBits, cOneBits;	// holds the number of bits to be used to represent the first part of the gamma encoding
	DWORD dwSavBitPos;
	DWORD cByte = 0;	// used to track the number of bytes used in the coding scheme
	BYTE bitPos = 0;	// used to track the position in the byte where the current bit should be encoded

	// This routine is called to weight a collection. There is no reason to weight an
	// already weighted collection. It is illegal to weight an COLL_UNUSABLE collection. Refuse to do so.
	ASSERT(m_bCollState != COLL_UNUSABLE && m_bCollState != WEIGHTED);

	ASSERT(!m_acDocWts);

	__try
    {
    	// create an array to hold the count of (doc, wt) pairs added so far to a given concept
    	m_acDocWts = (LPDWORD)VAlloc(FALSE, m_cConcepts*sizeof(DWORD));

    	ZeroMemory(m_acDocWts, m_cConcepts*sizeof(DWORD));

        ASSERT(!m_aDocInvIndex);

    	m_aDocInvIndex = (LPDWORD) VAlloc(FALSE, sizeof(DWORD) * m_cConWts);

        ASSERT(!m_aWtInvIndex);

    	m_aWtInvIndex = (LPWORD) VAlloc(NULL, sizeof(WORD) * m_cConWts);

    	// Set the pointers in the conceptstruct array so that they point to the right places in the array of
    	// (doc, wt) pairs
    	DocList(0) = 0;
    	// index for concept i+1 = index for concept i + number of documents in concept i+1
    	for (i = 1; i < m_cConcepts; i++)
    		DocList(i) = DocList(i - 1) + DocFreq(i - 1);

    	// now change the docfreq to hold the cumulative frequency, not the raw frequency
    	// the raw frequency for i can be recomputed by subtracting i from i + 1.

    	for (i = 0; i < m_cConcepts; i++)
    		DocFreq(i) = DocList(i);

    	// Cause an extra ConceptStruct to be allocated. This extra will be used to hold the
    	// total number of m_conwts. This can be used to compute the docfreq (df) for i as df(i+1) - df(i)
    	__try
    	{
    		DocFreq(m_cConcepts) = m_cConWts;
    	}
    	__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbConcepts))
    	{
    		RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
    	}

    	// IMPORTANT : This for loop allows us to read each document vector from the disk and process the
    	//             document completely before moving on to the next document vector. We first weight and
    	//             normalize the vector and invert the vector after that.

    	for (i = 0; i < m_cDocuments; i++)
    	{
    		iFirstConWt = DocSentinel(i);	// the first conwt of this doc vector

    		// Copy the Term Frequencies into an array of floating points. All operations will be computed
    		// on these floating point weights. The final results can then be converted to a fixed point.
    		// IMPORTANT : ALL WEIGHTS SHOULD BE NORMALIZED TO ENSURE THAT EACH WEIGHT IS LESS THAN ONE.
    		//             THE FIXED POINT VALUE ONLY REPRESENTS VALUES BETWEEN 0.0 AND 1.0

    		cConWts = DocSentinel(i + 1) - iFirstConWt;	// number of conwts in this vector
    		for (j = 0; j < cConWts; j++)
    		{
    			__try
    			{
    				TermWt(j) = (float)GetRealTermFreq(j + iFirstConWt);
    			}
    			__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbVectorWt))
    			{
    				RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
    			}
    		}

    		//ApplyWeightingScheme(NEWTF_NONE, WT_TFIDF, NORM_COSINE, i);
    		ApplyWeightingScheme(TFModType, WeightType, NormType, iFirstConWt, cConWts);

    		// Now invert this document.

    		// k is the number of conwts in this vector
    		// m is the (Concept, Wt) pair of interest to us
    		// j tracks the number of (Concept, Wt) pairs processed so far for this document.
    		for (j = 0, k = DocSentinel(i + 1) - iFirstConWt, m = iFirstConWt; j < k; j++, m++)
    		{
    			// l is the index of the location in the inverted list array where the current (Doc,Wt) should go
    			// It is computed by adding the number of (Doc,Wt)s added so far to the current concept and the
    			// index where the first (Doc,Wt) for this concept should begin.
    			l = DocList(Concept(m)) + DocWtCount(Concept(m));

    			// Now copy the current (doc,wt) pair to the correct place in the inverted index array
    			Document(l) = i;
    			// ASSUMPTION : Each weight in TermWt is between 0.0 and 1.0
    			Weight(l) = (WORD)((double)TermWt(j) * (double)WT_ONE);

    			// Increase the counter to account for the addition of this document
    			DocWtCount(Concept(m))++;
    		}
    	}

    	// We don't need the m_acDocWts array any more
    	VFree(m_acDocWts);  m_acDocWts= NULL;

    	// Now that we have an inverted index, we don't need the con,wt document vectors anymore
        FreeVirtualBuffer(&m_vbVectorConcept );
    	FreeVirtualBuffer(&m_vbVectorTermFreq);
    	FreeVirtualBuffer(&m_vbVectorWt      );
    	FreeVirtualBuffer(&m_vbVectorRange   );
    #if 0
    	FreeVirtualBuffer(&m_vbTFOverFlow    );
    #endif

    	// Now compress the documents in the inverted index
    	// Estimate that we will need only a fourth	of the space (m_cConWts * 4 is the full number
    	// of bytes need to store the docs without compression)
        CreateVirtualBuffer(&m_vbDocInvIndex, m_cConWts, m_cConWts * 4);

    	for (i = 0; i < m_cConcepts; i++)
    	{
    		// j holds the previous document id
    		// k holds the number of documents in the inverted list for this concept
    		k = DocFromCumFreq(i);
    		dwSavBitPos = m_cBitsUsedInEncoding;

    		for (j = l = 0; l < k; l++)
    		{
    			// compute the compressed representation and add it
    			// The encoding scheme cannot encode 0, so we will map (0 to numdocs - 1) to (1 to numdocs)
    			// The 1 being added here accomplishes that mapping.
    			// As a result of this, the first document id is stored as docId + 1, but the subsequent
    			// gaps are stored as they are. When decoding, therefore, we have to adjust for the first
    			// doc and do not need to adjust for the remaining docs. in the inverted list for a concept.
    			dwDelta = DocIdFromInvList(i, l) + 1 - j;
    			ASSERT(dwDelta);	// dwDelta should always be greater than 0.
    			// Assume that there are at most 32 bits in the value that is being encoded.
    			// This assumption holds as long as we use a 32-bit value to store the initial document id
    			for (m = 0; m < 32 && !(bitMask32[m] & dwDelta); m++);
    			ASSERT(m < 32);
    			cSavOneBits = cOneBits = 31 - m;
    			// remove the highest 1 bit to get the reminder. removal is accomplished by xor'ing 1 with 1.
    			dwDelta ^= bitMask32[m];

    			m_cBitsUsedInEncoding += 2*cOneBits + 1;

    			// NOW ADD THE CODE BITS TO THE STREAM
    			__try
    			{
    				// add cOneBits bits to the stream
    				for (; cOneBits; cOneBits--)
    				{
    					CodeByte(cByte) |= bitMask8[bitPos];
    					bitPos = (bitPos + 1) % 8;
    					if (bitPos == 0) cByte++;
    				}

    				ASSERT(bitPos < 8);
    				// add a terminating 0 at the end
    				CodeByte(cByte) &= ~bitMask8[bitPos];
    				// advance the bit position
    				bitPos = (bitPos + 1) % 8;
    				if (bitPos == 0) cByte++;

    				// Add the reminder bits from dwDelta. The number of reminder bits is equal to the number of one bits
    				// Remember that m indicates the position of the highest 1 bit. Start there and write cSavOneBits bits.
    				for (; cSavOneBits; cSavOneBits--)
    				{
    					if (bitMask32[++m] & dwDelta) // if true, we have a 1 bit
    						CodeByte(cByte) |= bitMask8[bitPos];
    					else	// we have a 0 bit
    						CodeByte(cByte) &= ~bitMask8[bitPos];

    					bitPos = (bitPos + 1) % 8;
    					if (bitPos == 0) cByte++;
    				}
    			}
    			__except (VirtualBufferExceptionFilter(GetExceptionCode(), GetExceptionInformation(), &m_vbDocInvIndex))
    			{
    				RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
    			}

    			// save doc id for use in the next iteration
    			// 1 is being added to map from 0 based numbering to 1 based numbering
    			j = DocIdFromInvList(i, l) + 1;
    		}

    		// now store the position of the first bit that codes the first document gap of the document inverted list
    		// Caution : This replaces the index previously stored there.
    		DocList(i) = dwSavBitPos;
    	}

    	m_bCollState = WEIGHTED;
    }
    __finally
    {
    	// We don't need the uncompressed inverted index any more

    	if (m_aDocInvIndex) { VFree(m_aDocInvIndex);  m_aDocInvIndex = NULL; }
    	if (m_acDocWts    ) { VFree(m_acDocWts    );  m_acDocWts     = NULL; }

        if (_abnormal_termination())
        {
        	if (m_vbDocInvIndex.Base) FreeVirtualBuffer(&m_vbDocInvIndex);

        	if (m_aWtInvIndex ) { VFree(m_aWtInvIndex );  m_aWtInvIndex = NULL; }

            m_bCollState= COLL_UNUSABLE;
        }
    }
}

/*************************************************************************
 *	FUNCTION : 															 *
 *                                                                       *
 *  RETURNS  :															 *
 *																		 *
 *	PURPOSE :															 *
 *																		 *
 *	PARAMETERS :														 *
 *																		 *
 *	SIDE EFFECTS :														 *
 *                                                                       *
 *  DESCRIPTION :                                                        *
 *                                                                       *
 *  HISTORY :                                                            *
 *                                                                       *
 *          Author          Date              Action                     *
 *          ------          ----              ------                     *
 *                                                                       *
 *          KrishnaN        4/23/94           Creation.                  *
 *																		 *
 *************************************************************************/

void CCollection::StoreImage(CPersist *pDiskImage)
{
    ASSERT(GetCollState() != COLL_UNUSABLE && GetCollState() != WEIGHTED);

	// Account for the last document
	NewDocument();

	WeightAndInvertVectors(NEWTF_NONE, WT_TFIDF, NORM_COSINE);

	CollHdr *pch = (CollHdr *) (pDiskImage->ReserveTableSpace(sizeof(CollHdr)));

	pch->cConcepts           = m_cConcepts;
	pch->cDocuments          = m_cDocuments;
	pch->cDocWtPairs         = m_cConWts;
	pch->cBitsUsedInEncoding = m_cBitsUsedInEncoding;

	pch->offConcepts = pDiskImage->NextOffset();

	pDiskImage->SaveData(PBYTE(m_vbConcepts.Base), (m_cConcepts + 1) * sizeof(ConceptStruct));

	pch->offWtInvIndex  = pDiskImage->NextOffset();
	pDiskImage->WriteWords(m_aWtInvIndex, m_cConWts);

	pch->offDocInvIndex = pDiskImage->NextOffset();
	pDiskImage->WriteBytes(PBYTE(m_vbDocInvIndex.Base), (m_cBitsUsedInEncoding + 7)/8);
}

CCollection	* CCollection::CreateImage(CPersist *pDiskImage)
{
	CCollection *pColl = NULL;

	__try
    {
    	pColl = New CCollection();

    	pColl->ConnectImage(pDiskImage);
    }
    __finally
    {
        if (_abnormal_termination() && pColl)
        {
            delete pColl;  pColl= NULL;
        }
    }

	return pColl;
}

void CCollection::ConnectImage(CPersist *pDiskImage)
{
	m_fLoadedFromDisk = TRUE;

	CollHdr *pch = (CollHdr *) (pDiskImage->ReserveTableSpace(sizeof(CollHdr)));

	m_cConcepts           = pch->cConcepts;
	m_cDocuments          = pch->cDocuments;
	m_cConWts             = pch->cDocWtPairs;
	m_cBitsUsedInEncoding = pch->cBitsUsedInEncoding;

	m_vbConcepts   .Base = LPVOID(pDiskImage->LocationOf(pch->offConcepts   ));
	m_vbDocInvIndex.Base = LPVOID(pDiskImage->LocationOf(pch->offDocInvIndex));
	m_aWtInvIndex        = LPWORD(pDiskImage->LocationOf(pch->offWtInvIndex ));

	// ready to use!
	m_bCollState = COLL_USABLE;
}

/*************************************************************************
 *	FUNCTION : 															 *
 *                                                                       *
 *  RETURNS  :															 *
 *																		 *
 *	PURPOSE :															 *
 *																		 *
 *	PARAMETERS :														 *
 *																		 *
 *	SIDE EFFECTS :														 *
 *                                                                       *
 *  DESCRIPTION :                                                        *
 *                                                                       *
 *  HISTORY :                                                            *
 *                                                                       *
 *          Author          Date              Action                     *
 *          ------          ----              ------                     *
 *                                                                       *
 *          KrishnaN        4/23/94           Creation.                  *
 *																		 *
 *************************************************************************/

void CCollection::ApplyWeightingScheme(BYTE TFModType, BYTE WeightType, BYTE NormType, DWORD iFirstConWt, DWORD cConWts)
{
	register DWORD i;
	double Wt;	// used to hold different types of cumulative values at various points in the computations

	// First modify weight based on the term frequency component
	switch (TFModType)
	{
		case NEWTF_NONE:	// do nothing
			break;

		case NEWTF_BINARY:	// Since all the terms are in, turn them on
			for (i = 0; i < cConWts; i++)
				TermWt(i) = (float)1.0;
			break;

		case NEWTF_MAXNORM:
			Wt = 0.0;
			for (i = 0; i < cConWts; i++)
				if (TermWt(i) > Wt)
					Wt = TermWt(i);

			// increase Max by 0.00001 to place all normalized TFs between 0.0 and 1.0
			Wt += 0.00001;

			for (i = 0; i < cConWts; i++)
					TermWt(i) = (float) ((double)TermWt(i)/Wt);
			break;

		case NEWTF_AUGNORM:
			Wt = 0.0;
			for (i = 0; i < cConWts; i++)
				if (TermWt(i) > Wt)
					Wt = TermWt(i);

			// increase Max by 0.00001 to place all normalized TFs between 0.0 and 1.0
			Wt += 0.00001;

			for (i = 0; i < cConWts; i++)
				TermWt(i) = (float) (0.5 + 0.5 * (double)TermWt(i) / Wt);
			break;

		default:
			ASSERT(FALSE);
			break;
	}

	// Then modify the weight based on the collection frequency component
	switch (WeightType)
	{
		case WT_NONE:	// do nothing
			break;

		// if a concept occurs in all docs, let's assign it a small value instead of assigning it a 0.0
		case WT_TFIDF:
			for (i = 0; i < cConWts; i++)
				if (m_cDocuments == DocFromCumFreq(Concept(i + iFirstConWt)))
					TermWt(i) = (float) 0.005;
				else
					TermWt(i) = (float) ((double)TermWt(i) * log((double)m_cDocuments / (double)DocFromCumFreq(Concept(i + iFirstConWt))));
			break;

		case WT_PROB:
			for (i = 0; i < cConWts; i++)
				if (m_cDocuments == DocFromCumFreq(Concept(i + iFirstConWt)))
					TermWt(i) = (float) 0.005;
				else
					TermWt(i) = (float) ((double)TermWt(i) * log((double)(m_cDocuments - DocFromCumFreq(Concept(i + iFirstConWt))) / (double)DocFromCumFreq(Concept(i + iFirstConWt))));
			break;

		default:
			ASSERT(FALSE);
			break;
	}

	switch (NormType)
	{
		case NORM_NONE:
			break;

		case NORM_SUM:
			break;

		case NORM_COSINE:
			Wt = 0.0;
			// compute sum of squares of weights in the vector
			for (i = 0; i < cConWts; i++)
				Wt += TermWt(i) * TermWt(i);

			Wt = sqrt(Wt);
			// normalize each weight by the sum of squares computed above
			for (i = 0; i < cConWts; i++)
				TermWt(i) = (float) ((double)TermWt(i) / Wt);
			break;

		case NORM_MAX:
			break;
	}
}

DWORD CCollection::GetDocumentGap(LPDWORD startBitPos)
{
	ASSERT(*startBitPos < m_cBitsUsedInEncoding);

	int cOneBits = 0;
	DWORD dwGap;
	DWORD cByte = *startBitPos / 8;
	BYTE bitPos = (BYTE) (*startBitPos % 8);

	// determine the number of 1 bits
	for ( ; CodeByte(cByte) & bitMask8[bitPos]; )
	{
		cOneBits++;
		bitPos = (bitPos + 1) % 8;
		if (bitPos == 0)
			cByte++;
	}

	*startBitPos += 2*cOneBits + 1;
	ASSERT(*startBitPos <= m_cBitsUsedInEncoding);

	// reconstruct the doc id
	// set the low bit and shift it left as you reconstruct the lower bits
	dwGap = 1;
	for ( ; cOneBits; cOneBits--)
	{
		bitPos = (bitPos + 1) % 8;
		if (bitPos == 0)
			cByte++;
		dwGap <<= 1;
		// If true, place a 1 bit in the lowest bit position
		// If false, you already have a 0 bit in the lowest bit position
		if (CodeByte(cByte) & bitMask8[bitPos])
			dwGap = dwGap | bitMask32[31];
	}

	// Remember that we stored gap + 1
	return(dwGap);
}