173 lines
6.9 KiB
C++
173 lines
6.9 KiB
C++
// This file contains the definition of class CVector
|
|
#ifndef __VECTOR_H__
|
|
|
|
#define __VECTOR_H__
|
|
|
|
// weighting constants
|
|
#define NEWTF_NONE 0
|
|
#define NEWTF_BINARY 1
|
|
#define NEWTF_MAXNORM 2
|
|
#define NEWTF_AUGNORM 3
|
|
#define WT_NONE 4
|
|
#define WT_TFIDF 5
|
|
#define WT_PROB 6
|
|
#define NORM_NONE 7
|
|
#define NORM_SUM 8
|
|
#define NORM_COSINE 9
|
|
#define NORM_MAX 10
|
|
|
|
// state definition constants
|
|
#define COLL_USABLE 0x00
|
|
#define COLL_UNUSABLE 0x01
|
|
#define WEIGHTED 0x02
|
|
|
|
// error definitions
|
|
#define COLLERROR_NOCONCEPTS 0xFFFFFF10
|
|
#define COLLERROR_OUTOFMEMORY 0xFFFFFF11
|
|
#define COLLERROR_BADSEQUENCE 0xFFFFFF12
|
|
#define COLLERROR_BADINPUT 0xFFFFFF13
|
|
|
|
// Weights. A 1.0 is represented in fixed point as 0xFFFF
|
|
#define WT_ONE 0xFFFF
|
|
|
|
// macros to enhance readability
|
|
#define DocSentinel(i) *((LPDWORD)m_vbVectorRange.Base + i)
|
|
#define Concept(i) *((LPDWORD)m_vbVectorConcept.Base + i)
|
|
#define TermFreq(i) *((LPWORD)m_vbVectorTermFreq.Base + i)
|
|
#define TermWt(i) *((float *)m_vbVectorWt.Base + i)
|
|
// BugBug : For now assume that no term frequency will overflow. So get the value from
|
|
// the term freq array. Later on, however, you will have to see if there is
|
|
// an over flow and if so, get the overflow.
|
|
#define GetRealTermFreq(i) *((LPWORD)m_vbVectorTermFreq.Base + i)
|
|
// Macro DocFreq is only meaningful UNTIL inversion. Before inversion, we reuse the cDocFreq field
|
|
// to hold the cumulative document frequencies instead of the raw frequencies. The advantage is that
|
|
// we don't have to use an extra field to hold a pointer to the wt list of a concept. We do, however,
|
|
// still have to maintain a field for the Doc list of a concept because this list if compressed.
|
|
#define DocFreq(i) ((ConceptStruct *)m_vbConcepts.Base + i)->cDocFreq
|
|
// DocFromCumFreq is only meaningful FROM inversion. cDocFreq changes from a holder of raw doc count
|
|
// to a pointer to the beginning of the list.
|
|
#define DocFromCumFreq(i) (((ConceptStruct *)m_vbConcepts.Base + i+1)->cDocFreq - ((ConceptStruct *)m_vbConcepts.Base + i)->cDocFreq)
|
|
#define DocList(i) ((ConceptStruct *)m_vbConcepts.Base + i)->pDocList
|
|
#define CodeByte(i) *((LPBYTE)m_vbDocInvIndex.Base + i)
|
|
#define DocWtCount(i) m_acDocWts[i]
|
|
#define Document(i) m_aDocInvIndex[i]
|
|
#define Weight(i) m_aWtInvIndex[i]
|
|
|
|
#define DocIdFromInvList(con, i) m_aDocInvIndex[((ConceptStruct *)m_vbConcepts.Base + con)->pDocList + i]
|
|
#define WtFromInvList(con, i) m_aWtInvIndex[((ConceptStruct *)m_vbConcepts.Base + con)->cDocFreq + i]
|
|
|
|
typedef struct
|
|
{
|
|
DWORD cDocFreq; // the number of documents in the collection, in which this concept occurs at least once
|
|
// just before inversion, cDocFreq is reused to hold the cumulative values. The advantage
|
|
// is that we can avoid using a third field to point to the wts of the docs.
|
|
DWORD pDocList; // a pointer to the list of documents in which this concept occurs. This is an index into an array.
|
|
} ConceptStruct;
|
|
|
|
#if 0
|
|
typedef struct
|
|
{
|
|
DWORD iConWtIndex; // index of the Con, Wt pair that has a termfreq greater than 64K
|
|
DWORD cTermFreq; // the overflowing value
|
|
} TFOverFlowStruct;
|
|
#endif
|
|
|
|
typedef struct
|
|
{
|
|
DWORD cConcepts; // Number of concepts in the dictionary of this coll
|
|
DWORD offConcepts;
|
|
DWORD cDocuments; // Number of documents in the collection
|
|
DWORD cDocWtPairs; // Number of doc,wt [ = con,freq ] pairs
|
|
DWORD offWtInvIndex;
|
|
DWORD offDocInvIndex;
|
|
DWORD cBitsUsedInEncoding;// Number of bits used to encode the doc inverted index.
|
|
#if 0
|
|
DWORD cOverFlows; // Number of overflows.
|
|
#endif
|
|
} CollHdr;
|
|
|
|
class CTextSet;
|
|
|
|
class CCollection
|
|
{
|
|
friend class CQuery;
|
|
|
|
public:
|
|
|
|
// Creator
|
|
|
|
static CCollection *NewCollection();
|
|
|
|
// Destructor
|
|
~CCollection();
|
|
|
|
// Access Functions:
|
|
void Initialize(DWORD cInEstConcepts, DWORD cInMaxConcepts, DWORD cInEstDocuments, DWORD cInMaxDocuments, DWORD cInEstConWtPairs, DWORD cInMaxConWtPairs);
|
|
void RecordConcept(DWORD ConceptId);
|
|
void NewDocument();
|
|
void WeightAndInvertVectors(BYTE TFModType, BYTE WeightType, BYTE NormType);
|
|
BOOL Serialize(HANDLE hInFile, BOOL fSaveVectors);
|
|
BOOL Unserialize(HANDLE hInFile);
|
|
void SetNumberOfConcepts(DWORD cInConcepts);
|
|
|
|
// Information Functions:
|
|
BYTE GetCollState() { return m_bCollState; }
|
|
BOOL IsConceptIdValid(DWORD ConceptId) { if (ConceptId > m_cConcepts) return FALSE; return TRUE;}
|
|
DWORD GetDocumentCount() {return m_cDocuments;}
|
|
|
|
// Save/Load Functions
|
|
void StoreImage(CPersist *pDiskImage);
|
|
static CCollection *CreateImage(CPersist *pDiskImage);
|
|
void ConnectImage(CPersist *pDiskImage);
|
|
|
|
private:
|
|
// Constructor
|
|
CCollection();
|
|
|
|
// Internal functions.
|
|
void ApplyWeightingScheme(BYTE TFModType, BYTE WeightType, BYTE NormType, DWORD iFirstConWt, DWORD cConWts);
|
|
DWORD GetDocumentGap(LPDWORD startBitPos);
|
|
|
|
private:
|
|
// Internal variables
|
|
// The following provides memory to implement the collection.
|
|
LPDWORD m_acDocWts; // array of doc,wt pair counts used to aid in the inversion process
|
|
LPWORD m_aWtInvIndex; // wt component of the Doc,Wt inverted index
|
|
LPDWORD m_aDocInvIndex; // Doc component of the Doc,Wt inverted index
|
|
// The vectors are (concept, freq) pairs. We are implementing that as two structures. One is an array of
|
|
// concepts and the other is an array of term frequencies. If we have to implement the tuple as one
|
|
// structure, we will be wasting a WORD for every structure.
|
|
MY_VIRTUAL_BUFFER m_vbConcepts; // buffer to hold an array of concept structures
|
|
MY_VIRTUAL_BUFFER m_vbVectorRange; // tracks the start and end of vector representation for a given document
|
|
// in the (Concept, Freq) array
|
|
MY_VIRTUAL_BUFFER m_vbVectorConcept; // the concept part of the vector representation
|
|
MY_VIRTUAL_BUFFER m_vbVectorTermFreq; // the term frequency part of the vector representation
|
|
MY_VIRTUAL_BUFFER m_vbVectorWt; // the temporary buffer used to convert term freq to buffer
|
|
MY_VIRTUAL_BUFFER m_vbDocInvIndex; // the buffer used to store the compressed document gaps in the document index
|
|
#if 0
|
|
MY_VIRTUAL_BUFFER m_vbTFOverFlow; // buffer to hold the term frequencies that are GT 64K.
|
|
// It is very unlikely that we will ever have a term that occurs more than 64K
|
|
// times in a document, but that case should be accounted for.
|
|
#endif
|
|
|
|
// The following track the state of the collection.
|
|
BYTE m_bCollState; // tracks the state of the collection
|
|
DWORD m_cConcepts; // number of unique concepts in the dictionary
|
|
DWORD m_cDocuments; // number of documents in the collection
|
|
DWORD m_cConWts; // number of ConWt pairs seen so far
|
|
#if 0
|
|
DWORD m_cOverFlows; // number of term frequency overflows
|
|
#endif
|
|
DWORD m_cBitsUsedInEncoding; // number of bits used to encode the doc gaps in the inverted list
|
|
BOOL m_fLoadedFromDisk; // indicates if it has been loaded from disk
|
|
|
|
// Used for integration with Ron's code
|
|
CTextSet *m_pts;
|
|
|
|
public:
|
|
CTextSet * PTextSet() {return m_pts;};
|
|
void SetTextSet(CTextSet *pts) {m_pts = pts;};
|
|
};
|
|
|
|
#endif // __VECTOR_H__
|