NT4/private/windows/win4help/ftsrch/query.h
2020-09-30 17:12:29 +02:00

118 lines
4.4 KiB
C++

// This file contains the definition of class CQuery
#ifndef __QUERY_H__
#define __QUERY_H__
// error definitions
#define QUERYERROR_OUTOFMEMORY 0xFFFFFF21
#define QUERYERROR_NODOCUMENTS 0xFFFFFF22
#define QUERYERROR_INVALIDCOLLECTION 0xFFFFFF23
#define QUERYERROR_NOCONCEPTS 0xFFFFFF24
#define QUERYERROR_EMPTYQUERY 0xFFFFFF25
#define QUERYERROR_EMPTYSIMARRAY 0xFFFFFF26
#define QUERYERROR_NOHITS 0xFFFFFF27
#define QUERYERROR_NODOCWTS 0xFFFFFF28
#define QUERYERROR_NORELEVANTDOCS 0xFFFFFF29
#define QUERYERROR_BADSIMINFO 0xFFFFFF2A
// relevance feedback
#define UNMARKED 0x00 // User hasn't seen it yet
#define RELEVANT 0x01 // User found the document to be relevant
#define NOTRELEVANT 0x02 // User found the document to be irrelevant
#define DONOTKNOW 0x03 // User could not decide if the document is relevant or not
// Weights. A 1.0 is represented in fixed point as 0xFFFF
#define WT_ONE 0xFFFF
#define DOESNOTEXIST 0xFFFFFFFF
// NOTE : I TESTED WITH SIMILARITY FIELD LIMITED TO 16 BITS, AND THE SIMILARITY VALUES ARE NOT THAT DIFFERENT
// FROM THE SIMILARITY VALUES OBTAINED WHEN THE SIMILARITY FIELD IS LIMITED TO 24 BITS OR ALLOWED TO BE
// 32 BITS. THE THREE MOST SIGNIFICANT DIGITS WERE ALWAYS THE SAME. THIS IS NOT SURPRISING BECAUSE
// WE ARE DISTRIBUTING A RANGE OF 0.0 TO 1.0 BETWEEN 0 AND 64k (WHEN LIMITED TO 16 BITS) ETC. SINCE THE
// ORIGINAL RANGE IS SO SMALL, THE COMPROMISE IN BITS IS NOT A REAL PROBLEM.
#define MAXSIM 0xFFFFFFFF // 0.0 - 1.0 is mapped to 0 - MAXSIM.
typedef struct
{
DWORD DocId; // Document Id
DWORD Similarity; // Similarity of this document to this query
DWORD CollId; // Collection Id
} SimStruct;
typedef struct
{
DWORD ConceptId;
DWORD Weight;
} TempConWtStruct;
int _cdecl CompareTempConWtStruct(const void *arg1, const void *arg2);
int _cdecl CompareSimStruct(const void *arg1, const void *arg2);
class CDictionary;
class CQuery
{
public:
// Creator
static CQuery *NewQuery(CTextSet *pts);
// Destructor
~CQuery();
// Access Functions:
void RecordConcept(DWORD ConceptId);
BOOL WeightVector(BYTE TFModType, BYTE WeightType, BYTE NormType);
BOOL RankDocuments(SimStruct *aInSimilarity, DWORD cInHits);
DWORD RetrieveWithFeedback(SimStruct *aInSimilarity, DWORD cInMaxHits,
PWCHAR pwRelDocText, int cwRelDocText,
PWCHAR pwNonRelDocText, int cwNonRelDocText
);
private:
// Constructors
CQuery();
// Initialer
void Initialize(CTextSet *textsetIn, DWORD cInEstConWtPairs, DWORD cInMaxConWtPairs);
// Internal functions.
void ApplyWeightingScheme(BYTE TFModType, BYTE WeightType, BYTE NormType, DWORD iFirstConWt, DWORD cConWts);
DWORD GetDocPosInList(SimStruct *aInSimilarity, DWORD cInHits, DWORD DocId);
DWORD GetDocPosInList2(SimStruct *aInSimilarity, DWORD cInHits, DWORD DocId);
void SortQuery();
void IndexDocumentText(PWCHAR pwDocText, int cwText, BOOL fRelevant);
private:
// Internal variables
// The following provides memory to implement the query class.
LPSTR m_pszQueryText; // buffer to hold the query text.
LPWORD m_aWtInvIndex; // Pointer to corresponding array in collection
LPDWORD m_aDocInvIndex; // Pointer to corresponding array in collection
CTextSet *m_ptdb; // Pointer to the text database.
MY_VIRTUAL_BUFFER m_vbConcepts; // buffer to hold array of concepts
// The vectors are (concept, freq) pairs. We are implementing that as two structures. One is an array of
// concepts and the other is an array of term frequencies. If we have to implement the tuple as one
// structure, we will be wasting a WORD for every structure.
MY_VIRTUAL_BUFFER m_vbVectorConcept; // the concept part of the vector representation
MY_VIRTUAL_BUFFER m_vbVectorTermFreq;// the term frequency part of the vector representation
MY_VIRTUAL_BUFFER m_vbVectorWt; // the temporary buffer used to convert term freq to buffer
#if 0
MY_VIRTUAL_BUFFER m_vbTFOverFlow; // buffer to hold the term frequencies that are GT 64K.
// It is very unlikely that we will ever have a term that occurs more than 64K
// times in a document, but that case should be accounted for.
#endif
// The following track the state of the collection.
DWORD m_cConWts; // number of ConWt pairs seen so far
#if 0
DWORD m_cOverFlows; // number of term frequency overflows
#endif
DWORD m_cDocuments; // total number of documents in the collection for this this query is indexed.
};
#endif // __QUERY_H