NT4/private/windows/win4help/ftsrch/dict.h

179 lines
6.8 KiB
C
Raw Normal View History

2001-01-01 00:00:00 +01:00
// This file contains definitions for class CDictionary
#ifndef __DICT_H__
#define __DICT_H__
typedef void (APIENTRY *FPSTEMMER) (PWCHAR, WORD);
#define STOPWORD 0xFFFFFFFF // this concept id indicates that the word is a stop word
#define EOL 0xFFFFFFFF // this pointer value indicates the end of list (NULL POINTER)
//Macros to simplify access to the virtual buffers
#define WordHashBucket(i) *((LPDWORD)m_vbWordHashBuckets.Base + i)
#define StemHashBucket(i) *((LPDWORD)m_vbStemHashBuckets.Base + i)
#if 0
#define Word(i) ((WordStruct *)m_vbWords.Base + i)
#endif // 0
#define pImage(i) *((LPDWORD)m_vbpImage.Base + i)
#define ConceptId(i) *((LPDWORD)m_vbConceptId.Base + i)
#define pNextWord(i) *((LPDWORD)m_vbpNextWord.Base + i)
#define ConceptStreamByte(i) *((LPBYTE)m_vbConceptId.Base + i)
#define pNextWordStreamByte(i) *((LPBYTE)m_vbpNextWord.Base + i)
#define Stem(i) ((StemStruct *)m_vbStems.Base + i)
// State of the dictionary. Defines the operations that can be performed.
#define UNDEFINED 0x00 // dictionary state is undefined
#define UNINSERTABLE 0x01 // dictionary does not allow insertions
#define DICT_UNUSABLE UNINSERTABLE
#define INSERTABLE 0x02 // dictionary allows insertions
#define STORABLE 0x03 // dictionary can now be stored
#define DICT_USABLE 0x04 // dictionary can now be used
#define HASHTABLE_SIZE 0x3FFF // number of buckets in the hash table
#define HASHMASK 0x00003FFF // Used to convert a 32-bit value into a 15-bit hash index
// Library defined error codes. Bit 29 must be set!
#define DICTERROR_BADINPUT 0xFFFFFF00
#define DICTERROR_OUTOFMEMORY 0xFFFFFF01
#define DICTERROR_BADMEMFREEATTEMPT 0xFFFFFF02
#define DICTERROR_BADSEQUENCE 0xFFFFFF03
#define MAXWORDLEN 128 // maximum characters in a word.
#if 0
typedef struct
{
DWORD pImage; // pointer to the string's image. This is an index into a character buffer.
DWORD ConceptId; // Unique concept identifier. For stop words, this is STOPWORD.
DWORD pNextWord; // pointer to the next word in the hash chain. This is an index into a WordStruct buffer.
} WordStruct;
#endif // 0
typedef struct
{
DWORD pImage; // pointer to the string's image. This is an index into a character buffer.
DWORD pNextStem; // pointer to the next stem in the hash chain. This is an index into a StemStruct buffer.
} StemStruct;
typedef struct
{
DWORD cHashBuckets; // Number of buckets in the hash table
DWORD offHashBuckets; // offset of hash buckets
DWORD cWords; // Number of words in the dictionary
#if 0
DWORD offWords; // offset of words
#endif // 0
BYTE cConceptIdBits; // number of bits used to represent the concept id
BYTE cpNextWordBits; // number of bits used to represent pointers to next image
BYTE Reserved1; // reserved
BYTE Reserved2; // reserved
DWORD offpImage; // offset of pointers to next image
DWORD offConceptId; // offset of concept ids
DWORD offpNextWord; // offset of pointers to next word
DWORD cWordChars; // Size of the word buffer, in characters - NOT BYTES
DWORD offWordChars; // offset of word images
DWORD cStems; // Number of stems in the dictionary
DWORD cStopWords; // Number of stopwords
} DictHdr;
class CDictionary
{
public:
//Creator
static CDictionary *NewDictionary(BOOL fLoadStopWords= TRUE);
// Destructor
~CDictionary();
// Access Functions:
VOID StartDictInsertions(DWORD cEstWords, DWORD cMaxWords, DWORD cEstWordBufferSize, DWORD cMaxWordBufferSize);
VOID EndDictInsertions();
DWORD EnterWord(PWCHAR pWord, WORD cCharsInWord, BOOL fStopWord = FALSE, BOOL fLookup = FALSE);
// Information Functions:
BYTE GetDictState() { return m_bDictState; }
DWORD GetWordCount() { return m_cWords; }
DWORD GetStopWordCount() { return m_cStopWords; }
DWORD GetConceptCount() { return m_cStems; }
// Save/Load Functions
VOID StoreImage(CPersist *pDiskImage);
static CDictionary *CreateImage(CPersist *pDiskImage);
VOID ConnectImage(DictHdr *pdh, CPersist *pDiskImage);
// words with the common stem functions
DWORD GetWordCountOfConcept(DWORD dwConId);
PWCHAR GetFirstWordOfConcept(DWORD dwConId);
PWCHAR GetNextWordOfConcept(DWORD dwConId);
private:
// Constructors
CDictionary();
// Initializer
void Initial();
// Internal functions.
void StemWord(PWCHAR pWord, WORD cCharsInWord);
DWORD ComputeHashKey(PWCHAR Word, WORD cCharsInWord);
DWORD EnterStem(PWCHAR WordStem);
DWORD AddWordToDict(PWCHAR pWord, WORD cCharsInWord);
DWORD AddStemToDict(PWCHAR pStem, WORD cCharsInStem);
DWORD GetpImage(DWORD i);
DWORD GetConceptId(DWORD i);
DWORD GetpNextWord(DWORD i);
private:
// Internal variables
// The following provide memory to implement the dictionary.
#if 0
MY_VIRTUAL_BUFFER m_vbWords; // An array of words.
#endif // 0
// WordStruct is broken into its components as follows.
MY_VIRTUAL_BUFFER m_vbpImage; // An array of pointers to Images
MY_VIRTUAL_BUFFER m_vbConceptId; // An array of concept ids
MY_VIRTUAL_BUFFER m_vbpNextWord; // An array of next words
MY_VIRTUAL_BUFFER m_vbStems; // An array of stems.
MY_VIRTUAL_BUFFER m_vbWordHashBuckets; // An array of buckets to implement the hash table for words.
MY_VIRTUAL_BUFFER m_vbStemHashBuckets; // An array of buckets to implement the hash table for stems.
MY_VIRTUAL_BUFFER m_vbWordBuffer; // Buffer to hold the word strings (images).
MY_VIRTUAL_BUFFER m_vbStemBuffer; // Buffer to hold the word stem strings (images).
MY_VIRTUAL_BUFFER m_vbpCopyOfWord; // Buffer to hold an internal copy of the word being processed.
MY_VIRTUAL_BUFFER m_vbpCopyOfWord2;
// The following track the state of the dictionary.
BYTE m_bDictState; // Indicates the current state of the dictionary.
DWORD m_cWordChars; // The number of word characters in the word buffer.
DWORD m_cStemChars; // The number of stem characters in the stem buffer.
DWORD m_cWords; // The number of words in the dictionary.
DWORD m_cStopWords; // The number of stop words in the dictionary.
DWORD m_cMaxWords; // The maximum number of words supported by the allocated memory.
DWORD m_cStems; // The number of stems in the dictionary.
DWORD m_cMaxStems; // The maximum number of stems supported by the allocated memory.
BYTE m_cConceptIdBits; // number of bits used to represent the concept id
BYTE m_cpNextWordBits; // number of bits used to represent the pointer to next word
BOOL m_fWordsCompressed; // Indicates if the Words array is compressed
BOOL m_fLoadedFromDisk; // Is this dictionary loaded from disk?
FPSTEMMER m_lpfnStemmer; // pointer to the stemmer function
HINSTANCE m_hStemmerInstance; // handle to the stemmer library instance
// ADDED TO TEST "WORDS OF COMMON STEM"
DWORD m_ConIdInContext; // The concept id that is being tracked in the first, <next> retrieval
DWORD m_LastOccurrenceOfConId; // last occurrence of this concept id in the word list
};
#endif //__DICT_H