// This file contains definitions for class CDictionary #ifndef __DICT_H__ #define __DICT_H__ typedef void (APIENTRY *FPSTEMMER) (PWCHAR, WORD); #define STOPWORD 0xFFFFFFFF // this concept id indicates that the word is a stop word #define EOL 0xFFFFFFFF // this pointer value indicates the end of list (NULL POINTER) //Macros to simplify access to the virtual buffers #define WordHashBucket(i) *((LPDWORD)m_vbWordHashBuckets.Base + i) #define StemHashBucket(i) *((LPDWORD)m_vbStemHashBuckets.Base + i) #if 0 #define Word(i) ((WordStruct *)m_vbWords.Base + i) #endif // 0 #define pImage(i) *((LPDWORD)m_vbpImage.Base + i) #define ConceptId(i) *((LPDWORD)m_vbConceptId.Base + i) #define pNextWord(i) *((LPDWORD)m_vbpNextWord.Base + i) #define ConceptStreamByte(i) *((LPBYTE)m_vbConceptId.Base + i) #define pNextWordStreamByte(i) *((LPBYTE)m_vbpNextWord.Base + i) #define Stem(i) ((StemStruct *)m_vbStems.Base + i) // State of the dictionary. Defines the operations that can be performed. #define UNDEFINED 0x00 // dictionary state is undefined #define UNINSERTABLE 0x01 // dictionary does not allow insertions #define DICT_UNUSABLE UNINSERTABLE #define INSERTABLE 0x02 // dictionary allows insertions #define STORABLE 0x03 // dictionary can now be stored #define DICT_USABLE 0x04 // dictionary can now be used #define HASHTABLE_SIZE 0x3FFF // number of buckets in the hash table #define HASHMASK 0x00003FFF // Used to convert a 32-bit value into a 15-bit hash index // Library defined error codes. Bit 29 must be set! #define DICTERROR_BADINPUT 0xFFFFFF00 #define DICTERROR_OUTOFMEMORY 0xFFFFFF01 #define DICTERROR_BADMEMFREEATTEMPT 0xFFFFFF02 #define DICTERROR_BADSEQUENCE 0xFFFFFF03 #define MAXWORDLEN 128 // maximum characters in a word. #if 0 typedef struct { DWORD pImage; // pointer to the string's image. This is an index into a character buffer. DWORD ConceptId; // Unique concept identifier. For stop words, this is STOPWORD. DWORD pNextWord; // pointer to the next word in the hash chain. This is an index into a WordStruct buffer. } WordStruct; #endif // 0 typedef struct { DWORD pImage; // pointer to the string's image. This is an index into a character buffer. DWORD pNextStem; // pointer to the next stem in the hash chain. This is an index into a StemStruct buffer. } StemStruct; typedef struct { DWORD cHashBuckets; // Number of buckets in the hash table DWORD offHashBuckets; // offset of hash buckets DWORD cWords; // Number of words in the dictionary #if 0 DWORD offWords; // offset of words #endif // 0 BYTE cConceptIdBits; // number of bits used to represent the concept id BYTE cpNextWordBits; // number of bits used to represent pointers to next image BYTE Reserved1; // reserved BYTE Reserved2; // reserved DWORD offpImage; // offset of pointers to next image DWORD offConceptId; // offset of concept ids DWORD offpNextWord; // offset of pointers to next word DWORD cWordChars; // Size of the word buffer, in characters - NOT BYTES DWORD offWordChars; // offset of word images DWORD cStems; // Number of stems in the dictionary DWORD cStopWords; // Number of stopwords } DictHdr; class CDictionary { public: //Creator static CDictionary *NewDictionary(BOOL fLoadStopWords= TRUE); // Destructor ~CDictionary(); // Access Functions: VOID StartDictInsertions(DWORD cEstWords, DWORD cMaxWords, DWORD cEstWordBufferSize, DWORD cMaxWordBufferSize); VOID EndDictInsertions(); DWORD EnterWord(PWCHAR pWord, WORD cCharsInWord, BOOL fStopWord = FALSE, BOOL fLookup = FALSE); // Information Functions: BYTE GetDictState() { return m_bDictState; } DWORD GetWordCount() { return m_cWords; } DWORD GetStopWordCount() { return m_cStopWords; } DWORD GetConceptCount() { return m_cStems; } // Save/Load Functions VOID StoreImage(CPersist *pDiskImage); static CDictionary *CreateImage(CPersist *pDiskImage); VOID ConnectImage(DictHdr *pdh, CPersist *pDiskImage); // words with the common stem functions DWORD GetWordCountOfConcept(DWORD dwConId); PWCHAR GetFirstWordOfConcept(DWORD dwConId); PWCHAR GetNextWordOfConcept(DWORD dwConId); private: // Constructors CDictionary(); // Initializer void Initial(); // Internal functions. void StemWord(PWCHAR pWord, WORD cCharsInWord); DWORD ComputeHashKey(PWCHAR Word, WORD cCharsInWord); DWORD EnterStem(PWCHAR WordStem); DWORD AddWordToDict(PWCHAR pWord, WORD cCharsInWord); DWORD AddStemToDict(PWCHAR pStem, WORD cCharsInStem); DWORD GetpImage(DWORD i); DWORD GetConceptId(DWORD i); DWORD GetpNextWord(DWORD i); private: // Internal variables // The following provide memory to implement the dictionary. #if 0 MY_VIRTUAL_BUFFER m_vbWords; // An array of words. #endif // 0 // WordStruct is broken into its components as follows. MY_VIRTUAL_BUFFER m_vbpImage; // An array of pointers to Images MY_VIRTUAL_BUFFER m_vbConceptId; // An array of concept ids MY_VIRTUAL_BUFFER m_vbpNextWord; // An array of next words MY_VIRTUAL_BUFFER m_vbStems; // An array of stems. MY_VIRTUAL_BUFFER m_vbWordHashBuckets; // An array of buckets to implement the hash table for words. MY_VIRTUAL_BUFFER m_vbStemHashBuckets; // An array of buckets to implement the hash table for stems. MY_VIRTUAL_BUFFER m_vbWordBuffer; // Buffer to hold the word strings (images). MY_VIRTUAL_BUFFER m_vbStemBuffer; // Buffer to hold the word stem strings (images). MY_VIRTUAL_BUFFER m_vbpCopyOfWord; // Buffer to hold an internal copy of the word being processed. MY_VIRTUAL_BUFFER m_vbpCopyOfWord2; // The following track the state of the dictionary. BYTE m_bDictState; // Indicates the current state of the dictionary. DWORD m_cWordChars; // The number of word characters in the word buffer. DWORD m_cStemChars; // The number of stem characters in the stem buffer. DWORD m_cWords; // The number of words in the dictionary. DWORD m_cStopWords; // The number of stop words in the dictionary. DWORD m_cMaxWords; // The maximum number of words supported by the allocated memory. DWORD m_cStems; // The number of stems in the dictionary. DWORD m_cMaxStems; // The maximum number of stems supported by the allocated memory. BYTE m_cConceptIdBits; // number of bits used to represent the concept id BYTE m_cpNextWordBits; // number of bits used to represent the pointer to next word BOOL m_fWordsCompressed; // Indicates if the Words array is compressed BOOL m_fLoadedFromDisk; // Is this dictionary loaded from disk? FPSTEMMER m_lpfnStemmer; // pointer to the stemmer function HINSTANCE m_hStemmerInstance; // handle to the stemmer library instance // ADDED TO TEST "WORDS OF COMMON STEM" DWORD m_ConIdInContext; // The concept id that is being tracked in the first, retrieval DWORD m_LastOccurrenceOfConId; // last occurrence of this concept id in the word list }; #endif //__DICT_H