4298 lines
129 KiB
C++
4298 lines
129 KiB
C++
// TxDBase.cpp -- Implementation for the CTextDatabase class.
|
|
|
|
#include "stdafx.h"
|
|
#include "VMBuffer.h"
|
|
#include "TXDBase.h"
|
|
#include "ByteMaps.h"
|
|
#include "Indicate.h"
|
|
#include "Tokens.h"
|
|
#include "MemEx.h"
|
|
#include "Memory.h"
|
|
#include "Search.h"
|
|
#include "CallBkQ.h"
|
|
#include "ftslex.h"
|
|
#include "stdlib.h"
|
|
#include "search.h"
|
|
#include "memory.h"
|
|
#include "AbrtSrch.h"
|
|
#include "Except.h"
|
|
|
|
#ifdef PROFILING
|
|
|
|
UINT cbTokenCommit = INIT_TOKEN_REF_COMMIT;
|
|
UINT cbDescriptorCommit = 0x118000; // INIT_IMAGE_DESCRIPTOR_COMMIT; // 0x160000;
|
|
UINT cbImageCommit = 0x160000; // INIT_TOKEN_IMAGE_COMMIT; // 0x120000;
|
|
UINT cbDisplayCommit = INIT_DISPLAY_IMAGE_COMMIT; // 0x100000;
|
|
|
|
#endif // PROFILING
|
|
|
|
// acBits[i] gives the sum of the bits in byte value i.
|
|
|
|
BYTE acBits [256] =
|
|
{
|
|
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
|
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
|
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
|
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
|
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
|
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
|
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
|
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
|
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
|
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
|
|
};
|
|
|
|
// acLeadingZeroes gives the low order zeroes before the first
|
|
// one bit in a byte value.
|
|
|
|
BYTE acLeadingZeroes[256] =
|
|
{
|
|
8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
|
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
|
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
|
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
|
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
|
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
|
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
|
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
|
7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
|
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
|
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
|
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
|
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
|
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
|
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
|
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
|
|
};
|
|
|
|
BYTE aLog2[256] =
|
|
{
|
|
0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
|
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
|
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
|
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
|
|
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
|
|
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
|
|
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
|
|
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
|
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
|
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
|
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
|
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
|
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
|
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
|
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
|
|
};
|
|
|
|
BYTE nescan[256] =
|
|
{
|
|
0x00, 0xff, 0xfe, 0x01, 0xfc, 0x03, 0x02, 0xfd, 0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05,
|
|
0xf0, 0x0f, 0x0e, 0xf1, 0x0c, 0xf3, 0xf2, 0x0d, 0x08, 0xf7, 0xf6, 0x09, 0xf4, 0x0b, 0x0a, 0xf5,
|
|
0xe0, 0x1f, 0x1e, 0xe1, 0x1c, 0xe3, 0xe2, 0x1d, 0x18, 0xe7, 0xe6, 0x19, 0xe4, 0x1b, 0x1a, 0xe5,
|
|
0x10, 0xef, 0xee, 0x11, 0xec, 0x13, 0x12, 0xed, 0xe8, 0x17, 0x16, 0xe9, 0x14, 0xeb, 0xea, 0x15,
|
|
0xc0, 0x3f, 0x3e, 0xc1, 0x3c, 0xc3, 0xc2, 0x3d, 0x38, 0xc7, 0xc6, 0x39, 0xc4, 0x3b, 0x3a, 0xc5,
|
|
0x30, 0xcf, 0xce, 0x31, 0xcc, 0x33, 0x32, 0xcd, 0xc8, 0x37, 0x36, 0xc9, 0x34, 0xcb, 0xca, 0x35,
|
|
0x20, 0xdf, 0xde, 0x21, 0xdc, 0x23, 0x22, 0xdd, 0xd8, 0x27, 0x26, 0xd9, 0x24, 0xdb, 0xda, 0x25,
|
|
0xd0, 0x2f, 0x2e, 0xd1, 0x2c, 0xd3, 0xd2, 0x2d, 0x28, 0xd7, 0xd6, 0x29, 0xd4, 0x2b, 0x2a, 0xd5,
|
|
0x80, 0x7f, 0x7e, 0x81, 0x7c, 0x83, 0x82, 0x7d, 0x78, 0x87, 0x86, 0x79, 0x84, 0x7b, 0x7a, 0x85,
|
|
0x70, 0x8f, 0x8e, 0x71, 0x8c, 0x73, 0x72, 0x8d, 0x88, 0x77, 0x76, 0x89, 0x74, 0x8b, 0x8a, 0x75,
|
|
0x60, 0x9f, 0x9e, 0x61, 0x9c, 0x63, 0x62, 0x9d, 0x98, 0x67, 0x66, 0x99, 0x64, 0x9b, 0x9a, 0x65,
|
|
0x90, 0x6f, 0x6e, 0x91, 0x6c, 0x93, 0x92, 0x6d, 0x68, 0x97, 0x96, 0x69, 0x94, 0x6b, 0x6a, 0x95,
|
|
0x40, 0xbf, 0xbe, 0x41, 0xbc, 0x43, 0x42, 0xbd, 0xb8, 0x47, 0x46, 0xb9, 0x44, 0xbb, 0xba, 0x45,
|
|
0xb0, 0x4f, 0x4e, 0xb1, 0x4c, 0xb3, 0xb2, 0x4d, 0x48, 0xb7, 0xb6, 0x49, 0xb4, 0x4b, 0x4a, 0xb5,
|
|
0xa0, 0x5f, 0x5e, 0xa1, 0x5c, 0xa3, 0xa2, 0x5d, 0x58, 0xa7, 0xa6, 0x59, 0xa4, 0x5b, 0x5a, 0xa5,
|
|
0x50, 0xaf, 0xae, 0x51, 0xac, 0x53, 0x52, 0xad, 0xa8, 0x57, 0x56, 0xa9, 0x54, 0xab, 0xaa, 0x55
|
|
};
|
|
|
|
#if 0 // byte values from 0..255 as hex numbers
|
|
|
|
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
|
|
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
|
|
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
|
|
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
|
|
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
|
|
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
|
|
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
|
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
|
|
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
|
|
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
|
|
0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
|
|
0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
|
|
0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
|
|
0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
|
|
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
|
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
|
|
|
|
#endif
|
|
|
|
UINT CBitsToRepresent(UINT ui)
|
|
{
|
|
if (ui & 0xFFFF0000)
|
|
if (ui & 0xFF000000) return 24 + aLog2[0xFF & (ui >> 24)];
|
|
else return 16 + aLog2[0xFF & (ui >> 16)];
|
|
else
|
|
if (ui & 0x0000FF00) return 8 + aLog2[0xFF & (ui >> 8)];
|
|
else return aLog2[0xFF & ui ];
|
|
}
|
|
|
|
#if 0
|
|
|
|
PDESCRIPTOR *GradeDescriptorVector(PDESCRIPTOR pd, UINT cd, PCompareImages pfnCompare)
|
|
{
|
|
PDESCRIPTOR *ppdResult= NULL;
|
|
|
|
__try
|
|
{
|
|
ppdResult= (PDESCRIPTOR *) VAlloc(FALSE, cd*sizeof(PDESCRIPTOR));
|
|
PDESCRIPTOR *ppd;
|
|
|
|
UINT c= cd;
|
|
|
|
for(ppd= ppdResult; c--; ) *ppd++= pd++;
|
|
|
|
qsort(ppdResult, cd, sizeof(PDESCRIPTOR), pfnCompare);
|
|
}
|
|
__finally
|
|
{
|
|
if (_abnormal_termination() && ppdResult)
|
|
{
|
|
VFree(ppdResult); ppdResult= NULL;
|
|
}
|
|
}
|
|
|
|
return ppdResult;
|
|
}
|
|
|
|
int __cdecl ComparePointers(const void *pvL, const void *pvR)
|
|
{
|
|
#define ppvL (*(PVOID **) pvL)
|
|
#define ppvR (*(PVOID **) pvR)
|
|
|
|
if (*ppvL < *ppvR) return -1;
|
|
if (*ppvL > *ppvR) return 1;
|
|
|
|
return 0;
|
|
|
|
#undef ppvL
|
|
#undef ppvR
|
|
}
|
|
|
|
PUINT GradePointers(PVOID *papv, UINT cPointers)
|
|
{
|
|
// This function returns GradeUp of a vector of pointers.
|
|
|
|
ASSERT(sizeof(UINT) == sizeof(PVOID *));
|
|
|
|
PVOID **pappv= NULL;
|
|
|
|
__try
|
|
{
|
|
pappv= (PVOID **) VAlloc(FALSE, sizeof(PVOID)*cPointers);
|
|
PVOID **pppv, *ppv;
|
|
UINT c;
|
|
|
|
for (c= cPointers, ppv= papv, pppv= pappv; c--; ) *pppv++ = ppv++;
|
|
|
|
qsort(pappv, cPointers, sizeof(PVOID *), ComparePointers);
|
|
|
|
PUINT pul = PUINT (pappv);
|
|
|
|
for (c= cPointers, pppv= pappv; c--; ) *pul++ = (*pppv++) - papv;
|
|
}
|
|
__finally
|
|
{
|
|
if (_abnormal_termination() && pappv)
|
|
{
|
|
VFree(pappv); pappv= NULL;
|
|
}
|
|
}
|
|
|
|
return PUINT (pappv);
|
|
}
|
|
|
|
PTokenImage *MergeDescriptorGrades(PDESCRIPTOR *ppdFirst, UINT cdFirst,
|
|
PDESCRIPTOR *ppdSecond, UINT cdSecond,
|
|
PCompareImages pfnCompare
|
|
)
|
|
{
|
|
UINT cdResult= cdFirst+cdSecond;
|
|
|
|
PTokenImage *ppdResult= NULL;
|
|
|
|
__try
|
|
{
|
|
ppdResult= (PDESCRIPTOR *) VAlloc(FALSE, ctiResult * sizeof(PDESCRIPTOR));
|
|
|
|
MergeImageRefSets(ppdResult, cdResult, ppdFirst, cdFirst, ppdSecond, cdSecond);
|
|
}
|
|
__finally
|
|
{
|
|
if (_abnormal_termination() && ppdResult)
|
|
{
|
|
VFree(ppdResult); ppdResult= NULL;
|
|
}
|
|
}
|
|
|
|
return ppdResult;
|
|
}
|
|
|
|
#endif // 0
|
|
|
|
#define pdL (*(PDESCRIPTOR *) (pvL))
|
|
#define pdR (*(PDESCRIPTOR *) (pvR))
|
|
|
|
int __cdecl CompareImagesLR(const void *pvL, const void *pvR)
|
|
{
|
|
int diff;
|
|
UINT cw, cwSave;
|
|
UINT cwSortKeyL = CbImage(pdL);
|
|
UINT cwSortKeyR = CbImage(pdR);
|
|
PWCHAR pwL = pdL->pbImage;
|
|
PWCHAR pwR = pdR->pbImage;
|
|
|
|
cw = cwSortKeyL;
|
|
if (cw > cwSortKeyR) cw = cwSortKeyR;
|
|
cwSave = cw;
|
|
|
|
for ( ;cw--; )
|
|
if (diff = *pwL++ - *pwR++)
|
|
return diff;
|
|
|
|
if (cwSortKeyL > cwSave) return 1;
|
|
if (cwSortKeyL < cwSave) return -1;
|
|
return 0;
|
|
}
|
|
|
|
|
|
int __cdecl CompareImagesRL(const void *pvL, const void *pvR)
|
|
{
|
|
PWCHAR pwL, pwR, pwLL, pwRR;
|
|
int diff, nRet;
|
|
UINT i, cwR, cwL, cwRR, cwLL;
|
|
|
|
pwL = pdL->pbImage + 1; // skip alpha-num-punc prefix
|
|
pwR = pdR->pbImage + 1;
|
|
|
|
cwL = CbImage(pdL);
|
|
cwR = CbImage(pdR);
|
|
|
|
if (!cwL || !cwR)
|
|
{
|
|
if (!cwL && !cwR)
|
|
return FALSE; // both sort keys are zero length
|
|
else
|
|
return cwR ? -1 : 1; // sort zero length before others
|
|
}
|
|
|
|
cwLL = --cwL;
|
|
cwRR = --cwR;
|
|
|
|
for (i = 0; i < cwL; i++)
|
|
if (HIBYTE(pwL[i]) == SORT_KEY_SEPARATOR)
|
|
{
|
|
cwL = i;
|
|
break;
|
|
}
|
|
|
|
for (i = 0; i < cwR; i++)
|
|
if (HIBYTE(pwR[i]) == SORT_KEY_SEPARATOR)
|
|
{
|
|
cwR = i;
|
|
break;
|
|
}
|
|
|
|
pwL += cwL;
|
|
pwR += cwR;
|
|
|
|
pwLL = pwL, pwRR = pwR;
|
|
cwLL -= cwL, cwRR -= cwR;
|
|
|
|
if ((nRet = cwL - cwR) > 0)
|
|
cwL = cwR;
|
|
|
|
for ( ;cwL--; )
|
|
if (diff = *--pwL - *--pwR)
|
|
return diff;
|
|
|
|
#if 0
|
|
if (nRet)
|
|
return nRet; // base text of different lengths
|
|
|
|
if ((nRet = cwLL - cwRR) > 0)
|
|
cwLL = cwRR;
|
|
// sort case and diacritic weights
|
|
for ( ; cwLL--; )
|
|
if (diff= *pwLL++ - *pwRR++)
|
|
return diff;
|
|
#endif
|
|
return nRet;
|
|
}
|
|
|
|
#undef pdL
|
|
#undef pdR
|
|
|
|
void MergeImageRefSets(PVOID *ppvResult , UINT cpvResult ,
|
|
PVOID *ppvSrcLow , UINT cpvSrcLow ,
|
|
PVOID *ppvSrcHigh, UINT cpvSrcHigh,
|
|
PCompareImages pCompareImages
|
|
)
|
|
{
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
#ifdef _DEBUG
|
|
|
|
int c;
|
|
|
|
for (c= cpvSrcLow; --c > 0; )
|
|
ASSERT(0 >= pCompareImages(ppvSrcLow + c - 1, ppvSrcLow + c));
|
|
|
|
for (c= cpvSrcHigh; --c > 0; )
|
|
ASSERT(0 >= pCompareImages(ppvSrcHigh + c - 1, ppvSrcHigh + c));
|
|
|
|
#endif // _DEBUG
|
|
|
|
PVOID *ppv, *ppvLow, *ppvHigh, *ppvMiddle;
|
|
int cpv;
|
|
int interval;
|
|
|
|
ASSERT(cpvResult == cpvSrcLow + cpvSrcHigh);
|
|
|
|
if (!cpvSrcHigh)
|
|
{
|
|
ppvSrcHigh = ppvSrcLow;
|
|
cpvSrcHigh = cpvSrcLow;
|
|
cpvSrcLow = 0;
|
|
}
|
|
|
|
while (cpvResult)
|
|
{
|
|
if(!cpvSrcLow)
|
|
{
|
|
CopyMemory(ppvResult, ppvSrcHigh, cpvSrcHigh * sizeof(PVOID));
|
|
return;
|
|
}
|
|
|
|
if (pCompareImages(ppvSrcLow, ppvSrcHigh) >= 0)
|
|
{
|
|
ppv= ppvSrcHigh; cpv= cpvSrcHigh;
|
|
|
|
ppvSrcHigh= ppvSrcLow; cpvSrcHigh= cpvSrcLow;
|
|
ppvSrcLow = ppv; cpvSrcLow = cpv;
|
|
}
|
|
|
|
ppvHigh = (ppvLow= ppvSrcLow) + cpvSrcLow;
|
|
ppv = ppvSrcHigh;
|
|
|
|
while (1 < (interval= ppvHigh - ppvLow))
|
|
{
|
|
ppvMiddle= ppvLow + interval/2;
|
|
|
|
if (pCompareImages(ppv, ppvMiddle) >= 0)
|
|
ppvLow = ppvMiddle;
|
|
else ppvHigh = ppvMiddle;
|
|
}
|
|
|
|
cpv= ppvHigh - ppvSrcLow;
|
|
|
|
CopyMemory(ppvResult, ppvSrcLow, cpv * sizeof(PVOID));
|
|
|
|
ppvResult += cpv;
|
|
ppvSrcLow += cpv;
|
|
cpvResult -= cpv;
|
|
cpvSrcLow -= cpv;
|
|
}
|
|
}
|
|
|
|
void SortTokenImages(PDESCRIPTOR pdBase, PDESCRIPTOR **pppdSorted,
|
|
PDESCRIPTOR **pppdTailSorted,
|
|
PUINT pcdSorted, UINT cd
|
|
)
|
|
{
|
|
PDESCRIPTOR *ppdResult = NULL,
|
|
*ppdResult2 = NULL,
|
|
*ppdSrc1 = NULL,
|
|
*ppdSrc2 = NULL,
|
|
*ppdSrc3 = NULL,
|
|
pd,
|
|
*ppd;
|
|
|
|
UINT cdSorted = *pcdSorted,
|
|
cdSrc2,
|
|
c;
|
|
|
|
if (cd == cdSorted) return;
|
|
|
|
__try
|
|
{
|
|
ppdSrc3 = *pppdTailSorted;
|
|
ppdSrc1 = *pppdSorted;
|
|
|
|
cdSrc2 = cd - cdSorted;
|
|
ppdSrc2 = (PDESCRIPTOR *) VAlloc(FALSE, cdSrc2 * sizeof(PDESCRIPTOR));
|
|
|
|
for (pd= pdBase + cdSorted, ppd= ppdSrc2, c= cdSrc2; c--;)
|
|
*ppd++ = pd++;
|
|
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
qsort(ppdSrc2, cdSrc2, sizeof(PDESCRIPTOR), CompareImagesRL);
|
|
|
|
// Note: We purposely call MergeImageRefSets below even when cdSorted is zero.
|
|
// For that case MergeImageRefSets simply makes a copy of ppdSrc2. We do this
|
|
// so we'll have a free instance of ppdSrc2 to sort left-to-right.
|
|
|
|
ppdResult2 = (PDESCRIPTOR *) VAlloc(FALSE, cd * sizeof(PDESCRIPTOR));
|
|
|
|
MergeImageRefSets((PVOID *) ppdResult2, cd, (PVOID *) ppdSrc3, cdSorted, (PVOID *) ppdSrc2, cdSrc2, CompareImagesRL);
|
|
|
|
#ifdef _DEBUG
|
|
{
|
|
for (int c=cd; --c > 0 ; )
|
|
ASSERT(0 >= CompareImagesRL(ppdResult2 + c - 1, ppdResult2 + c));
|
|
}
|
|
#endif // _DEBUG
|
|
|
|
*pppdTailSorted= ppdResult2; ppdResult2= NULL;
|
|
|
|
if (ppdSrc3) { VFree(ppdSrc3); ppdSrc3 = NULL; }
|
|
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
qsort(ppdSrc2, cdSrc2, sizeof(PDESCRIPTOR), CompareImagesLR);
|
|
|
|
if (!cdSorted)
|
|
{
|
|
ASSERT(!ppdSrc1);
|
|
|
|
*pppdSorted= ppdSrc2; ppdSrc2= NULL;
|
|
*pcdSorted= cdSrc2;
|
|
}
|
|
else
|
|
{
|
|
ASSERT(ppdSrc1);
|
|
|
|
ppdResult = (PDESCRIPTOR *) ExAlloc(LPTR, cd * sizeof(PDESCRIPTOR));
|
|
|
|
MergeImageRefSets((PVOID *) ppdResult, cd, (PVOID *) ppdSrc1, cdSorted, (PVOID *) ppdSrc2, cdSrc2, CompareImagesLR);
|
|
|
|
*pppdSorted= ppdResult; ppdResult= NULL;
|
|
*pcdSorted= cd;
|
|
|
|
VFree(ppdSrc1); ppdSrc1 = NULL;
|
|
VFree(ppdSrc2); ppdSrc2 = NULL;
|
|
}
|
|
|
|
#ifdef _DEBUG
|
|
{
|
|
for (int c=cd; --c > 0 ; )
|
|
ASSERT(0 >= CompareImagesLR(*pppdSorted + c - 1, *pppdSorted + c));
|
|
}
|
|
#endif // _DEBUG
|
|
}
|
|
__finally
|
|
{
|
|
if (ppdSrc2 ) { VFree(ppdSrc2 ); ppdSrc2 = NULL; }
|
|
if (ppdResult2) { VFree(ppdResult2); ppdResult2 = NULL; }
|
|
if (ppdResult ) { VFree(ppdResult ); ppdResult = NULL; }
|
|
}
|
|
}
|
|
|
|
UINT FormatAToken(PDESCRIPTOR pd, int cbOffset, int iColStart, int iColLimit, PWCHAR pbLine)
|
|
{
|
|
// Copies the image of a token into a line segment buffer. Tab characters within the
|
|
// image are interpreted as moving to the nearest multiple-of-8 boundary.
|
|
//
|
|
// The line segment is denoted by pbLine.
|
|
// Parameter iColStart defines the offset of the segment within the complete line.
|
|
// Parameter iColLimit defines its right boundary within the complete line segment.
|
|
// Parameter defines the starting location for the token within the complete line image.
|
|
//
|
|
// The result value will be a new cbOffset value adjusted to denote the character following
|
|
// the copied token image.
|
|
//
|
|
// Parameter pbLine may be NULL when only the new cbOffset value is needed.
|
|
|
|
int cbToken= CwDisplay(pd);
|
|
PWCHAR pbToken= pd->pwDisplay;
|
|
|
|
// For tokens which do not contain tabs, we have two cases
|
|
// to consider. In the first case where the token lies completely
|
|
// to the left of the image rectangle, we simple adjust cbOffset.
|
|
|
|
if (iColStart >= cbOffset + cbToken || !pbLine) cbOffset += cbToken;
|
|
else
|
|
// Otherwise we must copy some or all of the token's image into
|
|
// the destination byte array.
|
|
|
|
{
|
|
// If this token straddles the left boundrary of the image
|
|
// rectangle, we skip over it's first few image characters.
|
|
|
|
if (cbOffset < iColStart)
|
|
{
|
|
cbToken -= iColStart - cbOffset;
|
|
pbToken += iColStart - cbOffset;
|
|
|
|
cbOffset= iColStart;
|
|
}
|
|
|
|
// If it's going to straddle the right boundary, we don't
|
|
// copy the trailing characters.
|
|
|
|
if (iColLimit < cbOffset + cbToken) cbToken= iColLimit-cbOffset;
|
|
|
|
CopyMemory(pbLine + cbOffset - iColStart, pbToken, cbToken * sizeof(WCHAR));
|
|
|
|
cbOffset+= cbToken;
|
|
}
|
|
|
|
return cbOffset;
|
|
}
|
|
|
|
static UINT cbPhysicalMemory;
|
|
static UINT cbAvailableMemory;
|
|
|
|
#ifdef _DEBUG
|
|
|
|
CTextDatabase::CTextDatabase(PSZ pszTypeName) : CTextMatrix(pszTypeName)
|
|
|
|
#else // _DEBUG
|
|
|
|
CTextDatabase::CTextDatabase()
|
|
|
|
#endif // _DEBUG
|
|
|
|
{
|
|
// This routine does initializations which do not require memory allocation.
|
|
// The allocation part of instance construction is done by the routine
|
|
// InitTextDatabase below.
|
|
|
|
if (!cbPhysicalMemory) // Static class members start with a value of zero.
|
|
{
|
|
MEMORYSTATUS ms;
|
|
|
|
GlobalMemoryStatus(&ms);
|
|
|
|
cbPhysicalMemory= ms.dwTotalPhys;
|
|
|
|
cbAvailableMemory= UINT (MEMORY_FACTOR * double(cbPhysicalMemory));
|
|
}
|
|
|
|
#ifdef _DEBUG
|
|
|
|
m_fInitialized= FALSE;
|
|
|
|
#endif
|
|
|
|
m_cdSorted = 0;
|
|
m_cwDisplayMax = 0;
|
|
m_cbScanned = 0;
|
|
m_cTokensIndexed = 0;
|
|
m_cLocalDicts = 0;
|
|
m_iLocalDictBase = 0;
|
|
m_iSerialNumberNext = 0;
|
|
m_iNextRefSet = 0;
|
|
m_ibNextFileBlockLow = 0;
|
|
m_ibNextFileBlockHigh = 0;
|
|
m_cbBlockSize = 0;
|
|
m_cdwCompressedRefs = 0;
|
|
m_cTermRanks = 0;
|
|
m_cdwArticleRefs = 0;
|
|
m_cdwVocabularyRefs = 0;
|
|
m_fdwOptions = TOPIC_SEARCH | PHRASE_SEARCH | PHRASE_FEEDBACK | VECTOR_SEARCH;
|
|
m_lcidSorting = LCID(-1);
|
|
|
|
m_pwHash = NULL;
|
|
m_pbType = NULL;
|
|
m_paStart = NULL;
|
|
m_paEnd = NULL;
|
|
m_pshtGlobal = NULL;
|
|
m_pshtGalactic = NULL;
|
|
m_pisSymbols = NULL;
|
|
m_ppdTailSorted = NULL;
|
|
m_ppdSorted = NULL;
|
|
m_pafClassifications = NULL;
|
|
m_paiGlobalToRefList = NULL;
|
|
m_puioRefTemp = NULL;
|
|
m_puioCompressedRefs = NULL;
|
|
m_prldTokenRefs = NULL;
|
|
m_pdwCompressedRefs = NULL;
|
|
m_pFirstFreeFileBlock = NULL;
|
|
m_papFileBlockLinks = NULL;
|
|
m_piolLeft = NULL;
|
|
m_piolRight = NULL;
|
|
m_piolResult = NULL;
|
|
m_pTermRanks = NULL;
|
|
m_puioCompressedArticleRefs = NULL;
|
|
m_prldArticleRefs = NULL;
|
|
m_pdwArticleRefs = NULL;
|
|
m_puioCompressedVocabularyRefs = NULL;
|
|
m_prldVocabularyRefs = NULL;
|
|
m_pdwVocabularyRefs = NULL;
|
|
m_pDict = NULL;
|
|
m_pColl = NULL;
|
|
m_pulstate = NULL;
|
|
vbTokenStream .Base = NULL;
|
|
vbTokenImages .Base = NULL;
|
|
vbDisplayImages .Base = NULL;
|
|
// vbImageDescriptors.Base = NULL; // For now...
|
|
}
|
|
|
|
void CTextDatabase::InitTextDatabase(BOOL fFromFile)
|
|
{
|
|
ASSERT(!m_fInitialized);
|
|
|
|
m_fFromFileImage= fFromFile;
|
|
|
|
if (!fFromFile)
|
|
__try
|
|
{
|
|
m_lcidSorting= GetUserDefaultLCID();
|
|
|
|
UINT cbSector;
|
|
|
|
m_puioRefTemp= CUnbufferedIO::NewTempFile((PSZ)GetSourceName());
|
|
|
|
cbSector= m_puioRefTemp->CbSector();
|
|
|
|
m_cbBlockSize = cbSector * ((CB_TEMP_BLOCKS + cbSector - 1) / cbSector);
|
|
m_cbTransactionLimit = cbSector * ((CB_TRANSACTION_LIMIT + cbSector - 1) / cbSector);
|
|
|
|
m_pshtGlobal = CSegHashTable::NewSegHashTable(sizeof(TermTagGlobal ), sizeof(UINT ));
|
|
m_pshtGalactic = CSegHashTable::NewSegHashTable(sizeof(TermTagGalactic), sizeof(UINT ));
|
|
|
|
m_pulstate = (UnlinkedState *) VAlloc(TRUE, sizeof(UnlinkedState));
|
|
|
|
m_pulstate->m_aiBaseCByte[0] = 0;
|
|
m_pulstate->m_aiBaseToken[0] = 0;
|
|
|
|
#ifdef PROFILING
|
|
CreateVirtualBuffer(&vbTokenStream, cbTokenCommit, INIT_TOKEN_REF_RESERVATION);
|
|
#else // PROFILING
|
|
CreateVirtualBuffer(&vbTokenStream, INIT_TOKEN_REF_COMMIT, INIT_TOKEN_REF_RESERVATION);
|
|
#endif // PROFILING
|
|
|
|
m_puiTokenNext = TokenBase();
|
|
m_pltNext = (PLocalToken) TokenBase();
|
|
|
|
#ifdef PROFILING
|
|
CreateVirtualBuffer(&vbTokenImages, cbImageCommit, INIT_TOKEN_IMAGE_RESERVATION);
|
|
#else // PROFILING
|
|
CreateVirtualBuffer(&vbTokenImages, INIT_TOKEN_IMAGE_COMMIT, INIT_TOKEN_IMAGE_RESERVATION);
|
|
#endif // PROFILING
|
|
|
|
m_pbLastGalactic =
|
|
m_pbNext =
|
|
m_pbNextGalactic =
|
|
m_pbNextGlobal = ImageBase();
|
|
|
|
#ifdef PROFILING
|
|
CreateVirtualBuffer(&vbDisplayImages, cbDisplayCommit, INIT_DISPLAY_IMAGE_RESERVATION);
|
|
#else // PROFILING
|
|
CreateVirtualBuffer(&vbDisplayImages, INIT_DISPLAY_IMAGE_COMMIT, INIT_DISPLAY_IMAGE_RESERVATION);
|
|
#endif // PROFILING
|
|
|
|
m_pwDispLastGalactic =
|
|
m_pwDispNext =
|
|
m_pwDispNextGalactic =
|
|
m_pwDispNextGlobal = DisplayBase();
|
|
|
|
#ifdef PROFILING
|
|
CreateVirtualBuffer(&vbImageDescriptors, cbDescriptorCommit, INIT_IMAGE_DESCRIPTOR_RESERVATION);
|
|
#else // PROFILING
|
|
CreateVirtualBuffer(&vbImageDescriptors, INIT_IMAGE_DESCRIPTOR_COMMIT, INIT_IMAGE_DESCRIPTOR_RESERVATION);
|
|
#endif // PROFILING
|
|
|
|
m_pdNextBound =
|
|
m_pdNext =
|
|
m_pdNextGalactic =
|
|
m_pdNextGlobal = DescriptorBase();
|
|
|
|
#ifdef _DEBUG
|
|
m_pdNext->pbImage= PWCHAR(-1);
|
|
#endif // _DEBUG
|
|
|
|
m_clsfTokens.Initial();
|
|
|
|
ZeroMemory(m_pulstate, sizeof(UnlinkedState));
|
|
|
|
m_pdNext->pwDisplay = m_pwDispNext;
|
|
|
|
m_pbNextGalactic= m_pbNextGlobal= m_pbNext;
|
|
|
|
m_pwDispNextGalactic= m_pwDispNextGlobal= m_pwDispNext;
|
|
|
|
m_pdNextGalactic= m_pdNextGlobal= m_pdNextBound= m_pdNext;
|
|
|
|
m_iSerialNumberNext= 0;
|
|
|
|
if (FVectorSearch())
|
|
{
|
|
m_pDict = CDictionary::NewDictionary();
|
|
m_pColl = CCollection::NewCollection();
|
|
}
|
|
}
|
|
__finally
|
|
{
|
|
if (_abnormal_termination())
|
|
{
|
|
if (vbTokenStream.Base ) FreeVirtualBuffer(&vbTokenStream );
|
|
if (vbTokenImages.Base ) FreeVirtualBuffer(&vbTokenImages );
|
|
if (vbImageDescriptors.Base) FreeVirtualBuffer(&vbImageDescriptors);
|
|
|
|
if (m_pshtGalactic) delete m_pshtGalactic;
|
|
if (m_pshtGlobal ) delete m_pshtGlobal;
|
|
if (m_pDict ) delete m_pDict;
|
|
if (m_pColl ) delete m_pColl;
|
|
}
|
|
}
|
|
|
|
#ifdef _DEBUG
|
|
|
|
m_fInitialized= TRUE;
|
|
|
|
#endif
|
|
}
|
|
|
|
|
|
CTextDatabase::~CTextDatabase()
|
|
{
|
|
if (!m_fFromFileImage)
|
|
{
|
|
FreeVirtualBuffer(&vbTokenStream);
|
|
|
|
if (m_pwHash ) delete [] m_pwHash ;
|
|
if (m_pbType ) delete [] m_pbType;
|
|
if (m_paStart) delete [] m_paStart;
|
|
if (m_paEnd ) delete [] m_paEnd;
|
|
|
|
if (m_pulstate)
|
|
{
|
|
for (int c= m_cLocalDicts; c-- > m_iLocalDictBase; )
|
|
if (m_pulstate->m_apLocalDict[c])
|
|
{
|
|
VFree(m_pulstate->m_apLocalDict[c]);
|
|
|
|
if (m_pulstate->pld == m_pulstate->m_apLocalDict[c])
|
|
m_pulstate->pld = NULL;
|
|
}
|
|
|
|
if (m_pulstate->pld ) VFree(m_pulstate->pld );
|
|
VFree(m_pulstate );
|
|
}
|
|
|
|
if (m_pafClassifications) VFree(m_pafClassifications);
|
|
if (m_pTermRanks ) VFree(m_pTermRanks );
|
|
if (m_prldTokenRefs ) VFree(m_prldTokenRefs );
|
|
if (m_prldArticleRefs ) VFree(m_prldArticleRefs );
|
|
if (m_prldVocabularyRefs) VFree(m_prldVocabularyRefs);
|
|
|
|
if (m_puioCompressedArticleRefs ) delete m_puioCompressedArticleRefs;
|
|
if (m_puioCompressedVocabularyRefs) delete m_puioCompressedVocabularyRefs;
|
|
if (m_puioRefTemp ) delete m_puioRefTemp;
|
|
if (m_puioCompressedRefs ) delete m_puioCompressedRefs;
|
|
}
|
|
|
|
FreeVirtualBuffer(&vbTokenImages);
|
|
FreeVirtualBuffer(&vbDisplayImages);
|
|
FreeVirtualBuffer(&vbImageDescriptors);
|
|
|
|
if (m_pDict ) delete m_pDict;
|
|
if (m_pColl ) delete m_pColl;
|
|
if (m_pshtGalactic) delete m_pshtGalactic;
|
|
if (m_pshtGlobal ) delete m_pshtGlobal;
|
|
|
|
if (m_ppdSorted ) VFree(m_ppdSorted );
|
|
if (m_ppdTailSorted) VFree(m_ppdTailSorted);
|
|
|
|
if (m_pisSymbols) DetachRef(m_pisSymbols);
|
|
|
|
m_clsfTokens.FinalCleanUp();
|
|
}
|
|
|
|
typedef struct _TextDatabaseHeader
|
|
{
|
|
UINT fdwOptions;
|
|
UINT cbScanned;
|
|
UINT cTokens;
|
|
UINT cLocalDicts;
|
|
UINT offTokens;
|
|
UINT cbTokenImages;
|
|
// UINT offTokenImages;
|
|
UINT cUniqueTokens;
|
|
UINT cwMaxUniqueToken;
|
|
// UINT offTokenDescriptors;
|
|
|
|
UINT offReferenceCounts;
|
|
UINT offDescriptorFlags;
|
|
|
|
UINT offppdSorted;
|
|
UINT offppdTailSorted;
|
|
UINT offpafClassifications;
|
|
UINT offClassifier;
|
|
UINT offTermMap;
|
|
UINT offArticleRefDescr;
|
|
UINT cdwArticleRefs;
|
|
UINT offArticleRefs;
|
|
UINT offVocabularyDescr;
|
|
UINT cdwVocabularyRefs;
|
|
UINT offVocabularyRefs;
|
|
UINT offTokenRefDescr;
|
|
UINT cdwRefs;
|
|
UINT offRefs;
|
|
UINT offaiBaseToken;
|
|
UINT offaiBaseCByte;
|
|
UINT cwDispImages;
|
|
UINT offDispImages;
|
|
UINT cnTokenSortKeys;
|
|
UINT cnDispSortKeys;
|
|
LCID lcid;
|
|
|
|
PDESCRIPTOR pdOld;
|
|
|
|
} TextDatabaseHeader;
|
|
|
|
void CTextDatabase::StoreImage(CPersist *pDiskImage)
|
|
{
|
|
PUINT pcRefs = NULL;
|
|
CCompressedSet *pcsOffsets = NULL;
|
|
|
|
__try
|
|
{
|
|
AppendText(NULL, 0, 0); // To bring the database to queriable form
|
|
|
|
TextDatabaseHeader *ptdbh= (TextDatabaseHeader *) (pDiskImage->ReserveTableSpace(sizeof(TextDatabaseHeader)));
|
|
|
|
UINT cbImage = m_pbNextGalactic - ImageBase();
|
|
UINT cwDispImage = m_pwDispNextGalactic - DisplayBase();
|
|
UINT cdUnique = m_pdNextGalactic - DescriptorBase();
|
|
UINT cPartitions = ArticleCount();
|
|
|
|
ptdbh->fdwOptions = m_fdwOptions;
|
|
ptdbh->cbScanned = m_cbScanned;
|
|
ptdbh->cTokens = TokenCount();
|
|
ptdbh->cLocalDicts = m_cLocalDicts;
|
|
ptdbh->cbTokenImages = cbImage;
|
|
ptdbh->cwDispImages = cwDispImage;
|
|
ptdbh->cUniqueTokens = cdUnique;
|
|
ptdbh->cwMaxUniqueToken = m_cwDisplayMax;
|
|
ptdbh->cdwArticleRefs = m_cdwArticleRefs;
|
|
ptdbh->cdwVocabularyRefs = m_cdwVocabularyRefs;
|
|
ptdbh->cdwRefs = m_cdwCompressedRefs;
|
|
|
|
ASSERT(m_lcidSorting != LCID(-1));
|
|
|
|
ptdbh->lcid = m_lcidSorting;
|
|
|
|
if (FPhrases())
|
|
{
|
|
if (m_cdwCompressedRefs)
|
|
{
|
|
ptdbh->offRefs = pDiskImage->NextOffset(); pDiskImage->WriteDWords(m_pdwCompressedRefs, m_cdwCompressedRefs);
|
|
}
|
|
else ptdbh->offRefs = 0;
|
|
|
|
ptdbh->offTokenRefDescr = pDiskImage->NextOffset(); pDiskImage->SaveData(PBYTE(m_prldTokenRefs), sizeof(RefListDescriptor) * cdUnique);
|
|
}
|
|
else
|
|
{
|
|
ptdbh->offRefs = 0;
|
|
ptdbh->offTokenRefDescr = 0;
|
|
}
|
|
|
|
if (FPhraseFeedback())
|
|
{
|
|
ptdbh->offTokens = pDiskImage->NextOffset(); pDiskImage->WriteDWords(TokenBase(), TokenCount());
|
|
}
|
|
else ptdbh->offTokens= 0;
|
|
|
|
// ptdbh->offTokenImages = pDiskImage->NextOffset(); ptdbh->cnTokenSortKeys = pDiskImage->Encode((PBYTE)ImageBase(), cbImage * sizeof(WCHAR));
|
|
ptdbh->offDispImages = pDiskImage->NextOffset(); ptdbh->cnDispSortKeys = pDiskImage->Encode((PBYTE)DisplayBase(), cwDispImage * sizeof(WCHAR));
|
|
|
|
ValidateHeap();
|
|
|
|
pcRefs= PUINT(VAlloc(FALSE, sizeof(UINT) * cdUnique));
|
|
|
|
UINT c;
|
|
PUINT pui;
|
|
PDESCRIPTOR pd;
|
|
PWCHAR pwcBase= DisplayBase();
|
|
PBYTE pb;
|
|
|
|
for (pd= DescriptorBase(), c= cdUnique, pui= pcRefs; c--; ) *pui++ = (pd++)->cReferences;
|
|
|
|
ValidateHeap();
|
|
|
|
ptdbh->offReferenceCounts = pDiskImage->NextOffset(); pDiskImage->WriteDWords(pcRefs, cdUnique);
|
|
|
|
ValidateHeap();
|
|
|
|
for (pd= DescriptorBase(), c= cdUnique, pb= PBYTE(pcRefs); c--; pd++)
|
|
{
|
|
*pb++ = pd->bCharset;
|
|
*pb++ = pd->fImageFlags;
|
|
}
|
|
|
|
ValidateHeap();
|
|
|
|
ptdbh->offDescriptorFlags = pDiskImage->NextOffset(); pDiskImage->WriteBytes(PBYTE(pcRefs), cdUnique * 2);
|
|
|
|
ValidateHeap();
|
|
|
|
for (pd= DescriptorBase(), c= cdUnique, pui= pcRefs; c--; ) *pui++ = (pd++)->pwDisplay - pwcBase;
|
|
|
|
ValidateHeap();
|
|
|
|
pcsOffsets= CCompressedSet::NewCompressedSet(pcRefs, cdUnique, cwDispImage);
|
|
|
|
ValidateHeap();
|
|
|
|
VFree(pcRefs); pcRefs= NULL;
|
|
|
|
ValidateHeap();
|
|
|
|
pcsOffsets->StoreImage(pDiskImage);
|
|
|
|
delete pcsOffsets; pcsOffsets= NULL;
|
|
|
|
// ptdbh->offTokenDescriptors = pDiskImage->NextOffset(); pDiskImage->SaveData(PBYTE(DescriptorBase()), sizeof(DESCRIPTOR) * (1 + cdUnique));
|
|
|
|
ptdbh->offppdSorted = pDiskImage->NextOffset(); pDiskImage->WriteDWords(PUINT(m_ppdSorted ), cdUnique);
|
|
ptdbh->offppdTailSorted = pDiskImage->NextOffset(); pDiskImage->WriteDWords(PUINT(m_ppdTailSorted ), cdUnique);
|
|
ptdbh->offpafClassifications = pDiskImage->NextOffset(); pDiskImage->WriteDWords(PUINT(m_pafClassifications), cdUnique);
|
|
ptdbh->offTermMap = pDiskImage->NextOffset(); pDiskImage->WriteDWords( TermRanks() , cdUnique);
|
|
ptdbh->offClassifier = pDiskImage->NextOffset(); pDiskImage->SaveData(PBYTE(&m_clsfTokens), sizeof(m_clsfTokens));
|
|
ptdbh->offArticleRefDescr = pDiskImage->NextOffset(); pDiskImage->SaveData(PBYTE(m_prldArticleRefs), sizeof(RefListDescriptor) * cdUnique);
|
|
|
|
if (m_cdwArticleRefs)
|
|
{
|
|
ptdbh->offArticleRefs = pDiskImage->NextOffset(); pDiskImage->WriteDWords(m_pdwArticleRefs, m_cdwArticleRefs );
|
|
}
|
|
else ptdbh->offArticleRefs = 0;
|
|
|
|
ptdbh->offVocabularyDescr = pDiskImage->NextOffset(); pDiskImage->SaveData(PBYTE(m_prldVocabularyRefs), sizeof(RefListDescriptor) * cPartitions);
|
|
|
|
if (m_cdwVocabularyRefs)
|
|
{
|
|
ptdbh->offVocabularyRefs = pDiskImage->NextOffset(); pDiskImage->WriteDWords(m_pdwVocabularyRefs, m_cdwVocabularyRefs);
|
|
}
|
|
else ptdbh->offVocabularyRefs = 0;
|
|
|
|
ptdbh->offaiBaseToken = pDiskImage->NextOffset(); pDiskImage->WriteDWords(m_pulstate->m_aiBaseToken, m_cLocalDicts);
|
|
ptdbh->offaiBaseCByte = pDiskImage->NextOffset(); pDiskImage->WriteDWords(m_pulstate->m_aiBaseCByte, m_cLocalDicts);
|
|
|
|
ptdbh->pdOld= DescriptorBase();
|
|
|
|
m_pisSymbols->StoreImage(pDiskImage);
|
|
|
|
if (FVectorSearch())
|
|
{
|
|
m_pDict->StoreImage(pDiskImage);
|
|
|
|
// Let the collection know about the # of unique concepts in the dictionary
|
|
|
|
m_pColl->SetNumberOfConcepts(m_pDict->GetConceptCount());
|
|
|
|
m_pColl->StoreImage(pDiskImage);
|
|
}
|
|
}
|
|
__finally
|
|
{
|
|
if (pcRefs ) { VFree(pcRefs ); pcRefs = NULL; }
|
|
if (pcsOffsets) { VFree(pcsOffsets); pcsOffsets = NULL; }
|
|
}
|
|
}
|
|
|
|
void MergeSerial(UINT iValue, PVOID pvTag, PVOID pvEnvironment)
|
|
{
|
|
ASSERT(TRUE); // Shouldn't ever call this routine!!
|
|
}
|
|
|
|
void AddSerial(UINT iValue, PVOID pvTag, PVOID pvEnvironment)
|
|
{
|
|
*PUINT(pvTag)= iValue;
|
|
}
|
|
|
|
void CTextDatabase::ConnectImage(CPersist *pDiskImage, BOOL fUnpackDisplayForm)
|
|
{
|
|
TextDatabaseHeader *ptdbh= (TextDatabaseHeader *) (pDiskImage->ReserveTableSpace(sizeof(TextDatabaseHeader)));
|
|
|
|
m_fdwOptions = ptdbh->fdwOptions;
|
|
m_cbScanned = ptdbh->cbScanned;
|
|
m_cLocalDicts = ptdbh->cLocalDicts;
|
|
|
|
// Now we can attach the token stream
|
|
|
|
if (FPhrases())
|
|
{
|
|
m_cdwCompressedRefs = ptdbh->cdwRefs;
|
|
m_pdwCompressedRefs = (m_cdwCompressedRefs)? PUINT(pDiskImage->LocationOf(ptdbh->offRefs)) : NULL;
|
|
m_prldTokenRefs = PRefListDescriptor(pDiskImage->LocationOf(ptdbh->offTokenRefDescr));
|
|
}
|
|
else
|
|
{
|
|
m_cdwCompressedRefs = 0;
|
|
m_pdwCompressedRefs = NULL;
|
|
m_prldTokenRefs = NULL;
|
|
}
|
|
|
|
if (FPhraseFeedback())
|
|
{
|
|
vbTokenStream.Base= LPVOID(pDiskImage->LocationOf(ptdbh->offTokens));
|
|
|
|
m_puiTokenNext= TokenBase() + ptdbh->cTokens;
|
|
m_pltNext= PLocalToken(m_puiTokenNext);
|
|
}
|
|
else
|
|
{
|
|
vbTokenStream.Base= NULL;
|
|
|
|
m_puiTokenNext = PUINT(0) + ptdbh->cTokens;
|
|
m_pltNext = PLocalToken(m_puiTokenNext);
|
|
}
|
|
|
|
m_lcidSorting = ptdbh->lcid;
|
|
|
|
// Now we can attach the token UNICODE display images
|
|
|
|
if (fUnpackDisplayForm)
|
|
{
|
|
UINT cbImages= ptdbh->cwDispImages * sizeof(WCHAR);
|
|
|
|
CreateVirtualBuffer(&vbDisplayImages, cbImages, cbImages);
|
|
|
|
m_pwDispNext=
|
|
m_pwDispNextGlobal=
|
|
m_pwDispNextGalactic= DisplayBase() + (Decode((PUINT)pDiskImage->LocationOf(ptdbh->offDispImages),
|
|
ptdbh->cnDispSortKeys, (PBYTE)DisplayBase()) >> 1);
|
|
m_pwDispLastGalactic= DisplayBase();
|
|
|
|
// Here we reconstruct the token descriptors
|
|
|
|
// BugBug! This isn't just an address attachment because we have to
|
|
// fix up the image addresses. Changing those pointers to
|
|
// offsets will eliminate this work.
|
|
|
|
UINT c = ptdbh->cUniqueTokens+1;
|
|
UINT cbDescriptors = c * sizeof(DESCRIPTOR);
|
|
PWCHAR pwcBase = DisplayBase();
|
|
|
|
CreateVirtualBuffer(&vbImageDescriptors, cbDescriptors, cbDescriptors);
|
|
|
|
PDESCRIPTOR pd= DescriptorBase();
|
|
|
|
pd[--c].pwDisplay = pwcBase + ptdbh->cwDispImages;
|
|
|
|
PUINT pcRefs = PUINT(pDiskImage->LocationOf(ptdbh->offReferenceCounts));
|
|
PUINT pui;
|
|
|
|
for (pui= pcRefs; c--; ) (pd++)->cReferences = *pui++;
|
|
|
|
PBYTE pbFlags= PBYTE(pDiskImage->LocationOf(ptdbh->offDescriptorFlags));
|
|
|
|
for (pd= DescriptorBase(), c= ptdbh->cUniqueTokens; c--; pd++)
|
|
{
|
|
pd->bCharset = *pbFlags++;
|
|
pd->fImageFlags = *pbFlags++;
|
|
}
|
|
|
|
CCompressedSet* pcsOffsets = NULL;
|
|
CCmpEnumerator* pEnumerator = NULL;
|
|
|
|
__try
|
|
{
|
|
AttachRef(pcsOffsets, CCompressedSet::CreateImage(pDiskImage));
|
|
|
|
pEnumerator= CCmpEnumerator::NewEnumerator(pcsOffsets);
|
|
|
|
for (pd= DescriptorBase(), c= ptdbh->cUniqueTokens; c; )
|
|
{
|
|
UINT cChunk= c;
|
|
|
|
const UINT *pui= pEnumerator->NextDWordsIn(&cChunk);
|
|
|
|
c -= cChunk;
|
|
|
|
for (; cChunk--; pd++)
|
|
pd->pwDisplay = pwcBase + *pui++;
|
|
}
|
|
}
|
|
__finally
|
|
{
|
|
if (pEnumerator) { delete pEnumerator; pEnumerator = NULL; }
|
|
if (pcsOffsets ) DetachRef(pcsOffsets);
|
|
}
|
|
|
|
for (pd= DescriptorBase(), c= ptdbh->cUniqueTokens; c--; pd++)
|
|
pd->cwDisplay = (pd+1)->pwDisplay - pd->pwDisplay;
|
|
|
|
// CopyMemory(pd, pDiskImage->LocationOf(ptdbh->offTokenDescriptors), sizeof(DESCRIPTOR) * c);
|
|
|
|
// Now we can attach the token sort keys
|
|
ASSERT(!ImageBase());
|
|
|
|
m_cwDisplayMax = ptdbh->cwMaxUniqueToken;
|
|
|
|
UINT cbSortKeys = ptdbh->cbTokenImages * sizeof(WCHAR);
|
|
|
|
#if 1
|
|
|
|
CreateVirtualBuffer(&vbTokenImages, cbSortKeys, cbSortKeys);
|
|
|
|
// int ibDispDelta = DisplayBase() - pd->pwDisplay;
|
|
|
|
// for ( ; c--; pd++)
|
|
// pd->pwDisplay += ibDispDelta;
|
|
|
|
m_pbLastGalactic = m_pbNext = ImageBase();
|
|
|
|
__try
|
|
{
|
|
for (c = ptdbh->cUniqueTokens, pd = DescriptorBase(); c--; pd++)
|
|
{
|
|
pd->pbImage = m_pbNext;
|
|
|
|
m_pbNext += LCSortKeyW(m_lcidSorting, 0, pd->pwDisplay, pd->cwDisplay, m_pbNext, MaxSortKeyBytes(pd->cwDisplay));
|
|
}
|
|
}
|
|
__except (ExceptionFilter(GetExceptionCode(), GetExceptionInformation()))
|
|
{
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
}
|
|
|
|
pd->pbImage = m_pbNextGalactic = m_pbNextGlobal = m_pbNext;
|
|
|
|
#else // 1
|
|
|
|
CreateVirtualBuffer(&vbTokenImages, cbSortKeys, cbSortKeys);
|
|
|
|
m_pbNext=
|
|
m_pbNextGlobal=
|
|
m_pbNextGalactic= ImageBase() + (Decode((PUINT)pDiskImage->LocationOf(ptdbh->offTokenImages),
|
|
ptdbh->cnTokenSortKeys, (PBYTE)ImageBase()) >> 1);
|
|
m_pbLastGalactic= ImageBase();
|
|
|
|
int ibDelta = ImageBase () - pd->pbImage;
|
|
// int ibDispDelta = DisplayBase() - pd->pwDisplay;
|
|
|
|
for ( ; c--; pd++)
|
|
{
|
|
// pd->pwDisplay += ibDispDelta;
|
|
pd->pbImage += ibDelta;
|
|
}
|
|
|
|
#endif // 1
|
|
|
|
m_cdSorted = ptdbh->cUniqueTokens;
|
|
|
|
ASSERT(!m_ppdSorted && !m_ppdTailSorted);
|
|
|
|
m_ppdSorted= (PDESCRIPTOR *) VAlloc(FALSE, m_cdSorted * sizeof(PDESCRIPTOR));
|
|
|
|
PDESCRIPTOR *ppdSrc = (PDESCRIPTOR *) (pDiskImage->LocationOf(ptdbh->offppdSorted));
|
|
PDESCRIPTOR *ppdDest = m_ppdSorted;
|
|
|
|
int cbDelta= PBYTE(DescriptorBase()) - PBYTE(ptdbh->pdOld);
|
|
|
|
for (c= m_cdSorted; c--; )
|
|
*ppdDest++ = (PDESCRIPTOR) (PBYTE(*ppdSrc++) + cbDelta);
|
|
|
|
m_ppdTailSorted= (PDESCRIPTOR *) VAlloc(FALSE, m_cdSorted * sizeof(PDESCRIPTOR));
|
|
|
|
ppdSrc = (PDESCRIPTOR *) (pDiskImage->LocationOf(ptdbh->offppdTailSorted));
|
|
ppdDest = m_ppdTailSorted;
|
|
|
|
for (c= m_cdSorted; c--; )
|
|
*ppdDest++ = (PDESCRIPTOR) (PBYTE(*ppdSrc++) + cbDelta);
|
|
|
|
ASSERT(pDiskImage->IsFTSFile());
|
|
|
|
if (pDiskImage->VersionIndex() == FTSVERSION_MIN)
|
|
{
|
|
qsort(m_ppdSorted , m_cdSorted, sizeof(PDESCRIPTOR), CompareImagesLR);
|
|
qsort(m_ppdTailSorted, m_cdSorted, sizeof(PDESCRIPTOR), CompareImagesRL);
|
|
}
|
|
}
|
|
else CCompressedSet::SkipImage(pDiskImage);
|
|
|
|
// Now we can connect the descriptor limit pointers.
|
|
|
|
m_pdNext=
|
|
m_pdNextGlobal=
|
|
m_pdNextGalactic=
|
|
m_pdNextBound= DescriptorBase() + ptdbh->cUniqueTokens;
|
|
|
|
#if 0
|
|
// We'll reconstruct the galactic hash table based on the contents
|
|
// of the unique descriptor set.
|
|
|
|
CAValRef *pavr= NULL;
|
|
|
|
__try
|
|
{
|
|
m_pshtGalactic= CSegHashTable::NewSegHashTable(sizeof(TermTagGalactic), sizeof(UINT));
|
|
|
|
m_pshtGlobal= CSegHashTable::NewSegHashTable(sizeof(TermTagGlobal), sizeof(UINT));
|
|
|
|
pavr= DescriptorList(DescriptorBase(), ptdbh->cUniqueTokens);
|
|
|
|
m_pshtGalactic->Assimilate(pavr, NULL, MergeSerial, AddSerial);
|
|
}
|
|
__finally
|
|
{
|
|
if (pavr) { delete pavr; pavr= NULL; }
|
|
}
|
|
#endif // 0
|
|
|
|
m_iSerialNumberNext= ptdbh->cUniqueTokens;
|
|
|
|
// Now we'll construct the sorting and classification data for the
|
|
// unique tokens.
|
|
|
|
ASSERT(!m_pafClassifications);
|
|
|
|
m_pafClassifications= PUINT(pDiskImage->LocationOf(ptdbh->offpafClassifications));
|
|
|
|
CopyMemory(&m_clsfTokens, PBYTE(pDiskImage->LocationOf(ptdbh->offClassifier)), sizeof(m_clsfTokens));
|
|
|
|
// Here we're reconstructing the Symbols indicator sets.
|
|
|
|
if (FPhrases())
|
|
AttachRef(m_pisSymbols, CIndicatorSet::CreateImage(pDiskImage));
|
|
else CIndicatorSet::SkipImage(pDiskImage);
|
|
|
|
// Here we're connecting the compressed reference lists.
|
|
|
|
m_cdwArticleRefs = ptdbh->cdwArticleRefs;
|
|
m_pdwArticleRefs = (m_cdwArticleRefs)? PUINT(pDiskImage->LocationOf(ptdbh->offArticleRefs)) : NULL;
|
|
m_prldArticleRefs = PRefListDescriptor(pDiskImage->LocationOf(ptdbh->offArticleRefDescr));
|
|
|
|
m_cdwVocabularyRefs = ptdbh->cdwVocabularyRefs;
|
|
m_pdwVocabularyRefs = (m_cdwVocabularyRefs)? PUINT(pDiskImage->LocationOf(ptdbh->offVocabularyRefs )) : NULL;
|
|
m_prldVocabularyRefs = PRefListDescriptor(pDiskImage->LocationOf(ptdbh->offVocabularyDescr));
|
|
|
|
// Finally we must copy the per-local-dict vectors
|
|
|
|
// UINT c= (ptdbh->cLocalDicts + 1) * sizeof(UINT);
|
|
|
|
// ZeroMemory(m_pulstate->m_apLocalDict, c);
|
|
|
|
// CopyMemory(m_pulstate->m_aiBaseToken, PUINT(pDiskImage->LocationOf(ptdbh->offaiBaseToken)), c);
|
|
// CopyMemory(m_pulstate->m_aiBaseCByte, PUINT(pDiskImage->LocationOf(ptdbh->offaiBaseCByte)), c);
|
|
|
|
if (FVectorSearch())
|
|
{
|
|
m_pDict = CDictionary::CreateImage(pDiskImage);
|
|
m_pColl = CCollection::CreateImage(pDiskImage);
|
|
}
|
|
}
|
|
|
|
int CTextDatabase::AppendText(PWCHAR pwText, int cwText, BOOL fArticleEnd, UINT iCharset, UINT lcid)
|
|
{
|
|
// AppendText adds text to a text database. Calling AppendText with
|
|
// pbText <--> NULL or cbText <--> 0 is a signal to synchronize the
|
|
// access data structure with the current text. This is necessary
|
|
// before any queries are made against the database.
|
|
//
|
|
// Large text streams may be given to AppendText in segments. We
|
|
// assume that no tokens are broken across segments. The easiest
|
|
// way to insure that is to split segments at linebreak boundaries.
|
|
//
|
|
// Note: The AppendSlave code assumes that it can store a trailing zero
|
|
//
|
|
// *(pwText+cwText)= 0
|
|
//
|
|
// without harm.
|
|
|
|
int cwScanned;
|
|
|
|
if (!pwText || !cwText)
|
|
{
|
|
SyncForQueries();
|
|
|
|
return 0;
|
|
}
|
|
|
|
__try
|
|
{
|
|
while (cwText)
|
|
{
|
|
cwScanned = AppendSlave(pwText, cwText, fArticleEnd, iCharset, lcid);
|
|
|
|
if (cwScanned >= 0) // continue passing partial buffers
|
|
{
|
|
pwText += cwScanned;
|
|
cwText -= cwScanned;
|
|
}
|
|
else // reached end of passed buffer
|
|
{
|
|
pwText -= cwScanned;
|
|
cwText += cwScanned;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
__except (ExceptionFilter(GetExceptionCode(), GetExceptionInformation()))
|
|
{
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
}
|
|
|
|
return cwText;
|
|
}
|
|
|
|
int CTextDatabase::ExceptionFilter
|
|
( IN DWORD ExceptionCode,
|
|
IN PEXCEPTION_POINTERS ExceptionInfo
|
|
)
|
|
{
|
|
// Routine Description:
|
|
//
|
|
// This function is an exception filter that handles exceptions that
|
|
// referenced uncommitted but reserved memory controlled by *ptdbc.
|
|
// It this filter routine is able to commit the additional pages needed
|
|
// to allow the memory reference to succeed, then it will re-execute the
|
|
// faulting instruction. If it is unable to commit the pages, it will
|
|
// execute the callers exception handler.
|
|
//
|
|
// If the exception is not an access violation or is an access
|
|
// violation but does not reference memory contained in the reserved
|
|
// areas used by *ptdbc, then this filter passes the exception
|
|
// on up the exception chain.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// ExceptionCode - Reason for the exception.
|
|
//
|
|
// ExceptionInfo - Information about the exception and the context
|
|
// that it occurred in.
|
|
//
|
|
// Return Value:
|
|
//
|
|
// Exception disposition code that tells the exception dispatcher what
|
|
// to do with this exception. One of three values is returned:
|
|
//
|
|
// EXCEPTION_EXECUTE_HANDLER - execute the exception handler
|
|
// associated with the exception clause that called this filter
|
|
// procedure.
|
|
//
|
|
// EXCEPTION_CONTINUE_SEARCH - Continue searching for an exception
|
|
// handler to handle this exception.
|
|
//
|
|
// EXCEPTION_CONTINUE_EXECUTION - Dismiss this exception and return
|
|
// control to the instruction that caused the exception.
|
|
|
|
UINT FaultingAddress;
|
|
|
|
// If this is an access violation touching memory within
|
|
// our reserved buffer, but outside of the committed portion
|
|
// of the buffer, then we are going to take this exception.
|
|
|
|
if (ExceptionCode != STATUS_ACCESS_VIOLATION) return EXCEPTION_CONTINUE_SEARCH;
|
|
|
|
// We pass all other exceptions up the chain.
|
|
|
|
// Get the virtual address that caused the access violation
|
|
// from the exception record.
|
|
|
|
FaultingAddress= (UINT )(ExceptionInfo->ExceptionRecord
|
|
->ExceptionInformation[1]);
|
|
|
|
UINT cbPageSize= vbTokenStream.PageSize;
|
|
|
|
int i;
|
|
|
|
for (i=0; i < COUNT_OF_VIRTUAL_BUFFERS; i++)
|
|
{
|
|
MY_VIRTUAL_BUFFER *pvb= &(m_avb[i]);
|
|
|
|
if ( FaultingAddress < (UINT ) pvb->CommitLimit
|
|
|| FaultingAddress > (UINT ) ((PBYTE) pvb->ReserveLimit - cbPageSize)
|
|
) continue;
|
|
|
|
// This is our exception. Try to extend the buffer
|
|
// to including the faulting address.
|
|
|
|
FaultingAddress+= BUFFER_INCREMENT;
|
|
|
|
if (FaultingAddress >= (UINT ) ((PBYTE) pvb->ReserveLimit - cbPageSize))
|
|
FaultingAddress = (UINT ) ((PBYTE) pvb->ReserveLimit - cbPageSize - 1);
|
|
|
|
if (ExtendVirtualBuffer(pvb, (PBYTE) FaultingAddress))
|
|
return EXCEPTION_CONTINUE_EXECUTION;
|
|
else
|
|
return EXCEPTION_CONTINUE_SEARCH;
|
|
}
|
|
|
|
return EXCEPTION_CONTINUE_SEARCH;
|
|
}
|
|
|
|
PLocalDictionary CTextDatabase::AllocateLocalDictionary()
|
|
{
|
|
PLocalDictionary pld;
|
|
|
|
ASSERT( m_pulstate);
|
|
ASSERT(!(m_pulstate->pld));
|
|
|
|
pld= (PLocalDictionary) VAlloc(FALSE, sizeof(LocalDictionary));
|
|
|
|
pld->pltFirst = m_pltNext;
|
|
pld->clt = 0;
|
|
pld->ppdNext = pld->apdLocal;
|
|
|
|
pld->apdLocal[0]= DescriptorBase();
|
|
pld->apdLocal[1]= DescriptorBase() + 1;
|
|
pld->apdLocal[2]= DescriptorBase() + 2;
|
|
|
|
ZeroMemory(pld->aiTokenInstFirst, ENTRIES_PER_LOCAL_DICT * sizeof(USHORT));
|
|
|
|
m_pulstate->pld = pld;
|
|
|
|
#ifdef _DEBUG
|
|
m_pulstate->cCollisions = 0;
|
|
#endif // _DEBUG
|
|
|
|
ZeroMemory(&(m_pulstate->appdLocalClasses ), sizeof(PDESCRIPTOR *) * LOCAL_HASH_CLASSES );
|
|
ZeroMemory(&(m_pulstate->appdCollisionChains), sizeof(PDESCRIPTOR *) * ENTRIES_PER_LOCAL_DICT);
|
|
ZeroMemory(&(m_pulstate->cReferences ), sizeof(UINT ) * ENTRIES_PER_LOCAL_DICT);
|
|
|
|
return pld;
|
|
}
|
|
|
|
int CTextDatabase::AppendSlave(PWCHAR pwText, int cwText, BOOL fArticleEnd, UINT iCharset, UINT lcid)
|
|
{
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
#define MAX_TOKENS 1000
|
|
|
|
int n, nChar, nTokens;
|
|
int nMore = cwText;
|
|
DWORD dwConId;
|
|
|
|
PLocalDictionary pld;
|
|
|
|
pld= m_pulstate->pld;
|
|
|
|
m_pulstate->pbBuffer = pwText;
|
|
m_pulstate->pbCurrentLine = pwText;
|
|
|
|
if (!pld) pld= AllocateLocalDictionary(); // Sets m_pulstate->pld as a side effect...
|
|
|
|
if (!m_pwHash) m_pwHash = New UINT[MAX_TOKENS];
|
|
if (!m_pbType) m_pbType = New BYTE[MAX_TOKENS];
|
|
if (!m_paStart) m_paStart = New PWCHAR[MAX_TOKENS];
|
|
if (!m_paEnd) m_paEnd = New PWCHAR[MAX_TOKENS];
|
|
|
|
if (pwText && m_pwHash && m_paStart && m_pbType && m_paEnd)
|
|
{
|
|
PWCHAR pwTextStart= pwText;
|
|
|
|
nTokens = WordBreakW(&pwText, &nMore, m_paStart, m_paEnd, m_pbType, m_pwHash, MAX_TOKENS, REMOVE_SPACE_CHARS);
|
|
|
|
if (nTokens > 1 && (nMore || !fArticleEnd)) // exhausted token space OR more article
|
|
{
|
|
if (nTokens > 2 && !(m_pbType[nTokens-1] & WORD_TYPE))
|
|
nTokens--; // break at word starts (punc not to span)
|
|
nChar= m_paStart[--nTokens] - pwTextStart; // reprocess last token
|
|
}
|
|
else
|
|
nChar = cwText; // processed entire buffer
|
|
|
|
for (n = 0; n < nTokens; n++)
|
|
{
|
|
((m_pltNext)++)->iLocalDescriptorEntry =
|
|
SearchLocalTable(m_paStart[n], m_paEnd[n] - m_paStart[n], m_pwHash[n], m_pbType[n], iCharset, lcid);
|
|
|
|
if (FVectorSearch())
|
|
{
|
|
// Enter only the word type tokens into the dictionary
|
|
if (m_pbType[n] & WORD_TYPE)
|
|
{
|
|
dwConId = m_pDict->EnterWord(m_paStart[n], m_paEnd[n] - m_paStart[n]);
|
|
if (dwConId != EOL && dwConId != STOPWORD)
|
|
m_pColl->RecordConcept(dwConId);
|
|
}
|
|
}
|
|
|
|
if (++(m_pulstate->pld->clt) == MAX_REFS_PER_LDICT)
|
|
MoveToNextLocalDict(m_paEnd[n]);
|
|
}
|
|
|
|
m_cbScanned += nChar;
|
|
|
|
if (!nMore)
|
|
nChar = -nChar; // marks that text buffer is fully processed
|
|
}
|
|
|
|
else
|
|
{
|
|
nChar = cwText;
|
|
m_cbScanned += nChar;
|
|
}
|
|
|
|
return nChar;
|
|
}
|
|
|
|
USHORT CTextDatabase::SearchLocalTable(PWCHAR pbToken, UINT cbToken, UINT hv, BYTE bType, UINT iCharset, UINT lcid)
|
|
{
|
|
/*++
|
|
Searches the current local dictionary by means of its hash
|
|
table and collision chain. Adds an entry if it didn't already
|
|
exist in the dictionary.
|
|
--*/
|
|
|
|
PDESCRIPTOR *ppd, **pppd;
|
|
PDESCRIPTOR pd;
|
|
PWCHAR pb;
|
|
USHORT iToken;
|
|
PLocalDictionary pld;
|
|
|
|
pld= m_pulstate->pld;
|
|
|
|
pppd= &(m_pulstate->appdLocalClasses[(hv ^ (hv >> 16)) & LOCAL_HASH_MASK]);
|
|
|
|
for (ppd= *pppd;
|
|
ppd;
|
|
pppd= m_pulstate->appdCollisionChains + (ppd - &(pld->apdLocal[0])), ppd= *pppd
|
|
)
|
|
{
|
|
pd= *ppd;
|
|
|
|
if (CwDisplay(pd) != cbToken) continue;
|
|
|
|
if (wcsncmp(pd->pwDisplay, pbToken, cbToken)) continue;
|
|
|
|
iToken= (USHORT) (ppd - &(pld->apdLocal[0]));
|
|
|
|
++(m_pulstate->cReferences[iToken]);
|
|
|
|
return iToken;
|
|
}
|
|
|
|
// This token doesn't match anything in the local token set.
|
|
// So we must add it to the local set.
|
|
|
|
if (pld->ppdNext == &pld->apdLocal[ENTRIES_PER_LOCAL_DICT])
|
|
{
|
|
// When the local dictionary is full, we create a new
|
|
// dictionary and call ourselves recursively to add the
|
|
// new token.
|
|
|
|
MoveToNextLocalDict(pbToken);
|
|
|
|
return SearchLocalTable(pbToken, cbToken, hv, bType, iCharset, lcid);
|
|
}
|
|
else ppd= (pld->ppdNext)++;
|
|
|
|
pd = (m_pdNext)++;
|
|
|
|
#ifdef _DEBUG
|
|
m_pdNext->pbImage = PWCHAR(-1);
|
|
#endif // _DEBUG
|
|
|
|
pb = m_pwDispNext; (m_pdNext)->pwDisplay = m_pwDispNext += cbToken;
|
|
|
|
memcpy(pb, pbToken, cbToken * sizeof(WCHAR));
|
|
|
|
pd->bCharset = iCharset;
|
|
pd->fImageFlags = (bType & WORD_TYPE) ? LETTER_CHAR : 0;
|
|
pd->cReferences = 0; // Necessary initialing for the FlattenAndMerge routine...
|
|
|
|
*ppd= pd;
|
|
|
|
ASSERT(m_pulstate->appdCollisionChains > m_pulstate->appdLocalClasses); // Necessary for following test.
|
|
|
|
#ifdef _DEBUG
|
|
if (pppd >= m_pulstate->appdCollisionChains) ++(m_pulstate->cCollisions);
|
|
#endif // _DEBUG
|
|
|
|
*pppd= ppd;
|
|
|
|
iToken= (USHORT) (ppd - &(pld->apdLocal[0]));
|
|
|
|
++(m_pulstate->cReferences[iToken]);
|
|
|
|
return iToken;
|
|
}
|
|
|
|
PLocalDictionary CTextDatabase::MoveToNextLocalDict(PWCHAR pbScanLimit)
|
|
{
|
|
ASSERT(m_pulstate);
|
|
ASSERT(m_pulstate->pld);
|
|
|
|
BindToGlobalDict(pbScanLimit);
|
|
|
|
// Whenever we complete a local dictionary, we check the space we've
|
|
// consumed against a pair of thresholds. If the number of tokens is
|
|
// over threshold #1, we flatten the reference lists, merge them with
|
|
// the global reference lists, and restart our linked lists. If the
|
|
// number of global dictionaries exceeds threshold #2, we merge the
|
|
// global dictionary with the galactic dictionary and restart with an
|
|
// empty global dictionary.
|
|
//
|
|
// The two thresholds are calculated based on the available memory
|
|
// on the current machine. Threshold #1 is set to guarantee that we
|
|
// can flatten the link lists in RAM. Threshold #2 guarantees that
|
|
// all the global dictionary entries fit within memory when we're
|
|
// binding a local dictionary to them.
|
|
|
|
int cTokens = m_pulstate->m_aiBaseToken[m_cLocalDicts] - m_pulstate->m_aiBaseToken[m_iLocalDictBase];
|
|
int cActiveDicts = m_cLocalDicts - m_iLocalDictBase;
|
|
int cTokenRefs = cTokens;
|
|
int cGlobalTerms = m_pdNextGlobal - m_pdNextGalactic;
|
|
|
|
#if 0
|
|
|
|
UINT cbUsage1= sizeof(CompressionState) * cGlobalTerms
|
|
+ sizeof(ReferenceDescriptor) * cGlobalTerms
|
|
+ sizeof(UINT) * (m_iSerialNumberNext + cGlobalTerms * 3
|
|
+ cTokenRefs * 2
|
|
+ MAX_GLOBAL_TOKENS * 2
|
|
)
|
|
+ sizeof(LocalDictionary) * cActiveDicts
|
|
+ sizeof(*this);
|
|
|
|
#endif // 0
|
|
|
|
UINT cbUsage1= cActiveDicts * 11 * 65536;
|
|
|
|
if (cbUsage1 > cbAvailableMemory)
|
|
{
|
|
FlattenAndMergeLinks();
|
|
GalacticMerge();
|
|
}
|
|
|
|
#if 0
|
|
|
|
UINT cbUsage2= sizeof(DESCRIPTOR) * cGlobalTerms
|
|
+ sizeof(UINT ) * cGlobalTerms
|
|
+ ((PBYTE)m_pbNextGlobal - (PBYTE)m_pbNextGalactic)
|
|
+ ((PBYTE)m_pwDispNextGlobal - (PBYTE)m_pwDispNextGalactic)
|
|
+ (PBYTE(m_pdNextGalactic) - (PBYTE) DescriptorBase());
|
|
|
|
UINT cbUsage3= sizeof(LocalDictionary) + m_pshtGlobal->CbMemorySize() + sizeof(this);
|
|
|
|
if (cbUsage2 > cbAvailableMemory || cbUsage3 > cbAvailableMemory) GalacticMerge();
|
|
|
|
#endif // 0
|
|
|
|
return AllocateLocalDictionary();
|
|
}
|
|
|
|
CAValRef *CTextDatabase::DescriptorList(PDESCRIPTOR pd, UINT cd)
|
|
{
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
CAValRef *pavr= NULL;
|
|
|
|
__try
|
|
{
|
|
pavr= CAValRef::NewValRef(cd);
|
|
|
|
for (; cd--; ++pd)
|
|
pavr->AddWCRef(pd->pwDisplay, CwDisplay(pd));
|
|
}
|
|
__finally
|
|
{
|
|
if (_abnormal_termination() && pavr)
|
|
{
|
|
delete pavr; pavr= NULL;
|
|
}
|
|
}
|
|
|
|
return pavr;
|
|
}
|
|
|
|
void AddLocalEntries(UINT iValue, PVOID pvTag, PVOID pvEnvironment)
|
|
{
|
|
#define plc ((LOCAL_CONTEXT_1 *) pvEnvironment)
|
|
#define pttg ((TermTagGlobal *) pvTag)
|
|
|
|
(plc->cAdded)++;
|
|
|
|
CTextDatabase *ptdb = plc->ptdb;
|
|
PLocalDictionary pld = ptdb->m_pulstate->pld;
|
|
|
|
DESCRIPTOR *pdGlobal= ptdb->m_pulstate->pld->apdLocal[iValue];
|
|
|
|
pttg->iGlobalDesc = pdGlobal - ptdb->DescriptorBase();
|
|
(plc->ppde)[iValue] = pdGlobal;
|
|
|
|
pttg->cRefsNew = ptdb->m_pulstate->cReferences[iValue];
|
|
// pttg->iNewRefFirst = plc->iLTBase + pld->aiTokenInstFirst[iValue];
|
|
// pttg->iNewRefLast = plc->iLTBase + ptdb->m_pulstate->aiTokenInstLast[iValue];
|
|
|
|
#undef plc
|
|
#undef ptt
|
|
}
|
|
|
|
void MergeLocalEntries(UINT iValue, PVOID pvTag, PVOID pvEnvironment)
|
|
{
|
|
#define plc ((LOCAL_CONTEXT_1 *) pvEnvironment)
|
|
#define pttg ((TermTagGlobal *) pvTag)
|
|
|
|
CTextDatabase *ptdb = plc->ptdb;
|
|
PLocalDictionary pld = ptdb->m_pulstate->pld;
|
|
|
|
ASSERT(pttg->iGlobalDesc < plc->iDescLimit); // Fails when we have duplicates in the
|
|
// list we're adding to the global dict.
|
|
|
|
DESCRIPTOR *pdGlobal= ptdb->DescriptorBase() + pttg->iGlobalDesc;
|
|
|
|
// if (!(pttg->cRefsNew)) pttg->iNewRefFirst= plc->iLTBase + pld->aiTokenInstFirst[iValue];
|
|
|
|
pttg->cRefsNew += ptdb->m_pulstate->cReferences[iValue];
|
|
// pttg->iNewRefLast = plc->iLTBase + ptdb->m_pulstate->aiTokenInstLast[iValue];
|
|
|
|
pld->apdLocal[iValue]= pdGlobal;
|
|
|
|
#undef plc
|
|
#undef ptt
|
|
}
|
|
|
|
void FixupDescriptorIndex(UINT iValue, PVOID pvTag, PVOID pvEnvironment)
|
|
{
|
|
#define pul ((PUINT ) pvEnvironment)
|
|
#define pttg ((TermTagGlobal *) pvTag)
|
|
|
|
pttg->iGlobalDesc = pul[iValue];
|
|
|
|
#undef ptt
|
|
#undef pui
|
|
}
|
|
|
|
void CTextDatabase::BindToGlobalDict(PWCHAR pbScanLimit)
|
|
{
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
CAValRef *pavr = NULL;
|
|
PDESCRIPTOR *ppdRemap = NULL;
|
|
CAValRef *pavrNew = NULL;
|
|
|
|
__try
|
|
{
|
|
PLocalDictionary pld;
|
|
PLocalToken pltBase, pltLimit;
|
|
USHORT iLocalDict;
|
|
|
|
pld= m_pulstate->pld;
|
|
|
|
if (!pld) return;
|
|
|
|
if (m_pulstate->m_aiBaseToken[m_cLocalDicts] == UINT ((pld->pltFirst + pld->clt) - (PLocalToken) TokenBase()))
|
|
return;
|
|
|
|
// First we traverse the set of token references for this local
|
|
// dictionary. As we move along we construct the references
|
|
// chains for the set of tokens, and we accumulate reference
|
|
// counts.
|
|
|
|
pltBase= pld->pltFirst;
|
|
|
|
m_pltNext= pltLimit= pltBase + pld->clt;
|
|
|
|
ZeroMemory( pld->aiTokenInstFirst, sizeof(USHORT) * ENTRIES_PER_LOCAL_DICT);
|
|
// ZeroMemory(m_pulstate->aiTokenInstLast , sizeof(USHORT) * ENTRIES_PER_LOCAL_DICT);
|
|
|
|
do
|
|
{
|
|
USHORT iDescriptor = (--pltLimit)->iLocalDescriptorEntry;
|
|
USHORT iToken = pltLimit - pltBase;
|
|
|
|
pltLimit->iLocalReferenceNext= pld->aiTokenInstFirst[iDescriptor];
|
|
|
|
// if (!(pltLimit->iLocalReferenceNext= pld->aiTokenInstFirst[iDescriptor]))
|
|
// m_pulstate->aiTokenInstLast[iDescriptor]= iToken;
|
|
|
|
pld->aiTokenInstFirst[iDescriptor]= iToken;
|
|
|
|
} while (pltBase != pltLimit);
|
|
|
|
// Then we add this local dictionary to the list and record
|
|
// the cumulative text size, line count, and token count information
|
|
// corresponding to it.
|
|
|
|
if ( !(iLocalDict= m_cLocalDicts)
|
|
|| pld != m_pulstate->m_apLocalDict[--iLocalDict]
|
|
)
|
|
{
|
|
iLocalDict= m_cLocalDicts++;
|
|
|
|
m_pulstate->m_apLocalDict[iLocalDict] = pld;
|
|
}
|
|
|
|
m_pulstate->m_aiBaseCByte[iLocalDict+1] = m_cbScanned + (pbScanLimit - m_pulstate->pbBuffer);
|
|
m_pulstate->m_aiBaseToken[iLocalDict+1] = (pld->pltFirst + pld->clt) - (PLocalToken) TokenBase();
|
|
|
|
#ifdef _DEBUG
|
|
m_pulstate->m_acLocalCollisions[iLocalDict] = m_pulstate->cCollisions;
|
|
#endif // _DEBUG
|
|
|
|
USHORT cLocalEntries = pld->ppdNext - pld->apdLocal;
|
|
|
|
pavr = CAValRef::NewValRef(cLocalEntries);
|
|
|
|
PDESCRIPTOR *ppd = pld->apdLocal;
|
|
USHORT c = cLocalEntries;
|
|
|
|
for (; c--; )
|
|
{
|
|
PDESCRIPTOR pd= *ppd++;
|
|
pavr->AddWCRef(pd->pwDisplay, CwDisplay(pd));
|
|
}
|
|
|
|
ppdRemap = (PDESCRIPTOR *) VAlloc(TRUE, cLocalEntries * sizeof(PDESCRIPTOR));
|
|
|
|
LOCAL_CONTEXT_1 lc;
|
|
|
|
lc.ptdb = NULL; AttachRef(lc.ptdb, this);
|
|
lc.ild = iLocalDict;
|
|
lc.ppde = ppdRemap;
|
|
lc.iDescLimit = m_pdNextGlobal - DescriptorBase();
|
|
lc.iLTBase = pld->pltFirst - (PLocalToken) TokenBase();
|
|
lc.cAdded = 0;
|
|
|
|
m_pshtGlobal->Assimilate(pavr, &lc, MergeLocalEntries, AddLocalEntries);
|
|
|
|
DetachRef(lc.ptdb);
|
|
|
|
// Now we'll compact the set of token images by removing images
|
|
// already in the global dictionary.
|
|
|
|
if (lc.cAdded)
|
|
{
|
|
// Note! The code below overwrites *ppdRemap with UINT values.
|
|
|
|
ASSERT(sizeof(PUINT) == sizeof(PDESCRIPTOR **));
|
|
|
|
PUINT pui = (PUINT) ppdRemap;
|
|
PWCHAR pwDispDest = m_pwDispNextGlobal;
|
|
PDESCRIPTOR pdDest = m_pdNextGlobal;
|
|
PDESCRIPTOR *ppd;
|
|
UINT iDesc= pdDest - DescriptorBase();
|
|
USHORT c;
|
|
|
|
for(c= cLocalEntries, ppd= ppdRemap; c--; ppd++)
|
|
{
|
|
PDESCRIPTOR pd= *ppd;
|
|
|
|
if (!pd) continue;
|
|
|
|
UINT iEntry= ppd - ppdRemap;
|
|
|
|
*pui++ = iDesc++;
|
|
|
|
pld->apdLocal[iEntry]= pdDest;
|
|
|
|
if (pdDest != pd)
|
|
{
|
|
UINT cb = CwDisplay(pd);
|
|
|
|
MoveMemory(pwDispDest, pd->pwDisplay, cb * sizeof(WCHAR));
|
|
|
|
pd->pwDisplay = pwDispDest;
|
|
|
|
pwDispDest += cb;
|
|
|
|
*pdDest++ = *pd;
|
|
}
|
|
else
|
|
{
|
|
pwDispDest += CwDisplay(pd);
|
|
++pdDest;
|
|
}
|
|
}
|
|
|
|
pdDest->pwDisplay = pwDispDest;
|
|
|
|
#if _DEBUG
|
|
|
|
{
|
|
UINT i;
|
|
|
|
for (i= 0; i < cLocalEntries; ++i)
|
|
ASSERT(pld->apdLocal[i] < pdDest);
|
|
|
|
UINT iLimit= pdDest - DescriptorBase();
|
|
|
|
for (i= 0; i < lc.cAdded; ++i)
|
|
ASSERT(iLimit > ((PUINT)ppdRemap)[i]);
|
|
}
|
|
|
|
#endif // _DEBUG
|
|
|
|
// Now we'll fixup the global hash table entries
|
|
// to reference the new descriptor locations.
|
|
|
|
pavrNew= DescriptorList(m_pdNextGlobal, lc.cAdded);
|
|
|
|
m_pshtGlobal->Assimilate(pavrNew, ppdRemap, FixupDescriptorIndex, NULL);
|
|
|
|
delete pavrNew; pavrNew= NULL;
|
|
|
|
m_pwDispNextGlobal= pwDispDest;
|
|
m_pdNextGlobal= pdDest;
|
|
}
|
|
|
|
m_pbNext= m_pbNextGlobal;
|
|
m_pwDispNext= m_pwDispNextGlobal;
|
|
m_pdNext= m_pdNextGlobal;
|
|
|
|
ASSERT(256 > (m_pdNext->pwDisplay - (m_pdNext-1)->pwDisplay));
|
|
|
|
delete pavr; pavr= NULL;
|
|
|
|
VFree(ppdRemap); ppdRemap= NULL;
|
|
|
|
if ( pld->clt == MAX_REFS_PER_LDICT
|
|
|| pld->ppdNext == &pld->apdLocal[ENTRIES_PER_LOCAL_DICT]
|
|
)
|
|
m_pulstate->pld= NULL;
|
|
else
|
|
{
|
|
ZeroMemory(m_pulstate->cReferences,
|
|
sizeof(int ) * (ENTRIES_PER_LOCAL_DICT));
|
|
}
|
|
}
|
|
__finally
|
|
{
|
|
if (pavrNew ) { delete pavrNew; pavrNew = NULL; }
|
|
if (pavr ) { delete pavr; pavr = NULL; }
|
|
if (ppdRemap) { VFree(ppdRemap); ppdRemap = NULL; }
|
|
}
|
|
}
|
|
|
|
void GetSerial(UINT iValue, PVOID pvTag, PVOID pvEnv)
|
|
{
|
|
#define plc ((LOCAL_CONTEXT_2 *) pvEnv)
|
|
#define pttgal ((TermTagGalactic *) pvTag)
|
|
|
|
plc->paiSerial[iValue]= pttgal->iGalacticDesc;
|
|
|
|
#undef plc
|
|
#undef pttgal
|
|
}
|
|
|
|
void NewSerial(UINT iValue, PVOID pvTag, PVOID pvEnv)
|
|
{
|
|
#define plc ((LOCAL_CONTEXT_2 *) pvEnv)
|
|
#define pttgal ((TermTagGalactic *) pvTag)
|
|
|
|
plc->paiSerial[iValue]= pttgal->iGalacticDesc
|
|
= (plc->iSerialNext)++;
|
|
|
|
#undef plc
|
|
#undef pttgal
|
|
}
|
|
|
|
void RecordSerial(UINT iValue, PVOID pvTag, PVOID pvEnv)
|
|
{
|
|
#define paiSerial ((UINT *) pvEnv)
|
|
#define pttGlob ((TermTagGlobal *) pvTag)
|
|
|
|
pttGlob->iGalacticDesc= paiSerial[iValue];
|
|
|
|
#undef paiSerial
|
|
#undef pttGlob
|
|
}
|
|
|
|
void ExtractStatistics(PVOID pvTag, PVOID pvEnv)
|
|
{
|
|
#define pttGlob ((TermTagGlobal *) pvTag)
|
|
#define plc ((LOCAL_CONTEXT_3 *) pvEnv)
|
|
|
|
ASSERT(plc->puiMap[pttGlob->iGalacticDesc] == 0);
|
|
|
|
plc->puiMap[pttGlob->iGalacticDesc]= pttGlob->iGlobalDesc + 1; // We store the map with origin == 1.
|
|
|
|
UINT iGlobal= pttGlob->iGlobalDesc - plc->idBase;
|
|
|
|
CompressionState *pcs= &(plc->paCS[iGlobal]);
|
|
|
|
UINT c= pcs->cRefs = pttGlob->cRefsNew;
|
|
|
|
pttGlob->cRefsNew= 0;
|
|
|
|
if (!c) return;
|
|
|
|
pttGlob->cRefsGlobal += c;
|
|
|
|
++(plc->cNewRefLists);
|
|
|
|
plc->cdw += c + 2; // We'll be storing: iGalactic, cRefs, Ref1, ..., RefN
|
|
|
|
#if 0
|
|
|
|
if (c > 3)
|
|
{
|
|
UINT iFirst= pcs->iRef = pttGlob->iNewRefFirst;
|
|
UINT span = pttGlob->iNewRefLast - iFirst;
|
|
|
|
--c;
|
|
|
|
UINT cbitsBasis = pcs->cbitsBasis = CBitsToRepresent((span - 1) / c);
|
|
UINT basis = 1 << cbitsBasis;
|
|
|
|
pcs->cbits= CBITS_BASIS_MASK + 8 * 3 * sizeof(UINT)
|
|
+ c * (1+cbitsBasis)
|
|
+ (span + basis - c - 1) / basis;
|
|
}
|
|
else pcs->cbits= c? 8 * (c + 1) * sizeof(UINT)
|
|
: 0;
|
|
|
|
plc->cdw += (31 + pcs->cbits) >> 5;
|
|
|
|
#endif // 0
|
|
|
|
#undef pttGlob
|
|
#undef plc
|
|
}
|
|
|
|
void CTextDatabase::FlattenAndMergeLinks()
|
|
{
|
|
// This routine coalesces global reference lists. When it begins, each
|
|
// global term has two sets of references. The first set is a sequence of
|
|
// zero or more compressed index vectors. The second set are a collection
|
|
// of linked lists which tie together term references associated with a
|
|
// particular local dictionary.
|
|
//
|
|
// We combine the two sets by flattening the linked list set to construct
|
|
// reference vectors, and then we compress those vectors and merge them
|
|
// with first vector set.
|
|
//
|
|
// During the merging process we maintain an ordering dictated by the
|
|
// galactic hash table. That is, the reference vectors for each term
|
|
// is stored in the same order as the term would be stored in the
|
|
// galactic descriptor vector.
|
|
|
|
#ifdef _DEBUG
|
|
|
|
ASSERT( !m_pulstate->pld
|
|
|| m_pulstate->m_aiBaseToken[m_cLocalDicts]
|
|
== UINT((m_pulstate->pld->pltFirst + m_pulstate->pld->clt) - (PLocalToken) TokenBase())
|
|
);
|
|
|
|
#endif
|
|
|
|
m_pulstate->pld= 0;
|
|
|
|
if (m_puiTokenNext == PUINT(m_pltNext)) return;
|
|
|
|
// First we get galactic serial numbers for any new global terms.
|
|
// The member variable m_pdNextBound points to the first global
|
|
// descriptor which does not have a galactic serial number.
|
|
//
|
|
// Note: A galactic serial number is an ordering value defined in
|
|
// the galactic hash table.
|
|
|
|
UINT cOldTerms= m_pdNextBound - m_pdNextGalactic;
|
|
|
|
if (m_pdNextBound < m_pdNextGlobal)
|
|
{
|
|
PUINT paiSerial = NULL;
|
|
CAValRef *pavr = NULL;
|
|
|
|
__try
|
|
{
|
|
UINT cSlots= m_pdNextGlobal - m_pdNextBound;
|
|
|
|
paiSerial= (PUINT) VAlloc(FALSE, cSlots * sizeof(UINT));
|
|
|
|
pavr= DescriptorList(m_pdNextBound, cSlots);
|
|
|
|
LOCAL_CONTEXT_2 lc;
|
|
|
|
lc.iSerialNext = m_iSerialNumberNext;
|
|
lc.paiSerial = paiSerial;
|
|
|
|
m_pshtGalactic->Assimilate(pavr, &lc, &GetSerial, &NewSerial);
|
|
|
|
m_iSerialNumberNext= lc.iSerialNext;
|
|
|
|
m_pshtGlobal->Assimilate(pavr, paiSerial, RecordSerial, NULL);
|
|
|
|
PDESCRIPTOR pd= m_pdNextGlobal;
|
|
|
|
for (paiSerial += cSlots; cSlots--;) (--pd)->iGalactic= *--paiSerial;
|
|
|
|
m_pdNextBound= m_pdNextGlobal;
|
|
}
|
|
__finally
|
|
{
|
|
if (paiSerial) { VFree(paiSerial); paiSerial = NULL; }
|
|
if (pavr ) { delete pavr; pavr = NULL; }
|
|
}
|
|
}
|
|
|
|
LOCAL_CONTEXT_3 lc3;
|
|
|
|
UINT cGlobalTerms= m_pdNextGlobal - m_pdNextGalactic;
|
|
|
|
lc3.cdw = 0;
|
|
lc3.cNewRefLists = 0;
|
|
lc3.idBase = m_pdNextGalactic - DescriptorBase();
|
|
|
|
lc3.puiMap = NULL;
|
|
lc3.paCS = NULL;
|
|
|
|
PUINT paiGlobalToRefList = NULL;
|
|
PUINT padwRefs = NULL;
|
|
PUINT *papdwRefs = NULL;
|
|
CIndicatorSet *pisSymbols = NULL;
|
|
PLocalDictionary pld = NULL;
|
|
|
|
__try
|
|
{
|
|
lc3.puiMap = (PUINT) VAlloc(TRUE , m_iSerialNumberNext * sizeof(UINT));
|
|
lc3.paCS = (CompressionState *) VAlloc(FALSE,
|
|
cGlobalTerms
|
|
* sizeof(CompressionState)
|
|
);
|
|
|
|
#ifdef _DEBUG
|
|
|
|
FillMemory(lc3.paCS , cGlobalTerms * sizeof(CompressionState), UCHAR(-1));
|
|
|
|
#endif // _DEBUG
|
|
|
|
m_pshtGlobal->ForEach(&lc3, ExtractStatistics);
|
|
|
|
paiGlobalToRefList = (PUINT) VAlloc(FALSE, cGlobalTerms * sizeof(UINT));
|
|
|
|
UINT cbBuffer= lc3.cdw * sizeof(UINT);
|
|
|
|
padwRefs = (PUINT )(m_puioRefTemp->GetBuffer(&cbBuffer));
|
|
papdwRefs = (PUINT *) VAlloc(FALSE, lc3.cNewRefLists * sizeof(PUINT));
|
|
|
|
#ifdef _DEBUG
|
|
|
|
FillMemory(paiGlobalToRefList, cGlobalTerms * sizeof( UINT), UCHAR(-1));
|
|
FillMemory(padwRefs , cbBuffer , UCHAR(-1));
|
|
FillMemory(papdwRefs , lc3.cNewRefLists * sizeof(PUINT), UCHAR(-1));
|
|
|
|
#endif // _DEBUG
|
|
|
|
RefClusterDescriptor rcd;
|
|
|
|
rcd.iFilePosLow = m_ibNextFileBlockLow;
|
|
rcd.iFilePosHigh = m_ibNextFileBlockHigh;
|
|
rcd.cdw = lc3.cdw;
|
|
rcd.cTerms = lc3.cNewRefLists;
|
|
|
|
UINT cbClusterSet= RoundUp(cbBuffer, m_cbBlockSize);
|
|
|
|
UINT ibNewLow= m_ibNextFileBlockLow + cbClusterSet;
|
|
|
|
if (ibNewLow < m_ibNextFileBlockLow) ++m_ibNextFileBlockHigh;
|
|
|
|
m_ibNextFileBlockLow= ibNewLow;
|
|
|
|
PUINT *ppdwRef= papdwRefs;
|
|
|
|
PUINT puiSrc = lc3.puiMap,
|
|
pdwRef = padwRefs;
|
|
UINT c, i;
|
|
|
|
// The loop below sets up two index mappings --
|
|
//
|
|
// paiGlobalToRefList goes from global sequence to relative galactic sequence
|
|
// paiRefListToGlobal goes from relative galactic sequence to global sequence
|
|
//
|
|
// It also partitions the pdwRefs vector into space for each reference list.
|
|
|
|
lc3.idBase++; // To adjust for the origin-1 indices in lc3.puiMap.
|
|
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
for (i= 0, c= m_iSerialNumberNext; c--; puiSrc++)
|
|
{
|
|
UINT iGlobal= *puiSrc;
|
|
|
|
if (!iGlobal) continue;
|
|
|
|
iGlobal -= lc3.idBase;
|
|
|
|
CompressionState *pcs= lc3.paCS + iGlobal;
|
|
|
|
if (!pcs->cRefs) continue;
|
|
|
|
pdwRef[0] = puiSrc - lc3.puiMap;
|
|
pdwRef[1] = pcs->cRefs;
|
|
|
|
*ppdwRef++= pdwRef + 2;
|
|
|
|
pdwRef += 2 + pcs->cRefs;
|
|
|
|
m_pdNextGalactic[iGlobal].cReferences += pcs->cRefs;
|
|
|
|
paiGlobalToRefList[iGlobal] = i++;
|
|
|
|
#if 0
|
|
|
|
// If we have more than three references, we store them as
|
|
// a compressed bit stream with this layout:
|
|
//
|
|
// cRefs, cBits, iRefFirst, { basis, delta1, ... , deltaN }
|
|
//
|
|
// where
|
|
//
|
|
// cRefs is a UINT which gives the number of encoded references
|
|
//
|
|
// cBits is a UINT which gives the length of the layout in bits.
|
|
//
|
|
// iRefFirst is a UINT which gives the index position of the first
|
|
// reference
|
|
//
|
|
// basis is a 5-bit value which defines the numerical basis for
|
|
// the trailing delta values
|
|
//
|
|
// delta1, ... , deltaN are variable length encoded values which
|
|
// give the distances between successive
|
|
// reference indices
|
|
//
|
|
// Each deltaI is a stream of K 1 bits followed by a zero bit and then
|
|
// a residue value. Residue values are log2(basis) bits long. The original
|
|
// distance value is (basis * K) + residue + 1. Note that K can be zero.
|
|
|
|
// When we have three or fewer references, we store them uncompressed
|
|
// immediately after the reference count.
|
|
|
|
UINT cdw= (31 + pcs->cbits) >> 5;
|
|
|
|
if (3 < (pdwRef[0]= pcs->cRefs))
|
|
{
|
|
pdwRef[1]= pcs->cbits; pcs->ibitNext= 96 + CBITS_BASIS_MASK;
|
|
|
|
ASSERT(pcs->cbitsBasis <= BASIS_MASK);
|
|
|
|
pdwRef[2]= pcs->iRef;
|
|
pdwRef[3]= pcs->cbitsBasis;
|
|
}
|
|
else pcs->ibitNext= 32;
|
|
|
|
prd->iSerialGalactic = puiSrc - lc3.puiMap;
|
|
prd->idwRefList = pdwRef - pdwRefs; pdwRef += cdw;
|
|
prd->cdwRefs = cdw;
|
|
prd->iLastRef = pcs->iRef;
|
|
|
|
++prd;
|
|
|
|
#endif // 0
|
|
|
|
}
|
|
|
|
VFree(lc3.puiMap); lc3.puiMap= NULL;
|
|
|
|
if (m_pisSymbols)
|
|
AttachRef(pisSymbols, m_pisSymbols->TakeIndicators(m_pltNext - (PLocalToken) TokenBase()));
|
|
else AttachRef(pisSymbols, CIndicatorSet::NewIndicatorSet(m_pltNext - (PLocalToken) TokenBase()));
|
|
|
|
ChangeRef(m_pisSymbols, pisSymbols);
|
|
DetachRef(pisSymbols);
|
|
|
|
// Now we're ready to process the local dictionaries.
|
|
// Note that we destroy and deallocate the local dictionaries as we
|
|
// process them.
|
|
|
|
for (; m_iLocalDictBase < m_cLocalDicts; ++m_iLocalDictBase)
|
|
{
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
pld= m_pulstate->m_apLocalDict[m_iLocalDictBase];
|
|
|
|
m_pulstate->m_apLocalDict[m_iLocalDictBase]= NULL;
|
|
|
|
PLocalToken pltBase= pld->pltFirst;
|
|
|
|
UINT iltBase= pltBase - (PLocalToken) TokenBase();
|
|
|
|
UINT clt;
|
|
PLocalToken plt;
|
|
|
|
for (plt=pltBase, clt= pld->clt, i= iltBase; clt--; plt++)
|
|
if ((pld->apdLocal[plt->iLocalDescriptorEntry]->fImageFlags & LETTER_CHAR))
|
|
m_pisSymbols->RawSetBit(iltBase + (plt - pltBase));
|
|
|
|
UINT cLocalEntries= pld->ppdNext - pld->apdLocal;
|
|
|
|
// Now we're extracting the references for each token in the
|
|
// local dictionary.
|
|
|
|
UINT iEntry;
|
|
|
|
for (iEntry= 0; iEntry < cLocalEntries; iEntry++)
|
|
{
|
|
USHORT usLink = pld->aiTokenInstFirst[iEntry];
|
|
UINT iGlobal = pld->apdLocal [iEntry] - m_pdNextGalactic;
|
|
|
|
PUINT *ppdwRef= papdwRefs + paiGlobalToRefList[iGlobal];
|
|
PUINT pdwRef= *ppdwRef;
|
|
|
|
do
|
|
{
|
|
*pdwRef++= iltBase +usLink;
|
|
|
|
usLink= (pltBase+usLink)->iLocalReferenceNext;
|
|
|
|
} while (usLink);
|
|
|
|
*ppdwRef= pdwRef;
|
|
|
|
#if 0
|
|
|
|
CompressionState *pcs = lc3.paCS + iGlobal;
|
|
PUINT pdwBase = pdwRefs + pard[paiGlobalToRefList[iGlobal]].idwRefList;
|
|
UINT ibitNext = pcs->ibitNext;
|
|
|
|
if (pcs->cRefs > 3)
|
|
{
|
|
// For reference lists longer than three elements we use a compression
|
|
// strategy described by Alistair Moffat in Computing Systems, Vol. 5.
|
|
// No. 2, Page 125.
|
|
|
|
UINT cbitsBasis = pcs->cbitsBasis;
|
|
UINT fBasisMask = ~((~0) << cbitsBasis);
|
|
PUINT pui = pdwBase + (ibitNext >> 5);
|
|
UINT ibitBase = ibitNext & 31;
|
|
UINT ui = *pui & ~((~0) << ibitBase);
|
|
|
|
do
|
|
{
|
|
UINT iRef= iltBase +usLink;
|
|
|
|
ASSERT(iRef >= pcs->iRef);
|
|
|
|
usLink= (pltBase+usLink)->iLocalReferenceNext;
|
|
|
|
UINT delta= iRef - pcs->iRef;
|
|
|
|
if (delta--)
|
|
{
|
|
pcs->iRef= iRef;
|
|
|
|
UINT cOneBits, fraction, cbits, cbitsFraction;
|
|
|
|
for (cOneBits= delta >> cbitsBasis; cOneBits; cOneBits-= cbits)
|
|
{
|
|
cbits= (cOneBits <= 32 - ibitBase)? cOneBits : 32 - ibitBase;
|
|
|
|
ui |= (~((~0) << cbits)) << ibitBase;
|
|
|
|
ibitBase += cbits;
|
|
|
|
if (32 == ibitBase) { *pui++= ui; ui= 0; ibitBase= 0; }
|
|
}
|
|
|
|
fraction= (delta & fBasisMask) << 1;
|
|
|
|
for (cbitsFraction= cbitsBasis + 1; cbitsFraction; cbitsFraction-= cbits)
|
|
{
|
|
cbits= (cbitsFraction <= UINT(32 - ibitBase))? cbitsFraction : 32 - ibitBase;
|
|
|
|
ui |= fraction << ibitBase;
|
|
|
|
fraction >>= cbits;
|
|
ibitBase += cbits;
|
|
|
|
if (32 == ibitBase) { *pui++= ui; ui= 0; ibitBase= 0; }
|
|
}
|
|
}
|
|
|
|
} while (usLink);
|
|
|
|
if (ibitBase) *pui= ui;
|
|
|
|
pcs->ibitNext= ibitBase + ((pui - pdwBase) << 5);
|
|
}
|
|
else // For lists with three or fewer elements, we store the indices
|
|
// uncompressed.
|
|
{
|
|
do
|
|
{
|
|
ASSERT(!(ibitNext & 31));
|
|
|
|
pdwBase[ibitNext >> 5]= iltBase +usLink;
|
|
|
|
ibitNext += 32;
|
|
|
|
usLink= (pltBase+usLink)->iLocalReferenceNext;
|
|
|
|
} while (usLink);
|
|
|
|
pcs->ibitNext= ibitNext;
|
|
}
|
|
|
|
#endif // 0
|
|
|
|
// The line below overwrites apdLocal[i] via a union declaration to prepare for
|
|
// the token processing code following this loop.
|
|
|
|
pld->aiGalactic[iEntry]= m_pdNextGalactic[iGlobal].iGalactic;
|
|
}
|
|
|
|
// Now we'll convert the local tokens into galactic indices.
|
|
|
|
ASSERT(sizeof(UINT) == sizeof(LocalToken));
|
|
|
|
UINT cLocalTokens= pld->clt;
|
|
PUINT pui = (PUINT) pltBase;
|
|
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
for (; cLocalTokens--; ) *pui++ = pld->aiGalactic[(pltBase++)->iLocalDescriptorEntry];
|
|
|
|
// We've finished with this local dictionary. Now we'll delete it to recover
|
|
// memory space.
|
|
|
|
VFree(pld); pld= NULL;
|
|
}
|
|
|
|
m_pisSymbols ->InvalidateCache();
|
|
|
|
VFree(papdwRefs); papdwRefs = NULL;
|
|
VFree(lc3.paCS ); lc3.paCS = NULL;
|
|
|
|
WriteLargeBuff(padwRefs, rcd.iFilePosLow, rcd.iFilePosHigh, cbBuffer);
|
|
|
|
m_puioRefTemp->FreeBuffer(padwRefs); padwRefs= NULL;
|
|
|
|
// Now we adjust m_puiTokenNext to account for the new tokens we've processed.
|
|
|
|
m_puiTokenNext= PUINT(m_pltNext);
|
|
|
|
#if 0
|
|
|
|
// Now we'll put the actual list sizes into the reference descriptors.
|
|
|
|
CompressionState *pcs= lc3.paCS;
|
|
|
|
for (c= cGlobalTerms, rsd.cdwRefs= 0; c--; pcs++)
|
|
{
|
|
if (pcs->cRefs)
|
|
rsd.cdwRefs += pard[paiGlobalToRefList[pcs - lc3.paCS]].cdwRefs= (31 + pcs->ibitNext) >> 5;
|
|
}
|
|
|
|
#endif // 0
|
|
|
|
if (m_paiGlobalToRefList) VFree(m_paiGlobalToRefList);
|
|
|
|
m_paiGlobalToRefList = paiGlobalToRefList; paiGlobalToRefList= NULL;
|
|
|
|
m_pulstate->m_rcd[m_iNextRefSet++]= rcd;
|
|
|
|
#if 0
|
|
|
|
// Finally we'll meld this reference set with previously accumulated reference sets.
|
|
|
|
if (m_iNextRefSet < MAX_REF_SETS && (!m_iNextRefSet || m_cMerges[m_iNextRefSet-1]))
|
|
{
|
|
m_rsd[m_iNextRefSet ]= rsd;
|
|
m_cMerges[m_iNextRefSet++]= 0;
|
|
}
|
|
else CoalesceReferenceLists(&rsd);
|
|
|
|
#endif // 0
|
|
}
|
|
__finally
|
|
{
|
|
if (pisSymbols) DetachRef(pisSymbols);
|
|
|
|
if (padwRefs) { m_puioRefTemp->FreeBuffer(padwRefs); padwRefs = NULL; }
|
|
|
|
if (pld ) { VFree(pld ); pld = NULL; }
|
|
if (lc3.puiMap ) { VFree(lc3.puiMap ); lc3.puiMap = NULL; }
|
|
if (lc3.paCS ) { VFree(lc3.paCS ); lc3.paCS = NULL; }
|
|
if (papdwRefs ) { VFree(papdwRefs ); papdwRefs = NULL; }
|
|
if (paiGlobalToRefList) { VFree(paiGlobalToRefList); paiGlobalToRefList = NULL; }
|
|
}
|
|
}
|
|
|
|
void CTextDatabase::WriteLargeBuff(PVOID pvBuffer, UINT iPosLow, UINT iPosHigh, UINT cbBuffer)
|
|
{
|
|
PBYTE pbBuffer= (PBYTE) pvBuffer;
|
|
|
|
UINT uiCompletionCode;
|
|
|
|
UINT cbChunk;
|
|
|
|
for (; cbBuffer; cbBuffer -= cbChunk)
|
|
{
|
|
cbChunk= m_cbTransactionLimit;
|
|
|
|
if (cbChunk > cbBuffer) cbChunk= cbBuffer;
|
|
|
|
for (;;)
|
|
{
|
|
m_puioRefTemp->Write(pbBuffer, iPosLow, iPosHigh, cbChunk, &uiCompletionCode);
|
|
|
|
if (uiCompletionCode != ERROR_DISK_FULL) break;
|
|
|
|
if (m_puioRefTemp->AskForDiskSpace()) continue;
|
|
|
|
RaiseException(STATUS_NO_DISK_SPACE, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
}
|
|
|
|
pbBuffer += cbChunk;
|
|
|
|
UINT iLowNew= iPosLow + cbChunk;
|
|
|
|
if (iLowNew < iPosLow) ++iPosHigh;
|
|
|
|
iPosLow= iLowNew;
|
|
|
|
if (uiCompletionCode)
|
|
RaiseException(STATUS_DISK_WRITE_ERROR, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
}
|
|
}
|
|
|
|
void CTextDatabase::GalacticMerge()
|
|
{
|
|
// This routine coalesces global reference list information with the galactic
|
|
// reference lists.
|
|
|
|
ASSERT(!m_pulstate->pld);
|
|
|
|
if (m_pdNextBound != m_pdNextGlobal) FlattenAndMergeLinks();
|
|
|
|
ASSERT(m_pdNextBound == m_pdNextGlobal);
|
|
|
|
if (m_pdNextGalactic == m_pdNextGlobal) return;
|
|
|
|
ASSERT(m_pshtGlobal);
|
|
|
|
delete m_pshtGlobal; m_pshtGlobal= NULL;
|
|
|
|
PDESCRIPTOR pdGlobal = NULL;
|
|
PWCHAR pwDispGlobal = NULL;
|
|
|
|
__try
|
|
{
|
|
// First we must coalesce the global descriptors and image strings into the
|
|
// galactic sets.
|
|
|
|
ASSERT( m_pdNextGalactic == DescriptorBase()
|
|
|| 256 > (m_pdNextGalactic->pwDisplay - (m_pdNextGalactic-1)->pwDisplay)
|
|
);
|
|
|
|
UINT cGlobalTerms = m_pdNextGlobal - m_pdNextGalactic;
|
|
UINT cbGlobalImages = m_pbNextGlobal - m_pbNextGalactic;
|
|
UINT cwDispGlobalImages = m_pwDispNextGlobal - m_pwDispNextGalactic;
|
|
|
|
// Since the galactic descriptors and images will overwrite the memory currently used
|
|
// for global information, we must copy that information to temporary buffers.
|
|
|
|
pdGlobal = (PDESCRIPTOR) VAlloc(FALSE, (cGlobalTerms+1) * sizeof(DESCRIPTOR));
|
|
pwDispGlobal = (PWCHAR ) VAlloc(FALSE, cwDispGlobalImages * sizeof(WCHAR));
|
|
|
|
// When we copy the image strings, all their base addresses will change
|
|
// by the same amount.
|
|
|
|
UINT deltaDispAddr= pwDispGlobal - m_pwDispNextGalactic;
|
|
|
|
CopyMemory(pdGlobal, m_pdNextGalactic, (cGlobalTerms+1) * sizeof(DESCRIPTOR));
|
|
CopyMemory(pwDispGlobal, m_pwDispNextGalactic, cwDispGlobalImages * sizeof(WCHAR));
|
|
|
|
UINT c;
|
|
PDESCRIPTOR pd = pdGlobal;
|
|
PDESCRIPTOR pdDest = DescriptorBase();
|
|
UINT cGalacticTerms = m_pdNextGalactic - DescriptorBase();
|
|
|
|
// The loop below merges the global term descriptors with the
|
|
// galactic set.
|
|
|
|
for (pd= pdGlobal, c= cGlobalTerms; c--;pd++)
|
|
{
|
|
// Note the union overlap of iGalactic and pbImage within
|
|
// the DESCRIPTOR structure.
|
|
|
|
UINT iGalactic= pd->iGalactic;
|
|
|
|
pd->cwDisplay = CwDisplay(pd);
|
|
pd->pwDisplay += deltaDispAddr;
|
|
|
|
if (iGalactic < cGalacticTerms)
|
|
pdDest[iGalactic].cReferences += pd->cReferences;
|
|
else pdDest[iGalactic]= *pd;
|
|
}
|
|
|
|
VFree(pdGlobal); pdGlobal = NULL;
|
|
|
|
UINT cTermsNew = m_iSerialNumberNext - cGalacticTerms;
|
|
PWCHAR pb = m_pbNextGalactic;
|
|
PWCHAR pwDisp = m_pwDispNextGalactic;
|
|
LCID lcid = GetUserDefaultLCID();
|
|
|
|
// Now we copy the global image strings into the galactic image space.
|
|
|
|
__try
|
|
{
|
|
for (pd= m_pdNextGalactic, c= cTermsNew; c--; pd++)
|
|
{
|
|
int cwDisp = pd->cwDisplay;
|
|
|
|
CopyMemory(pwDisp, pd->pwDisplay, cwDisp * sizeof(WCHAR));
|
|
|
|
|
|
int cb = LCSortKeyW(lcid, 0, pwDisp, cwDisp, pb, MaxSortKeyBytes(cwDisp));
|
|
|
|
pd->pbImage = pb;
|
|
|
|
pb += cb;
|
|
|
|
|
|
pd->pwDisplay = pwDisp;
|
|
|
|
pwDisp += cwDisp;
|
|
}
|
|
}
|
|
__except (ExceptionFilter(GetExceptionCode(), GetExceptionInformation()))
|
|
{
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
}
|
|
|
|
VFree(pwDispGlobal); pwDispGlobal = NULL;
|
|
|
|
pd->pbImage= pb;
|
|
pd->pwDisplay = pwDisp;
|
|
|
|
m_pbNextGalactic= m_pbNextGlobal= m_pbNext= pb;
|
|
m_pwDispNextGalactic= m_pwDispNextGlobal= m_pwDispNext= pwDisp;
|
|
|
|
m_pdNextBound= m_pdNext= m_pdNextGlobal= m_pdNextGalactic += cTermsNew;
|
|
}
|
|
__finally
|
|
{
|
|
if (pdGlobal ) { VFree(pdGlobal ); pdGlobal = NULL; }
|
|
if (pwDispGlobal) { VFree(pwDispGlobal); pwDispGlobal = NULL; }
|
|
|
|
// Now we no longer need m_paiGlobalToRefList
|
|
|
|
if (m_paiGlobalToRefList) { VFree(m_paiGlobalToRefList); m_paiGlobalToRefList = NULL; }
|
|
}
|
|
|
|
// Finally we set up an empty global hash table.
|
|
|
|
m_pshtGlobal= CSegHashTable::NewSegHashTable(sizeof(TermTagGlobal), sizeof(UINT ));
|
|
}
|
|
|
|
VOID CTextDatabase::GetTextMatrix(int iRowStart, int iColStart,
|
|
int cRows, int cCols, PWCHAR pbDest)
|
|
{
|
|
// This routine extracts a rectangular sub-image from the text database.
|
|
// It assumes that image rows are delimited by line break tokens.
|
|
//
|
|
// The top left corner of the image rectangle is denoted by iRowStart and
|
|
// iColStart. The dimensions of the image rectangle are given by cRows and
|
|
// cCols. The resulting character image will be stored in the byte array
|
|
// referenced by pbDest. That byte array is assumed to be in row-major order.
|
|
//
|
|
// Where the image rectangle lies outside the data -- either beyond the last
|
|
// row or beyond the rightmost column of a particular row -- blanks will be
|
|
// be stored in the destination byte array.
|
|
|
|
ASSERT(FPhraseFeedback());
|
|
|
|
ASSERT(iRowStart >= 0 && iColStart >= 0 && cRows >= 0 && cCols >= 0);
|
|
|
|
// First we preclear the destination area to all blanks. This allows easier
|
|
// treatment of the various boundary conditions later in the code.
|
|
|
|
for (int i = 0; i < cRows*cCols; i++)
|
|
pbDest[i] = UNICODE_SPACE_CHAR;
|
|
|
|
PWCHAR pbLine= pbDest;
|
|
|
|
SyncForQueries();
|
|
|
|
int cTokens = m_pisSymbols->ItemCount();
|
|
int cLines = 1;
|
|
|
|
// If the starting row is beyond the last database row, we return all
|
|
// blanks.
|
|
|
|
if (iRowStart >= cLines) return;
|
|
|
|
// If the last row of the image is beyond the end of the database, we
|
|
// adjust cRows to go just up to the end of the database.
|
|
|
|
if (iRowStart+cRows > cLines) cRows= cLines - iRowStart;
|
|
|
|
if (!cRows || !cCols) return;
|
|
|
|
PUINT paiLineStarts= (PUINT ) _alloca((cRows+1) * sizeof(UINT ));
|
|
|
|
if (!iRowStart)
|
|
{
|
|
*paiLineStarts= UINT(-1);
|
|
|
|
if (cRows > 1)
|
|
paiLineStarts[cRows]= cTokens-1;
|
|
}
|
|
else
|
|
if (cRows >= 1)
|
|
paiLineStarts[cRows-1]= cTokens-1;
|
|
|
|
PUINT piLineStart;
|
|
UINT c;
|
|
|
|
for (piLineStart= paiLineStarts, c= cRows+1; c--; ) *piLineStart++ += 1;
|
|
|
|
int iColLimit= iColStart+cCols;
|
|
|
|
for (piLineStart= paiLineStarts; cRows--; pbLine += cCols, ++piLineStart)
|
|
{
|
|
// The database elides single spaces between symbol tokens. To properly
|
|
// account for those elided tokens, we must keep track of whether the last
|
|
// token was a symbol. At the beginning of each line we set the flag to
|
|
// FALSE because the preceding symbol is either a line break, or we started
|
|
// at token zero.
|
|
|
|
BOOL fPrecedingSymbol= FALSE;
|
|
|
|
PUINT pToken= TokenBase() + *piLineStart;
|
|
|
|
UINT cLineTokens = (*(piLineStart+1) - *piLineStart) - 1;
|
|
|
|
// Now we process the tokens for this line until we reach the end of
|
|
// the line.
|
|
|
|
int cbOffset;
|
|
PDESCRIPTOR pd;
|
|
|
|
for (cbOffset= 0;
|
|
cLineTokens--;
|
|
fPrecedingSymbol= pd->fImageFlags & LETTER_CHAR
|
|
)
|
|
{
|
|
pd= DescriptorBase() + *pToken++;
|
|
|
|
// In this token loop cbOffset tracks the offset location of the
|
|
// current token within the full line image.
|
|
|
|
// If this token is a symbol and the preceding token was a symbol,
|
|
// we adjust cbOffset to account for the elided space character.
|
|
|
|
if ((pd->fImageFlags & LETTER_CHAR) && fPrecedingSymbol) ++cbOffset;
|
|
|
|
cbOffset= FormatAToken(pd, cbOffset, iColStart, iColLimit, pbLine);
|
|
|
|
if (cbOffset >= iColLimit) break;
|
|
}
|
|
}
|
|
}
|
|
|
|
void FindDescriptor(UINT iValue, PVOID pvTag, PVOID pvEnvironment)
|
|
{
|
|
#define plc ((LOCAL_CONTEXT_4 *) (pvEnvironment))
|
|
#define pttGal PTermTagGalactic(pvTag)
|
|
|
|
plc->ppd[iValue]= plc->pdBase + pttGal->iGalacticDesc;
|
|
|
|
#undef plc
|
|
#undef pttGal
|
|
}
|
|
|
|
PDESCRIPTOR *CTextDatabase::FindTokens(CTokenList *ptl, PUINT pcd)
|
|
{
|
|
CAValRef *pavr = NULL;
|
|
PDESCRIPTOR *ppdResult = NULL;
|
|
|
|
UINT cd = 0;
|
|
|
|
__try
|
|
{
|
|
PDESCRIPTOR *ppdSorted = ptl->m_ppdSorted;
|
|
|
|
cd = ptl->m_cd;
|
|
|
|
pavr= CAValRef::NewValRef(cd);
|
|
|
|
for (UINT c= cd; c--; )
|
|
{
|
|
PDESCRIPTOR pd= *ppdSorted++;
|
|
|
|
pavr->AddWCRef(pd->pwDisplay, CwDisplay(pd));
|
|
}
|
|
|
|
ppdResult= (PDESCRIPTOR *) VAlloc(FALSE, cd * sizeof(PDESCRIPTOR));
|
|
|
|
LOCAL_CONTEXT_4 lc4;
|
|
|
|
lc4.ppd = ppdResult;
|
|
lc4.pdBase = DescriptorBase();
|
|
|
|
m_pshtGalactic->Assimilate(pavr, &lc4, FindDescriptor, NULL);
|
|
}
|
|
__finally
|
|
{
|
|
if (pavr) { delete pavr; pavr= NULL; }
|
|
|
|
if (_abnormal_termination() && ppdResult)
|
|
{
|
|
VFree(ppdResult); ppdResult= NULL;
|
|
}
|
|
}
|
|
|
|
if (pcd) *pcd= cd;
|
|
|
|
return ppdResult;
|
|
}
|
|
|
|
CIndicatorSet *CTextDatabase::TokenInstancesFor(CTokenList *ptl)
|
|
{
|
|
BOOL fDirectRef = FALSE;
|
|
|
|
SyncForQueries();
|
|
|
|
ASSERT(m_fFromFileImage || m_puioCompressedRefs);
|
|
ASSERT(m_pdwCompressedRefs || !m_cdwCompressedRefs);
|
|
|
|
PDESCRIPTOR *ppdSorted = ptl->m_ppdSorted;
|
|
UINT cDescriptors = ptl->m_cd;
|
|
|
|
fDirectRef= ( ptl->m_How_Constructed == CTokenList::TDB_FULL_REF
|
|
|| ptl->m_How_Constructed == CTokenList::TDB_PARTIAL_REF
|
|
);
|
|
|
|
if (!fDirectRef)
|
|
{
|
|
ppdSorted= NULL;
|
|
|
|
ppdSorted= FindTokens(ptl, &cDescriptors);
|
|
}
|
|
|
|
ASSERT(ppdSorted);
|
|
|
|
CIndicatorSet *pisDesc = NULL;
|
|
CIndicatorSet *pisTokens = NULL;
|
|
PUINT paiDesc = NULL;
|
|
|
|
__try
|
|
{
|
|
AttachRef(pisDesc, CIndicatorSet::NewIndicatorSet(DescriptorCount()));
|
|
|
|
for (; cDescriptors--; ppdSorted++)
|
|
pisDesc->RawSetBit(*ppdSorted - DescriptorBase());
|
|
|
|
pisDesc->InvalidateCache();
|
|
|
|
cDescriptors= pisDesc->SelectionCount();
|
|
|
|
paiDesc= (PUINT) VAlloc(FALSE, cDescriptors * sizeof(UINT));
|
|
|
|
#ifdef _DEBUG
|
|
UINT cbResult=
|
|
#endif // _DEBUG
|
|
|
|
pisDesc->MarkedItems(0, (int *)paiDesc, cDescriptors);
|
|
|
|
ASSERT(cbResult == cDescriptors);
|
|
|
|
AttachRef(pisTokens, CIndicatorSet::NewIndicatorSet(TokenCount()));
|
|
|
|
PUINT pi= paiDesc;
|
|
|
|
for (; cDescriptors--; )
|
|
{
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
IndicateRefs(m_prldTokenRefs + *pi++, m_pdwCompressedRefs, pisTokens, FALSE);
|
|
}
|
|
}
|
|
__finally
|
|
{
|
|
if (paiDesc) VFree(paiDesc);
|
|
if (pisDesc) DetachRef(pisDesc);
|
|
|
|
if (!fDirectRef) { VFree(ppdSorted); ppdSorted= NULL; }
|
|
|
|
if (_abnormal_termination() && pisTokens) DetachRef(pisTokens);
|
|
}
|
|
|
|
ForgetRef(pisTokens);
|
|
|
|
return pisTokens;
|
|
}
|
|
|
|
CIndicatorSet *CTextDatabase::VocabularyFor(CIndicatorSet *pisArticles, BOOL fRemovePervasiveTerms)
|
|
{
|
|
ASSERT(pisArticles->ItemCount() == ArticleCount());
|
|
|
|
UINT cPartitions= pisArticles->SelectionCount();
|
|
|
|
if (!cPartitions) return CIndicatorSet::NewIndicatorSet(DescriptorCount());
|
|
|
|
PUINT paicTerm = NULL;
|
|
PUINT paiTerms = NULL;
|
|
PUINT paiPartition = NULL;
|
|
CIndicatorSet *pisVocabulary = NULL;
|
|
|
|
__try
|
|
{
|
|
if (fRemovePervasiveTerms) paicTerm= PUINT(VAlloc(TRUE, sizeof(UINT) * DescriptorCount()));
|
|
|
|
const UINT *paiMap;
|
|
|
|
GetPartitionInfo(NULL, NULL, &paiMap);
|
|
|
|
paiPartition= PUINT(VAlloc(FALSE, cPartitions * sizeof(UINT)));
|
|
|
|
ASSERT(paiPartition);
|
|
|
|
pisArticles->MarkedItems(0, PINT(paiPartition), cPartitions);
|
|
|
|
AttachRef(pisVocabulary, CIndicatorSet::NewIndicatorSet(DescriptorCount()));
|
|
|
|
PUINT pi= paiPartition;
|
|
UINT c= cPartitions;
|
|
|
|
for (; c--; )
|
|
{
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
IndicateRefs(m_prldVocabularyRefs + paiMap[*pi++], m_pdwVocabularyRefs, pisVocabulary, FALSE, paicTerm);
|
|
}
|
|
|
|
VFree(paiPartition); paiPartition= NULL;
|
|
|
|
if (paicTerm)
|
|
{
|
|
UINT cTerms= pisVocabulary->SelectionCount();
|
|
|
|
ASSERT(cTerms);
|
|
|
|
paiTerms= PUINT(VAlloc(FALSE, cTerms * sizeof(UINT)));
|
|
|
|
pisVocabulary->MarkedItems(0, PINT(paiTerms), cTerms);
|
|
|
|
PUINT piTerm= paiTerms;
|
|
|
|
for (; cTerms--; )
|
|
{
|
|
UINT iTerm= *piTerm++;
|
|
|
|
if (paicTerm[iTerm] == cPartitions) pisVocabulary->RawClearBit(iTerm);
|
|
}
|
|
|
|
pisVocabulary->InvalidateCache();
|
|
}
|
|
}
|
|
__finally
|
|
{
|
|
if (paiPartition) { VFree(paiPartition); paiPartition = NULL; }
|
|
if (paiTerms ) { VFree(paiTerms ); paiTerms = NULL; }
|
|
if (paicTerm ) { VFree(paicTerm ); paicTerm = NULL; }
|
|
|
|
if (_abnormal_termination() && pisVocabulary) DetachRef(pisVocabulary);
|
|
}
|
|
|
|
ForgetRef(pisVocabulary);
|
|
|
|
return pisVocabulary;
|
|
}
|
|
|
|
CIndicatorSet *CTextDatabase::TopicInstancesFor(CTokenList *ptl)
|
|
{
|
|
BOOL fDirectRef = FALSE;
|
|
|
|
SyncForQueries();
|
|
|
|
ASSERT(m_fFromFileImage || m_puioCompressedArticleRefs);
|
|
ASSERT(m_pdwArticleRefs || !m_cdwArticleRefs);
|
|
|
|
PDESCRIPTOR *ppdSorted = ptl->m_ppdSorted;
|
|
UINT cDescriptors = ptl->m_cd;
|
|
|
|
fDirectRef= ( ptl->m_How_Constructed == CTokenList::TDB_FULL_REF
|
|
|| ptl->m_How_Constructed == CTokenList::TDB_PARTIAL_REF
|
|
);
|
|
|
|
if (!fDirectRef)
|
|
{
|
|
ppdSorted= NULL;
|
|
|
|
ppdSorted= FindTokens(ptl, &cDescriptors);
|
|
}
|
|
|
|
ASSERT(ppdSorted);
|
|
|
|
CIndicatorSet *pisDesc = NULL;
|
|
CIndicatorSet *pisArticles = NULL;
|
|
PUINT paiDesc = NULL;
|
|
|
|
__try
|
|
{
|
|
AttachRef(pisDesc, CIndicatorSet::NewIndicatorSet(DescriptorCount()));
|
|
|
|
for (; cDescriptors--; ppdSorted++)
|
|
pisDesc->RawSetBit(*ppdSorted - DescriptorBase());
|
|
|
|
pisDesc->InvalidateCache();
|
|
|
|
cDescriptors= pisDesc->SelectionCount();
|
|
|
|
paiDesc= (PUINT) VAlloc(FALSE, cDescriptors * sizeof(UINT));
|
|
|
|
#ifdef _DEBUG
|
|
UINT cbResult=
|
|
#endif // _DEBUG
|
|
|
|
pisDesc->MarkedItems(0, (int *)paiDesc, cDescriptors);
|
|
|
|
ASSERT(cbResult == cDescriptors);
|
|
|
|
AttachRef(pisArticles, CIndicatorSet::NewIndicatorSet(ArticleCount()));
|
|
|
|
PUINT pi= paiDesc;
|
|
|
|
for (; cDescriptors--; )
|
|
{
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
IndicateRefs(m_prldArticleRefs + *pi++, m_pdwArticleRefs, pisArticles, FALSE);
|
|
}
|
|
}
|
|
__finally
|
|
{
|
|
if (pisDesc) DetachRef(pisDesc);
|
|
|
|
if (paiDesc ) { VFree(paiDesc ); paiDesc = NULL; }
|
|
if (!fDirectRef) { VFree(ppdSorted); ppdSorted = NULL; }
|
|
|
|
if (_abnormal_termination() && pisArticles) DetachRef(pisArticles);
|
|
}
|
|
|
|
ForgetRef(pisArticles);
|
|
|
|
return pisArticles;
|
|
}
|
|
|
|
UINT CTextDatabase::TokenInstanceCountFor(CTokenList *ptl)
|
|
{
|
|
BOOL fDirectRef = FALSE;
|
|
|
|
SyncForQueries();
|
|
|
|
ASSERT(m_fFromFileImage || m_puioCompressedRefs);
|
|
ASSERT(m_pdwCompressedRefs || !m_cdwCompressedRefs);
|
|
|
|
PDESCRIPTOR *ppdSorted = ptl->m_ppdSorted;
|
|
UINT cDescriptors = ptl->m_cd;
|
|
int cRefs = 0;
|
|
|
|
fDirectRef= ( ptl->m_How_Constructed == CTokenList::TDB_FULL_REF
|
|
|| ptl->m_How_Constructed == CTokenList::TDB_PARTIAL_REF
|
|
);
|
|
|
|
if (!fDirectRef)
|
|
{
|
|
ppdSorted= NULL;
|
|
|
|
ppdSorted= FindTokens(ptl, &cDescriptors);
|
|
}
|
|
|
|
ASSERT(ppdSorted);
|
|
|
|
CIndicatorSet *pisDesc = NULL;
|
|
PUINT paiDesc = NULL;
|
|
|
|
__try
|
|
{
|
|
AttachRef(pisDesc, CIndicatorSet::NewIndicatorSet(DescriptorCount()));
|
|
|
|
for (; cDescriptors--; ppdSorted++)
|
|
pisDesc->RawSetBit(*ppdSorted - DescriptorBase());
|
|
|
|
pisDesc->InvalidateCache();
|
|
|
|
cDescriptors= pisDesc->SelectionCount();
|
|
|
|
paiDesc= (PUINT) VAlloc(FALSE, cDescriptors * sizeof(UINT));
|
|
|
|
#ifdef _DEBUG
|
|
UINT cbResult=
|
|
#endif // _DEBUG
|
|
|
|
pisDesc->MarkedItems(0, (int *)paiDesc, cDescriptors);
|
|
|
|
ASSERT(cbResult == cDescriptors);
|
|
|
|
PUINT pi= paiDesc;
|
|
|
|
for (; cDescriptors--; )
|
|
{
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
cRefs += IndicateRefs(m_prldTokenRefs + *pi++, m_pdwCompressedRefs, NULL, TRUE);
|
|
}
|
|
}
|
|
__finally
|
|
{
|
|
if (pisDesc) DetachRef(pisDesc);
|
|
|
|
if (paiDesc ) { VFree(paiDesc ); paiDesc = NULL; }
|
|
if (!fDirectRef) { VFree(ppdSorted); ppdSorted = NULL; }
|
|
}
|
|
return cRefs;
|
|
}
|
|
|
|
void CTextDatabase::IndicateVocabularyRefs(CIndicatorSet *pisVocabulary, CIndicatorSet *pisTokens, const UINT *piMap)
|
|
{
|
|
SyncForQueries();
|
|
|
|
ASSERT(FPhraseFeedback());
|
|
|
|
ASSERT(pisTokens->ItemCount() == TokenCount());
|
|
|
|
PUINT paiBlock = NULL;
|
|
|
|
__try
|
|
{
|
|
UINT cTokenRefs= pisTokens->SelectionCount();
|
|
|
|
if (!cTokenRefs) return;
|
|
|
|
PUINT paiTokens= TokenBase();
|
|
|
|
UINT cdwBlock= 16384;
|
|
|
|
paiBlock= (PUINT) VAlloc(FALSE, cdwBlock * sizeof(UINT));
|
|
|
|
UINT i, cdwChunk;
|
|
|
|
for (i= 0; cTokenRefs; cTokenRefs-= cdwChunk, i+= cdwChunk)
|
|
{
|
|
cdwChunk= pisTokens->MarkedItems(i, (int *) paiBlock, cdwBlock);
|
|
|
|
UINT c, *pi;
|
|
|
|
for (c= cdwChunk, pi= paiBlock; c--; ) pisVocabulary->RawSetBit(piMap[paiTokens[*pi++]]);
|
|
}
|
|
|
|
pisVocabulary->InvalidateCache();
|
|
}
|
|
__finally
|
|
{
|
|
if (paiBlock) { VFree(paiBlock); paiBlock= NULL; }
|
|
}
|
|
}
|
|
|
|
void CTextDatabase::IndicateVocabularyRefs(CIndicatorSet *pisVocabulary, UINT iPartition, const UINT *piMap)
|
|
{
|
|
IndicateMappedRefs(m_prldVocabularyRefs + iPartition, m_pdwVocabularyRefs, pisVocabulary, piMap);
|
|
}
|
|
|
|
void CTextDatabase::IndicateMappedRefs(PRefListDescriptor prld, PUINT pdwRefBase, CIndicatorSet *pisArticles, const UINT *piMap)
|
|
{
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
if ((prld->rb.fRefPair) || 3 > prld->rb.cReferences)
|
|
{
|
|
pisArticles->RawSetBit(piMap[prld->iRefFirst]);
|
|
|
|
if (prld->rb.fRefPair) pisArticles->RawSetBit(piMap[~(prld->iRefSecond)]);
|
|
|
|
pisArticles->InvalidateCache();
|
|
|
|
return;
|
|
}
|
|
|
|
int cTokenRefs= prld->rb.cReferences;
|
|
|
|
UINT ibit= prld->ibitRefListBase;
|
|
|
|
PUINT pdwRefs = pdwRefBase + (ibit >> 5);
|
|
UINT cbitsBasis = prld->rb.cbitsBasis;
|
|
UINT basis = 1 << cbitsBasis;
|
|
UINT fBasisMask = basis - 1;
|
|
UINT iBitBase = ibit & 31;
|
|
UINT ui = (*pdwRefs++) >> iBitBase;
|
|
UINT iRef = UINT(-1);
|
|
|
|
for (; cTokenRefs--; )
|
|
{
|
|
if (iBitBase == 32)
|
|
{
|
|
ui= *pdwRefs++; iBitBase= 0;
|
|
}
|
|
|
|
UINT cOnesLeading;
|
|
|
|
for (cOnesLeading= 0;;)
|
|
{
|
|
UINT cOnes= acLeadingZeroes[(~ui) & 0xFF];
|
|
|
|
cOnesLeading += cOnes;
|
|
iBitBase += cOnes;
|
|
ui >>= cOnes;
|
|
|
|
if (cOnes < 8 && iBitBase < 32) break;
|
|
|
|
if (iBitBase ==32)
|
|
{
|
|
ui= *pdwRefs++; iBitBase= 0;
|
|
}
|
|
}
|
|
|
|
UINT iDelta= (ui >> 1) & fBasisMask;
|
|
|
|
ui >>= cbitsBasis+1;
|
|
iBitBase += cbitsBasis+1;
|
|
|
|
if (32 < iBitBase)
|
|
{
|
|
ui= *pdwRefs++;
|
|
|
|
iBitBase -= 32;
|
|
|
|
iDelta|= fBasisMask & (ui << (cbitsBasis - iBitBase));
|
|
|
|
ui >>= iBitBase;
|
|
}
|
|
|
|
iRef+= iDelta + 1 + (cOnesLeading << cbitsBasis);
|
|
|
|
pisArticles->RawSetBit(piMap[iRef]);
|
|
}
|
|
|
|
pisArticles->InvalidateCache();
|
|
}
|
|
|
|
|
|
int CTextDatabase::IndicateRefs(PRefListDescriptor prld, PUINT pdwRefLists, CIndicatorSet *pis, BOOL fCountOnly, PUINT paiCountArray)
|
|
{
|
|
if (fCountOnly) return (prld->rb.fRefPair)? 2 : prld->rb.cReferences;
|
|
|
|
int cRefs;
|
|
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
if ((prld->rb.fRefPair) || 3 > prld->rb.cReferences)
|
|
{
|
|
pis->RawSetBit(prld->iRefFirst); if (paiCountArray) ++paiCountArray[prld->iRefFirst];
|
|
|
|
if (prld->rb.fRefPair)
|
|
{
|
|
pis->RawSetBit(~(prld->iRefSecond)); if (paiCountArray) ++paiCountArray[~(prld->iRefSecond)];
|
|
|
|
cRefs= 2;
|
|
}
|
|
else cRefs= 1;
|
|
|
|
pis->InvalidateCache();
|
|
|
|
return cRefs;
|
|
}
|
|
|
|
int cTokenRefs;
|
|
|
|
cRefs= cTokenRefs= prld->rb.cReferences;
|
|
|
|
UINT ibit= prld->ibitRefListBase;
|
|
|
|
PUINT pdwRefs= pdwRefLists + (ibit >> 5);
|
|
|
|
UINT cbitsBasis = prld->rb.cbitsBasis;
|
|
UINT basis = 1 << cbitsBasis;
|
|
UINT fBasisMask = basis - 1;
|
|
UINT iBitBase = ibit & 31;
|
|
UINT ui = (*pdwRefs++) >> iBitBase;
|
|
UINT iRef = UINT(-1);
|
|
|
|
for (; cTokenRefs--; )
|
|
{
|
|
if (iBitBase == 32)
|
|
{
|
|
ui= *pdwRefs++; iBitBase= 0;
|
|
}
|
|
|
|
UINT cOnesLeading;
|
|
|
|
for (cOnesLeading= 0;;)
|
|
{
|
|
UINT cOnes= acLeadingZeroes[(~ui) & 0xFF];
|
|
|
|
cOnesLeading += cOnes;
|
|
iBitBase += cOnes;
|
|
ui >>= cOnes;
|
|
|
|
if (cOnes < 8 && iBitBase < 32) break;
|
|
|
|
if (iBitBase ==32)
|
|
{
|
|
ui= *pdwRefs++; iBitBase= 0;
|
|
}
|
|
}
|
|
|
|
UINT iDelta= (ui >> 1) & fBasisMask;
|
|
|
|
ui >>= cbitsBasis+1;
|
|
iBitBase += cbitsBasis+1;
|
|
|
|
if (32 < iBitBase)
|
|
{
|
|
ui= *pdwRefs++;
|
|
|
|
iBitBase -= 32;
|
|
|
|
iDelta|= fBasisMask & (ui << (cbitsBasis - iBitBase));
|
|
|
|
ui >>= iBitBase;
|
|
}
|
|
|
|
iRef+= iDelta + 1 + (cOnesLeading << cbitsBasis);
|
|
|
|
pis->RawSetBit(iRef); if (paiCountArray) ++paiCountArray[iRef];
|
|
}
|
|
|
|
pis->InvalidateCache();
|
|
|
|
return cRefs;
|
|
}
|
|
|
|
void CTextDatabase::ExtendClassifications(PDESCRIPTOR pdSuffix)
|
|
{
|
|
ASSERT(m_cdSorted == UINT(m_pdNextGalactic - DescriptorBase()));
|
|
|
|
PUINT pafClassesNew = NULL;
|
|
|
|
__try
|
|
{
|
|
UINT cdTotal = m_cdSorted;
|
|
UINT cdSuffix = m_pdNextGalactic - pdSuffix;
|
|
UINT cdPrefix = cdTotal - cdSuffix;
|
|
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
BOOL fPartitionChanged= m_clsfTokens.ScanAndRankData
|
|
(m_pbLastGalactic, m_pbNextGalactic - m_pbLastGalactic);
|
|
|
|
m_pbLastGalactic= m_pbNextGalactic;
|
|
|
|
pafClassesNew= (PUINT) VAlloc(FALSE, sizeof(UINT)*cdTotal);
|
|
|
|
PUINT pfNew = pafClassesNew;
|
|
|
|
PDESCRIPTOR *ppd= m_ppdSorted;
|
|
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
if (fPartitionChanged)
|
|
for (; cdTotal--; )
|
|
{
|
|
PDESCRIPTOR pd= *ppd++;
|
|
|
|
*pfNew++= m_clsfTokens.ClassifyData(pd->pbImage, CbImage(pd));
|
|
}
|
|
else
|
|
{
|
|
PUINT pfOld = m_pafClassifications;
|
|
|
|
for (; cdTotal--; )
|
|
{
|
|
PDESCRIPTOR pd= *ppd++;
|
|
|
|
if (pd < pdSuffix) *pfNew++= *pfOld++;
|
|
else *pfNew++= m_clsfTokens.ClassifyData(pd->pbImage, CbImage(pd));
|
|
}
|
|
}
|
|
|
|
if (m_pafClassifications) VFree(m_pafClassifications);
|
|
|
|
m_pafClassifications= pafClassesNew; pafClassesNew= NULL;
|
|
}
|
|
__finally
|
|
{
|
|
if (pafClassesNew) { VFree(pafClassesNew); pafClassesNew = NULL; }
|
|
}
|
|
}
|
|
|
|
void CTextDatabase::SyncForQueries()
|
|
{
|
|
if (m_fFromFileImage) return;
|
|
|
|
if (m_pulstate->pld) BindToGlobalDict(m_pulstate->pbBuffer);
|
|
|
|
if (!TokenCount() || m_cTokensIndexed == TokenCount()) return;
|
|
|
|
m_cTokensIndexed= TokenCount();
|
|
|
|
FlattenAndMergeLinks();
|
|
|
|
if (m_pdNextGlobal > m_pdNextGalactic) GalacticMerge();
|
|
|
|
if (m_iNextRefSet) CoalesceReferenceLists();
|
|
|
|
if (m_cdSorted < UINT(m_pdNextGalactic - DescriptorBase()))
|
|
{
|
|
PDESCRIPTOR pdBase = DescriptorBase() + m_cdSorted;
|
|
PDESCRIPTOR pd = pdBase;
|
|
UINT c = m_pdNextGalactic - pdBase;
|
|
|
|
for (; c--; pd++)
|
|
{
|
|
UINT cw= CwDisplay(pd);
|
|
|
|
if (cw > m_cwDisplayMax) m_cwDisplayMax= cw;
|
|
}
|
|
|
|
SortTokenImages(DescriptorBase(), &m_ppdSorted,
|
|
&m_ppdTailSorted,
|
|
&m_cdSorted,
|
|
m_pdNextGalactic - DescriptorBase()
|
|
);
|
|
|
|
ExtendClassifications(pdBase);
|
|
}
|
|
|
|
ConstructVocabularyLists();
|
|
}
|
|
|
|
void CTextDatabase::CopyRefStreamSegment(CIOList *piolDestination, CIOList *piolSource, UINT cdw)
|
|
{
|
|
UINT cdwChunkOut;
|
|
|
|
for (; cdw; cdw -= cdwChunkOut)
|
|
{
|
|
cdwChunkOut= cdw;
|
|
|
|
PUINT pdwDest= piolDestination->NextDWordsOut(&cdwChunkOut);
|
|
|
|
UINT cdwChunk= cdwChunkOut;
|
|
UINT cdwChunkIn;
|
|
|
|
for (; cdwChunk; cdwChunk-= cdwChunkIn, pdwDest += cdwChunkIn)
|
|
{
|
|
cdwChunkIn= cdwChunk;
|
|
|
|
const UINT *pdwSrc= piolSource->NextDWordsIn(&cdwChunkIn);
|
|
|
|
CopyMemory(pdwDest, pdwSrc, cdwChunkIn * sizeof(UINT));
|
|
}
|
|
}
|
|
}
|
|
|
|
void CTextDatabase::MergeRefLists(PRefStream prsResult, PRefStream pars, UINT cRefStreams)
|
|
{
|
|
if (cRefStreams == 1)
|
|
{
|
|
*prsResult= *pars;
|
|
|
|
return;
|
|
}
|
|
|
|
CAbortSearch::CheckContinueState();
|
|
|
|
RefStream rsLeft, rsRight;
|
|
|
|
if (cRefStreams > 2)
|
|
{
|
|
UINT cFirstHalf= cRefStreams / 2;
|
|
|
|
MergeRefLists(&rsLeft , pars , cFirstHalf );
|
|
MergeRefLists(&rsRight, pars + cFirstHalf, cRefStreams - cFirstHalf);
|
|
}
|
|
else
|
|
{
|
|
rsLeft = pars[0];
|
|
rsRight = pars[1];
|
|
}
|
|
|
|
ASSERT(rsLeft.cdw && rsRight.cdw);
|
|
|
|
prsResult->cdw = 0;
|
|
prsResult->pFirstBlock = NULL;
|
|
|
|
// The order of the AttachStream calls below is critical.
|
|
// The result attach must come first to flush any queued
|
|
// output. That output may be part of rsLeft or rsRight!
|
|
|
|
m_piolResult->AttachStream(prsResult, TRUE);
|
|
m_piolLeft ->AttachStream(&rsLeft );
|
|
m_piolRight ->AttachStream(&rsRight);
|
|
|
|
UINT iSerialLeft = m_piolLeft ->GetDWordIn();
|
|
UINT iSerialRight = m_piolRight->GetDWordIn();
|
|
|
|
// Note: We reserve UINT(-1) to mark the end of a reference stream.
|
|
|
|
UINT cTerms= 0;
|
|
|
|
for (; iSerialLeft != UINT(-1) || iSerialRight != UINT(-1); )
|
|
{
|
|
ASSERT(iSerialLeft == UINT(-1) || iSerialLeft < UINT(m_pdNextGalactic - DescriptorBase()));
|
|
ASSERT(iSerialRight == UINT(-1) || iSerialRight < UINT(m_pdNextGalactic - DescriptorBase()));
|
|
|
|
++cTerms;
|
|
|
|
UINT cdwStreamLeft = (iSerialLeft <= iSerialRight)? m_piolLeft ->GetDWordIn() : 0;
|
|
UINT cdwStreamRight = (iSerialLeft >= iSerialRight)? m_piolRight->GetDWordIn() : 0;
|
|
|
|
m_piolResult->PutDWordOut((iSerialLeft <= iSerialRight)? iSerialLeft : iSerialRight);
|
|
m_piolResult->PutDWordOut(cdwStreamLeft + cdwStreamRight);
|
|
|
|
if (cdwStreamLeft)
|
|
{
|
|
CopyRefStreamSegment(m_piolResult, m_piolLeft, cdwStreamLeft);
|
|
|
|
#ifdef _DEBUG
|
|
UINT iLast= iSerialLeft;
|
|
#endif // _DEBUG
|
|
|
|
iSerialLeft= m_piolLeft->Empty()? UINT(-1)
|
|
: m_piolLeft->GetDWordIn();
|
|
|
|
ASSERT(iSerialLeft > iLast);
|
|
}
|
|
|
|
if (cdwStreamRight)
|
|
{
|
|
CopyRefStreamSegment(m_piolResult, m_piolRight, cdwStreamRight);
|
|
|
|
#ifdef _DEBUG
|
|
UINT iLast= iSerialRight;
|
|
#endif // _DEBUG
|
|
|
|
iSerialRight= m_piolRight->Empty()? UINT(-1)
|
|
: m_piolRight->GetDWordIn();
|
|
ASSERT(iSerialRight > iLast);
|
|
}
|
|
}
|
|
}
|
|
|
|
typedef struct _BufferCallbackControl
|
|
{
|
|
PUINT pdwBuffer;
|
|
UINT cdwBuffer;
|
|
|
|
} BufferCallbackControl, *PBufferCallbackControl;
|
|
|
|
void BufferCallback(PVOID pv, CallBackTransaction cbt, PUINT *pdwLast, PUINT pcdwLast, UINT cdwRequest)
|
|
{
|
|
PBufferCallbackControl pbcc= PBufferCallbackControl(pv);
|
|
|
|
switch(cbt)
|
|
{
|
|
case RequestInput:
|
|
|
|
if (cdwRequest > pbcc->cdwBuffer) cdwRequest= pbcc->cdwBuffer;
|
|
|
|
*pdwLast = pbcc->pdwBuffer; pbcc->pdwBuffer += cdwRequest;
|
|
*pcdwLast = cdwRequest; pbcc->cdwBuffer -= cdwRequest;
|
|
|
|
return;
|
|
|
|
case QueryForEmptyRing:
|
|
|
|
*pcdwLast= (pbcc->cdwBuffer == 0);
|
|
|
|
return;
|
|
|
|
case RequestOutput:
|
|
|
|
ASSERT(FALSE); // Shouldn't be called for output functions...
|
|
|
|
return;
|
|
|
|
case Flush:
|
|
|
|
ASSERT(FALSE); // Shouldn't be called for output functions...
|
|
|
|
return;
|
|
|
|
case Disconnect:
|
|
|
|
return; // Don't have any disconnect actions to perform
|
|
|
|
default:
|
|
|
|
ASSERT(FALSE); // Unknown transaction type
|
|
|
|
return;
|
|
}
|
|
}
|
|
|
|
UINT iLimitDebug= 1000000000; // Useful for stopping a a particular reference list...
|
|
// See the Assert below which uses this variable.
|
|
|
|
void CTextDatabase::CompressArticleRefLists(CIOList *piolSource, UINT cdw)
|
|
{
|
|
PRefListDescriptor prldArticleRefs = NULL;
|
|
CIndicatorSet *pisMarked = NULL;
|
|
CIOStream *piosCompressed = NULL;
|
|
CCompressor *pCompressor = NULL;
|
|
CCallbackQueue *pcbq = NULL;
|
|
PUINT paiArticles = NULL;
|
|
|
|
__try
|
|
{
|
|
const UINT *paiPartitions;
|
|
const UINT *paiRanks;
|
|
|
|
prldArticleRefs= PRefListDescriptor(VAlloc(TRUE, DescriptorCount() * sizeof(RefListDescriptor)));
|
|
|
|
UINT cPartitions= GetPartitionInfo(&paiPartitions, &paiRanks);
|
|
|
|
pisMarked= CIndicatorSet::NewIndicatorSet(cPartitions);
|
|
|
|
ASSERT(pisMarked);
|
|
|
|
ASSERT(!m_puioCompressedArticleRefs);
|
|
|
|
m_puioCompressedArticleRefs= CUnbufferedIO::NewTempFile((PSZ)GetSourceName());
|
|
|
|
piosCompressed= CIOStream::NewIOStream(m_puioCompressedArticleRefs);
|
|
|
|
piosCompressed->AttachStream(TRUE);
|
|
|
|
pCompressor= CCompressor::NewCompressor(piosCompressed);
|
|
|
|
paiArticles= PUINT(VAlloc(FALSE, DescriptorCount() * sizeof(UINT)));
|
|
|
|
UINT ibitBase= 0;
|
|
|
|
UINT iTerm = UINT(-1);
|
|
|
|
while (!(piolSource->Empty()))
|
|
{
|
|
iTerm = piolSource->GetDWordIn();
|
|
UINT cIndices = piolSource->GetDWordIn();
|
|
|
|
ASSERT(iTerm < iLimitDebug); // For stopping at a particular reference list.
|
|
// Very useful sometimes...
|
|
|
|
PDESCRIPTOR pd= DescriptorBase() + iTerm;
|
|
|
|
ASSERT(cIndices == pd->cReferences);
|
|
|
|
pisMarked->ClearAll();
|
|
|
|
const UINT *piPartitionNext= paiPartitions;
|
|
|
|
UINT iLimit= *piPartitionNext++;
|
|
|
|
UINT cPrevious= 0;
|
|
|
|
for (; cIndices; )
|
|
{
|
|
UINT cIndexBlock= cIndices;
|
|
|
|
const UINT *pi= piolSource->NextDWordsIn(&cIndexBlock);
|
|
|
|
ASSERT(pi && cIndexBlock);
|
|
|
|
cIndices -= cIndexBlock;
|
|
|
|
for (; cIndexBlock--; )
|
|
{
|
|
UINT iRef= *pi++;
|
|
|
|
if (iRef < iLimit) continue;
|
|
|
|
do iLimit= *piPartitionNext++;
|
|
while (iRef >= iLimit);
|
|
|
|
// The line below has been adjusted to allow multiple indices to be simultaneously
|
|
// searched. Previously the paiRanks mapping was necessary to convert from a partition
|
|
// index to a title index. The difference is that topics are put sequentially into
|
|
// partitions as they are encountered whereas their titles are sorted alphabetically.
|
|
//
|
|
// In the new structure we use CTitleCollection::UniversalTitleMap to map from
|
|
// a particular text set partition to the corresponding title in the combined
|
|
// title collection object.
|
|
|
|
pisMarked->RawSetBit((piPartitionNext-paiPartitions)-2);
|
|
// pisMarked->RawSetBit(paiRanks[(piPartitionNext-paiPartitions)-2]);
|
|
}
|
|
}
|
|
|
|
pisMarked->InvalidateCache();
|
|
|
|
cIndices= pisMarked->SelectionCount();
|
|
|
|
PRefListDescriptor prld= prldArticleRefs + iTerm;
|
|
|
|
prld->rb.cReferences= cIndices;
|
|
|
|
pisMarked->MarkedItems(0, PINT(paiArticles), cIndices);
|
|
|
|
if (cIndices < 3)
|
|
{
|
|
prld->iRefFirst= paiArticles[0];
|
|
|
|
if (cIndices == 2) prld->iRefSecond= ~(paiArticles[1]);
|
|
}
|
|
else
|
|
{
|
|
prld->ibitRefListBase= ibitBase;
|
|
|
|
BufferCallbackControl bcc;
|
|
|
|
bcc.pdwBuffer= paiArticles;
|
|
bcc.cdwBuffer= cIndices;
|
|
|
|
pcbq= CCallbackQueue::NewInputCallQueue(BufferCallback, &bcc);
|
|
|
|
UINT cbitsBasis;
|
|
|
|
ibitBase= pCompressor->Compress(pcbq, cIndices, 0, cPartitions, &cbitsBasis);
|
|
|
|
prld->rb.cbitsBasis= cbitsBasis;
|
|
|
|
delete pcbq; pcbq= NULL;
|
|
}
|
|
}
|
|
|
|
ASSERT(iTerm == DescriptorCount() - 1);
|
|
|
|
m_cdwArticleRefs= (ibitBase + 31) >> 5;
|
|
|
|
delete pCompressor; pCompressor = NULL;
|
|
delete piosCompressed; piosCompressed = NULL;
|
|
|
|
ASSERT(!m_pdwArticleRefs);
|
|
|
|
if (m_cdwArticleRefs)
|
|
m_pdwArticleRefs= (PUINT) m_puioCompressedArticleRefs->MappedImage();
|
|
}
|
|
__finally
|
|
{
|
|
if (paiArticles ) { VFree(paiArticles); paiArticles = NULL; }
|
|
if (pcbq ) { delete pcbq; pcbq = NULL; }
|
|
if (pisMarked ) { delete pisMarked; pisMarked = NULL; }
|
|
if (pCompressor ) { delete pCompressor; pCompressor = NULL; }
|
|
if (piosCompressed) { delete piosCompressed; piosCompressed = NULL; }
|
|
|
|
if (_abnormal_termination())
|
|
{
|
|
if (prldArticleRefs) { VFree(prldArticleRefs); prldArticleRefs = NULL; }
|
|
|
|
if (m_puioCompressedArticleRefs)
|
|
{
|
|
delete m_puioCompressedArticleRefs;
|
|
m_puioCompressedArticleRefs= NULL;
|
|
}
|
|
}
|
|
}
|
|
|
|
m_prldArticleRefs= prldArticleRefs;
|
|
}
|
|
|
|
void CTextDatabase::ConstructVocabularyLists()
|
|
{
|
|
// This routine constructs per-article vocabulary reference lists.
|
|
|
|
PRefListDescriptor prldBase = NULL;
|
|
PUINT paiTerms = NULL;
|
|
CIndicatorSet *pisMarked = NULL;
|
|
CIOStream *piosCompressed = NULL;
|
|
CCompressor *pCompressor = NULL;
|
|
CCallbackQueue *pcbq = NULL;
|
|
|
|
__try
|
|
{
|
|
const UINT *paiTermRanks= TermRanks();
|
|
const UINT *paiPartitions;
|
|
|
|
UINT cPartitions = GetPartitionInfo(&paiPartitions);
|
|
UINT cTerms = DescriptorCount();
|
|
|
|
prldBase= PRefListDescriptor(VAlloc(TRUE, cPartitions * sizeof(RefListDescriptor)));
|
|
|
|
pisMarked= CIndicatorSet::NewIndicatorSet(cTerms);
|
|
|
|
ASSERT(pisMarked);
|
|
|
|
paiTerms= PUINT(VAlloc(FALSE, cTerms * sizeof(UINT)));
|
|
|
|
ASSERT(paiTerms);
|
|
|
|
ASSERT(!m_puioCompressedVocabularyRefs);
|
|
|
|
m_puioCompressedVocabularyRefs= CUnbufferedIO::NewTempFile((PSZ)GetSourceName());
|
|
|
|
ASSERT(m_puioCompressedVocabularyRefs);
|
|
|
|
piosCompressed= CIOStream::NewIOStream(m_puioCompressedVocabularyRefs);
|
|
|
|
ASSERT(piosCompressed);
|
|
|
|
piosCompressed->AttachStream(TRUE);
|
|
|
|
pCompressor= CCompressor::NewCompressor(piosCompressed);
|
|
|
|
ASSERT(pCompressor);
|
|
|
|
UINT ibitBase= 0;
|
|
PUINT piToken= TokenBase();
|
|
|
|
for (PRefListDescriptor prld= prldBase; cPartitions--; ++prld)
|
|
{
|
|
UINT cTokens= paiPartitions[1] - paiPartitions[0]; ++paiPartitions;
|
|
|
|
pisMarked->ClearAll();
|
|
|
|
// The line below has been adjusted to allow multiple indices to be simultaneously
|
|
// searched. Previously the paiTermRanks mapping was used to convert from descriptor
|
|
// order to sorted order (sorted by term image).
|
|
//
|
|
// In the new structure we use CTokenCollection::UniversalTokenMap to perform that
|
|
// transformation relative to the current Token Collection object.
|
|
|
|
for (; cTokens--; ) pisMarked->RawSetBit(*piToken++);
|
|
// for (; cTokens--; ) pisMarked->RawSetBit(paiTermRanks[*piToken++]);
|
|
|
|
pisMarked->InvalidateCache();
|
|
|
|
UINT cIndices= pisMarked->SelectionCount();
|
|
|
|
prld->rb.cReferences= cIndices;
|
|
|
|
pisMarked->MarkedItems(0, PINT(paiTerms), cIndices);
|
|
|
|
if (cIndices < 3)
|
|
{
|
|
prld->iRefFirst= paiTerms[0];
|
|
|
|
if (cIndices == 2) prld->iRefSecond= ~(paiTerms[1]);
|
|
}
|
|
else
|
|
{
|
|
prld->ibitRefListBase= ibitBase;
|
|
|
|
BufferCallbackControl bcc;
|
|
|
|
bcc.pdwBuffer= paiTerms;
|
|
bcc.cdwBuffer= cIndices;
|
|
|
|
pcbq= CCallbackQueue::NewInputCallQueue(BufferCallback, &bcc);
|
|
|
|
UINT cbitsBasis;
|
|
|
|
ibitBase= pCompressor->Compress(pcbq, cIndices, 0, cTerms, &cbitsBasis);
|
|
|
|
prld->rb.cbitsBasis= cbitsBasis;
|
|
|
|
delete pcbq; pcbq= NULL;
|
|
}
|
|
}
|
|
|
|
m_cdwVocabularyRefs= (ibitBase + 31) >> 5;
|
|
|
|
delete pCompressor; pCompressor = NULL;
|
|
delete piosCompressed; piosCompressed = NULL;
|
|
|
|
ASSERT(!m_pdwVocabularyRefs);
|
|
|
|
if (m_cdwVocabularyRefs)
|
|
m_pdwVocabularyRefs= (PUINT) m_puioCompressedVocabularyRefs->MappedImage();
|
|
|
|
m_prldVocabularyRefs= prldBase; prldBase = NULL;
|
|
}
|
|
__finally
|
|
{
|
|
if (pcbq ) { delete pcbq; pcbq = NULL; }
|
|
if (paiTerms ) { VFree(paiTerms); paiTerms = NULL; }
|
|
if (pisMarked ) { delete pisMarked; pisMarked = NULL; }
|
|
if (pCompressor ) { delete pCompressor; pCompressor = NULL; }
|
|
if (piosCompressed) { delete piosCompressed; piosCompressed = NULL; }
|
|
|
|
if (_abnormal_termination())
|
|
{
|
|
if (prldBase) { VFree(prldBase); prldBase = NULL; }
|
|
|
|
if (m_puioCompressedVocabularyRefs)
|
|
{
|
|
m_pdwVocabularyRefs = NULL;
|
|
|
|
delete m_puioCompressedVocabularyRefs;
|
|
m_puioCompressedVocabularyRefs= NULL;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void CTextDatabase::CompressRefLists(CIOList *piolSource, UINT cdw)
|
|
{
|
|
CIOStream *piosCompressed = NULL;
|
|
CCompressor *pCompressor = NULL;
|
|
|
|
__try
|
|
{
|
|
ASSERT(!m_prldTokenRefs);
|
|
|
|
m_prldTokenRefs= PRefListDescriptor(VAlloc(TRUE, sizeof(RefListDescriptor) * DescriptorCount()));
|
|
|
|
ASSERT(!m_puioCompressedRefs); // for now... BugBug! Need to restore incremental indexing
|
|
// capability.
|
|
m_puioCompressedRefs= CUnbufferedIO::NewTempFile((PSZ)GetSourceName());
|
|
|
|
piosCompressed= CIOStream::NewIOStream(m_puioCompressedRefs);
|
|
|
|
piosCompressed->AttachStream(TRUE);
|
|
|
|
pCompressor= CCompressor::NewCompressor(piosCompressed);
|
|
|
|
ASSERT(pCompressor);
|
|
|
|
UINT cTokensTotal= TokenCount();
|
|
|
|
UINT ibitBase= 0;
|
|
|
|
UINT iTerm= UINT(-1);
|
|
|
|
while (!(piolSource->Empty()))
|
|
{
|
|
iTerm = piolSource->GetDWordIn();
|
|
UINT cIndices = piolSource->GetDWordIn();
|
|
|
|
PRefListDescriptor prld= m_prldTokenRefs + iTerm;
|
|
|
|
ASSERT(cIndices == (DescriptorBase() + iTerm)->cReferences);
|
|
|
|
prld->rb.cReferences= cIndices;
|
|
|
|
if (cIndices < 3)
|
|
{
|
|
prld->iRefFirst= piolSource->GetDWordIn();
|
|
|
|
if (cIndices == 2) prld->iRefSecond= ~(piolSource->GetDWordIn());
|
|
|
|
continue;
|
|
}
|
|
|
|
prld->ibitRefListBase= ibitBase;
|
|
|
|
UINT cbitsBasis;
|
|
|
|
ibitBase= pCompressor->Compress(piolSource, cIndices, 0, cTokensTotal, &cbitsBasis);
|
|
|
|
prld->rb.cbitsBasis= cbitsBasis;
|
|
}
|
|
|
|
ASSERT(iTerm == DescriptorCount() - 1);
|
|
|
|
m_cdwCompressedRefs= (ibitBase + 31) >> 5;
|
|
|
|
delete pCompressor; pCompressor = NULL;
|
|
delete piosCompressed; piosCompressed = NULL;
|
|
|
|
ASSERT(!m_pdwCompressedRefs);
|
|
|
|
if (m_cdwCompressedRefs)
|
|
m_pdwCompressedRefs= (PUINT) m_puioCompressedRefs->MappedImage();
|
|
}
|
|
__finally
|
|
{
|
|
if (pCompressor ) { delete pCompressor; pCompressor = NULL; }
|
|
if (piosCompressed) { delete piosCompressed; piosCompressed = NULL; }
|
|
|
|
if (_abnormal_termination())
|
|
{
|
|
if (m_puioCompressedRefs) { delete m_puioCompressedRefs; m_puioCompressedRefs = NULL; }
|
|
if (m_prldTokenRefs ) { VFree(m_prldTokenRefs); m_prldTokenRefs = NULL; }
|
|
}
|
|
}
|
|
}
|
|
|
|
void FoundValidToken(UINT iValue, PVOID pvTag, PVOID pvEnvironment)
|
|
{
|
|
#define pis ((CIndicatorSet *) pvEnvironment)
|
|
|
|
pis->RawSetBit(iValue);
|
|
|
|
#undef pis
|
|
}
|
|
|
|
CIndicatorSet *CTextDatabase::ValidTokens(CTokenList *ptl)
|
|
{
|
|
CIndicatorSet *pisTokens = NULL;
|
|
CAValRef *pavr = NULL;
|
|
|
|
__try
|
|
{
|
|
SyncForQueries();
|
|
|
|
UINT cTokens= ptl->RowCount();
|
|
|
|
AttachRef(pisTokens, CIndicatorSet::NewIndicatorSet(cTokens));
|
|
|
|
pavr= CAValRef::NewValRef(cTokens);
|
|
|
|
UINT c= cTokens;
|
|
|
|
PDESCRIPTOR *ppd= ptl->m_ppdSorted;
|
|
|
|
for (; c--; )
|
|
{
|
|
PDESCRIPTOR pd= *ppd++;
|
|
pavr->AddWCRef(pd->pwDisplay, CwDisplay(pd));
|
|
}
|
|
|
|
m_pshtGalactic->Assimilate(pavr, pisTokens, FoundValidToken, NULL);
|
|
|
|
pisTokens->InvalidateCache();
|
|
|
|
delete pavr;
|
|
}
|
|
__finally
|
|
{
|
|
if (pavr) { delete pavr; pavr= NULL; }
|
|
|
|
if (_abnormal_termination() && pisTokens) DetachRef(pisTokens);
|
|
}
|
|
|
|
ForgetRef(pisTokens);
|
|
|
|
return pisTokens;
|
|
}
|
|
|
|
void CTextDatabase::CoalesceReferenceLists()
|
|
{
|
|
PRefStream pars= NULL;
|
|
|
|
__try
|
|
{
|
|
ASSERT(!m_ibNextFileBlockHigh);
|
|
|
|
UINT cFileBlocksSpare= SPARE_FILE_BLOCKS + CIOQueue::C_BLOCKS * 3;
|
|
|
|
UINT cFileBlocksUsed = BlocksFor(m_ibNextFileBlockLow, m_cbBlockSize);
|
|
UINT cFileBlockSlots = cFileBlocksSpare + cFileBlocksUsed;
|
|
|
|
ASSERT(!m_papFileBlockLinks);
|
|
|
|
m_papFileBlockLinks= (PFileBlockLink) VAlloc(TRUE, cFileBlockSlots * sizeof(FileBlockLink));
|
|
|
|
UINT c;
|
|
|
|
for (m_pFirstFreeFileBlock= NULL, c= cFileBlocksSpare; c--; )
|
|
{
|
|
PFileBlockLink pfbl= m_papFileBlockLinks + cFileBlocksUsed + c;
|
|
|
|
pfbl->pNextBlock= m_pFirstFreeFileBlock;
|
|
|
|
m_pFirstFreeFileBlock= pfbl;
|
|
}
|
|
|
|
pars= (PRefStream) VAlloc(FALSE, m_iNextRefSet * sizeof(RefStream));
|
|
|
|
PRefClusterDescriptor prcd;
|
|
PRefStream prs;
|
|
|
|
for (c= m_iNextRefSet, prcd= m_pulstate->m_rcd, prs= pars; c--; prcd++, prs++)
|
|
{
|
|
UINT cdw= prs->cdw= prcd->cdw;
|
|
|
|
UINT cBlocks= BlocksFor(cdw * sizeof(UINT), m_cbBlockSize);
|
|
|
|
ASSERT(!(prcd->iFilePosHigh));
|
|
ASSERT(!(prcd->iFilePosLow % m_cbBlockSize));
|
|
|
|
PFileBlockLink pBlock=
|
|
prs->pFirstBlock= m_papFileBlockLinks
|
|
+ (prcd->iFilePosLow / m_cbBlockSize);
|
|
|
|
for (; cBlocks--; pBlock++)
|
|
pBlock->pNextBlock= cBlocks? pBlock + 1 : NULL;
|
|
}
|
|
|
|
ASSERT(!m_piolLeft );
|
|
ASSERT(!m_piolRight );
|
|
ASSERT(!m_piolResult);
|
|
|
|
m_piolResult= CIOList::NewIOList(m_puioRefTemp, m_papFileBlockLinks, &m_pFirstFreeFileBlock);
|
|
|
|
if (!m_piolResult)
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
|
|
RefStream rsResult;
|
|
|
|
if (m_iNextRefSet == 1) rsResult= *pars;
|
|
else
|
|
{
|
|
m_piolLeft= CIOList::NewIOList(m_puioRefTemp, m_papFileBlockLinks, &m_pFirstFreeFileBlock);
|
|
|
|
if (!m_piolLeft)
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
|
|
m_piolRight= CIOList::NewIOList(m_puioRefTemp, m_papFileBlockLinks, &m_pFirstFreeFileBlock);
|
|
|
|
if (!m_piolRight)
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
|
|
MergeRefLists(&rsResult, pars, m_iNextRefSet);
|
|
|
|
delete m_piolRight; m_piolRight = NULL;
|
|
delete m_piolLeft; m_piolLeft = NULL;
|
|
}
|
|
|
|
if (m_piolResult->Writable()) m_piolResult->FlushOutput(TRUE);
|
|
|
|
RefStream rsAux= rsResult;
|
|
|
|
m_piolResult->AttachStream(&rsAux, FALSE, FALSE);
|
|
|
|
CompressArticleRefLists(m_piolResult, rsAux.cdw);
|
|
|
|
if (FPhrases())
|
|
{
|
|
m_piolResult->AttachStream(&rsResult);
|
|
|
|
CompressRefLists(m_piolResult, rsResult.cdw);
|
|
}
|
|
|
|
delete m_piolResult; m_piolResult = NULL;
|
|
delete m_puioRefTemp; m_puioRefTemp= CUnbufferedIO::NewTempFile((PSZ)GetSourceName());
|
|
|
|
if (!m_puioRefTemp)
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
|
|
m_iNextRefSet= 0;
|
|
}
|
|
__finally
|
|
{
|
|
if (_abnormal_termination())
|
|
{
|
|
if (m_piolRight ) { m_piolRight ->ExceptionDestructor(); m_piolRight = NULL; }
|
|
if (m_piolLeft ) { m_piolLeft ->ExceptionDestructor(); m_piolLeft = NULL; }
|
|
if (m_piolResult) { m_piolResult->ExceptionDestructor(); m_piolResult = NULL; }
|
|
}
|
|
else
|
|
{
|
|
if (m_piolRight ) { delete m_piolRight; m_piolRight = NULL; }
|
|
if (m_piolLeft ) { delete m_piolLeft; m_piolLeft = NULL; }
|
|
if (m_piolResult) { delete m_piolResult; m_piolResult = NULL; }
|
|
}
|
|
|
|
if (pars ) { VFree(pars); pars = NULL; }
|
|
if (m_papFileBlockLinks) { VFree(m_papFileBlockLinks); m_papFileBlockLinks = NULL; }
|
|
|
|
if (_abnormal_termination())
|
|
{
|
|
if (m_piolResult ) { delete m_piolResult; m_piolResult = NULL; }
|
|
if (m_papFileBlockLinks) { VFree(m_papFileBlockLinks); m_papFileBlockLinks = NULL; }
|
|
|
|
if (m_puioRefTemp)
|
|
{
|
|
delete m_puioRefTemp; m_puioRefTemp= NULL;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
const UINT *CTextDatabase::TermRanks()
|
|
{
|
|
PUINT piRemap = NULL;
|
|
UINT cTerms = 0;
|
|
|
|
__try
|
|
{
|
|
SyncForQueries();
|
|
|
|
cTerms= DescriptorCount();
|
|
|
|
if (m_cTermRanks == cTerms) return m_pTermRanks;
|
|
|
|
if (m_cTermRanks) { m_cTermRanks= 0; m_pTermRanks= NULL; }
|
|
|
|
piRemap = (PUINT) VAlloc(FALSE, cTerms * sizeof(UINT));
|
|
|
|
PDESCRIPTOR pd = DescriptorBase();
|
|
PDESCRIPTOR *ppd = m_ppdSorted;
|
|
UINT c, i;
|
|
|
|
for (c= cTerms, i= 0; c--; ) piRemap[(*ppd++) - pd]= i++;
|
|
}
|
|
__finally
|
|
{
|
|
if (_abnormal_termination() && piRemap)
|
|
{
|
|
VFree(piRemap); piRemap = NULL;
|
|
}
|
|
}
|
|
|
|
m_cTermRanks= cTerms;
|
|
m_pTermRanks= piRemap;
|
|
|
|
return piRemap;
|
|
}
|
|
|
|
UINT CTextDatabase::TextLength(PDESCRIPTOR *ppdSorted, PUINT puiTokenMap, UINT iTokenStart, UINT iTokenLimit)
|
|
{
|
|
SyncForQueries();
|
|
|
|
ASSERT(FPhraseFeedback());
|
|
|
|
ASSERT(iTokenStart <= iTokenLimit);
|
|
ASSERT(iTokenLimit <= TokenCount());
|
|
|
|
UINT cTokens = iTokenLimit - iTokenStart;
|
|
PUINT pi = TokenBase() + iTokenStart;
|
|
UINT cb = 0;
|
|
BOOL fSymbolLast = FALSE;
|
|
|
|
while (cTokens--)
|
|
{
|
|
PDESCRIPTOR pd= ppdSorted[puiTokenMap[*pi++]];
|
|
|
|
if (pd->fImageFlags & LETTER_CHAR)
|
|
if (fSymbolLast) ++cb;
|
|
else fSymbolLast= TRUE;
|
|
else fSymbolLast= FALSE;
|
|
|
|
cb += CbImage(pd);
|
|
}
|
|
|
|
return cb;
|
|
}
|
|
|
|
UINT CTextDatabase::CopyText(PDESCRIPTOR *ppdSorted, PUINT puiTokenMap, UINT iTokenStart, UINT iTokenLimit, PWCHAR pbBuffer, UINT cbBuffer)
|
|
{
|
|
SyncForQueries();
|
|
|
|
ASSERT(FPhraseFeedback());
|
|
|
|
ASSERT(iTokenStart <= iTokenLimit);
|
|
ASSERT(iTokenLimit <= TokenCount());
|
|
|
|
UINT cTokens = iTokenLimit - iTokenStart;
|
|
PUINT pi = TokenBase() + iTokenStart;
|
|
UINT cb = 0;
|
|
BOOL fSymbolLast = FALSE;
|
|
|
|
while (cbBuffer && cTokens--)
|
|
{
|
|
PDESCRIPTOR pd= ppdSorted[puiTokenMap[*pi++]];
|
|
|
|
if (pd->fImageFlags & LETTER_CHAR)
|
|
if (fSymbolLast)
|
|
{
|
|
++cb;
|
|
*pbBuffer++= UNICODE_SPACE_CHAR;
|
|
if (!--cbBuffer)
|
|
break;
|
|
}
|
|
else fSymbolLast= TRUE;
|
|
else fSymbolLast= FALSE;
|
|
|
|
UINT cbToken= CwDisplay(pd);
|
|
|
|
if (cbToken > cbBuffer) cbToken= cbBuffer;
|
|
|
|
CopyMemory(pbBuffer, pd->pwDisplay, cbToken * sizeof(WCHAR));
|
|
|
|
pbBuffer += cbToken;
|
|
cbBuffer -= cbToken;
|
|
cb += cbToken;
|
|
}
|
|
|
|
return cb;
|
|
}
|