232 lines
5.2 KiB
C++
232 lines
5.2 KiB
C++
/*****************************************************************************
|
|
* *
|
|
* CPHRASE.CPP *
|
|
* *
|
|
* Copyright (C) Microsoft Corporation 1993-1994 *
|
|
* All Rights reserved. *
|
|
* *
|
|
*****************************************************************************/
|
|
|
|
#include "stdafx.h"
|
|
|
|
#include "cphrase.h"
|
|
#include "ftsrch.h"
|
|
|
|
// These must be identical to ..\common\ctable.cpp
|
|
|
|
const int TABLE_ALLOC_SIZE = 4096; // allocate in page increments
|
|
const int MAX_POINTERS = (2048 * 1024); // 2 meg, 524,288 strings
|
|
const int MAX_STRINGS = (10 * 1024 * 1024) - 4096L; // 10 megs
|
|
|
|
#ifdef _DEBUG
|
|
#undef THIS_FILE
|
|
static char THIS_FILE[] = __FILE__;
|
|
#endif
|
|
|
|
static HASH FASTCALL ConvertToHash(PCSTR szKey);
|
|
|
|
CPhrase* pphrase;
|
|
|
|
CPhrase::CPhrase()
|
|
: CTable()
|
|
{
|
|
fWordPhrasing = FALSE;
|
|
}
|
|
|
|
/***************************************************************************
|
|
|
|
FUNCTION: CPhrase::AddPhrase
|
|
|
|
PURPOSE: Either adds the string, or updates its count if already added
|
|
|
|
PARAMETERS:
|
|
pszString
|
|
|
|
RETURNS:
|
|
|
|
COMMENTS:
|
|
We use a hash algorithm that is fast, but will generate collisions.
|
|
We ignore any collisions -- the effect on the phrase generation file
|
|
will be negligible.
|
|
|
|
MODIFICATION DATES:
|
|
24-Feb-1994 [ralphw]
|
|
|
|
***************************************************************************/
|
|
|
|
BOOL STDCALL CPhrase::AddPhrase(PSTR pszString)
|
|
{
|
|
static BOOL fWarned = FALSE;
|
|
#ifdef _DEBUG
|
|
// ptblCheck->AddString(pszString);
|
|
#endif
|
|
|
|
#ifdef CHECK_HALL
|
|
poutPhrase->outstring_eol(pszString);
|
|
#endif
|
|
|
|
if (options.fsCompress & COMPRESS_TEXT_HALL) {
|
|
return (pScanText(hCompressor, (PBYTE) pszString, strlen(pszString),
|
|
defCharSet) >= 0);
|
|
}
|
|
if (*pszString == ' ')
|
|
pszString = FirstNonSpace(pszString, options.fDBCS);
|
|
else if (!*pszString)
|
|
return TRUE;
|
|
|
|
// REVIEW: because old compiler thought it would see these
|
|
|
|
ASSERT(!strpbrk(pszString, "\001\021\013\n\r\032"));
|
|
|
|
while (strlen(pszString) >= CCH_MAX_PHRASE) {
|
|
int cb = CCH_MAX_PHRASE - 1;
|
|
|
|
// REVIEW: needs work for DBCS enabling
|
|
|
|
while (pszString[cb] == ' ' && cb)
|
|
cb--;
|
|
if (cb == 0)
|
|
return TRUE; // too long to add without any spaces
|
|
|
|
pszString[cb] = '\0';
|
|
RemoveTrailingSpaces(pszString);
|
|
AddPhrase(pszString);
|
|
pszString += cb + 1;
|
|
}
|
|
|
|
HASH hash = ConvertToHash(pszString);
|
|
int pos;
|
|
for (pos = 1; pos < endpos; pos++) {
|
|
if (((HASH_PHRASE*) ppszTable[pos])->hash == hash)
|
|
break;
|
|
}
|
|
if (pos < endpos)
|
|
((HASH_PHRASE *) ppszTable[pos])->count++;
|
|
|
|
else if (!fWarned) {
|
|
if (endpos >= maxpos) {
|
|
if (cbStrings + TABLE_ALLOC_SIZE >= MAX_STRINGS ||
|
|
cbPointers + TABLE_ALLOC_SIZE >= MAX_POINTERS) {
|
|
#ifdef _DEBUG
|
|
SendStringToParent("Phrase limit reached\r\n");
|
|
#endif
|
|
fWarned = TRUE;
|
|
}
|
|
else
|
|
IncreaseTableBuffer();
|
|
}
|
|
|
|
if ((ppszTable[endpos] = TableMalloc(strlen(pszString) + 1 +
|
|
sizeof(HASH) + sizeof(DWORD))) == NULL) {
|
|
#ifdef _DEBUG
|
|
SendStringToParent("Phrase limit reached in TableMalloc\r\n");
|
|
#endif
|
|
fWarned = TRUE;
|
|
return FALSE;
|
|
}
|
|
|
|
HASH_PHRASE* pphr = (HASH_PHRASE*) ppszTable[endpos];
|
|
pphr->hash = hash;
|
|
pphr->count = 1;
|
|
strcpy(pphr->sz, pszString);
|
|
endpos++;
|
|
}
|
|
|
|
if (fWordPhrasing)
|
|
return TRUE;
|
|
else {
|
|
fWordPhrasing = TRUE;
|
|
|
|
// Now add each word in the phrase
|
|
|
|
StrToken(pszString, ' '); // Ignore first word
|
|
PSTR pszWord;
|
|
|
|
// Write out words:
|
|
|
|
while ((pszWord = StrToken(NULL, ' ')) != NULL)
|
|
AddPhrase(pszWord);
|
|
|
|
fWordPhrasing = FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/***************************************************************************
|
|
|
|
FUNCTION: CPhrase::GetPhrase
|
|
|
|
PURPOSE: Return string, once for each time it occurred.
|
|
|
|
PARAMETERS:
|
|
pszDst
|
|
|
|
RETURNS:
|
|
|
|
COMMENTS:
|
|
WARNING! This is a destructive read. You can only read this
|
|
table once!
|
|
|
|
MODIFICATION DATES:
|
|
25-Feb-1994 [ralphw]
|
|
|
|
***************************************************************************/
|
|
|
|
DWORD STDCALL CPhrase::GetPhrase(PSTR pszDst)
|
|
{
|
|
*pszDst = 0; // clear the line no matter what happens
|
|
|
|
if (curpos >= endpos)
|
|
return (DWORD) -1;
|
|
HASH_PHRASE* pphr = (HASH_PHRASE*) ppszTable[curpos++];
|
|
strcpy(pszDst, pphr->sz);
|
|
return pphr->count;
|
|
}
|
|
|
|
static const HASH MAX_CHARS = 43L;
|
|
|
|
static HASH FASTCALL ConvertToHash(PCSTR szKey)
|
|
{
|
|
int ich;
|
|
HASH hash = 0L;
|
|
|
|
// REVIEW: 14-Oct-1993 [ralphw] -- Note lack of check for a hash collision.
|
|
|
|
/*
|
|
* REVIEW: This is the hash algorithm used for context strings. That
|
|
* may not make much sense here where we must also deal with
|
|
* international character sets including DBCS.
|
|
*/
|
|
|
|
for (ich = 0; szKey[ich]; ++ich) {
|
|
if (szKey[ich] == '!')
|
|
hash = (hash * MAX_CHARS) + 11;
|
|
else if (szKey[ich] == '.')
|
|
hash = (hash * MAX_CHARS) + 12;
|
|
else if (szKey[ich] == '_')
|
|
hash = (hash * MAX_CHARS) + 13;
|
|
else if (szKey[ich] == '0')
|
|
hash = (hash * MAX_CHARS) + 10;
|
|
else if (szKey[ich] <= 'Z')
|
|
hash = (hash * MAX_CHARS) + (szKey[ich] - '0');
|
|
else
|
|
hash = (hash * MAX_CHARS) + (szKey[ich] - '0' - ('a' - 'A'));
|
|
}
|
|
|
|
/*
|
|
* Since the value 0 is reserved as a nil value, if any string
|
|
* actually hashes to this value, we just move it.
|
|
*/
|
|
|
|
return (hash == 0 ? 0 + 1 : hash);
|
|
}
|
|
|
|
void CPhrase::SortTable(void)
|
|
{
|
|
if (endpos < 3) // don't sort one entry
|
|
return;
|
|
SetTableSortColumn(sizeof(HASH) + sizeof(DWORD));
|
|
doSort(1, (int) endpos - 1);
|
|
}
|