2020-09-30 17:12:29 +02:00

232 lines
5.2 KiB
C++

/*****************************************************************************
* *
* CPHRASE.CPP *
* *
* Copyright (C) Microsoft Corporation 1993-1994 *
* All Rights reserved. *
* *
*****************************************************************************/
#include "stdafx.h"
#include "cphrase.h"
#include "ftsrch.h"
// These must be identical to ..\common\ctable.cpp
const int TABLE_ALLOC_SIZE = 4096; // allocate in page increments
const int MAX_POINTERS = (2048 * 1024); // 2 meg, 524,288 strings
const int MAX_STRINGS = (10 * 1024 * 1024) - 4096L; // 10 megs
#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif
static HASH FASTCALL ConvertToHash(PCSTR szKey);
CPhrase* pphrase;
CPhrase::CPhrase()
: CTable()
{
fWordPhrasing = FALSE;
}
/***************************************************************************
FUNCTION: CPhrase::AddPhrase
PURPOSE: Either adds the string, or updates its count if already added
PARAMETERS:
pszString
RETURNS:
COMMENTS:
We use a hash algorithm that is fast, but will generate collisions.
We ignore any collisions -- the effect on the phrase generation file
will be negligible.
MODIFICATION DATES:
24-Feb-1994 [ralphw]
***************************************************************************/
BOOL STDCALL CPhrase::AddPhrase(PSTR pszString)
{
static BOOL fWarned = FALSE;
#ifdef _DEBUG
// ptblCheck->AddString(pszString);
#endif
#ifdef CHECK_HALL
poutPhrase->outstring_eol(pszString);
#endif
if (options.fsCompress & COMPRESS_TEXT_HALL) {
return (pScanText(hCompressor, (PBYTE) pszString, strlen(pszString),
defCharSet) >= 0);
}
if (*pszString == ' ')
pszString = FirstNonSpace(pszString, options.fDBCS);
else if (!*pszString)
return TRUE;
// REVIEW: because old compiler thought it would see these
ASSERT(!strpbrk(pszString, "\001\021\013\n\r\032"));
while (strlen(pszString) >= CCH_MAX_PHRASE) {
int cb = CCH_MAX_PHRASE - 1;
// REVIEW: needs work for DBCS enabling
while (pszString[cb] == ' ' && cb)
cb--;
if (cb == 0)
return TRUE; // too long to add without any spaces
pszString[cb] = '\0';
RemoveTrailingSpaces(pszString);
AddPhrase(pszString);
pszString += cb + 1;
}
HASH hash = ConvertToHash(pszString);
int pos;
for (pos = 1; pos < endpos; pos++) {
if (((HASH_PHRASE*) ppszTable[pos])->hash == hash)
break;
}
if (pos < endpos)
((HASH_PHRASE *) ppszTable[pos])->count++;
else if (!fWarned) {
if (endpos >= maxpos) {
if (cbStrings + TABLE_ALLOC_SIZE >= MAX_STRINGS ||
cbPointers + TABLE_ALLOC_SIZE >= MAX_POINTERS) {
#ifdef _DEBUG
SendStringToParent("Phrase limit reached\r\n");
#endif
fWarned = TRUE;
}
else
IncreaseTableBuffer();
}
if ((ppszTable[endpos] = TableMalloc(strlen(pszString) + 1 +
sizeof(HASH) + sizeof(DWORD))) == NULL) {
#ifdef _DEBUG
SendStringToParent("Phrase limit reached in TableMalloc\r\n");
#endif
fWarned = TRUE;
return FALSE;
}
HASH_PHRASE* pphr = (HASH_PHRASE*) ppszTable[endpos];
pphr->hash = hash;
pphr->count = 1;
strcpy(pphr->sz, pszString);
endpos++;
}
if (fWordPhrasing)
return TRUE;
else {
fWordPhrasing = TRUE;
// Now add each word in the phrase
StrToken(pszString, ' '); // Ignore first word
PSTR pszWord;
// Write out words:
while ((pszWord = StrToken(NULL, ' ')) != NULL)
AddPhrase(pszWord);
fWordPhrasing = FALSE;
}
return TRUE;
}
/***************************************************************************
FUNCTION: CPhrase::GetPhrase
PURPOSE: Return string, once for each time it occurred.
PARAMETERS:
pszDst
RETURNS:
COMMENTS:
WARNING! This is a destructive read. You can only read this
table once!
MODIFICATION DATES:
25-Feb-1994 [ralphw]
***************************************************************************/
DWORD STDCALL CPhrase::GetPhrase(PSTR pszDst)
{
*pszDst = 0; // clear the line no matter what happens
if (curpos >= endpos)
return (DWORD) -1;
HASH_PHRASE* pphr = (HASH_PHRASE*) ppszTable[curpos++];
strcpy(pszDst, pphr->sz);
return pphr->count;
}
static const HASH MAX_CHARS = 43L;
static HASH FASTCALL ConvertToHash(PCSTR szKey)
{
int ich;
HASH hash = 0L;
// REVIEW: 14-Oct-1993 [ralphw] -- Note lack of check for a hash collision.
/*
* REVIEW: This is the hash algorithm used for context strings. That
* may not make much sense here where we must also deal with
* international character sets including DBCS.
*/
for (ich = 0; szKey[ich]; ++ich) {
if (szKey[ich] == '!')
hash = (hash * MAX_CHARS) + 11;
else if (szKey[ich] == '.')
hash = (hash * MAX_CHARS) + 12;
else if (szKey[ich] == '_')
hash = (hash * MAX_CHARS) + 13;
else if (szKey[ich] == '0')
hash = (hash * MAX_CHARS) + 10;
else if (szKey[ich] <= 'Z')
hash = (hash * MAX_CHARS) + (szKey[ich] - '0');
else
hash = (hash * MAX_CHARS) + (szKey[ich] - '0' - ('a' - 'A'));
}
/*
* Since the value 0 is reserved as a nil value, if any string
* actually hashes to this value, we just move it.
*/
return (hash == 0 ? 0 + 1 : hash);
}
void CPhrase::SortTable(void)
{
if (endpos < 3) // don't sort one entry
return;
SetTableSortColumn(sizeof(HASH) + sizeof(DWORD));
doSort(1, (int) endpos - 1);
}