Windows2003-3790/enduser/speech/common/include/stringblob.h
2020-09-30 16:53:55 +02:00

442 lines
12 KiB
C++

/*******************************************************************************
* StringBlob.h *
*--------------*
* Description:
* This is the header file for the CStringBlob class used internally by SAPI.
*
* Copyright 1998-2000 Microsoft Corporation All Rights Reserved.
*
*******************************************************************************/
#ifndef _STRINGBLOB_H_
#define _STRINGBLOB_H_ 1
#ifndef SPDebug_h
#include <SPDebug.h>
#endif
#include <math.h>
template <class XCHAR>
class CStringBlobT
{
XCHAR * m_pData; // List of words, end-to-end
ULONG m_cchAllocated; // Size of m_pData
ULONG * m_aichWords; // Word index => offset in m_pData [1] is index of start of second word
ULONG m_cwords; // Number of words
ULONG m_cwordsAllocated; // Size of m_aichWords
ULONG * m_aulBuckets; // Hash table containing indices of words or 0 for empty buckets
ULONG m_cBuckets; // Number of buckets in hash table
public:
CStringBlobT()
{
m_pData = NULL;
m_cchAllocated = 0;
m_aichWords = NULL;
m_cwords = 0;
m_cwordsAllocated = 0;
m_aulBuckets = NULL;
m_cBuckets = 0;
}
~CStringBlobT()
{
Clear();
}
void Detach(XCHAR **ppszWordList, ULONG *pulSize)
{
*ppszWordList = NULL;
if (m_pData)
{
ULONG cchDesired = StringSize();
ULONG cbSize = SerializeSize(); // byte count, ULONG multiple
*ppszWordList = (XCHAR*)::CoTaskMemRealloc(m_pData, cbSize);
if (*ppszWordList == NULL)
{
*ppszWordList = m_pData;
cbSize = m_cchAllocated * sizeof(XCHAR);
}
m_pData = NULL;
Clear();
if (pulSize)
{
*pulSize = cbSize;
}
}
}
void Clear()
{
if (m_pData)
{
::CoTaskMemFree(m_pData);
m_pData = NULL;
}
m_cchAllocated = 0;
free(m_aichWords);
m_aichWords = NULL;
m_cwordsAllocated = 0;
m_cwords = 0;
free(m_aulBuckets);
m_aulBuckets = NULL;
m_cBuckets = 0;
}
HRESULT InitFrom(const XCHAR * pszStringArray, ULONG cch)
{
SPDBG_ASSERT(m_pData == NULL);
if (cch)
{
ULONG cbSize = (cch * sizeof(XCHAR) + 3) & ~3;
m_pData = (XCHAR *)::CoTaskMemAlloc(cbSize);
if (m_pData == NULL)
return E_OUTOFMEMORY;
m_cchAllocated = cch;
SPDBG_ASSERT(pszStringArray[0] == 0); // First string is always empty.
// First pass to copy data and count strings.
const XCHAR * pszPastEnd = pszStringArray + cch;
const XCHAR * psz = pszStringArray;
XCHAR * pszOut = m_pData;
ULONG cwords = 0;
while (psz < pszPastEnd)
{
if ((*pszOut++ = *psz++) == 0)
++cwords;
}
m_aichWords = (ULONG *) malloc(sizeof(ULONG) * cwords);
if (m_aichWords == NULL)
return E_OUTOFMEMORY;
m_cwordsAllocated = cwords;
m_cwords = cwords - 1; // Doesn't count leading 0
HRESULT hr = SetHashSize(cwords * 2 + 1);
if (FAILED(hr))
return hr;
// Second pass to fill in indices and hash table.
psz = pszStringArray + 1;
const WCHAR * pszWordStart = psz;
ULONG ulID = 1;
m_aichWords[0] = 1;
while (psz < pszPastEnd)
{
if (*(psz++) == 0)
{
SPDBG_ASSERT(ulID < m_cwordsAllocated);
m_aichWords[ulID] = (ULONG)(psz - pszStringArray); // can't have more than 4 million chars!
m_aulBuckets[FindIndex(pszWordStart)] = ulID;
pszWordStart = psz;
++ulID;
}
}
}
return S_OK;
}
ULONG HashKey(const XCHAR * pszString, ULONG * pcchIncNull = NULL)
{
ULONG hash = 0;
ULONG cchIncNull = 1; // one for the NULL
for (const XCHAR * pch = pszString; *pch; ++pch, ++cchIncNull)
hash = hash * 65599 + *pch;
if (pcchIncNull)
*pcchIncNull = cchIncNull;
return hash;
}
// find index for string -- returns 0 if not found
ULONG FindIndex(const XCHAR * psz)
{
SPDBG_ASSERT(psz);
ULONG cchIncNull;
ULONG start = HashKey(psz, &cchIncNull) % m_cBuckets;
ULONG index = start;
do
{
// Not in table; return index where it should be placed.
if (m_aulBuckets[index] == 0)
return index;
// Compare length and if it matches compare full string.
if (m_aichWords[m_aulBuckets[index]] - m_aichWords[m_aulBuckets[index] - 1] == cchIncNull &&
IsEqual(m_aichWords[m_aulBuckets[index] - 1], psz))
{
// Found this word already in the table.
return index;
}
if (++index >= m_cBuckets)
index -= m_cBuckets;
} while (index != start);
SPDBG_ASSERT(m_cwords == m_cBuckets); // Shouldn't ever get here
return (ULONG) -1;
}
// Returns ID; use IndexFromId to recover string offset
ULONG Find(const XCHAR * psz)
{
if (psz == NULL || m_cwords == 0)
return 0;
// Should always succeed in finding a bucket, since hash table is >2x larger than # of elements.
ULONG ibucket = FindIndex(psz);
return m_aulBuckets[ibucket]; // May be 0 if not in table
}
ULONG primeNext(ULONG val)
{
if (val < 2)
val = 2; /* the smallest prime number */
for (;;)
{
/* Is val a prime number? */
ULONG maxFactor = (ULONG) sqrt ((double) val);
/* Is i a factor of val? */
for (ULONG i = 2; i <= maxFactor; i++)
if (val % i == 0)
break;
if (i > maxFactor)
return (val);
val++;
}
}
HRESULT SetHashSize(ULONG cbuckets)
{
if (cbuckets > m_cBuckets)
{
ULONG * oldtable = m_aulBuckets;
ULONG oldentry = m_cBuckets;
ULONG prime = primeNext(cbuckets);
// Alloc new table.
m_aulBuckets = (ULONG *) malloc(prime * sizeof(ULONG));
if (m_aulBuckets == NULL)
{
m_aulBuckets = oldtable;
return E_OUTOFMEMORY;
}
for (ULONG i=0; i < prime; i++)
{
m_aulBuckets[i] = 0;
}
m_cBuckets = prime;
for (i = 0; i < oldentry; i++)
{
if (oldtable[i] != 0)
{
ULONG ibucket = FindIndex(m_pData + m_aichWords[oldtable[i] - 1]);
m_aulBuckets[ibucket] = oldtable[i];
}
}
free(oldtable);
}
return S_OK;
}
//
// The ID for a NULL string is always 0, the ID for subsequent strings is the
// index of the string + 1;
//
HRESULT Add(const XCHAR * psz, ULONG * pichOffset, ULONG *pulID = NULL)
{
ULONG ID = 0;
if (psz)
{
// Grow if we're more than half full.
if (m_cwords * 2 >= m_cBuckets)
{
HRESULT hr = SetHashSize(m_cwords * 3 + 17);
if (FAILED(hr))
return hr;
}
// Find out where this element should end up in hash table.
ULONG ibucket = FindIndex(psz);
if (m_aulBuckets[ibucket] == 0)
{
// Not found in hash table. Append it to the end.
// Grow ID=>index mapping array if necessary.
if (m_cwords + 1 >= m_cwordsAllocated) // 1 extra for init. zero
{
void * pvNew = realloc(m_aichWords, sizeof(*m_aichWords) * (m_cwords + 100));
if (pvNew == NULL)
return E_OUTOFMEMORY;
m_aichWords = (ULONG *)pvNew;
m_cwordsAllocated = m_cwords + 100;
m_aichWords[0] = 1;
}
// Grow string storage if necessary.
ULONG cchIncNull = xcslen(psz);
if (m_aichWords[m_cwords] + cchIncNull > m_cchAllocated)
{
ULONG cbDesired = ((m_cchAllocated + cchIncNull) * sizeof(XCHAR) + 0x2003) & ~3;
void * pvNew = ::CoTaskMemRealloc(m_pData, cbDesired);
if (pvNew == NULL)
{
return E_OUTOFMEMORY;
}
m_pData = (XCHAR *)pvNew;
m_pData[0] = 0;
m_cchAllocated = cbDesired / sizeof(XCHAR);
}
memcpy(m_pData + m_aichWords[m_cwords], psz, cchIncNull * sizeof(XCHAR));
++m_cwords;
m_aichWords[m_cwords] = m_aichWords[m_cwords - 1] + cchIncNull;
// Fill in hash table entry with index of string.
m_aulBuckets[ibucket] = m_cwords;
ID = m_cwords;
}
else
{
// It was already there.
ID = m_aulBuckets[ibucket];
}
}
*pichOffset = ID ? m_aichWords[ID - 1] : 0;
if (pulID)
{
*pulID = ID;
}
return S_OK;
}
const ULONG GetNumItems() const
{
return m_cwords;
}
const XCHAR * String(ULONG ichOffset) const
{
return ichOffset ? m_pData + ichOffset : NULL;
}
static int xcscmp(const WCHAR * p0, const WCHAR * p1)
{
return wcscmp(p0, p1);
}
static int xcscmp(const char * p0, const char * p1)
{
return strcmp(p0, p1);
}
static int xcslen(const WCHAR * p)
{
return wcslen(p) + 1;
}
static int xcslen(const char * p)
{
return strlen(p) + 1;
}
BOOL IsEqual(ULONG ichOffset, const XCHAR * psz)
{
if (ichOffset)
{
return (psz ? (xcscmp(m_pData + ichOffset, psz) == 0) : FALSE);
}
else
{
return (psz == NULL);
}
}
ULONG StringSize(void) const
{
return m_cwords ? m_aichWords[m_cwords] : 0;
}
ULONG IndexFromId(ULONG ulID) const
{
SPDBG_ASSERT(ulID <= m_cwords);
if (ulID > 0)
{
return m_aichWords[ulID - 1];
}
return 0;
}
const XCHAR * Item(ULONG ulID) const
{
SPDBG_ASSERT(ulID <= m_cwords);
if ((ulID < 1) || m_pData == NULL)
{
return NULL;
}
return m_pData + IndexFromId(ulID);
}
ULONG SerializeSize() const
{
return (StringSize() * sizeof(XCHAR) + 3) & ~3;
}
const XCHAR * SerializeData()
{
ULONG cchWrite = StringSize();
if (cchWrite)
{
const ULONG cb = cchWrite * sizeof(XCHAR);
if (cb % 4) // We know there's room since data is always DWORD aligned by
{
memset(m_pData + cchWrite, 0xcc, 4 - (cb & 3)); // Junk data so make sure it's not null
}
}
return m_pData;
}
};
typedef class CStringBlobT<WCHAR> CStringBlob;
typedef class CStringBlobT<WCHAR> CStringBlobW;
typedef class CStringBlobT<char> CStringBlobA;
#endif // _STRINGBLOB_H_