Windows2003-3790/enduser/speech/common/include/stringblob.h

/*******************************************************************************
* StringBlob.h *
*--------------*
*   Description:
*       This is the header file for the CStringBlob class used internally by SAPI.
*
*   Copyright 1998-2000 Microsoft Corporation All Rights Reserved.
*
*******************************************************************************/

#ifndef _STRINGBLOB_H_
#define _STRINGBLOB_H_ 1

#ifndef SPDebug_h
#include <SPDebug.h>
#endif

#include <math.h>

template <class XCHAR>
class CStringBlobT
{
    XCHAR *     m_pData;            // List of words, end-to-end
    ULONG       m_cchAllocated;     // Size of m_pData
    ULONG *     m_aichWords;        // Word index => offset in m_pData  [1] is index of start of second word
    ULONG       m_cwords;           // Number of words
    ULONG       m_cwordsAllocated;  // Size of m_aichWords
    ULONG *     m_aulBuckets;       // Hash table containing indices of words or 0 for empty buckets
    ULONG       m_cBuckets;         // Number of buckets in hash table

public:
    CStringBlobT()
    {
        m_pData = NULL;
        m_cchAllocated = 0;
        m_aichWords = NULL;
        m_cwords = 0;
        m_cwordsAllocated = 0;
        m_aulBuckets = NULL;
        m_cBuckets = 0;
    }

    ~CStringBlobT()
    {
        Clear();
    }

    void Detach(XCHAR **ppszWordList, ULONG *pulSize)
    {
        *ppszWordList = NULL;
        if (m_pData)
        {
            ULONG cchDesired = StringSize();
            ULONG cbSize = SerializeSize(); // byte count, ULONG multiple

            *ppszWordList = (XCHAR*)::CoTaskMemRealloc(m_pData, cbSize);
            if (*ppszWordList == NULL)
            {
                *ppszWordList = m_pData;
                cbSize = m_cchAllocated * sizeof(XCHAR);
            }
            m_pData = NULL;

            Clear();

            if (pulSize)
            {
                *pulSize = cbSize;
            }
        }
    }

    void Clear()
    {
        if (m_pData)
        {
            ::CoTaskMemFree(m_pData);
            m_pData = NULL;
        }
        m_cchAllocated = 0;

        free(m_aichWords);
        m_aichWords = NULL;
        m_cwordsAllocated = 0;
        m_cwords = 0;

        free(m_aulBuckets);
        m_aulBuckets = NULL;
        m_cBuckets = 0;
    }

    HRESULT InitFrom(const XCHAR * pszStringArray, ULONG cch)
    {
        SPDBG_ASSERT(m_pData == NULL);

        if (cch)
        {
            ULONG cbSize = (cch * sizeof(XCHAR) + 3) & ~3;
            m_pData = (XCHAR *)::CoTaskMemAlloc(cbSize);
            if (m_pData == NULL)
                return E_OUTOFMEMORY;
            m_cchAllocated = cch;

            SPDBG_ASSERT(pszStringArray[0] == 0);   // First string is always empty.

            // First pass to copy data and count strings.
            const XCHAR * pszPastEnd = pszStringArray + cch;
            const XCHAR * psz = pszStringArray;
            XCHAR * pszOut = m_pData;
            ULONG cwords = 0;

            while (psz < pszPastEnd)
            {
                if ((*pszOut++ = *psz++) == 0)
                    ++cwords;
            }

            m_aichWords = (ULONG *) malloc(sizeof(ULONG) * cwords);
            if (m_aichWords == NULL)
                return E_OUTOFMEMORY;
            m_cwordsAllocated = cwords;
            m_cwords = cwords - 1;  // Doesn't count leading 0

            HRESULT hr = SetHashSize(cwords * 2 + 1);
            if (FAILED(hr))
                return hr;

            // Second pass to fill in indices and hash table.
            psz = pszStringArray + 1;
            const WCHAR * pszWordStart = psz;
            ULONG ulID = 1;
            m_aichWords[0] = 1;
            while (psz < pszPastEnd)
            {
                if (*(psz++) == 0)
                {
                    SPDBG_ASSERT(ulID < m_cwordsAllocated);

                    m_aichWords[ulID] = (ULONG)(psz - pszStringArray); // can't have more than 4 million chars!

                    m_aulBuckets[FindIndex(pszWordStart)] = ulID;

                    pszWordStart = psz;
                    ++ulID;
                }
            }
        }

        return S_OK;
    }

    ULONG HashKey(const XCHAR * pszString, ULONG * pcchIncNull = NULL)
    {
        ULONG hash = 0;
        ULONG cchIncNull = 1;   // one for the NULL

	    for (const XCHAR * pch = pszString; *pch; ++pch, ++cchIncNull)
            hash = hash * 65599 + *pch;

        if (pcchIncNull)
            *pcchIncNull = cchIncNull;
        return hash;
    }

    // find index for string -- returns 0 if not found
    ULONG FindIndex(const XCHAR * psz)
    {
        SPDBG_ASSERT(psz);
        ULONG cchIncNull;
        ULONG start = HashKey(psz, &cchIncNull) % m_cBuckets;
        ULONG index = start;

        do
        {
            // Not in table; return index where it should be placed.
            if (m_aulBuckets[index] == 0)
                return index;

            // Compare length and if it matches compare full string.
            if (m_aichWords[m_aulBuckets[index]] - m_aichWords[m_aulBuckets[index] - 1] == cchIncNull &&
                IsEqual(m_aichWords[m_aulBuckets[index] - 1], psz))
            {
                // Found this word already in the table.
                return index;
            }

            if (++index >= m_cBuckets)
                index -= m_cBuckets;
        } while (index != start);

        SPDBG_ASSERT(m_cwords == m_cBuckets);   // Shouldn't ever get here

        return (ULONG) -1;
    }


    // Returns ID; use IndexFromId to recover string offset
    ULONG Find(const XCHAR * psz)
    {
        if (psz == NULL || m_cwords == 0)
            return 0;

        // Should always succeed in finding a bucket, since hash table is >2x larger than # of elements.
        ULONG   ibucket = FindIndex(psz);
        return m_aulBuckets[ibucket];    // May be 0 if not in table
    }


    ULONG primeNext(ULONG val)
    {
        if (val < 2)
            val = 2; /* the smallest prime number */

        for (;;)
        {
            /* Is val a prime number? */
            ULONG maxFactor = (ULONG) sqrt ((double) val);

            /* Is i a factor of val? */
            for (ULONG i = 2; i <= maxFactor; i++)
                if (val % i == 0)
                    break;

            if (i > maxFactor)
                return (val);

            val++;
        }
    }


    HRESULT SetHashSize(ULONG cbuckets)
    {
        if (cbuckets > m_cBuckets)
        {
            ULONG * oldtable = m_aulBuckets;
            ULONG oldentry = m_cBuckets;
            ULONG prime = primeNext(cbuckets);

            // Alloc new table.
            m_aulBuckets = (ULONG *) malloc(prime * sizeof(ULONG));
            if (m_aulBuckets == NULL)
            {
                m_aulBuckets = oldtable;
                return E_OUTOFMEMORY;
            }

            for (ULONG i=0; i < prime; i++)
            {
                m_aulBuckets[i] = 0;
            }

            m_cBuckets = prime;

            for (i = 0; i < oldentry; i++)
            {
                if (oldtable[i] != 0)
                {
                    ULONG ibucket = FindIndex(m_pData + m_aichWords[oldtable[i] - 1]);
                    m_aulBuckets[ibucket] = oldtable[i];
                }
            }

            free(oldtable);
        }

        return S_OK;
    }


    //
    //  The ID for a NULL string is always 0, the ID for subsequent strings is the
    //  index of the string + 1;
    //
    HRESULT Add(const XCHAR * psz, ULONG * pichOffset, ULONG *pulID = NULL)
    {
        ULONG   ID = 0;

        if (psz)
        {
            // Grow if we're more than half full.
            if (m_cwords * 2 >= m_cBuckets)
            {
                HRESULT hr = SetHashSize(m_cwords * 3 + 17);
                if (FAILED(hr))
                    return hr;
            }

            // Find out where this element should end up in hash table.
            ULONG ibucket = FindIndex(psz);

            if (m_aulBuckets[ibucket] == 0)
            {
                // Not found in hash table.  Append it to the end.

                // Grow ID=>index mapping array if necessary.
                if (m_cwords + 1 >= m_cwordsAllocated)  // 1 extra for init. zero
                {
                    void * pvNew = realloc(m_aichWords, sizeof(*m_aichWords) * (m_cwords + 100));
                    if (pvNew == NULL)
                        return E_OUTOFMEMORY;
                    m_aichWords = (ULONG *)pvNew;
                    m_cwordsAllocated = m_cwords + 100;
                    m_aichWords[0] = 1;
                }

                // Grow string storage if necessary.
                ULONG   cchIncNull = xcslen(psz);
                if (m_aichWords[m_cwords] + cchIncNull > m_cchAllocated)
                {
                    ULONG cbDesired = ((m_cchAllocated + cchIncNull) * sizeof(XCHAR) + 0x2003) & ~3;
                    void * pvNew = ::CoTaskMemRealloc(m_pData, cbDesired);
                    if (pvNew == NULL)
                    {
                        return E_OUTOFMEMORY;
                    }
                    m_pData = (XCHAR *)pvNew;

                    m_pData[0] = 0;
                    m_cchAllocated = cbDesired / sizeof(XCHAR);
                }
                memcpy(m_pData + m_aichWords[m_cwords], psz, cchIncNull * sizeof(XCHAR));

                ++m_cwords;

                m_aichWords[m_cwords] = m_aichWords[m_cwords - 1] + cchIncNull;

                // Fill in hash table entry with index of string.
                m_aulBuckets[ibucket] = m_cwords;

                ID = m_cwords;
            }
            else
            {
                // It was already there.
                ID = m_aulBuckets[ibucket];
            }
        }

        *pichOffset = ID ? m_aichWords[ID - 1] : 0;
        if (pulID)
        {
            *pulID = ID;
        }
        return S_OK;
    }

    const ULONG GetNumItems() const
    {
        return m_cwords;
    }

    const XCHAR * String(ULONG ichOffset) const
    {
        return ichOffset ? m_pData + ichOffset : NULL;
    }

    static int xcscmp(const WCHAR * p0, const WCHAR * p1)
    {
        return wcscmp(p0, p1);
    }

    static int xcscmp(const char * p0, const char * p1)
    {
        return strcmp(p0, p1);
    }

    static int xcslen(const WCHAR * p)
    {
        return wcslen(p) + 1;
    }

    static int xcslen(const char * p)
    {
        return strlen(p) + 1;
    }

    BOOL IsEqual(ULONG ichOffset, const XCHAR * psz)
    {
        if (ichOffset)
        {
            return (psz ? (xcscmp(m_pData + ichOffset, psz) == 0) : FALSE);
        }
        else
        {
            return (psz == NULL);
        }
    }

    ULONG StringSize(void) const
    {
        return m_cwords ? m_aichWords[m_cwords] : 0;
    }

    ULONG IndexFromId(ULONG ulID) const
    {
        SPDBG_ASSERT(ulID <= m_cwords);
        if (ulID > 0)
        {
            return m_aichWords[ulID - 1];
        }
        return 0;
    }

    const XCHAR * Item(ULONG ulID) const
    {
        SPDBG_ASSERT(ulID <= m_cwords);
        if ((ulID < 1) || m_pData == NULL)
        {
            return NULL;
        }

        return m_pData + IndexFromId(ulID);
    }

    ULONG SerializeSize() const
    {
        return (StringSize() * sizeof(XCHAR) + 3) & ~3;
    }

    const XCHAR * SerializeData()
    {
        ULONG cchWrite = StringSize();
        if (cchWrite)
        {
            const ULONG cb = cchWrite * sizeof(XCHAR);

            if (cb % 4)  // We know there's room since data is always DWORD aligned by
            {
                memset(m_pData + cchWrite, 0xcc, 4 - (cb & 3)); // Junk data so make sure it's not null
            }
        }
        return m_pData;
    }
};


typedef class CStringBlobT<WCHAR> CStringBlob;
typedef class CStringBlobT<WCHAR> CStringBlobW;
typedef class CStringBlobT<char>  CStringBlobA;

#endif  // _STRINGBLOB_H_