886 lines
28 KiB
C++
886 lines
28 KiB
C++
|
/*********************************************************************
|
||
|
Silence.Cpp - Code for detecting silence on an incoming audio stream
|
||
|
|
||
|
begun 5/14/94 by Mike Rozak
|
||
|
Modified 12/10/96 by John Merrill to fix up alignment problems
|
||
|
*/
|
||
|
|
||
|
|
||
|
#include "stdafx.h"
|
||
|
#include <malloc.h>
|
||
|
#include "silence.h"
|
||
|
|
||
|
// temporary
|
||
|
#pragma warning(disable: 4100 4244)
|
||
|
|
||
|
/*********************************************************************
|
||
|
LowPassFilter - This low-pass filters 16-bit mono PCM data from one
|
||
|
buffer into another.
|
||
|
|
||
|
inputs
|
||
|
short *lpSrc - Source buffer
|
||
|
DWORD dwNumSamples - Number of samples in the source buffer
|
||
|
short *lpDst - Destination buffer. This will be filled in
|
||
|
with a low-passed version. It will have about an 8
|
||
|
sample lag. This must be as large as lpSrc.
|
||
|
short *psMax - Filled in with the new maximum.
|
||
|
If NULL then nothing is copied.
|
||
|
short *psMin - Filled in with the new minimum
|
||
|
If NULL then nothing is copied.
|
||
|
short *psAvg - Filled in with the new average
|
||
|
If NULL then nothing is copied.
|
||
|
DWORD dwSamplesPerSec
|
||
|
returns
|
||
|
DWORD - Number of samples returned. This will be <= dwNumSamples,
|
||
|
possible dwNumSamples - 7.
|
||
|
*/
|
||
|
DWORD LowPassFilter (short *lpSrc, DWORD dwNumSamples, short *lpDst,
|
||
|
short *psMax, short *psMin, short *psAvg, DWORD dwSamplesPerSec)
|
||
|
{
|
||
|
SPDBG_FUNC( "LowPassFilter" );
|
||
|
DWORD i;
|
||
|
long lSum;
|
||
|
short sSum, sMax, sMin;
|
||
|
short *lpLag;
|
||
|
BOOL fLow = (dwSamplesPerSec < 13000);
|
||
|
|
||
|
#define SHIFTRIGHT (fLow ? 3 : 4) // # bits to shift right.
|
||
|
#define WINDOWSIZE (1 << SHIFTRIGHT) // # samples
|
||
|
|
||
|
if (dwNumSamples < (DWORD) (WINDOWSIZE+1))
|
||
|
return 0;
|
||
|
|
||
|
// take the first 8 samples and average them together.
|
||
|
lSum = 0;
|
||
|
for (i = 0; i < (DWORD) WINDOWSIZE; i++)
|
||
|
lSum += lpSrc[i];
|
||
|
sSum = (short) (lSum >> SHIFTRIGHT);
|
||
|
|
||
|
//loop through the rest of the samples
|
||
|
lpLag = lpSrc;
|
||
|
lpSrc += WINDOWSIZE;
|
||
|
dwNumSamples -= WINDOWSIZE;
|
||
|
lSum = 0; // total
|
||
|
sMax = -32768;
|
||
|
sMin = 32767;
|
||
|
for (i = 0;dwNumSamples; lpSrc++, lpDst++, lpLag++, i++, dwNumSamples--) {
|
||
|
sSum = sSum - (*lpLag >> SHIFTRIGHT) + (*lpSrc >> SHIFTRIGHT);
|
||
|
// sSum = *lpSrc; // Dont do any filtering at all
|
||
|
*lpDst = sSum;
|
||
|
lSum += sSum;
|
||
|
if (sSum > sMax)
|
||
|
sMax = sSum;
|
||
|
if (sSum < sMin)
|
||
|
sMin = sSum;
|
||
|
};
|
||
|
|
||
|
// whow much did we do
|
||
|
if (psMax)
|
||
|
*psMax = sMax;
|
||
|
if (psMin)
|
||
|
*psMin = sMin;
|
||
|
if (psAvg && i)
|
||
|
*psAvg = (short) (lSum / (long) i);
|
||
|
return i;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*********************************************************************
|
||
|
QuantSamples - This quantizes the samples to +1, 0, or -1 (in place),
|
||
|
depedning if the given value is:
|
||
|
> sPositive then +1
|
||
|
< sNegative then -1
|
||
|
else 0
|
||
|
|
||
|
inputs
|
||
|
short *pSamples - Samples
|
||
|
DWORD dwNumSamples - Number of samples
|
||
|
short sPositive - Positive threshhold
|
||
|
short sNegative - Negative threshhold
|
||
|
returns
|
||
|
none
|
||
|
*/
|
||
|
void QuantSamples (short *pSamples, DWORD dwNumSamples,
|
||
|
short sPositive, short sNegative)
|
||
|
{
|
||
|
SPDBG_FUNC( "QuantSamples" );
|
||
|
while (dwNumSamples) {
|
||
|
if (*pSamples > sPositive)
|
||
|
*pSamples = 1;
|
||
|
else if (*pSamples < sNegative)
|
||
|
*pSamples = -1;
|
||
|
else
|
||
|
*pSamples = 0;
|
||
|
pSamples++;
|
||
|
dwNumSamples--;
|
||
|
};
|
||
|
}
|
||
|
|
||
|
/*********************************************************************
|
||
|
FindZC - This searches through the samples for the first zero crossing.
|
||
|
The returned point will have its previous sample at <= 0, and the
|
||
|
new one at >0.
|
||
|
|
||
|
inputs
|
||
|
short *pSamples - Samples;
|
||
|
DWORD dwNumSamples - Number of samples
|
||
|
returns
|
||
|
DWORD - first sampe number which is positive, or 0 if cant find
|
||
|
*/
|
||
|
DWORD FindZC (short *pSamples, DWORD dwNumSamples)
|
||
|
{
|
||
|
SPDBG_FUNC( "FindZC" );
|
||
|
DWORD i;
|
||
|
|
||
|
for (i = 1; i < dwNumSamples; i++)
|
||
|
if ((pSamples[i] > 0) && (pSamples[i-1] <= 0))
|
||
|
return i;
|
||
|
|
||
|
// else cant find
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*********************************************************************
|
||
|
CompareSegments - This compares two wave segments and sees how much
|
||
|
alike they are, returning a confidence that they are the same.
|
||
|
|
||
|
inputs
|
||
|
short *pA - Samples. This assumes that the samples
|
||
|
are -1, 0, or +1.
|
||
|
short *pB - Samples for B. Should be -1, 0, or +1
|
||
|
DWORD dwNumSamples - Number of samples in each of them
|
||
|
returns
|
||
|
WORD - Confidence from 0 to 0xffff (highest confidence)
|
||
|
|
||
|
Notes about the algo: Each sample will score a "similarity point"
|
||
|
for like signs, or if one of the values is a 0.
|
||
|
*/
|
||
|
WORD CompareSegments (short *pA, short *pB, DWORD dwNumSamples)
|
||
|
{
|
||
|
SPDBG_FUNC( "CompareSegments" );
|
||
|
DWORD dwSimilar = 0;
|
||
|
DWORD dwLeft;
|
||
|
|
||
|
for (dwLeft = dwNumSamples; dwLeft; pA++, pB++, dwLeft--)
|
||
|
if ((*pA == *pB) || (*pA == 0) || (*pB == 0))
|
||
|
dwSimilar++;
|
||
|
|
||
|
return (WORD) ((dwSimilar * 0xffff) / dwNumSamples);
|
||
|
}
|
||
|
|
||
|
|
||
|
/*********************************************************************
|
||
|
FindMostLikelyWaveLen - This Searches through wave data and finds the
|
||
|
most likeley wavelength for voiced audio. it returns a condifence
|
||
|
score from 0 to ffff (ffff is 100% positive).
|
||
|
|
||
|
inputs
|
||
|
short *pSamples - Samples
|
||
|
DWORD dwNumSamples - Number of samples
|
||
|
DWORD dwMinWaveLen - Minimum accepatble wavelength
|
||
|
DWORD dwMaxWaveLen - Maximum acceptable wavelength
|
||
|
WORD *pwConfidence - Filled in with confidence rating.
|
||
|
returns
|
||
|
DWORD - Wavelength found. 0 if can't deteermine anything
|
||
|
*/
|
||
|
DWORD FindMostLikelyWaveLen (short *pSamples, DWORD dwNumSamples,
|
||
|
DWORD dwMinWaveLen, DWORD dwMaxWaveLen, WORD *pwConfidence)
|
||
|
{
|
||
|
SPDBG_FUNC( "FindMostLikelyWaveLen" );
|
||
|
#define NUMCOMP (3)
|
||
|
DWORD dwFirstZC, i;
|
||
|
DWORD dwBestWaveLen;
|
||
|
WORD wBestConfidence;
|
||
|
DWORD dwCurZC, dwCurWaveLen, dwTemp;
|
||
|
WORD wConf, wTemp;
|
||
|
|
||
|
// Step one, find the first zero crossing
|
||
|
dwFirstZC = FindZC (pSamples, dwNumSamples);
|
||
|
if (!dwFirstZC)
|
||
|
return 0; // error
|
||
|
|
||
|
// Start at a minimum-wavelength away and start finding a wave
|
||
|
// which repeats three times and compares well.
|
||
|
dwBestWaveLen = 0; // best wavelength found so far
|
||
|
wBestConfidence = 0; // confidence of the best wavelength
|
||
|
dwCurWaveLen = dwMinWaveLen;
|
||
|
while (dwCurWaveLen <= dwMaxWaveLen) {
|
||
|
// Try the first comparison
|
||
|
dwCurZC = dwFirstZC + dwCurWaveLen;
|
||
|
if (dwCurZC >= dwNumSamples)
|
||
|
break; // no more samples left
|
||
|
|
||
|
// find first zero crossing from the current wavelen
|
||
|
dwTemp = FindZC (pSamples + dwCurZC, dwNumSamples - dwCurZC);
|
||
|
if (!dwTemp)
|
||
|
break; // no more samples left
|
||
|
dwCurZC += dwTemp;
|
||
|
dwCurWaveLen += dwTemp;
|
||
|
|
||
|
// Make sure that we have three wavelength's worth
|
||
|
if ((dwFirstZC + (NUMCOMP+1)*dwCurWaveLen) >= dwNumSamples)
|
||
|
break; // cant compare this
|
||
|
|
||
|
// Do two confidence tests and multiply them toegther to
|
||
|
// get the confidence for this wavelength
|
||
|
wConf = 0xffff;
|
||
|
for (i = 0; i < NUMCOMP; i++) {
|
||
|
wTemp = CompareSegments (pSamples + dwFirstZC /* + i * dwCurWaveLen */,
|
||
|
pSamples + (dwFirstZC + (i+1) * dwCurWaveLen), dwCurWaveLen);
|
||
|
wConf = (WORD) (((DWORD) wConf * (DWORD) wTemp) >> 16);
|
||
|
};
|
||
|
|
||
|
// If we're more confident about this one than others then use it
|
||
|
if (wConf >= wBestConfidence) {
|
||
|
wBestConfidence = wConf;
|
||
|
dwBestWaveLen = dwCurWaveLen;
|
||
|
};
|
||
|
|
||
|
// Up the current wavelength just a tad
|
||
|
dwCurWaveLen++;
|
||
|
};
|
||
|
|
||
|
*pwConfidence = wBestConfidence;
|
||
|
return dwBestWaveLen;
|
||
|
}
|
||
|
|
||
|
/*********************************************************************
|
||
|
IsSegmentVoiced - This detects if the segment if voiced or not.
|
||
|
|
||
|
inputs
|
||
|
short *pSamples - Sample data
|
||
|
DWORD dwNumSamples - number of samples
|
||
|
DWORD dwSamplesPerSec - Number of sample sper second
|
||
|
WORD wMinConfidence - Minimum condifence
|
||
|
returns
|
||
|
BOOL - TRUE if its definately voiced, FALSE if not or cant tell
|
||
|
*/
|
||
|
|
||
|
BOOL CSilence::IsSegmentVoiced (short *pSamples, DWORD dwNumSamples,
|
||
|
DWORD dwSamplesPerSec, WORD wMinConfidence, short *asFiltered)
|
||
|
{
|
||
|
SPDBG_FUNC( "CSilence::IsSegmentVoiced" );
|
||
|
//#define FILTERNUM (1024) // max # samples i nthe filter
|
||
|
//#define MAXVOICEHZ (300) // maximum voicce pitchm in hz
|
||
|
//#define MINVOICEHZ (50) // minimum voice pitch in hz
|
||
|
// #define MINCONFIDENCE (0x6000) // minimum confidence
|
||
|
// This means that 70% of the samples line up from one wavelength
|
||
|
// to another
|
||
|
|
||
|
DWORD dwNumFilter;
|
||
|
//short asFiltered[FILTERNUM];
|
||
|
short sMax, sMin, sAvg;
|
||
|
DWORD dwWaveLen;
|
||
|
WORD wConfidence;
|
||
|
short sPositive, sNegative;
|
||
|
|
||
|
// Filter it first so we just get the voiced audio range
|
||
|
if (dwNumSamples > FILTERNUM)
|
||
|
dwNumSamples = FILTERNUM;
|
||
|
dwNumFilter = LowPassFilter (pSamples, dwNumSamples, asFiltered,
|
||
|
&sMax, &sMin, &sAvg, m_dwSamplesPerSec);
|
||
|
|
||
|
// Truncate the wave samples to +1, 0, -1
|
||
|
sPositive = sAvg;
|
||
|
sNegative = sAvg;
|
||
|
QuantSamples (asFiltered, dwNumFilter, sPositive, sNegative);
|
||
|
|
||
|
// look through the voiced wavelengths for a frequency
|
||
|
dwWaveLen = FindMostLikelyWaveLen (asFiltered, dwNumFilter,
|
||
|
dwSamplesPerSec / m_dwHighFreq, dwSamplesPerSec / MINVOICEHZ,
|
||
|
&wConfidence);
|
||
|
|
||
|
return (dwWaveLen && (wConfidence >= wMinConfidence));
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
/*********************************************************************
|
||
|
TrimMaxAmp - This extracts the maximum amplitude range of the wave file
|
||
|
segment.
|
||
|
|
||
|
inputs
|
||
|
short * lpS - samples to look through
|
||
|
WORD dwNum - number of samples
|
||
|
returns
|
||
|
WORD - maximum amplitude range
|
||
|
*/
|
||
|
WORD NEAR PASCAL TrimMaxAmp (short * lpS, DWORD dwNum)
|
||
|
{
|
||
|
SPDBG_FUNC( "TrimMaxAmp" );
|
||
|
DWORD i;
|
||
|
short sMin, sMax, sTemp;
|
||
|
|
||
|
sMin = 32767;
|
||
|
sMax = (short) -32768;
|
||
|
for (i = dwNum; i; i--) {
|
||
|
sTemp = *(lpS++);
|
||
|
if (sTemp < sMin)
|
||
|
sMin = sTemp;
|
||
|
if (sTemp > sMax)
|
||
|
sMax = sTemp;
|
||
|
};
|
||
|
|
||
|
// If we're clipping at all then claim that we've maxed out.
|
||
|
// Some sound cards have bad DC offsets
|
||
|
if ((sMax >= 0x7f00) || (sMin <= -0x7f00))
|
||
|
return 0xffff;
|
||
|
|
||
|
return (WORD) (sMax - sMin);
|
||
|
}
|
||
|
|
||
|
/********************************************************************
|
||
|
TrimMaxAmpDelta - This extracts the maximum amplitude range and
|
||
|
calculates the maximum delta of the wave file
|
||
|
segment.
|
||
|
|
||
|
inputs
|
||
|
PBLOCKCHAR pBlockChar - Pointer to a block characteristic
|
||
|
structure which is filled in.
|
||
|
short * lpS - deltas to look through
|
||
|
WORD dwNum - number of samples
|
||
|
returns
|
||
|
nothing
|
||
|
*/
|
||
|
void TrimMaxAmpDelta(PBLOCKCHAR pBlockChar, short *lpS, DWORD dwNum)
|
||
|
{
|
||
|
SPDBG_FUNC( "TrimMaxAmpDelta" );
|
||
|
DWORD i;
|
||
|
WORD wMax = 0;
|
||
|
WORD wTemp;
|
||
|
short sMin, sMax, sCur, sLast;
|
||
|
|
||
|
// BUGFIX: 4303 Merge TrimMaxAmp and TrimMaxDelta
|
||
|
sLast = sMin = sMax = *(lpS++);
|
||
|
for (i = dwNum - 1; i; i--, sLast = sCur) {
|
||
|
sCur = *(lpS++);
|
||
|
// TrimMaxAmp
|
||
|
if (sCur < sMin)
|
||
|
sMin = sCur;
|
||
|
if (sCur > sMax)
|
||
|
sMax = sCur;
|
||
|
|
||
|
// TrimMaxDelta
|
||
|
wTemp = sCur > sLast ? (WORD) (sCur - sLast) : (WORD) (sLast - sCur);
|
||
|
if (wTemp > wMax)
|
||
|
wMax = wTemp;
|
||
|
|
||
|
}
|
||
|
// If we're clipping at all then claim that we've maxed out.
|
||
|
// Some sound cards have bad DC offsets
|
||
|
pBlockChar->wMaxLevel = ((sMax >= 0x7F00) || (sMin <= -0x7F00)) ? 0xFFFF : (WORD) (sMax - sMin);
|
||
|
pBlockChar->wMaxDelta = wMax;
|
||
|
} /* End of TrimMaxAmpDelta() */
|
||
|
|
||
|
|
||
|
/*********************************************************************
|
||
|
GetBlockChar - This gets the characteristics of a block of audio.
|
||
|
This characteristics can then be used to determine if the block
|
||
|
is silent or not.
|
||
|
|
||
|
inputs
|
||
|
short *lpS - sample data
|
||
|
DWORD dwNum - number of samples
|
||
|
PBLOCKCHAR pBlockChar - Pointer to a block characteristic
|
||
|
structure which is filled in.
|
||
|
BOOL fTestVoiced - Voicce testing will only be done if
|
||
|
this is TTRUE (in order to save processor).
|
||
|
returns
|
||
|
none
|
||
|
*/
|
||
|
void GetBlockChar(short *lpS, DWORD dwNum, PBLOCKCHAR pBlockChar, BOOL fTestVoiced)
|
||
|
{
|
||
|
SPDBG_FUNC( "GetBlockChar" );
|
||
|
// BUGFIX: 4303 Merge TrimMaxAmp and TrimMaxDelta
|
||
|
TrimMaxAmpDelta(pBlockChar, lpS, dwNum);
|
||
|
pBlockChar->bIsVoiced = pBlockChar->bHighLevel =
|
||
|
pBlockChar->bHighDelta = SIL_UNKNOWN;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*********************************************************************
|
||
|
IsBlockSound - This detects whether the block is silent or not.
|
||
|
|
||
|
inputs
|
||
|
PBLOCKCHAR pBlockInQuestion - Block in question. This has the
|
||
|
bHighLevel and bHighDelta flags modified
|
||
|
PBLOCKCHAR pBlockSilence - Silent block
|
||
|
BOOL fInUtterance - TRUE if we're in an utterance (which
|
||
|
means be more sensative), FALSE if we're not
|
||
|
returns
|
||
|
BOOL - TTRUE if has sound, FALSE if it is silent
|
||
|
*/
|
||
|
BOOL IsBlockSound (PBLOCKCHAR pBlockInQuestion, PBLOCKCHAR pBlockSilence,
|
||
|
BOOL fInUtterance)
|
||
|
{
|
||
|
SPDBG_FUNC( "IsBlockSound" );
|
||
|
#ifdef SOFTEND // Use so that catches a soft ending to phrases
|
||
|
#define SENSINV_THRESHHOLD_LEVEL(x) (((x)/4)*3)
|
||
|
#define SENSINV_THRESHHOLD_DELTA(x) (((x)/4)*3)
|
||
|
#else
|
||
|
#define SENSINV_THRESHHOLD_LEVEL(x) ((x)/2)
|
||
|
#define SENSINV_THRESHHOLD_DELTA(x) ((x)/2)
|
||
|
#endif
|
||
|
#define NORMINV_THRESHHOLD_LEVEL(x) ((x)/2)
|
||
|
#define NORMINV_THRESHHOLD_DELTA(x) ((x)/2)
|
||
|
|
||
|
if (fInUtterance) {
|
||
|
pBlockInQuestion->bHighLevel =
|
||
|
SENSINV_THRESHHOLD_LEVEL(pBlockInQuestion->wMaxLevel) >= pBlockSilence->wMaxLevel;
|
||
|
pBlockInQuestion->bHighDelta =
|
||
|
SENSINV_THRESHHOLD_DELTA(pBlockInQuestion->wMaxDelta) >= pBlockSilence->wMaxDelta;
|
||
|
}
|
||
|
else {
|
||
|
pBlockInQuestion->bHighLevel =
|
||
|
NORMINV_THRESHHOLD_LEVEL(pBlockInQuestion->wMaxLevel) >= pBlockSilence->wMaxLevel;
|
||
|
pBlockInQuestion->bHighDelta =
|
||
|
NORMINV_THRESHHOLD_DELTA(pBlockInQuestion->wMaxDelta) >= pBlockSilence->wMaxDelta;
|
||
|
};
|
||
|
|
||
|
|
||
|
return pBlockInQuestion->bHighLevel || pBlockInQuestion->bHighDelta;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*********************************************************************
|
||
|
ReEvaluateSilence - This takes the values used for silence and re-evaluates
|
||
|
them based upon new data which indicates what silence is. It
|
||
|
automatically adjusts to the noise level in the room over a few seconds.
|
||
|
NOTE: This should not be called when an utterance is happening, or
|
||
|
when it might be starting.
|
||
|
|
||
|
inputs
|
||
|
PBLOCKCHAR pSilence - This is the silence block, and should
|
||
|
start out with values in it. It will be modified
|
||
|
so to incorporate the new silence information.
|
||
|
PBLOCKCHAR pNew - New block which is known to be silence.
|
||
|
BYTE bWeight - This is the weighting of the new block
|
||
|
in influencing the old block, in a value from 0 to 255.
|
||
|
256 means that the value of the new silence completely
|
||
|
overpowers the old one, 0 means that it doesnt have
|
||
|
any affect.
|
||
|
returns
|
||
|
none
|
||
|
*/
|
||
|
void ReEvaluateSilence (PBLOCKCHAR pSilence, PBLOCKCHAR pNew,
|
||
|
BYTE bWeight)
|
||
|
{
|
||
|
SPDBG_FUNC( "ReEvaluateSilence" );
|
||
|
#define ADJUST(wOrig,wNew,bWt) \
|
||
|
(WORD) (( \
|
||
|
((DWORD) (wOrig) * (DWORD) (256 - (bWt))) + \
|
||
|
((DWORD) (wNew) * (DWORD) (bWt)) \
|
||
|
) >> 8);
|
||
|
|
||
|
pSilence->wMaxLevel = ADJUST (pSilence->wMaxLevel,
|
||
|
pNew->wMaxLevel, bWeight);
|
||
|
pSilence->wMaxDelta = ADJUST (pSilence->wMaxDelta,
|
||
|
pNew->wMaxDelta, bWeight);
|
||
|
|
||
|
// If it's way too silence (and too good to be true) then assume
|
||
|
// a default silece
|
||
|
// if (!pNew->wMaxLevel && !pNew->wMaxDelta) {
|
||
|
// if (pSilence->wMaxLevel < 2500)
|
||
|
// pSilence->wMaxLevel = 2500;
|
||
|
// if (pSilence->wMaxDelta < 400)
|
||
|
// pSilence->wMaxDelta = 400;
|
||
|
// }
|
||
|
}
|
||
|
|
||
|
/*********************************************************************
|
||
|
WhatsTheNewState - This takes in a stream of bit-field indicating which
|
||
|
of the last 32 blocks were detected as having sound, and what our
|
||
|
state was the last time this was called (utterance or not). It then
|
||
|
figureous out if we're still in an utterance, or we just entered one.
|
||
|
It also says how many buffers ago that was.
|
||
|
|
||
|
inputs
|
||
|
DWORD dwSoundBits - This is a bit-field of the last 32
|
||
|
audio blocks. A 1 in the field indicates that there was
|
||
|
sound there, a 0 indicates no sound. The low bit
|
||
|
corresponds to the most recent block, and high bit
|
||
|
the oldest.
|
||
|
DWORD dwVoicedBits - Just like sound bits except that it indicates
|
||
|
voiced sections of sound.
|
||
|
BOOL fWasInUtterance - This is true is we had an utterance
|
||
|
the last time this called, FALSE if there was silence
|
||
|
BOOL fLongUtterance - If this is a long utterance then dont
|
||
|
react for 1/4 second, otherwise use 1/8 second for
|
||
|
short utterance
|
||
|
WORD wBlocksPerSec - How many of the above-mentioned blocks
|
||
|
fit into a second.
|
||
|
WORD *wStarted - If a transition occurs from no utterance to
|
||
|
an utterance, then this fills in the number of of blocks
|
||
|
ago that the utterance started, into *wStarted. Otherwise
|
||
|
it is not changed.
|
||
|
WORD wReaction - Reaction time (in blocks) after an utterance is
|
||
|
finished
|
||
|
returns
|
||
|
BOOL - TRUE if we're in an utterance now, FALSE if we're in silence
|
||
|
*/
|
||
|
|
||
|
BOOL CSilence::WhatsTheNewState (DWORD dwSoundBits, DWORD dwVoicedBits,
|
||
|
BOOL fWasInUtterance, BOOL fLongUtterance,
|
||
|
WORD wBlocksPerSec, WORD *wStarted, WORD wReaction)
|
||
|
{
|
||
|
SPDBG_FUNC( "CSilence::WhatsTheNewState" );
|
||
|
WORD wCount, wOneBits;
|
||
|
WORD wTimeToCheck;
|
||
|
DWORD dwTemp, dwMask;
|
||
|
|
||
|
if (fWasInUtterance)
|
||
|
wTimeToCheck = wReaction;
|
||
|
else
|
||
|
wTimeToCheck = (wBlocksPerSec/4); // 1/4 second
|
||
|
if (!wTimeToCheck)
|
||
|
wTimeToCheck = 1;
|
||
|
|
||
|
|
||
|
for (wOneBits = 0, wCount = wTimeToCheck, dwTemp = dwSoundBits;
|
||
|
wCount;
|
||
|
dwTemp /= 2, wCount--)
|
||
|
if (dwTemp & 0x01)
|
||
|
wOneBits++;
|
||
|
|
||
|
if (fWasInUtterance) {
|
||
|
// If we were in an utterance, then we still are in an utterance
|
||
|
// UNLESS the number of bits which are turned on for the last
|
||
|
// 0.5 seconds is less that 1/4 of what should be turned on.
|
||
|
if ( (wOneBits >= 1))
|
||
|
return TRUE;
|
||
|
else
|
||
|
return FALSE;
|
||
|
}
|
||
|
else {
|
||
|
// We are in silence. We cannot possible go into an utterance
|
||
|
// until the current block is voicced
|
||
|
if (!(dwVoicedBits & 0x01))
|
||
|
return FALSE;
|
||
|
|
||
|
// If we were in silence then we're still in silence
|
||
|
// UNLESS the number of bits which are turned on for the last
|
||
|
// 0.5 seconds is more than 1/2 of what should be turned on.
|
||
|
// If so, then start the utterance 0.75 seconds ago.
|
||
|
if (wOneBits >= (wTimeToCheck / 2)) {
|
||
|
// we're not in an utterance
|
||
|
|
||
|
// Look back until get 1/8 second of silence, and include
|
||
|
// that in the data returned
|
||
|
dwTemp = dwSoundBits;
|
||
|
// dwMask = (1 << (wBlocksPerSec / 8)) - 1;
|
||
|
// for (wCount = wBlocksPerSec/8; dwTemp & dwMask; dwTemp >>= 1, wCount++);
|
||
|
dwMask = (1 << (wBlocksPerSec / m_wAddSilenceDiv)) - 1;
|
||
|
for (wCount = wBlocksPerSec/m_wAddSilenceDiv; dwTemp & dwMask; dwTemp >>= 1, wCount++);
|
||
|
|
||
|
*wStarted = wCount;
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
else
|
||
|
return FALSE;
|
||
|
};
|
||
|
|
||
|
}
|
||
|
|
||
|
|
||
|
/*********************************************************************
|
||
|
CSilence::CSilence - This creates the silence class.
|
||
|
|
||
|
inputs
|
||
|
WORD wBlocksPerSec - Number of blocks per second. The blocks
|
||
|
will be passed down through AddBlock().
|
||
|
returns
|
||
|
class
|
||
|
*/
|
||
|
CSilence::CSilence (WORD wBlocksPerSec)
|
||
|
{
|
||
|
SPDBG_FUNC( "CSilence::CSilence" );
|
||
|
m_wBlocksPerSec = min(wBlocksPerSec, 32); // no more than the # bits in a DWORD
|
||
|
m_wBlocksInQueue = m_wBlocksPerSec; // 1 second worth.
|
||
|
m_wLatestBlock = 0;
|
||
|
m_paBlockInfo = NULL;
|
||
|
m_dwSoundBits = m_dwVoicedBits = 0;
|
||
|
m_fFirstBlock = TRUE;
|
||
|
m_fInUtterance = FALSE;
|
||
|
m_dwUtteranceLength = 0;
|
||
|
m_dwSamplesPerSec = 11025;
|
||
|
}
|
||
|
|
||
|
/*********************************************************************
|
||
|
CSilence::~CSilence - Free up everything.
|
||
|
*/
|
||
|
CSilence::~CSilence (void)
|
||
|
{
|
||
|
SPDBG_FUNC( "CSilence::~CSilence" );
|
||
|
WORD i;
|
||
|
|
||
|
if (m_paBlockInfo) {
|
||
|
for (i = 0; i < m_wBlocksInQueue; i++)
|
||
|
if (m_paBlockInfo[i].pSamples)
|
||
|
free(m_paBlockInfo[i].pSamples);
|
||
|
free(m_paBlockInfo);
|
||
|
}
|
||
|
|
||
|
if (m_pASFiltered)
|
||
|
free(m_pASFiltered);
|
||
|
}
|
||
|
|
||
|
/*********************************************************************
|
||
|
CSilence::Init - This initializes the silence code. It basically
|
||
|
allocates memory. It should be called immediately after the object
|
||
|
is created and then not again.
|
||
|
|
||
|
inputs
|
||
|
none
|
||
|
returns
|
||
|
BOOL - TRUE if succeded, else out of memory
|
||
|
*/
|
||
|
BOOL CSilence::Init(BOOL fPhoneOptimized, DWORD dwSamplesPerSec)
|
||
|
{
|
||
|
SPDBG_FUNC( "CSilence::Init" );
|
||
|
m_dwSamplesPerSec = dwSamplesPerSec;
|
||
|
if (fPhoneOptimized) {
|
||
|
m_wAddSilenceDiv = (WORD) PHADD_BEGIN_SILENCE;
|
||
|
m_dwHighFreq = PHMAXVOICEHZ;
|
||
|
}
|
||
|
else {
|
||
|
m_wAddSilenceDiv = (WORD) PCADD_BEGIN_SILENCE;
|
||
|
m_dwHighFreq = PCMAXVOICEHZ;
|
||
|
}
|
||
|
if ((m_pASFiltered = (short *) malloc((sizeof(short)) * FILTERNUM)) == NULL)
|
||
|
return (FALSE);
|
||
|
|
||
|
// Initialize memory for the blocks and clear it.
|
||
|
if (m_paBlockInfo)
|
||
|
return (TRUE);
|
||
|
m_paBlockInfo = (PBINFO) malloc(m_wBlocksInQueue * sizeof(BINFO));
|
||
|
if (!m_paBlockInfo)
|
||
|
return (FALSE);
|
||
|
if (m_wBlocksInQueue && m_paBlockInfo)
|
||
|
memset(m_paBlockInfo, 0, m_wBlocksInQueue * sizeof(BINFO));
|
||
|
return (TRUE);
|
||
|
} /* End of Init() */
|
||
|
|
||
|
/*********************************************************************
|
||
|
CSilence::AddBlock - This does the following:
|
||
|
- Add the block the the queue. Free up an old block if needed.
|
||
|
The block should be 1/wBlocksPerSec long (about).
|
||
|
- Analyze the block to see if its got sound or is quiet.
|
||
|
- Fill in *wVU with a VU level.
|
||
|
- Return TRUE if we're in an utterance, FALSE if its silence now.
|
||
|
If TRUE then app should call GetBlock() until no more blocks left,
|
||
|
and pass them to the SR engine.
|
||
|
|
||
|
inputs
|
||
|
short *pSamples - Pointer to samples. This memory should
|
||
|
be allocaed with malloc(), and may be freed by the
|
||
|
object.
|
||
|
DWORD dwNumSamples - Number of samples
|
||
|
WORD *wVU - This is fille in with the VU meter for the block
|
||
|
QWORD qwTimeStamp - Time stamp for this buffer.
|
||
|
returns
|
||
|
BOOL - TRUE if an utterance is taking place, FALSE if its silent
|
||
|
*/
|
||
|
BOOL CSilence::AddBlock (short *pSamples, DWORD dwNumSamples,
|
||
|
WORD *wVU, QWORD qwTimeStamp)
|
||
|
{
|
||
|
SPDBG_FUNC( "CSilence::AddBlock" );
|
||
|
BLOCKCHAR bcNew;
|
||
|
BOOL fSound, fUtt;
|
||
|
PBINFO pbInfo;
|
||
|
WORD wUttStart, i;
|
||
|
|
||
|
// Dont add empty blocks
|
||
|
if (!dwNumSamples) {
|
||
|
if (pSamples)
|
||
|
free (pSamples);
|
||
|
return m_fInUtterance;
|
||
|
};
|
||
|
|
||
|
// Analyze the block for characteristics.
|
||
|
GetBlockChar (pSamples, dwNumSamples, &bcNew, !m_fInUtterance);
|
||
|
|
||
|
// fill in the vu
|
||
|
*wVU = bcNew.wMaxLevel;
|
||
|
|
||
|
// see if it's silent or not
|
||
|
if (m_fFirstBlock) {
|
||
|
// first block, so of course its silent
|
||
|
m_bcSilence = bcNew;
|
||
|
m_fFirstBlock = FALSE;
|
||
|
fSound = FALSE;
|
||
|
|
||
|
// BUGFIX 2466 - If it's way too silence (and too good to be true) then assume
|
||
|
// a default silece
|
||
|
if ((m_bcSilence.wMaxLevel < 500) || (m_bcSilence.wMaxDelta < 100)) {
|
||
|
m_bcSilence.wMaxLevel = 2500;
|
||
|
m_bcSilence.wMaxDelta = 400;
|
||
|
};
|
||
|
|
||
|
// If it's way too loud then cut down
|
||
|
if ((m_bcSilence.wMaxLevel > 2500) || (m_bcSilence.wMaxDelta > 1500)) {
|
||
|
m_bcSilence.wMaxLevel = min (m_bcSilence.wMaxLevel, 2500);
|
||
|
m_bcSilence.wMaxDelta = min (m_bcSilence.wMaxDelta, 1500);
|
||
|
};
|
||
|
}
|
||
|
else {
|
||
|
fSound = IsBlockSound (&bcNew, &m_bcSilence, m_fInUtterance);
|
||
|
};
|
||
|
|
||
|
// Test to see if the block is voiced if:
|
||
|
// - The amplitude level is more than background sound
|
||
|
// - We're not yet in an utterance (to save processor)
|
||
|
if (bcNew.bHighLevel && !m_fInUtterance) {
|
||
|
WORD wNoise;
|
||
|
wNoise = (m_dwSamplesPerSec <= 13000) ?
|
||
|
m_wNoiseThresh :
|
||
|
((m_wNoiseThresh / 3) * 2);
|
||
|
|
||
|
bcNew.bIsVoiced = this->IsSegmentVoiced (pSamples, dwNumSamples, m_dwSamplesPerSec, wNoise, m_pASFiltered) ?
|
||
|
SIL_YES : SIL_NO;
|
||
|
}
|
||
|
|
||
|
// add the block
|
||
|
m_dwVoicedBits = (m_dwVoicedBits << 1) |
|
||
|
( (bcNew.bIsVoiced == SIL_YES) ? 1 : 0 );
|
||
|
m_dwSoundBits = (m_dwSoundBits << 1) | (fSound ? 1 : 0);
|
||
|
m_wLatestBlock++;
|
||
|
if (m_wLatestBlock >= m_wBlocksInQueue)
|
||
|
m_wLatestBlock = 0;
|
||
|
pbInfo = m_paBlockInfo + m_wLatestBlock;
|
||
|
if (pbInfo->pSamples)
|
||
|
free (pbInfo->pSamples);
|
||
|
pbInfo->pSamples = pSamples;
|
||
|
pbInfo->dwNumSamples = dwNumSamples;
|
||
|
|
||
|
// BUGFIX: Alignment code. We need to store the timestamp for
|
||
|
// the BEGINNING of the block, not the end!
|
||
|
|
||
|
pbInfo->qwTimeStamp = qwTimeStamp - dwNumSamples * sizeof(WORD);
|
||
|
|
||
|
// What's our utterance state?
|
||
|
fUtt = this->WhatsTheNewState (m_dwSoundBits, m_dwVoicedBits, m_fInUtterance,
|
||
|
m_dwUtteranceLength >= m_wBlocksPerSec,
|
||
|
m_wBlocksPerSec, &wUttStart, m_wReaction);
|
||
|
if (fUtt && !m_fInUtterance) {
|
||
|
// We just entered an utterance, so wUttStart has a valid teerm
|
||
|
// in it. Go through the buffer queue and free all buffers which
|
||
|
// are older than wUttStart. Remembeer, this is a circular buffer
|
||
|
for (i = 0; i < (m_wBlocksInQueue - wUttStart); i++) {
|
||
|
pbInfo = m_paBlockInfo +
|
||
|
( (m_wLatestBlock + i + 1) % m_wBlocksInQueue);
|
||
|
if (pbInfo->pSamples)
|
||
|
free (pbInfo->pSamples);
|
||
|
pbInfo->pSamples = NULL;
|
||
|
};
|
||
|
|
||
|
// Since we just entered an utterance clear the utterance length counter
|
||
|
m_dwUtteranceLength = 0;
|
||
|
};
|
||
|
m_fInUtterance = fUtt;
|
||
|
|
||
|
// Remember how long this utterance has done on. Long utterances
|
||
|
// deserve more patience as far as silence goes
|
||
|
m_dwUtteranceLength++;
|
||
|
|
||
|
// Adjust the silence level if we're not in an utterance
|
||
|
// Requiring !fSound so that we dont accidentally indclude any
|
||
|
// utterance sections in the sound calculations
|
||
|
if (!m_fInUtterance /* && !fSound */) {
|
||
|
ReEvaluateSilence (&m_bcSilence, &bcNew,
|
||
|
255 / m_wBlocksPerSec);
|
||
|
}
|
||
|
else if (m_dwUtteranceLength >= ((DWORD)m_wBlocksPerSec * 30))
|
||
|
// if we have a very long utterance (> 30 second) then it's not
|
||
|
ReEvaluateSilence (&m_bcSilence, &bcNew, 255 / m_wBlocksPerSec);
|
||
|
|
||
|
// done
|
||
|
return m_fInUtterance;
|
||
|
}
|
||
|
|
||
|
/*********************************************************************
|
||
|
CSilence::ExpectNoiseChange - Sent to the silence detection algorithm
|
||
|
when it should expect the noise floor to go up/down.
|
||
|
|
||
|
inputs
|
||
|
WORD wValue - Amount that noise floor should change.
|
||
|
0x100 = no change. > 0x100 => louder, < 0x100 => quieter
|
||
|
returns
|
||
|
*/
|
||
|
void CSilence::ExpectNoiseChange (WORD wValue)
|
||
|
{
|
||
|
SPDBG_FUNC( "CSilence::ExpectNoiseChange" );
|
||
|
DWORD dwTemp;
|
||
|
|
||
|
dwTemp = ((DWORD) m_bcSilence.wMaxLevel * wValue) >> 8;
|
||
|
if (dwTemp > 0xffff)
|
||
|
dwTemp = 0xffff;
|
||
|
m_bcSilence.wMaxLevel = (WORD) dwTemp;
|
||
|
|
||
|
dwTemp = ((DWORD) m_bcSilence.wMaxDelta * wValue) >> 8;
|
||
|
if (dwTemp > 0xffff)
|
||
|
dwTemp = 0xffff;
|
||
|
m_bcSilence.wMaxDelta = (WORD) dwTemp;
|
||
|
}
|
||
|
|
||
|
/*********************************************************************
|
||
|
CSilence::GetBlock - This gets a block from the queue. This will fail
|
||
|
if there are no more blocks left to get OR if there's not utterance.
|
||
|
|
||
|
inputs
|
||
|
DWORD *pdwNumSamples - If a block is returned then this
|
||
|
will be filled in with the number of samples in the block.
|
||
|
QWORD *pqwTimeStamp - Filled in woth the time-stamp for the
|
||
|
buffer.
|
||
|
returns
|
||
|
short * - Pointer to a block of samples. This memory is the
|
||
|
caller's property and can be freed with free().
|
||
|
*/
|
||
|
short * CSilence::GetBlock (DWORD *pdwNumSamples, QWORD * pqwTimeStamp)
|
||
|
{
|
||
|
SPDBG_FUNC( "CSilence::GetBlock" );
|
||
|
PBINFO pbInfo;
|
||
|
WORD i, wCount;
|
||
|
short *pSamples;
|
||
|
|
||
|
if (!m_fInUtterance)
|
||
|
return NULL;
|
||
|
|
||
|
// find the first occurance
|
||
|
i = (m_wLatestBlock + 1) % m_wBlocksInQueue;
|
||
|
for (wCount = m_wBlocksInQueue; wCount;
|
||
|
i = ((i < (m_wBlocksInQueue-1)) ? (i+1) : 0), wCount-- ) {
|
||
|
pbInfo = m_paBlockInfo + i;
|
||
|
if (pbInfo->pSamples) {
|
||
|
*pdwNumSamples = pbInfo->dwNumSamples;
|
||
|
*pqwTimeStamp = pbInfo->qwTimeStamp;
|
||
|
pSamples = pbInfo->pSamples;
|
||
|
pbInfo->pSamples = NULL;
|
||
|
|
||
|
return pSamples;
|
||
|
};
|
||
|
};
|
||
|
|
||
|
// if got here then couldnt find anything
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
/*********************************************************************
|
||
|
CSilence::KillUtterance - Kills an exitsing utterance.
|
||
|
|
||
|
inputs
|
||
|
none
|
||
|
returns
|
||
|
none
|
||
|
*/
|
||
|
void CSilence::KillUtterance (void)
|
||
|
{
|
||
|
SPDBG_FUNC( "CSilence::KillUtterance" );
|
||
|
m_fInUtterance = FALSE;
|
||
|
m_dwSoundBits = 0;
|
||
|
m_dwVoicedBits = 0;
|
||
|
}
|
||
|
|