/***************************************************************************** * * * MAKEPHR.CPP * * * * Copyright (C) Microsoft Corporation 1990-1994 * * All Rights reserved. * * * ****************************************************************************** * * * Module Intent * * * * Main processing code that parcels out the work to all the other units * * * *****************************************************************************/ #include "stdafx.h" #include "..\common\coutput.h" #include #include "cphrase.h" #ifdef _DEBUG #undef THIS_FILE static char THIS_FILE[] = __FILE__; #endif /***************************************************************************** * * * Defines * * * *****************************************************************************/ const int CB_SAVEMIN_DEFAULT = 40; // REVIEW: if WinHelp 4.0 ends up being 32-bit, then this limit could // be raised. const int MAX_PHRASE_FILE = 0xFFF0; const int CCHBUF = 2048; // Generic buffer size const int MAX_LOOP = 35; const int CNIL = -1; // Nil count (used for invalid cMaxNKeyPh) // Maximum size of a record you can pass to the sorter. const int cbMAX_REC_SIZE = 520; // must be at least as large as CCH_MAX_PHRASE const int DB_SIZE = 8000; typedef struct { PSTR sz; // Pointer to phrase string int coc; // Number of occurances of phrase int cch; // Length of phrase int cbSavings; // Number of bytes saved by suppressing phrase } FR_INFO; static PSTR pszClubBuffer; // buffer for club's prefix static PSTR pszCandBuffer; // buffer for candidate static PSTR pszInputBuffer; // buffer for current line static int lTotal; // total savings over all clubs static INLINE int STDCALL CbPrefixPch2(PSTR pch1, PSTR pch2); static INLINE BOOL STDCALL FSlice(PSTR szInFile, PSTR szOutFile); static INLINE RC_TYPE STDCALL RcRemoveDuplicates(PSTR szInput, PSTR szOutput); static INLINE PSTR STDCALL SzGetPhrase(PSTR szBuf, int cchMax, FILE* pfile); static INLINE VOID STDCALL GenerateKeyPhrases(CTable* ptbl, int cMaxNKeyPh); static int STDCALL club(int cbSaveMin, CTable* ptbl, int* plcbFileSave); INLINE static void STDCALL SelectPhrase(FR_INFO fr, CTable* ptbl); static BOOL STDCALL FGetNextCandidate(FR_INFO* pfrCand); static int FASTCALL DcbConverge(int cInit, int cTarget, int cbSize); /*************************************************************************** FUNCTION: CbSaveCchC PURPOSE: To calculate the savings expected from compressing out a string given its length and number of occurences. PARAMETERS: cch c RETURNS: COMMENTS: Each phrase, when compressed is represented in the keyphrase table (with an extra length byte) and is replaced in the plaintext by two bytes. Hence, the formula for calculating savings is as follows: c = count of occurences cch = length of string 1 = one byte for index into keyphrase array 2 = two bytes per keyphrase magic cookie cost of keyphrase = cch + 2*c + 1 benefit of keyphrase = cch * c savings = benefit - cost = cch * (c - 1) - 2c - 1 Note: keyphrases may be a maximum of 255 bytes long, due to storage of their lengths in a byte. MODIFICATION DATES: 12-Jun-1994 [ralphw] ***************************************************************************/ INLINE static int STDCALL CbSaveCchC(int cch, int c) { if (cch < 2) return 0; else return (cch * (c - 1) - 2 * c - 1); }; /*************************************************************************** FUNCTION: RcMakePhr PURPOSE: PARAMETERS: szInputFile szOutputFile cMaxNKeyPh RETURNS: COMMENTS: MODIFICATION DATES: 19-Jul-1993 [ralphw] ***************************************************************************/ RC_TYPE STDCALL RcMakePhr(PSTR szOutputFile, int cMaxNKeyPh) { CTable tbl; ASSERT(szOutputFile != NULL); { CMem memClubBuffer(CCHBUF); // buffer for club's prefix CMem memCandBuffer(CCHBUF); // buffer for candidate CMem memInputBuffer(CCHBUF); // buffer for current line pszClubBuffer = memClubBuffer.psz; pszCandBuffer = memCandBuffer.psz; pszInputBuffer = memInputBuffer.psz; GenerateKeyPhrases(&tbl, cMaxNKeyPh); } tbl.SortTable(); COutput output(szOutputFile); if (!output.fInitialized) { // REVIEW: can we get more useful information about the problem? errHpj.ep = epNoFile; VReportError(HCERR_CANNOT_OPEN, &errHpj, szOutputFile); return RC_Failure; } #ifdef _DEBUG DWORD cb = 0; #endif for (int pos = 1; pos <= tbl.CountStrings(); pos++) { #ifdef _DEBUG /* * We only check during debug, because this should have already * been prevented -- we just want to make absolutely certain. */ // add 2 for CR/LF cb += strlen(tbl.GetPointer(pos)) + 2; ASSERT(cb < MAX_PHRASE_FILE + 15); #endif output.outstring_eol(tbl.GetPointer(pos)); } return RC_Success; } /*************************************************************************** FUNCTION: FGetNextCandidate PURPOSE: This function reads in the next candidate phrase from pfile. A "candidate" is the next string in the file, plus successive occurances of the same string. The return value, put into pfrCand, contains the string, its length, number of occurances, and bytes saved if it is suppressed. The static variable szInput is a "put back" buffer which, if full, contains the string read which didn't fit into the previous candidate. In addition to values in pfrCand, returns TRUE if candidate found, and FALSE if end-of-file has been reached. PARAMETERS: pfrCand pfile RETURNS: COMMENTS: MODIFICATION DATES: 19-Jul-1993 [ralphw] ***************************************************************************/ static BOOL STDCALL FGetNextCandidate(FR_INFO* pfrCand) { DWORD count = pphrase->GetPhrase(pfrCand->sz); if (count == (DWORD) -1) return FALSE; pfrCand->coc = count; pfrCand->cch = strlen(pfrCand->sz); pfrCand->cbSavings = CbSaveCchC(pfrCand->cch, pfrCand->coc); return TRUE; } /*************************************************************************** FUNCTION: CbPrefixPch2 PURPOSE: To calculate the number of characters shared as a prefix by the two given strings. PARAMETERS: pch1 pch2 RETURNS: length of the prefix COMMENTS: MODIFICATION DATES: 19-Jul-1993 [ralphw] ***************************************************************************/ static INLINE int STDCALL CbPrefixPch2(PSTR pch1, PSTR pch2) { int cch; for (cch = 0; pch1[cch] == pch2[cch] && pch1[cch] != '\0'; cch++) ; if (cch > 1 && options.fDBCS) { /* * If what doesn't match is the second character of a DBCS word, * then we must back up so as not to include the DBCS lead byte. */ if (IsFirstByte(pch1[cch - 1])) --cch; } return cch; } /*----------------------------------------------------------------------- | Name: club() | | Purpose: | Usage: | | Assumptions: We have sorted the phrases in input file. | | We only care about phrases saving more than | | cbSaveMin bytes. | | | | Method: | | We are looking for the phrases that save the most space.| | We could just look for multiple occurences of a single phrase. | | (In fact, this was our old method). Unfortunately, this means | | that if we have two 43 character phrases that differ in the 42nd| | character, we are probably wasting space by not using the common| | prefix. The following algorithm attempts to deal with common | | prefixes in an intelligent way. | | A group of phrases that effectively share a common | | prefix is called a "club." As we scan through the input, we are| | looking for new members to add to the current club. For example| | the current club may share the prefix "It's faster to." When we| | get the string "It's faster not to," we must decide whether | | or not to accept it as a club member. Accepting it would mean | | that we must now lower our standards to use the prefix "It's | | faster" in order to increase membership. Rejecting a string | | causes us to close the club to future members (it's in the by- | | laws). | | | | Our membership decision is based on two criteria. Both | | the club and the candidate must profit from the transaction in | | order for us to grow the club. | | 1) The candidate must earn more compression by joining | | the club, and using only the prefix he shares | | with the club than he would earn by retaining | | his suffix but having to pay the overhead of | | being a separate keyphrase. | | 2) The profit earned by the club by growing its member- | | ship must outweigh the loss of any shortenning | | of its prefix. | | | | Once we have closed membership to a club, we decide if | | the club saved enough to be worth keeping, and if so we write it| | to stdout. We then start the next club by having it consist | | solely of the candidate whose rejection closed the previous club| | Returns: count of phrases selected. | -----------------------------------------------------------------------*/ const int cbClubMin = 1; static int STDCALL club(int cbSaveMin, CTable* ptbl, int* plcbFileSave) { CTable* pfileSave = ptbl; FR_INFO frClub; // Current club phrase FR_INFO frCand; // Current candidate for club int cClub; // count of clubs int cchT; int cocT; int cbSavingsT; pfileSave->Empty(); frCand.sz = pszCandBuffer; frClub.sz = pszClubBuffer; //----------open-files-for-reading-and-writing-------------*/ //---initialize-1st-club,-1st-Candidate,-and-variables-------*/ frClub.sz[0] = '\0'; frClub.cch = 0; frClub.coc = 0; frClub.cbSavings = 0; cClub = 0; lTotal = 0; pphrase->SetPosition(); *plcbFileSave = 0; while (FGetNextCandidate(&frCand)) { // Get temporary values for combination of frCand and frClub cchT = CbPrefixPch2(frClub.sz, frCand.sz); cocT = frClub.coc + frCand.coc; cbSavingsT = CbSaveCchC(cchT, cocT); //---------IF-it-is-worthwhile-to-admit-candidate-----------*/ if (cchT > 2 && cbSavingsT > frCand.cbSavings + frClub.cbSavings + cbClubMin) { // Admit candidate into club: frClub.cch = cchT; frClub.coc = cocT; frClub.cbSavings = cbSavingsT; frClub.sz[cchT] = '\0'; } else //---------ELSE-the-club-is-closed-------------------------*/ { //---------IF-Club-saves-enuf-make-it-a-keyphrase---------*/ if (frClub.cbSavings >= cbSaveMin) { SelectPhrase(frClub, pfileSave); // add 2 for CR/LF *plcbFileSave += strlen(frClub.sz) + 2; cClub++; ASSERT(cClub > 0); lTotal += frClub.cbSavings; } // Make candidate start of next club frClub = frCand; frClub.sz = pszClubBuffer; // Keep buffers separate strcpy(frClub.sz, frCand.sz); } } // End while (more candidates) // Check last club for worthwhile savings if (frClub.cbSavings >= cbSaveMin) { SelectPhrase(frClub, pfileSave); *plcbFileSave += strlen(frClub.sz); cClub++; lTotal += frClub.cbSavings; } //--------------------print-data---------------------*/ #ifdef REPORT { wsprintf(szParentString, "DEBUG: cbSaveMin %d yields %s keyphrases, saving %s bytes\r\n", cbSaveMin, FormatNumber(cClub), FormatNumber(lTotal)); SendStringToParent(szParentString); } #endif //---------------close-all-files---------------------------*/ // Return number of phrases generated: return (cClub); } /*************************************************************************** * - Name: DcbConverge - * Purpose: * This function is used to compute the delta to the cbSaveMin * parameter to get cInit to converge to cTarget. Lots of hocus pocus. * * Arguments: * cInit - Current count of phrases * cTarget - Limit that count of phrases violates * cbSize - Current cbSaveMin, used to scale size of delta * * Returns: * Value to be added to cbSaveMin for next club attempt. This value * will be negative if cbSaveMin should be reduced. * ***************************************************************************/ static int FASTCALL DcbConverge(int cInit, int cTarget, int cbSize) { int dcb; dcb = ((cInit - cTarget) / 50); if (dcb == 0) dcb = (cInit > cTarget ? 1 : -1); return dcb * (1 + cbSize / 32); } /*************************************************************************** FUNCTION: GenerateKeyPhrases PURPOSE: Generates a key phrase list from list of candidates in szInput, using the club method, and writes them out to szOutput using SelectPhrase(). Gets numbers to use from szData (if it exists -- otherwise use defaults) except for when a valid phrase count is passed in cMaxNKeyPh. Calls club() repeatedly with different values for cbClubMin and cbSaveMin until club() returns the correct number of phrases generated. If we get into an infinite loop (by looping more than MAX_LOOP times), just use the best values received so far. PARAMETERS: szInput szData szOutput cMaxNKeyPh RETURNS: COMMENTS: MODIFICATION DATES: 19-Jul-1993 [ralphw] ***************************************************************************/ const int MIN_SAVINGS = 1; #define GRIND_TIME 4 static INLINE void STDCALL GenerateKeyPhrases(CTable* ptbl, int cMaxNKeyPh) { int cPhrases; int cMinNKeyPh; int dcbSaveMin, dcbSaveMinOld; int lcbFile = 0; int cbSaveMin = CB_SAVEMIN_DEFAULT; if (cMaxNKeyPh == CNIL) cMaxNKeyPh = MAX_PHRASES; dcbSaveMin = 0; cMinNKeyPh = cMaxNKeyPh - (cMaxNKeyPh >> 4); cGrind = 0; /* * While we don't have the right amount of phrases, or while the * output file is to big, we will do this loop and adjust the phrase * file. */ while ((cPhrases = club(cbSaveMin, ptbl, &lcbFile)) > cMaxNKeyPh || cPhrases < cMinNKeyPh || lcbFile >= MAX_PHRASE_FILE) { if (++cGrind == GRIND_TIME) { doGrind(); cGrind = 0; } dcbSaveMinOld = dcbSaveMin; // Check for output file > 64K -- WinHelp limitation if (lcbFile >= MAX_PHRASE_FILE && cPhrases <= cMaxNKeyPh) { ASSERT(cMaxNKeyPh > ((cPhrases * MAX_PHRASE_FILE) / lcbFile)); cMaxNKeyPh = (cPhrases * MAX_PHRASE_FILE) / lcbFile; cMinNKeyPh = cMaxNKeyPh - (cMaxNKeyPh >> 4); dcbSaveMin = DcbConverge(cPhrases, cMaxNKeyPh, cbSaveMin); cbSaveMin += dcbSaveMin; continue; } if (cPhrases <= cMaxNKeyPh && cPhrases >= cMinNKeyPh) break; // Can't go beyond 1,1: if (cbSaveMin == MIN_SAVINGS && cPhrases < cMinNKeyPh) break; // Adjust cbSaveMin if (cPhrases > cMaxNKeyPh) { dcbSaveMin = DcbConverge(cPhrases, cMaxNKeyPh, cbSaveMin); ASSERT(dcbSaveMin > 0); } else { ASSERT(cPhrases < cMinNKeyPh); dcbSaveMin = DcbConverge(cPhrases, cMinNKeyPh, cbSaveMin); ASSERT(dcbSaveMin < 0); // If we are not converging, then break if (dcbSaveMinOld > 0 && (-dcbSaveMin >= dcbSaveMinOld)) break; } cbSaveMin = max(cbSaveMin + dcbSaveMin, MIN_SAVINGS); } if (cbSaveMin == 1) return; // Can't do any better then this // Try to squeeze a bit more out of the phrase file. This will increase // compile time, but can generate a bit better compression. #ifdef _DEBUG int cOrgTotal = lTotal; #endif int cBestTotal = lTotal; int cbBestSaveMin = cbSaveMin; while (lcbFile < MAX_PHRASE_FILE && cbSaveMin > MIN_SAVINGS) { if (club(--cbSaveMin, ptbl, &lcbFile) >= MAX_PHRASES) break; if (++cGrind == GRIND_TIME) { doGrind(); cGrind = 0; } if (lcbFile < MAX_PHRASE_FILE && lTotal > cBestTotal) { // Keep track of best compression to date cBestTotal = lTotal; cbBestSaveMin = cbSaveMin; } else break; } club(cbBestSaveMin, ptbl, &lcbFile); } /*************************************************************************** FUNCTION: SelectPhrase PURPOSE: This function puts the given phrase into the list of phrases to suppress. The file argument is the file to save the phrases in. First, one trailing space is removed, if present, from the end of the string. This is because, in the phrase replacement algorithm, phrases from the list with trailing spaces added are available for free. PARAMETERS: fr pfile RETURNS: COMMENTS: MODIFICATION DATES: 19-Jul-1993 [ralphw] ***************************************************************************/ INLINE static void STDCALL SelectPhrase(FR_INFO fr, CTable* ptbl) { /* * If phrase contains a trailing space, we remove it, but only if the * resulting phrase is longer than 2 characters. */ if (fr.sz[fr.cch - 1] == ' ' && fr.cch > 3) fr.sz[--fr.cch] = '\0'; if (!ptbl->IsCSStringInTable(fr.sz)) ptbl->AddString(fr.sz); }