2020-09-30 17:12:32 +02:00

479 lines
12 KiB
C

/*
* optfmtch.c
* Match finder for the optimal parser
*/
#include <string.h>
#include <stdio.h>
#include <crtdbg.h>
#include "deflate.h"
#define VERIFY_SEARCH_CODE(routine_name) \
{ \
int debug_search; \
for (debug_search = 0; debug_search < clen; debug_search++) \
{ \
if (window[ptr+debug_search] != window[BufPos+debug_search]) \
{ \
_RPT2( \
_CRT_WARN, \
routine_name \
" char mismatch @%3d (clen=%d)\n", \
debug_search, clen); \
\
_RPT3( \
_CRT_WARN, \
" ptr=%8d, bufpos=%8d, end_pos=%8d\n\n", \
ptr, BufPos, end_pos); \
_ASSERT(0); \
} \
} \
}
#define VERIFY_MULTI_TREE_SEARCH_CODE(routine_name) \
_ASSERT(window[BufPos] == window[ptr]); \
_ASSERT(window[BufPos+1] == window[ptr+1]);
/*
* Finds the closest matches of all possible lengths, MIN_MATCH <= x <= MAX_MATCH,
* at position BufPos.
* The positions of each match location are stored in context->matchpos_table[]
* Returns the longest such match length found, or zero if no matches found.
*/
int optimal_find_match(t_encoder_context *context, long BufPos)
{
ULONG ptr;
ULONG a, b;
t_search_node *small_ptr, *big_ptr;
t_search_node *left = context->optimal_encoder->search_left;
t_search_node *right = context->optimal_encoder->search_right;
t_match_pos *matchpos_table = context->optimal_encoder->matchpos_table;
BYTE *window = context->optimal_encoder->window;
ULONG end_pos;
int val; /* must be signed */
int clen;
int same;
int match_length;
int small_len, big_len;
USHORT tree_to_use;
/*
* Retrieve root node of tree to search, and insert current node at
* the root.
*/
tree_to_use = *((USHORT UNALIGNED *) &window[BufPos]);
ptr = context->optimal_encoder->search_tree_root[tree_to_use];
context->optimal_encoder->search_tree_root[tree_to_use] = (t_search_node) BufPos;
/*
* end_pos is the furthest location back we will search for matches
* Remember that our window size is reduced by 3 bytes because of
* our repeated offset codes.
* Since BufPos starts at WINDOW_SIZE when compression begins,
* end_pos will never become negative.
*/
end_pos = BufPos - (WINDOW_SIZE-4);
/*
* Root node is either NULL, or points to a really distant position.
*/
if (ptr <= end_pos)
{
left[BufPos] = right[BufPos] = 0;
return 0;
}
/*
* confirmed length (no need to check the first clen chars in a search)
* note: clen is always equal to min(small_len, big_len)
*/
clen = 2;
/*
* current best match length
*/
match_length = 2;
/*
* longest match which is < our string
*/
small_len = 2;
/*
* longest match which is > our string
*/
big_len = 2;
#ifdef _DEBUG
VERIFY_MULTI_TREE_SEARCH_CODE("binary_search_findmatch()");
#endif
/*
* pointers to nodes to check
*/
small_ptr = &left[BufPos];
big_ptr = &right[BufPos];
do
{
/* compare bytes at current node */
same = clen;
#ifdef _DEBUG
VERIFY_SEARCH_CODE("optimal_findmatch()")
#endif
/* don't need to check first clen characters */
a = ptr + clen;
b = BufPos + clen;
while ((val = ((int) window[a++]) - ((int) window[b++])) == 0)
{
/* don't exceed MAX_MATCH */
if (++same >= MAX_MATCH)
goto long_match;
}
if (val < 0)
{
if (same > big_len)
{
if (same > match_length)
{
long_match:
do
{
matchpos_table[++match_length] = BufPos-ptr-1;
} while (match_length < same);
if (same >= BREAK_LENGTH)
{
*small_ptr = left[ptr];
*big_ptr = right[ptr];
goto end_bsearch;
}
}
big_len = same;
clen = min(small_len, big_len);
}
*big_ptr = (t_search_node) ptr;
big_ptr = &left[ptr];
ptr = *big_ptr;
}
else
{
if (same > small_len)
{
if (same > match_length)
{
do
{
matchpos_table[++match_length] = BufPos-ptr-1;
} while (match_length < same);
if (same >= BREAK_LENGTH)
{
*small_ptr = left[ptr];
*big_ptr = right[ptr];
goto end_bsearch;
}
}
small_len = same;
clen = min(small_len, big_len);
}
*small_ptr = (t_search_node) ptr;
small_ptr = &right[ptr];
ptr = *small_ptr;
}
} while (ptr > end_pos); /* while we don't go too far backwards */
*small_ptr = 0;
*big_ptr = 0;
end_bsearch:
/*
* If we have multiple search trees, we are already guaranteed
* a minimum match length of 2 when we reach here.
* If we only have one tree, then we're not guaranteed anything.
*/
if (match_length < MIN_MATCH)
return 0;
else
return (long) match_length;
}
/*
* Inserts the string at the current BufPos into the tree.
* Does not record all the best match lengths or otherwise attempt
* to search for matches
* Similar to the above function.
*/
void optimal_insert(t_encoder_context *context, long BufPos, long end_pos)
{
long ptr;
ULONG a,b;
t_search_node *small_ptr, *big_ptr;
t_search_node *left = context->optimal_encoder->search_left;
t_search_node *right = context->optimal_encoder->search_right;
BYTE *window = context->optimal_encoder->window;
int val;
int small_len, big_len;
int same;
int clen;
USHORT tree_to_use;
tree_to_use = *((USHORT UNALIGNED *) &window[BufPos]);
ptr = context->optimal_encoder->search_tree_root[tree_to_use];
context->optimal_encoder->search_tree_root[tree_to_use] = (t_search_node) BufPos;
if (ptr <= end_pos)
{
left[BufPos] = right[BufPos] = 0;
return;
}
clen = 2;
small_len = 2;
big_len = 2;
#ifdef _DEBUG
VERIFY_MULTI_TREE_SEARCH_CODE("quick_insert_bsearch_findmatch()");
#endif
small_ptr = &left[BufPos];
big_ptr = &right[BufPos];
do
{
same = clen;
a = ptr+clen;
b = BufPos+clen;
#ifdef _DEBUG
VERIFY_SEARCH_CODE("quick_insert_bsearch_findmatch()")
#endif
while ((val = ((int) window[a++]) - ((int) window[b++])) == 0)
{
/*
* Here we break on BREAK_LENGTH, not MAX_MATCH
*/
if (++same >= BREAK_LENGTH)
break;
}
if (val < 0)
{
if (same > big_len)
{
if (same >= BREAK_LENGTH)
{
*small_ptr = left[ptr];
*big_ptr = right[ptr];
return;
}
big_len = same;
clen = min(small_len, big_len);
}
*big_ptr = (t_search_node) ptr;
big_ptr = &left[ptr];
ptr = *big_ptr;
}
else
{
if (same > small_len)
{
if (same >= BREAK_LENGTH)
{
*small_ptr = left[ptr];
*big_ptr = right[ptr];
return;
}
small_len = same;
clen = min(small_len, big_len);
}
*small_ptr = (t_search_node) ptr;
small_ptr = &right[ptr];
ptr = *small_ptr;
}
} while (ptr > end_pos);
*small_ptr = 0;
*big_ptr = 0;
}
/*
* Remove a node from the search tree; this is ONLY done for the last
* BREAK_LENGTH symbols (see optenc.c). This is because we will have
* inserted strings that contain undefined data (e.g. we're at the 4th
* last byte from the file and binary_search_findmatch() a string into
* the tree - everything from the 4th symbol onwards is invalid, and
* would cause problems if it remained in the tree, so we have to
* remove it).
*/
void optimal_remove_node(t_encoder_context *context, long BufPos, ULONG end_pos)
{
ULONG ptr;
ULONG left_node_pos;
ULONG right_node_pos;
USHORT tree_to_use;
t_search_node *link;
t_search_node *left = context->optimal_encoder->search_left;
t_search_node *right = context->optimal_encoder->search_right;
BYTE *window = context->optimal_encoder->window;
/*
* The root node of tree_to_use should equal BufPos, since that is
* the most recent insertion into that tree - but if we never
* inserted this string (because it was a near match or a long
* string of zeroes), then we can't remove it.
*/
tree_to_use = *((USHORT UNALIGNED *) &window[BufPos]);
/*
* If we never inserted this string, do not attempt to remove it
*/
if (context->optimal_encoder->search_tree_root[tree_to_use] != BufPos)
return;
link = &context->optimal_encoder->search_tree_root[tree_to_use];
/*
* If the last occurence was too far away
*/
if (*link <= end_pos)
{
*link = 0;
left[BufPos] = right[BufPos] = 0;
return;
}
/*
* Most recent location of these chars
*/
ptr = BufPos;
/*
* Most recent location of a string which is "less than" it
*/
left_node_pos = left[ptr];
if (left_node_pos <= end_pos)
left_node_pos = left[ptr] = 0;
/*
* Most recent location of a string which is "greater than" it
*/
right_node_pos = right[ptr];
if (right_node_pos <= end_pos)
right_node_pos = right[ptr] = 0;
while (1)
{
/*
* If left node position is greater than right node position
* then follow the left node, since that is the more recent
* insertion into the tree. Otherwise follow the right node.
*/
if (left_node_pos > right_node_pos)
{
/*
* If it's too far away, then store that it never happened
*/
if (left_node_pos <= end_pos)
left_node_pos = 0;
ptr = *link = (t_search_node) left_node_pos;
if (!ptr)
break;
left_node_pos = right[ptr];
link = &right[ptr];
}
else
{
/*
* If it's too far away, then store that it never happened
*/
if (right_node_pos <= end_pos)
right_node_pos = 0;
ptr = *link = (t_search_node) right_node_pos;
if (!ptr)
break;
right_node_pos = left[ptr];
link = &left[ptr];
}
}
}
void removeNodes(t_encoder_context *context)
{
long i;
// remove the most recent insertions into the hash table, since we had invalid data
// sitting at the end of the window
for (i = 0; i <= BREAK_LENGTH; i++)
{
if (context->bufpos-i-1 < WINDOW_SIZE)
break;
optimal_remove_node(context, context->bufpos-i-1, context->bufpos-WINDOW_SIZE+BREAK_LENGTH);
}
}
// Reinsert the tree nodes we removed previously
void reinsertRemovedNodes(t_encoder_context *context)
{
long j;
for (j = BREAK_LENGTH; j > 0; j--)
{
if (context->bufpos - j > WINDOW_SIZE)
{
optimal_insert(
context,
context->bufpos - j,
context->bufpos - j - WINDOW_SIZE + 4
);
}
}
}