/* * function to compress resource files * the algorithm is fairly straight-forward: * (1) first, we find the least frequently occuring bytes * (2) next, we use 8 of those bytes, as follows: * (a) to mark 0000 * (b) to mark ffff * (c) to mark a run of 0s (next byte gives the run length) * (d) to mark a run of ffs (next byte gives the run length) * (e) to mark a run of arbitrary characters (next byte is the char, * and the one following that is the run length) * (f) to mark 0000 0000 * (g) to mark ffff ffff * (h) to mark tokenized strings * * The run character is also used as an escape character, i.e. to output * one of the reserved bytes, we output a 1-length run. * * The output file contains a header giving: * (1) The magic number 0xab08 * (2) the size of the uncompressed data (as a LONG), * (3) followed by the 8 special bytes above * (4) followed by the number of tokens (a byte) * (5) the tokens themselves, as pascal format strings (i.e. * a 1 byte length, followed by a 0-255 character string) * (6) finally, the compressed data */ #include #include #include #include #include #define MAXESC 8 static int zeroc, ffffc, zerorunc, ffffrunc, runc; static int longzeroc, longffffc; static int tokenesc; /* character frequency data */ static struct cdata { int ch; long frequency; long runs1; long runs2; } d[256]; static int cmp(struct cdata *a, struct cdata *b) { if (a->frequency != b->frequency) return a->frequency - b->frequency; return a->ch - b->ch; } #define output(c) outbuf[outsize++] = c /* * the compression is done in 2 passes. First, we do the run length * encoding and crunching out of 0's and FF's. Then, we take that output * and run it through the tokenizer */ /* * here is the tokenizer function * How it works: * We use a hash table; each n byte string in the file is * kept somewhere in the table, along with a count of how many times * it occurs. * After we've collected all the strings, we find the 255 most commonly * occuring strings and put them into a table. We then go through and * tokenize the file. */ static int tokensused; #define HASHSIZE 15000 typedef unsigned char byte; typedef struct hashentry { byte *str; /* pointer to the actual string */ int frequency; /* how often this string appears */ int hash; /* what this string hashes to */ struct hashentry *next; /* link to next hash entry */ } hentry; static hentry *hashtable[HASHSIZE]; #define TOKENSIZE 256 static hentry *tokens[TOKENSIZE]; static int numtokens; inline static unsigned hashfunc(byte *str, int len) { int i; unsigned hash = *str++; for (i = 1; i < len; i++) { hash = i + (hash << 2) + *str++; } return hash % HASHSIZE; } inline static unsigned match(byte *str1, byte *str2, int len) { int i; for (i = 0; i < len; i++) { if (*str1 != *str2) return 0; str1++; str2++; } return 1; } inline hentry *newhash() { return malloc(sizeof(hentry)); } inline static void update_table(unsigned hash, byte *str, int len) { hentry *h, **lastnext, *y; long lastfrequency = 0x7fffffffL; hentry temp; h = hashtable[hash]; lastnext = &hashtable[hash]; while (h) { if (match(str, h->str, len)) { h->frequency++; /* see if we moved up in frequency */ if (h->frequency > lastfrequency) { /* if so, swap with the previous entry */ for (y = hashtable[hash]; y; y = y->next) { if (y->next == h) { temp = *y; y->frequency = h->frequency; y->str = h->str; h->frequency = temp.frequency; h->str = temp.str; return; } } } return; } lastnext = &h->next; lastfrequency = h->frequency; h = h->next; } h = newhash(); h->str = str; h->frequency = 1; h->next = 0; h->hash = hash; *lastnext = h; } extern int gl_matchsize; /* * return the highest-frequency string remaining in the table, and delete * it from the table * * In update_table we kept the highest frequency strings at the beginning * of the table, so we only need to check the first hash table entry for * each hash value. * * returns NULL if no string occurs more than once */ static hentry * reporthigh() { long i; hentry *h; long maxfrequency = 1; unsigned maxhash = 0; byte *ptr; maxhash = 0; for (i = 0; i < HASHSIZE; i++) { h = hashtable[i]; if (h && h->frequency > maxfrequency) { maxhash = i; maxfrequency = h->frequency; } } if (maxfrequency == 1) return 0; h = hashtable[maxhash]; ptr = h->str; hashtable[maxhash] = h->next; return h; } int no_more_tokens = 0; void ctrlc() { no_more_tokens = 1; } /* * long tokenize(byte *input, long inpsize, byte *output): * input points to the input buffer * inpsize is the number of bytes in that buffer * output points to the output buffer * returns: number of bytes written to the output buffer */ static long tokenize(byte *input, long inpsize, byte *output) { long i; long outsize; int matchsize; unsigned hash; hentry *h; byte *ptr, *outptr; int c, j; signal(SIGINT, ctrlc); matchsize = gl_matchsize; if (matchsize < 4) { fprintf(stderr, "strings must be at least 4 bytes long\n"); return 0; } ptr = input; printf("analyzing for tokens...\n"); for (i = 0; i < inpsize - matchsize; i++) { hash = hashfunc(ptr, matchsize); update_table(hash, ptr, matchsize); ptr++; } printf("getting tokens...\n"); /* * whenever a hash table entry is taken from the hash table and * put into the tokens table, its frequency changes meaning: * after this point, it means the number of times the token * is used */ numtokens = 1; moretokens: outsize = 0L; outptr = output; while (!no_more_tokens && numtokens < TOKENSIZE) { h = reporthigh(); if (!h || h->frequency < 3) { no_more_tokens = 1; break; } tokens[numtokens] = h; h->frequency = 0; numtokens++; } printf("compressing...\n"); tokensused = 1; ptr = input; for (i = 0; i < inpsize; i++) { c = *ptr; hash = hashfunc(ptr, matchsize); for (j = 1; j < numtokens; j++) { if (hash == tokens[j]->hash && match(ptr, tokens[j]->str, matchsize)) { if (!tokens[j]->frequency) { tokens[j]->frequency++; tokensused++; } *outptr++ = tokenesc; *outptr++ = j; outsize += 2; ptr += matchsize; i += (matchsize-1); goto foundtoken; } } *outptr++ = c; outsize++; if (c == tokenesc) { *outptr++ = 0; outsize++; } ptr++; foundtoken: ; } j = 1; for (i = 1; i < numtokens; i++) { if (tokens[i]->frequency) { tokens[i]->frequency = 0; tokens[j] = tokens[i]; j++; } } if (tokensused < numtokens) { printf("only %d tokens used out of %d\n", tokensused, numtokens); printf("searching for more tokens\n"); numtokens = j; goto moretokens; } numtokens = j; printf("tokenizing stage: input size: %ld\n", inpsize); i = 4L + (long)matchsize * numtokens; printf("output size: header: %ld body: %ld total: %ld\n", i, outsize, i+outsize); return outsize; } long crunch(byte *inbuf, long insize, byte *outbuf) { int j, c; int lastc; int currunlen = 0; long i; int mustescape = 0; byte *ptr; long headsize; long outsize = 0; for (c = 0; c < 256; c++) { d[c].ch = c; } lastc = -1; for (i = 0; i < insize; i++) { c = inbuf[i]; d[c].frequency++; if (c == lastc) { currunlen++; } else { if (currunlen > 2) { d[lastc].runs2++; } else if (currunlen) { d[lastc].runs1++; } currunlen = 0; } lastc = c; } qsort(d, (size_t)256, sizeof(struct cdata), cmp); /* now, find our markers */ zeroc = d[0].ch; ffffc = d[1].ch; zerorunc = d[2].ch; ffffrunc = d[3].ch; runc = d[4].ch; longzeroc = d[5].ch; longffffc = d[6].ch; tokenesc = d[7].ch; lastc = -1; currunlen = 0; mustescape = 0; for (i = 0; i <= insize; i++) { c = inbuf[i]; if (c == lastc && currunlen < 255 && i != insize) { currunlen++; } else { if (mustescape || currunlen > 0) { if (lastc == 0) { if (currunlen == 1) output(zeroc); else if (currunlen == 3) { output(longzeroc); } else { output(zerorunc); output(currunlen); } } else if (lastc == 0xff) { if (currunlen == 1) output(ffffc); else if (currunlen == 3) output(longffffc); else { output(ffffrunc); output(currunlen); } } else { output(runc); output(lastc); output(currunlen); } } else { if (currunlen == 1) output(lastc); if (lastc >= 0) output(lastc); } /* check for special characters */ /* note: the token escape character is escaped in the tokenizing code */ mustescape = 0; for (j = 0; j < MAXESC-1; j++) { if (c == d[j].ch) { mustescape = 1; } } lastc = c; currunlen = 0; } } printf("before tokenizing: input size: %ld output size: %ld\n", insize, outsize); /* * ASSUMPTION: input buffer is big enough to handle the overflow if * the file is actually expanded by our RLE encoding */ for (i = 0; i < outsize; i++) { inbuf[i] = outbuf[i]; } outsize = tokenize(inbuf, outsize, outbuf); /* now we must copy the data to leave room for the compression header */ /* that header will be 8 bytes (escape characters) + 1 byte (no. of tokens) * plus (number of tokens-1) * (token size) */ headsize = 9 + (numtokens-1) * (1L+gl_matchsize); for (i = outsize-1; i >= 0; --i) { outbuf[i+headsize] = outbuf[i]; } /* output the escape characters */ outbuf[0] = zeroc; outbuf[1] = ffffc; outbuf[2] = zerorunc; outbuf[3] = ffffrunc; outbuf[4] = runc; outbuf[5] = longzeroc; outbuf[6] = longffffc; outbuf[7] = tokenesc; outbuf[8] = numtokens - 1; /* output the token table */ c = 9; for (i = 1; i < numtokens; i++) { outbuf[c++] = gl_matchsize; ptr = tokens[i]->str; for (j = 0; j < gl_matchsize; j++) { outbuf[c++] = *ptr++; } } if (c != headsize) { printf("WARNING: c == %ld headsize == %ld\n", (long)c, (long)headsize); } return outsize + headsize; }