From 6215084dc6fdb9a0bf9a740c96fe85bdb8175bf4 Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Mon, 6 May 2013 14:43:04 +0100 Subject: [PATCH 01/19] Start of trie compression implementation --- smaz.c | 226 ++++++++++++++++++++++++++++++++++++++++++++++++++++ smaz.h | 6 ++ smaz_test.c | 9 ++- 3 files changed, 239 insertions(+), 2 deletions(-) diff --git a/smaz.c b/smaz.c index aa674c3..602d856 100644 --- a/smaz.c +++ b/smaz.c @@ -1,4 +1,6 @@ +#include #include +#include /* Our compression codebook, used for compression */ static char *Smaz_cb[241] = { @@ -76,6 +78,230 @@ static char *Smaz_rcb[254] = { "e, ", " it", "whi", " ma", "ge", "x", "e c", "men", ".com" }; +const char startLetter = '\n'; +const char endLetter = 'z'; +const int noLetters = 'z' - '\n' +1; + +struct Branch { + int value; + struct Branch **children; + char *shortcut; + int shortcutLen; +}; + +char getKey(char ir) { + return ir - startLetter; +} + +void freeBranch(struct Branch *t) { + int x; + + for (x = 0; x < noLetters; x++) { + if (t->children[x] != NULL) { + freeBranch(t->children[x]); + } + free(t->children); + } + if (t->shortcut != NULL) { + free(t->shortcut); + } + free(t); +} + +struct Branch *newTrie() { + struct Branch *newBranch; + newBranch = (struct Branch *)calloc(sizeof(struct Branch), 1); + newBranch->value = -1; + return newBranch; +} + +struct Branch *buildTrie() { + struct Branch *trie; + int x; + + trie = newTrie(); + for (x = 0; x < 254; x++) { + addToBranch(trie, Smaz_rcb[x], x); + } + return trie; +} + +void addToBranch(struct Branch *t, char *remEntry, int value) { + int entryLen; + entryLen = strlen(remEntry); + + if (t->shortcut == NULL) { + t->shortcut = (char *)malloc(sizeof(char) * (entryLen + 1)); + memcpy(t->shortcut, remEntry, entryLen); + t->shortcut[entryLen] = '\0'; + t->value = value; + t->shortcutLen = entryLen; + t->children = NULL; + return; + } + + if (entryLen == 0 && t->shortcutLen == 0) { + t->value = value; + return; + } else { + int smallestLen = entryLen; + int x; + char *commonPrefix; + + if (smallestLen > t->shortcutLen) { + smallestLen = t->shortcutLen; + } + + for (x = 0; x < smallestLen && t->shortcut[x] == remEntry[x]; x++) { } + + commonPrefix = (char *)malloc(sizeof(char) * (x + 1)); + memcpy(commonPrefix, t->shortcut, x); + commonPrefix[x] = '\0'; + + if (x < t->shortcutLen) { + char *ttail; + int tkey; + struct Branch *newTBranch; + + ttail = (char *)malloc(sizeof(char) * (t->shortcutLen - x + 1)); + memcpy(ttail, &t->shortcut[x+1], (t->shortcutLen - x)); + ttail[(t->shortcutLen - x)] = '\0'; + + tkey = getKey(t->shortcut[x]); + + newTBranch = (struct Branch *)malloc(sizeof(struct Branch) * 1); + newTBranch->children = t->children; + newTBranch->value = t->value; + newTBranch->shortcut = ttail; + + if (t->children != NULL) { + free(t->children); + } + t->children = (struct Branch **)calloc(sizeof(struct Branch *), noLetters); + t->children[tkey] = newTBranch; + free(t->shortcut); + t->shortcut = commonPrefix; + t->value = -1; + } else { + /* the value of t remains */ + } + if (x < entryLen) { + /* we can assign the v to a child */ + int vkey; + char *vtail; + + vkey = getKey(remEntry[x]); + vtail = (char *)malloc(sizeof(char) * (entryLen - x + 1)); + memcpy(vtail, &remEntry[x+1], (entryLen - x)); + vtail[entryLen - x] = '\0'; + + if (t->children[vkey] == NULL) { + struct Branch *newVBranch; + newVBranch = (struct Branch *)calloc(sizeof(struct Branch), 1); + newVBranch->value = -1; + t->children[vkey] = newVBranch; + } + addToBranch(t->children[vkey], vtail, value); + } else { + /* the value of v now takes up the position */ + t->value = value; + } + } +} + +int smaz_compress_trie(struct Branch *trie, char *in, int inlen, char *out, int outlen) { + unsigned int h1,h2,h3=0; + int verblen = 0, _outlen = outlen; + char verb[256], *_out = out; + + while(inlen) { + int j = 7, needed; + char *flush = NULL; + char *slot; + + int length = 0; + struct Branch *branch; + + h1 = h2 = in[0]<<3; + if (inlen > 1) h2 += in[1]; + if (inlen > 2) h3 = h2^in[2]; + if (j > inlen) j = inlen; + + /* Try to lookup substrings into the hash table, starting from the + * longer to the shorter substrings */ + branch = trie; + while (1) { + /* see if there is something at the next branch */ + char nextChar; + nextChar = in[length]; + if (branch->children != NULL && branch->children[nextChar - startLetter] != NULL) { + struct Branch *tmpBranch; + tmpBranch = branch->children[nextChar - startLetter]; + length++; + if (tmpBranch->shortcut) { + /* attempt to get through the shortcut, probably need to bounds check in here... */ + if (memcmp(tmpBranch->shortcut, in+length, tmpBranch->shortcutLen) != 0) { + break; + } + length += tmpBranch->shortcutLen; + } + branch = tmpBranch; + } else { + break; + } + } + if (branch->value > 0) { + printf("comressed value: %d\n", branch->value); + /* Match found in the hash table, + * prepare a verbatim bytes flush if needed */ + if (verblen) { + needed = (verblen == 1) ? 2 : 2+verblen; + flush = out; + out += needed; + outlen -= needed; + } + /* Emit the byte */ + if (outlen <= 0) return _outlen+1; + out[0] = branch->value; + out++; + outlen--; + inlen -= length; + in += length; + goto out; + } + + /* Match not found - add the byte to the verbatim buffer */ + verb[verblen] = in[0]; + verblen++; + inlen--; + in++; +out: + /* Prepare a flush if we reached the flush length limit, and there + * is not already a pending flush operation. */ + if (!flush && (verblen == 256 || (verblen > 0 && inlen == 0))) { + needed = (verblen == 1) ? 2 : 2+verblen; + flush = out; + out += needed; + outlen -= needed; + if (outlen < 0) return _outlen+1; + } + /* Perform a verbatim flush if needed */ + if (flush) { + if (verblen == 1) { + flush[0] = (signed char)254; + flush[1] = verb[0]; + } else { + flush[0] = (signed char)255; + flush[1] = (signed char)(verblen-1); + memcpy(flush+2,verb,verblen); + } + flush = NULL; + verblen = 0; + } + } + return out-_out; +} + int smaz_compress(char *in, int inlen, char *out, int outlen) { unsigned int h1,h2,h3=0; int verblen = 0, _outlen = outlen; diff --git a/smaz.h b/smaz.h index ce9c35d..3b29839 100644 --- a/smaz.h +++ b/smaz.h @@ -1,7 +1,13 @@ #ifndef _SMAZ_H #define _SMAZ_H +struct Branch *newTrie(); +struct Branch *buildTrie(); +void freeBranch(struct Branch *t); +void addToBranch(struct Branch *t, char *remEntry, int value); + int smaz_compress(char *in, int inlen, char *out, int outlen); +int smaz_compress_trie(struct Branch *trie, char *in, int inlen, char *out, int outlen); int smaz_decompress(char *in, int inlen, char *out, int outlen); #endif diff --git a/smaz_test.c b/smaz_test.c index 47c02d6..ca8b823 100644 --- a/smaz_test.c +++ b/smaz_test.c @@ -11,6 +11,7 @@ int main(void) { int comprlen, decomprlen; int j, ranlen; int times = 1000000; + struct Branch *trie; char *strings[] = { "This is a small string", "foobar", @@ -33,10 +34,11 @@ int main(void) { }; j=0; + trie = buildTrie(); while(strings[j]) { int comprlevel; - comprlen = smaz_compress(strings[j],strlen(strings[j]),out,sizeof(out)); + comprlen = smaz_compress_trie(trie, strings[j],strlen(strings[j]),out,sizeof(out)); comprlevel = 100-((100*comprlen)/strlen(strings[j])); decomprlen = smaz_decompress(out,comprlen,d,sizeof(d)); if (strlen(strings[j]) != (unsigned)decomprlen || @@ -52,6 +54,7 @@ int main(void) { } j++; } + /* printf("Encrypting and decrypting %d test strings...\n", times); while(times--) { char charset[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvxyz/. "; @@ -70,8 +73,10 @@ int main(void) { printf("Bug! TEST NOT PASSED\n"); exit(1); } - /* printf("%d -> %d\n", comprlen, decomprlen); */ } printf("TEST PASSED :)\n"); + */ + + getchar(); return 0; } From 4e1c8f6c13001d6c3216d12caba418744d6b862a Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Tue, 7 May 2013 23:44:28 +0100 Subject: [PATCH 02/19] Fix bug with tree creation --- smaz.c | 38 +++++++++++++++++--------------------- smaz.h | 6 ++++++ 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/smaz.c b/smaz.c index 602d856..7a5bf8f 100644 --- a/smaz.c +++ b/smaz.c @@ -2,6 +2,8 @@ #include #include +#include "smaz.h" + /* Our compression codebook, used for compression */ static char *Smaz_cb[241] = { "\002s,\266", "\003had\232\002leW", "\003on \216", "", "\001yS", @@ -82,13 +84,6 @@ const char startLetter = '\n'; const char endLetter = 'z'; const int noLetters = 'z' - '\n' +1; -struct Branch { - int value; - struct Branch **children; - char *shortcut; - int shortcutLen; -}; - char getKey(char ir) { return ir - startLetter; } @@ -110,7 +105,7 @@ void freeBranch(struct Branch *t) { struct Branch *newTrie() { struct Branch *newBranch; - newBranch = (struct Branch *)calloc(sizeof(struct Branch), 1); + newBranch = (struct Branch *)calloc(1, sizeof(struct Branch)); newBranch->value = -1; return newBranch; } @@ -135,12 +130,11 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { memcpy(t->shortcut, remEntry, entryLen); t->shortcut[entryLen] = '\0'; t->value = value; - t->shortcutLen = entryLen; t->children = NULL; return; } - if (entryLen == 0 && t->shortcutLen == 0) { + if (entryLen == 0 && strlen(t->shortcut)) { t->value = value; return; } else { @@ -148,8 +142,8 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { int x; char *commonPrefix; - if (smallestLen > t->shortcutLen) { - smallestLen = t->shortcutLen; + if (smallestLen > strlen(t->shortcut)) { + smallestLen = strlen(t->shortcut); } for (x = 0; x < smallestLen && t->shortcut[x] == remEntry[x]; x++) { } @@ -158,14 +152,14 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { memcpy(commonPrefix, t->shortcut, x); commonPrefix[x] = '\0'; - if (x < t->shortcutLen) { + if (x < strlen(t->shortcut)) { char *ttail; int tkey; struct Branch *newTBranch; - ttail = (char *)malloc(sizeof(char) * (t->shortcutLen - x + 1)); - memcpy(ttail, &t->shortcut[x+1], (t->shortcutLen - x)); - ttail[(t->shortcutLen - x)] = '\0'; + ttail = (char *)malloc(sizeof(char) * (strlen(t->shortcut) - x + 1)); + memcpy(ttail, &t->shortcut[x+1], (strlen(t->shortcut) - x)); + ttail[(strlen(t->shortcut) - x)] = '\0'; tkey = getKey(t->shortcut[x]); @@ -177,7 +171,7 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { if (t->children != NULL) { free(t->children); } - t->children = (struct Branch **)calloc(sizeof(struct Branch *), noLetters); + t->children = (struct Branch **)calloc(noLetters, sizeof(struct Branch *)); t->children[tkey] = newTBranch; free(t->shortcut); t->shortcut = commonPrefix; @@ -195,9 +189,12 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { memcpy(vtail, &remEntry[x+1], (entryLen - x)); vtail[entryLen - x] = '\0'; + if (t->children == NULL) { + t->children = (struct Branch **)calloc(noLetters, sizeof(struct Branch *)); + } if (t->children[vkey] == NULL) { struct Branch *newVBranch; - newVBranch = (struct Branch *)calloc(sizeof(struct Branch), 1); + newVBranch = (struct Branch *)calloc(1, sizeof(struct Branch)); newVBranch->value = -1; t->children[vkey] = newVBranch; } @@ -240,10 +237,10 @@ int smaz_compress_trie(struct Branch *trie, char *in, int inlen, char *out, int length++; if (tmpBranch->shortcut) { /* attempt to get through the shortcut, probably need to bounds check in here... */ - if (memcmp(tmpBranch->shortcut, in+length, tmpBranch->shortcutLen) != 0) { + if (memcmp(tmpBranch->shortcut, in+length, strlen(tmpBranch->shortcut)) != 0) { break; } - length += tmpBranch->shortcutLen; + length += strlen(tmpBranch->shortcut); } branch = tmpBranch; } else { @@ -251,7 +248,6 @@ int smaz_compress_trie(struct Branch *trie, char *in, int inlen, char *out, int } } if (branch->value > 0) { - printf("comressed value: %d\n", branch->value); /* Match found in the hash table, * prepare a verbatim bytes flush if needed */ if (verblen) { diff --git a/smaz.h b/smaz.h index 3b29839..67a4b16 100644 --- a/smaz.h +++ b/smaz.h @@ -1,6 +1,12 @@ #ifndef _SMAZ_H #define _SMAZ_H +struct Branch { + int value; + struct Branch **children; + char *shortcut; +}; + struct Branch *newTrie(); struct Branch *buildTrie(); void freeBranch(struct Branch *t); From 0b01811de03aea49f63936cca0e97ddfbd57c3da Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Wed, 8 May 2013 22:25:10 +0100 Subject: [PATCH 03/19] it's a mess, but it works --- smaz.c | 25 +++++++++++++------ smaz_test.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 84 insertions(+), 11 deletions(-) diff --git a/smaz.c b/smaz.c index 7a5bf8f..d49d7ff 100644 --- a/smaz.c +++ b/smaz.c @@ -125,6 +125,7 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { int entryLen; entryLen = strlen(remEntry); + if (t->shortcut == NULL) { t->shortcut = (char *)malloc(sizeof(char) * (entryLen + 1)); memcpy(t->shortcut, remEntry, entryLen); @@ -134,7 +135,7 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { return; } - if (entryLen == 0 && strlen(t->shortcut)) { + if (entryLen == 0 && strlen(t->shortcut) == 0) { t->value = value; return; } else { @@ -157,9 +158,9 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { int tkey; struct Branch *newTBranch; - ttail = (char *)malloc(sizeof(char) * (strlen(t->shortcut) - x + 1)); + + ttail = (char *)calloc((strlen(t->shortcut) - x + 1), sizeof(char)); memcpy(ttail, &t->shortcut[x+1], (strlen(t->shortcut) - x)); - ttail[(strlen(t->shortcut) - x)] = '\0'; tkey = getKey(t->shortcut[x]); @@ -185,9 +186,8 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { char *vtail; vkey = getKey(remEntry[x]); - vtail = (char *)malloc(sizeof(char) * (entryLen - x + 1)); + vtail = (char *)calloc((entryLen - x + 1), sizeof(char)); memcpy(vtail, &remEntry[x+1], (entryLen - x)); - vtail[entryLen - x] = '\0'; if (t->children == NULL) { t->children = (struct Branch **)calloc(noLetters, sizeof(struct Branch *)); @@ -196,6 +196,7 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { struct Branch *newVBranch; newVBranch = (struct Branch *)calloc(1, sizeof(struct Branch)); newVBranch->value = -1; + /*printf("asdf: %c\n", vkey+'\n');*/ t->children[vkey] = newVBranch; } addToBranch(t->children[vkey], vtail, value); @@ -227,10 +228,13 @@ int smaz_compress_trie(struct Branch *trie, char *in, int inlen, char *out, int /* Try to lookup substrings into the hash table, starting from the * longer to the shorter substrings */ branch = trie; - while (1) { + while (length < inlen) { /* see if there is something at the next branch */ char nextChar; nextChar = in[length]; + if (nextChar < startLetter || nextChar > endLetter) { + break; + } if (branch->children != NULL && branch->children[nextChar - startLetter] != NULL) { struct Branch *tmpBranch; tmpBranch = branch->children[nextChar - startLetter]; @@ -238,25 +242,30 @@ int smaz_compress_trie(struct Branch *trie, char *in, int inlen, char *out, int if (tmpBranch->shortcut) { /* attempt to get through the shortcut, probably need to bounds check in here... */ if (memcmp(tmpBranch->shortcut, in+length, strlen(tmpBranch->shortcut)) != 0) { + /*printf("broke b '%s' != '%s' - %d\n", tmpBranch->shortcut, in+length, branch->value);*/ + length--; break; } length += strlen(tmpBranch->shortcut); } branch = tmpBranch; } else { + /*printf("broke a %d\n", branch->children);*/ break; } } - if (branch->value > 0) { + if (branch->value >= 0 && length <= inlen) { /* Match found in the hash table, * prepare a verbatim bytes flush if needed */ if (verblen) { needed = (verblen == 1) ? 2 : 2+verblen; + /*printf("Verb: %d\n", verblen);*/ flush = out; out += needed; outlen -= needed; } /* Emit the byte */ + /*printf("Value: %d\n", branch->value);*/ if (outlen <= 0) return _outlen+1; out[0] = branch->value; out++; @@ -327,6 +336,7 @@ int smaz_compress(char *in, int inlen, char *out, int outlen) { * prepare a verbatim bytes flush if needed */ if (verblen) { needed = (verblen == 1) ? 2 : 2+verblen; + /*printf("Verb good: %d\n", verblen);*/ flush = out; out += needed; outlen -= needed; @@ -334,6 +344,7 @@ int smaz_compress(char *in, int inlen, char *out, int outlen) { /* Emit the byte */ if (outlen <= 0) return _outlen+1; out[0] = slot[slot[0]+1]; + /*printf("Value: %d\n", *(unsigned char *)(&slot[slot[0]+1]));*/ out++; outlen--; inlen -= j; diff --git a/smaz_test.c b/smaz_test.c index ca8b823..48c6b88 100644 --- a/smaz_test.c +++ b/smaz_test.c @@ -4,15 +4,53 @@ #include "smaz.h" + +void hexDump (char *desc, void *addr, int len) { + int i; + unsigned char buff[17]; + unsigned char *pc = addr; + + if (desc != NULL) + printf ("%s:\n", desc); + + for (i = 0; i < len; i++) { + + if ((i % 16) == 0) { + if (i != 0) + printf (" %s\n", buff); + + printf (" %04x ", i); + } + + printf (" %02x", pc[i]); + + if ((pc[i] < 0x20) || (pc[i] > 0x7e)) + buff[i % 16] = '.'; + else + buff[i % 16] = pc[i]; + buff[(i % 16) + 1] = '\0'; + } + + while ((i % 16) != 0) { + printf (" "); + i++; + } + + printf (" %s\n", buff); +} + int main(void) { char in[512]; char out[4096]; char d[4096]; int comprlen, decomprlen; - int j, ranlen; + int j, ranlen, x; int times = 1000000; struct Branch *trie; char *strings[] = { + "nojQfTh", + "ht", + "QtZpZuMhlzfgHFEGA.Kja/hsIayllFSAMFDl.fQ/bJdzfzCvxdclaIbzzWyhbOhCj.nydSJSbmPUzhOHYqszMhvIBqqsSluQkxLbcUuRVXmhS.CrCIBPpKXEPbyhLDLJNn.pVGFEdFmKDC VLAk.LWDqLOlmhyvviIzBOBWsWGQpIPJjftiEd updeZIZjBVrOmDPGJmcZZ CziiEeAhtvkUnYdaFuvKGvdmQnmGaZVtWCpaxpVozEWjc/HyGQFMaiMqjzKYmgPGzSxsFPuCjP JcHUinZvLWVPTSarCUUYQmSGGyPYfeXCEunngaxFxPleyZjNtClHCRdYdsxWkiopaZqU.kaINJmZiUmp", "This is a small string", "foobar", "the end", @@ -35,10 +73,30 @@ int main(void) { j=0; trie = buildTrie(); + + /* + printf("here: %d\n", trie->children['9'-'\n']); + exit(0); + trie = newTrie(); + addToBranch(trie, "f", 1); + addToBranch(trie, "for", 3); + addToBranch(trie, "fo", 2); + exit(0); + for (x = 0; x < 'z' - '\n'; x++) { + printf("here: %c %d\n", x+'\n', trie->children['o'-'\n']->children[x]); + } + printf("here: '%s'\n", trie->children['f'-'\n']->children['o'-'\n']->shortcut); + */ + + while(strings[j]) { int comprlevel; + int comprlen2; comprlen = smaz_compress_trie(trie, strings[j],strlen(strings[j]),out,sizeof(out)); + /*hexDump("out bad", &out, 400);*/ + comprlen2 = smaz_compress(strings[j],strlen(strings[j]),out,sizeof(out)); + /*hexDump("out good", &out, 400);*/ comprlevel = 100-((100*comprlen)/strlen(strings[j])); decomprlen = smaz_decompress(out,comprlen,d,sizeof(d)); if (strlen(strings[j]) != (unsigned)decomprlen || @@ -54,11 +112,11 @@ int main(void) { } j++; } - /* printf("Encrypting and decrypting %d test strings...\n", times); while(times--) { char charset[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvxyz/. "; ranlen = random() % 512; + printf("doing %d\n", times); for (j = 0; j < ranlen; j++) { if (times & 1) @@ -66,16 +124,20 @@ int main(void) { else in[j] = (char)(random() & 0xff); } - comprlen = smaz_compress(in,ranlen,out,sizeof(out)); + + comprlen = smaz_compress_trie(trie, in,ranlen,out,sizeof(out)); decomprlen = smaz_decompress(out,comprlen,d,sizeof(out)); if (ranlen != decomprlen || memcmp(in,d,ranlen)) { printf("Bug! TEST NOT PASSED\n"); + hexDump("in", &in, ranlen); + hexDump("out bad", &out, comprlen); + comprlen = smaz_compress(in,ranlen,out,sizeof(out)); + hexDump("out good", &out, comprlen); exit(1); } } printf("TEST PASSED :)\n"); - */ getchar(); return 0; From 7920690b74e98e7645d86d68ce448476a4e05f7b Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Wed, 8 May 2013 22:45:04 +0100 Subject: [PATCH 04/19] Appears to be about 2x as fast --- smaz.c | 11 +--------- smaz_test.c | 60 +++++++++++++++++++++++++++++++++-------------------- 2 files changed, 39 insertions(+), 32 deletions(-) diff --git a/smaz.c b/smaz.c index d49d7ff..c3c3ab3 100644 --- a/smaz.c +++ b/smaz.c @@ -208,25 +208,16 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { } int smaz_compress_trie(struct Branch *trie, char *in, int inlen, char *out, int outlen) { - unsigned int h1,h2,h3=0; int verblen = 0, _outlen = outlen; char verb[256], *_out = out; while(inlen) { - int j = 7, needed; + int needed; char *flush = NULL; - char *slot; int length = 0; struct Branch *branch; - h1 = h2 = in[0]<<3; - if (inlen > 1) h2 += in[1]; - if (inlen > 2) h3 = h2^in[2]; - if (j > inlen) j = inlen; - - /* Try to lookup substrings into the hash table, starting from the - * longer to the shorter substrings */ branch = trie; while (length < inlen) { /* see if there is something at the next branch */ diff --git a/smaz_test.c b/smaz_test.c index 48c6b88..e098c22 100644 --- a/smaz_test.c +++ b/smaz_test.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "smaz.h" @@ -113,31 +114,46 @@ int main(void) { j++; } printf("Encrypting and decrypting %d test strings...\n", times); - while(times--) { - char charset[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvxyz/. "; - ranlen = random() % 512; - printf("doing %d\n", times); - - for (j = 0; j < ranlen; j++) { - if (times & 1) - in[j] = charset[random() % (sizeof(charset)-1)]; - else - in[j] = (char)(random() & 0xff); - } - - comprlen = smaz_compress_trie(trie, in,ranlen,out,sizeof(out)); - decomprlen = smaz_decompress(out,comprlen,d,sizeof(out)); - - if (ranlen != decomprlen || memcmp(in,d,ranlen)) { - printf("Bug! TEST NOT PASSED\n"); - hexDump("in", &in, ranlen); - hexDump("out bad", &out, comprlen); + { + struct timeval t1, t2; + double elapsedTime; + + gettimeofday(&t1, NULL); + while(times--) { + char charset[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvxyz/. "; + ranlen = random() % 512; + /*printf("doing %d\n", times);*/ + + for (j = 0; j < ranlen; j++) { + if (times & 1) + in[j] = charset[random() % (sizeof(charset)-1)]; + else + in[j] = (char)(random() & 0xff); + } + + /*comprlen = smaz_compress_trie(trie, in,ranlen,out,sizeof(out));*/ comprlen = smaz_compress(in,ranlen,out,sizeof(out)); - hexDump("out good", &out, comprlen); - exit(1); + decomprlen = smaz_decompress(out,comprlen,d,sizeof(out)); + + if (ranlen != decomprlen || memcmp(in,d,ranlen)) { + printf("Bug! TEST NOT PASSED\n"); + hexDump("in", &in, ranlen); + hexDump("out bad", &out, comprlen); + comprlen = smaz_compress(in,ranlen,out,sizeof(out)); + hexDump("out good", &out, comprlen); + exit(1); + } } + + gettimeofday(&t2, NULL); + + /*elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0; + elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0;*/ + printf("time = %u.%06u\n", t1.tv_sec, t1.tv_usec); + printf("time = %u.%06u\n", t2.tv_sec, t2.tv_usec); + + printf("TEST PASSED :)\n"); } - printf("TEST PASSED :)\n"); getchar(); return 0; From 99779edae625b429a05c7c6cc03c4087f1671509 Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Sat, 11 May 2013 13:29:26 +0100 Subject: [PATCH 05/19] Cleaning up the code some --- smaz.c | 61 ++++++++++++++++++++++++----------------------------- smaz.h | 1 + smaz_test.c | 4 ++-- 3 files changed, 31 insertions(+), 35 deletions(-) diff --git a/smaz.c b/smaz.c index c3c3ab3..a66da37 100644 --- a/smaz.c +++ b/smaz.c @@ -80,13 +80,8 @@ static char *Smaz_rcb[254] = { "e, ", " it", "whi", " ma", "ge", "x", "e c", "men", ".com" }; -const char startLetter = '\n'; -const char endLetter = 'z'; -const int noLetters = 'z' - '\n' +1; - -char getKey(char ir) { - return ir - startLetter; -} +const unsigned char endLetter = 'z'; +const int noLetters = 'z' +1; void freeBranch(struct Branch *t) { int x; @@ -128,6 +123,7 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { if (t->shortcut == NULL) { t->shortcut = (char *)malloc(sizeof(char) * (entryLen + 1)); + t->shortcut_length = entryLen; memcpy(t->shortcut, remEntry, entryLen); t->shortcut[entryLen] = '\0'; t->value = value; @@ -135,7 +131,7 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { return; } - if (entryLen == 0 && strlen(t->shortcut) == 0) { + if (entryLen == 0 && t->shortcut_length == 0) { t->value = value; return; } else { @@ -143,8 +139,8 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { int x; char *commonPrefix; - if (smallestLen > strlen(t->shortcut)) { - smallestLen = strlen(t->shortcut); + if (smallestLen > t->shortcut_length) { + smallestLen = t->shortcut_length; } for (x = 0; x < smallestLen && t->shortcut[x] == remEntry[x]; x++) { } @@ -153,21 +149,22 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { memcpy(commonPrefix, t->shortcut, x); commonPrefix[x] = '\0'; - if (x < strlen(t->shortcut)) { + if (x < t->shortcut_length) { char *ttail; int tkey; struct Branch *newTBranch; - ttail = (char *)calloc((strlen(t->shortcut) - x + 1), sizeof(char)); - memcpy(ttail, &t->shortcut[x+1], (strlen(t->shortcut) - x)); + ttail = (char *)calloc((t->shortcut_length - x + 1), sizeof(char)); + memcpy(ttail, &t->shortcut[x+1], (t->shortcut_length - x)); - tkey = getKey(t->shortcut[x]); + tkey = t->shortcut[x]; newTBranch = (struct Branch *)malloc(sizeof(struct Branch) * 1); newTBranch->children = t->children; newTBranch->value = t->value; newTBranch->shortcut = ttail; + newTBranch->shortcut_length = strlen(ttail); if (t->children != NULL) { free(t->children); @@ -176,6 +173,7 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { t->children[tkey] = newTBranch; free(t->shortcut); t->shortcut = commonPrefix; + t->shortcut_length = strlen(commonPrefix); t->value = -1; } else { /* the value of t remains */ @@ -185,7 +183,7 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { int vkey; char *vtail; - vkey = getKey(remEntry[x]); + vkey = remEntry[x]; vtail = (char *)calloc((entryLen - x + 1), sizeof(char)); memcpy(vtail, &remEntry[x+1], (entryLen - x)); @@ -212,51 +210,48 @@ int smaz_compress_trie(struct Branch *trie, char *in, int inlen, char *out, int char verb[256], *_out = out; while(inlen) { - int needed; + int needed = 0; char *flush = NULL; - int length = 0; - struct Branch *branch; + struct Branch *branch = NULL; + int remaining_length = inlen; branch = trie; - while (length < inlen) { - /* see if there is something at the next branch */ - char nextChar; + while (remaining_length--) { + unsigned int nextChar; + struct Branch **children; nextChar = in[length]; - if (nextChar < startLetter || nextChar > endLetter) { + if (nextChar > endLetter) { break; } - if (branch->children != NULL && branch->children[nextChar - startLetter] != NULL) { + children = branch->children; + if (children && children[nextChar]) { struct Branch *tmpBranch; - tmpBranch = branch->children[nextChar - startLetter]; + tmpBranch = children[nextChar]; length++; if (tmpBranch->shortcut) { - /* attempt to get through the shortcut, probably need to bounds check in here... */ - if (memcmp(tmpBranch->shortcut, in+length, strlen(tmpBranch->shortcut)) != 0) { - /*printf("broke b '%s' != '%s' - %d\n", tmpBranch->shortcut, in+length, branch->value);*/ + if (length <= inlen && + memcmp(tmpBranch->shortcut, in+length, tmpBranch->shortcut_length)) { length--; break; } - length += strlen(tmpBranch->shortcut); + length += tmpBranch->shortcut_length; } branch = tmpBranch; - } else { - /*printf("broke a %d\n", branch->children);*/ - break; + continue; } + break; } if (branch->value >= 0 && length <= inlen) { /* Match found in the hash table, * prepare a verbatim bytes flush if needed */ if (verblen) { needed = (verblen == 1) ? 2 : 2+verblen; - /*printf("Verb: %d\n", verblen);*/ flush = out; out += needed; outlen -= needed; } /* Emit the byte */ - /*printf("Value: %d\n", branch->value);*/ if (outlen <= 0) return _outlen+1; out[0] = branch->value; out++; diff --git a/smaz.h b/smaz.h index 67a4b16..04a36ab 100644 --- a/smaz.h +++ b/smaz.h @@ -5,6 +5,7 @@ struct Branch { int value; struct Branch **children; char *shortcut; + int shortcut_length; }; struct Branch *newTrie(); diff --git a/smaz_test.c b/smaz_test.c index e098c22..7cb7c7b 100644 --- a/smaz_test.c +++ b/smaz_test.c @@ -131,8 +131,8 @@ int main(void) { in[j] = (char)(random() & 0xff); } - /*comprlen = smaz_compress_trie(trie, in,ranlen,out,sizeof(out));*/ - comprlen = smaz_compress(in,ranlen,out,sizeof(out)); + comprlen = smaz_compress_trie(trie, in,ranlen,out,sizeof(out)); + /*comprlen = smaz_compress(in,ranlen,out,sizeof(out));*/ decomprlen = smaz_decompress(out,comprlen,d,sizeof(out)); if (ranlen != decomprlen || memcmp(in,d,ranlen)) { From 9157cf82efc46e914119b40c9ad067742e05296d Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Sat, 11 May 2013 14:08:00 +0100 Subject: [PATCH 06/19] Fix memory leaks in trie creation --- smaz.c | 28 +++++++++++++++------------- smaz_test.c | 6 ++++++ 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/smaz.c b/smaz.c index a66da37..df4df61 100644 --- a/smaz.c +++ b/smaz.c @@ -80,15 +80,16 @@ static char *Smaz_rcb[254] = { "e, ", " it", "whi", " ma", "ge", "x", "e c", "men", ".com" }; -const unsigned char endLetter = 'z'; -const int noLetters = 'z' +1; +#define END_LETTER 'z' +#define LETTER_COUNT ('z'+1) void freeBranch(struct Branch *t) { - int x; - - for (x = 0; x < noLetters; x++) { - if (t->children[x] != NULL) { - freeBranch(t->children[x]); + if (t->children != NULL) { + int x = 0; + for (x = 0; x < LETTER_COUNT; x++) { + if (t->children[x] != NULL) { + freeBranch(t->children[x]); + } } free(t->children); } @@ -145,9 +146,8 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { for (x = 0; x < smallestLen && t->shortcut[x] == remEntry[x]; x++) { } - commonPrefix = (char *)malloc(sizeof(char) * (x + 1)); + commonPrefix = (char *)calloc(x + 1, sizeof(char)); memcpy(commonPrefix, t->shortcut, x); - commonPrefix[x] = '\0'; if (x < t->shortcut_length) { char *ttail; @@ -160,7 +160,7 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { tkey = t->shortcut[x]; - newTBranch = (struct Branch *)malloc(sizeof(struct Branch) * 1); + newTBranch = (struct Branch *)calloc(1, sizeof(struct Branch)); newTBranch->children = t->children; newTBranch->value = t->value; newTBranch->shortcut = ttail; @@ -169,7 +169,7 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { if (t->children != NULL) { free(t->children); } - t->children = (struct Branch **)calloc(noLetters, sizeof(struct Branch *)); + t->children = (struct Branch **)calloc(LETTER_COUNT, sizeof(struct Branch *)); t->children[tkey] = newTBranch; free(t->shortcut); t->shortcut = commonPrefix; @@ -177,6 +177,7 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { t->value = -1; } else { /* the value of t remains */ + free(commonPrefix); } if (x < entryLen) { /* we can assign the v to a child */ @@ -188,7 +189,7 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { memcpy(vtail, &remEntry[x+1], (entryLen - x)); if (t->children == NULL) { - t->children = (struct Branch **)calloc(noLetters, sizeof(struct Branch *)); + t->children = (struct Branch **)calloc(LETTER_COUNT, sizeof(struct Branch *)); } if (t->children[vkey] == NULL) { struct Branch *newVBranch; @@ -198,6 +199,7 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { t->children[vkey] = newVBranch; } addToBranch(t->children[vkey], vtail, value); + free(vtail); } else { /* the value of v now takes up the position */ t->value = value; @@ -221,7 +223,7 @@ int smaz_compress_trie(struct Branch *trie, char *in, int inlen, char *out, int unsigned int nextChar; struct Branch **children; nextChar = in[length]; - if (nextChar > endLetter) { + if (nextChar > END_LETTER) { break; } children = branch->children; diff --git a/smaz_test.c b/smaz_test.c index 7cb7c7b..6426f9d 100644 --- a/smaz_test.c +++ b/smaz_test.c @@ -72,6 +72,11 @@ int main(void) { NULL }; + j=0; + for (j = 0; j < 100000; j++) { + trie = buildTrie(); + freeBranch(trie); + } j=0; trie = buildTrie(); @@ -155,6 +160,7 @@ int main(void) { printf("TEST PASSED :)\n"); } + getchar(); return 0; } From ecedafca4f4f3c77d73d37756f0500170b00b56e Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Sat, 11 May 2013 14:54:46 +0100 Subject: [PATCH 07/19] Rename to something a little more sane, a couple more tweaks --- smaz.c | 72 ++++++++++++++++++++++++++--------------------------- smaz.h | 6 ++--- smaz_test.c | 9 ++----- 3 files changed, 40 insertions(+), 47 deletions(-) diff --git a/smaz.c b/smaz.c index df4df61..e48259e 100644 --- a/smaz.c +++ b/smaz.c @@ -80,15 +80,15 @@ static char *Smaz_rcb[254] = { "e, ", " it", "whi", " ma", "ge", "x", "e c", "men", ".com" }; -#define END_LETTER 'z' -#define LETTER_COUNT ('z'+1) +#define SMAZ_END_LETTER 'z' +#define SMAZ_LETTER_COUNT ('z'+1) -void freeBranch(struct Branch *t) { +void smaz_free_trie(struct Branch *t) { if (t->children != NULL) { int x = 0; - for (x = 0; x < LETTER_COUNT; x++) { + for (x = 0; x < SMAZ_LETTER_COUNT; x++) { if (t->children[x] != NULL) { - freeBranch(t->children[x]); + smaz_free_trie(t->children[x]); } } free(t->children); @@ -99,29 +99,10 @@ void freeBranch(struct Branch *t) { free(t); } -struct Branch *newTrie() { - struct Branch *newBranch; - newBranch = (struct Branch *)calloc(1, sizeof(struct Branch)); - newBranch->value = -1; - return newBranch; -} - -struct Branch *buildTrie() { - struct Branch *trie; - int x; - - trie = newTrie(); - for (x = 0; x < 254; x++) { - addToBranch(trie, Smaz_rcb[x], x); - } - return trie; -} - -void addToBranch(struct Branch *t, char *remEntry, int value) { +void smaz_add_to_branch(struct Branch *t, char *remEntry, int value) { int entryLen; entryLen = strlen(remEntry); - if (t->shortcut == NULL) { t->shortcut = (char *)malloc(sizeof(char) * (entryLen + 1)); t->shortcut_length = entryLen; @@ -169,7 +150,7 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { if (t->children != NULL) { free(t->children); } - t->children = (struct Branch **)calloc(LETTER_COUNT, sizeof(struct Branch *)); + t->children = (struct Branch **)calloc(SMAZ_LETTER_COUNT, sizeof(struct Branch *)); t->children[tkey] = newTBranch; free(t->shortcut); t->shortcut = commonPrefix; @@ -189,7 +170,7 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { memcpy(vtail, &remEntry[x+1], (entryLen - x)); if (t->children == NULL) { - t->children = (struct Branch **)calloc(LETTER_COUNT, sizeof(struct Branch *)); + t->children = (struct Branch **)calloc(SMAZ_LETTER_COUNT, sizeof(struct Branch *)); } if (t->children[vkey] == NULL) { struct Branch *newVBranch; @@ -198,7 +179,7 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { /*printf("asdf: %c\n", vkey+'\n');*/ t->children[vkey] = newVBranch; } - addToBranch(t->children[vkey], vtail, value); + smaz_add_to_branch(t->children[vkey], vtail, value); free(vtail); } else { /* the value of v now takes up the position */ @@ -207,6 +188,23 @@ void addToBranch(struct Branch *t, char *remEntry, int value) { } } +struct Branch *smaz_build_custom_trie(char *codebook[254]) { + struct Branch *trie; + int x; + + trie = (struct Branch *)calloc(1, sizeof(struct Branch)); + trie->value = -1; + + for (x = 0; x < 254; x++) { + smaz_add_to_branch(trie, codebook[x], x); + } + return trie; +} + +struct Branch *smaz_build_trie() { + return smaz_build_custom_trie(Smaz_rcb); +} + int smaz_compress_trie(struct Branch *trie, char *in, int inlen, char *out, int outlen) { int verblen = 0, _outlen = outlen; char verb[256], *_out = out; @@ -220,24 +218,27 @@ int smaz_compress_trie(struct Branch *trie, char *in, int inlen, char *out, int branch = trie; while (remaining_length--) { - unsigned int nextChar; + unsigned char nextChar; struct Branch **children; nextChar = in[length]; - if (nextChar > END_LETTER) { + if (nextChar > SMAZ_END_LETTER) { break; } children = branch->children; if (children && children[nextChar]) { struct Branch *tmpBranch; + char *shortcut; + int shortcut_length; tmpBranch = children[nextChar]; + shortcut = tmpBranch->shortcut; + shortcut_length = tmpBranch->shortcut_length; length++; - if (tmpBranch->shortcut) { - if (length <= inlen && - memcmp(tmpBranch->shortcut, in+length, tmpBranch->shortcut_length)) { + if (shortcut) { + if (length <= inlen && memcmp(shortcut, in+length, shortcut_length)) { length--; break; } - length += tmpBranch->shortcut_length; + length += shortcut_length; } branch = tmpBranch; continue; @@ -245,8 +246,7 @@ int smaz_compress_trie(struct Branch *trie, char *in, int inlen, char *out, int break; } if (branch->value >= 0 && length <= inlen) { - /* Match found in the hash table, - * prepare a verbatim bytes flush if needed */ + /* Match found, prepare a verbatim bytes flush if needed */ if (verblen) { needed = (verblen == 1) ? 2 : 2+verblen; flush = out; diff --git a/smaz.h b/smaz.h index 04a36ab..b9ac015 100644 --- a/smaz.h +++ b/smaz.h @@ -8,10 +8,8 @@ struct Branch { int shortcut_length; }; -struct Branch *newTrie(); -struct Branch *buildTrie(); -void freeBranch(struct Branch *t); -void addToBranch(struct Branch *t, char *remEntry, int value); +struct Branch *smaz_build_trie(); +void smaz_free_trie(struct Branch *t); int smaz_compress(char *in, int inlen, char *out, int outlen); int smaz_compress_trie(struct Branch *trie, char *in, int inlen, char *out, int outlen); diff --git a/smaz_test.c b/smaz_test.c index 6426f9d..aef4ced 100644 --- a/smaz_test.c +++ b/smaz_test.c @@ -73,12 +73,7 @@ int main(void) { }; j=0; - for (j = 0; j < 100000; j++) { - trie = buildTrie(); - freeBranch(trie); - } - j=0; - trie = buildTrie(); + trie = smaz_build_trie(); /* printf("here: %d\n", trie->children['9'-'\n']); @@ -160,7 +155,7 @@ int main(void) { printf("TEST PASSED :)\n"); } - + smaz_free_trie(trie); getchar(); return 0; } From 85e017635d9f573428419c4669b1aaef750bd939 Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Sat, 11 May 2013 15:00:07 +0100 Subject: [PATCH 08/19] Rename struct --- smaz.c | 32 ++++++++++++++++---------------- smaz.h | 10 +++++----- smaz_test.c | 2 +- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/smaz.c b/smaz.c index e48259e..96d61a0 100644 --- a/smaz.c +++ b/smaz.c @@ -83,7 +83,7 @@ static char *Smaz_rcb[254] = { #define SMAZ_END_LETTER 'z' #define SMAZ_LETTER_COUNT ('z'+1) -void smaz_free_trie(struct Branch *t) { +void smaz_free_trie(struct SmazBranch *t) { if (t->children != NULL) { int x = 0; for (x = 0; x < SMAZ_LETTER_COUNT; x++) { @@ -99,7 +99,7 @@ void smaz_free_trie(struct Branch *t) { free(t); } -void smaz_add_to_branch(struct Branch *t, char *remEntry, int value) { +void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value) { int entryLen; entryLen = strlen(remEntry); @@ -133,7 +133,7 @@ void smaz_add_to_branch(struct Branch *t, char *remEntry, int value) { if (x < t->shortcut_length) { char *ttail; int tkey; - struct Branch *newTBranch; + struct SmazBranch *newTBranch; ttail = (char *)calloc((t->shortcut_length - x + 1), sizeof(char)); @@ -141,7 +141,7 @@ void smaz_add_to_branch(struct Branch *t, char *remEntry, int value) { tkey = t->shortcut[x]; - newTBranch = (struct Branch *)calloc(1, sizeof(struct Branch)); + newTBranch = (struct SmazBranch *)calloc(1, sizeof(struct SmazBranch)); newTBranch->children = t->children; newTBranch->value = t->value; newTBranch->shortcut = ttail; @@ -150,7 +150,7 @@ void smaz_add_to_branch(struct Branch *t, char *remEntry, int value) { if (t->children != NULL) { free(t->children); } - t->children = (struct Branch **)calloc(SMAZ_LETTER_COUNT, sizeof(struct Branch *)); + t->children = (struct SmazBranch **)calloc(SMAZ_LETTER_COUNT, sizeof(struct SmazBranch *)); t->children[tkey] = newTBranch; free(t->shortcut); t->shortcut = commonPrefix; @@ -170,11 +170,11 @@ void smaz_add_to_branch(struct Branch *t, char *remEntry, int value) { memcpy(vtail, &remEntry[x+1], (entryLen - x)); if (t->children == NULL) { - t->children = (struct Branch **)calloc(SMAZ_LETTER_COUNT, sizeof(struct Branch *)); + t->children = (struct SmazBranch **)calloc(SMAZ_LETTER_COUNT, sizeof(struct SmazBranch *)); } if (t->children[vkey] == NULL) { - struct Branch *newVBranch; - newVBranch = (struct Branch *)calloc(1, sizeof(struct Branch)); + struct SmazBranch *newVBranch; + newVBranch = (struct SmazBranch *)calloc(1, sizeof(struct SmazBranch)); newVBranch->value = -1; /*printf("asdf: %c\n", vkey+'\n');*/ t->children[vkey] = newVBranch; @@ -188,11 +188,11 @@ void smaz_add_to_branch(struct Branch *t, char *remEntry, int value) { } } -struct Branch *smaz_build_custom_trie(char *codebook[254]) { - struct Branch *trie; +struct SmazBranch *smaz_build_custom_trie(char *codebook[254]) { + struct SmazBranch *trie; int x; - trie = (struct Branch *)calloc(1, sizeof(struct Branch)); + trie = (struct SmazBranch *)calloc(1, sizeof(struct SmazBranch)); trie->value = -1; for (x = 0; x < 254; x++) { @@ -201,11 +201,11 @@ struct Branch *smaz_build_custom_trie(char *codebook[254]) { return trie; } -struct Branch *smaz_build_trie() { +struct SmazBranch *smaz_build_trie() { return smaz_build_custom_trie(Smaz_rcb); } -int smaz_compress_trie(struct Branch *trie, char *in, int inlen, char *out, int outlen) { +int smaz_compress_trie(struct SmazBranch *trie, char *in, int inlen, char *out, int outlen) { int verblen = 0, _outlen = outlen; char verb[256], *_out = out; @@ -213,20 +213,20 @@ int smaz_compress_trie(struct Branch *trie, char *in, int inlen, char *out, int int needed = 0; char *flush = NULL; int length = 0; - struct Branch *branch = NULL; + struct SmazBranch *branch = NULL; int remaining_length = inlen; branch = trie; while (remaining_length--) { unsigned char nextChar; - struct Branch **children; + struct SmazBranch **children; nextChar = in[length]; if (nextChar > SMAZ_END_LETTER) { break; } children = branch->children; if (children && children[nextChar]) { - struct Branch *tmpBranch; + struct SmazBranch *tmpBranch; char *shortcut; int shortcut_length; tmpBranch = children[nextChar]; diff --git a/smaz.h b/smaz.h index b9ac015..f8e55fa 100644 --- a/smaz.h +++ b/smaz.h @@ -1,18 +1,18 @@ #ifndef _SMAZ_H #define _SMAZ_H -struct Branch { +struct SmazBranch { int value; - struct Branch **children; + struct SmazBranch **children; char *shortcut; int shortcut_length; }; -struct Branch *smaz_build_trie(); -void smaz_free_trie(struct Branch *t); +struct SmazBranch *smaz_build_trie(); +void smaz_free_trie(struct SmazBranch *t); int smaz_compress(char *in, int inlen, char *out, int outlen); -int smaz_compress_trie(struct Branch *trie, char *in, int inlen, char *out, int outlen); +int smaz_compress_trie(struct SmazBranch *trie, char *in, int inlen, char *out, int outlen); int smaz_decompress(char *in, int inlen, char *out, int outlen); #endif diff --git a/smaz_test.c b/smaz_test.c index aef4ced..cf052c5 100644 --- a/smaz_test.c +++ b/smaz_test.c @@ -47,7 +47,7 @@ int main(void) { int comprlen, decomprlen; int j, ranlen, x; int times = 1000000; - struct Branch *trie; + struct SmazBranch *trie; char *strings[] = { "nojQfTh", "ht", From aa6770c259180f3efd6f3ebc39de1ef02de06d74 Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Thu, 27 Jun 2013 23:03:11 +0100 Subject: [PATCH 09/19] Some cleaning up and implementation of more tests --- README => README.md | 46 +++++++------- TODO | 4 -- smaz_test | Bin 0 -> 24101 bytes smaz_test.c | 149 +++++++++++++++++++++++++++++++++++--------- 4 files changed, 146 insertions(+), 53 deletions(-) rename README => README.md (59%) delete mode 100644 TODO create mode 100755 smaz_test diff --git a/README b/README.md similarity index 59% rename from README rename to README.md index 3f53ba2..43aebb7 100644 --- a/README +++ b/README.md @@ -1,5 +1,5 @@ SMAZ - compression for very small strings ------------------------------------------ +========================================= Smaz is a simple compression library suitable for compressing very short strings. General purpose compression libraries will build the state needed @@ -14,36 +14,37 @@ that Smaz is able to compress even strings of two or three bytes! For example the string "the" is compressed into a single byte. -To compare this with other libraries, think that like zlib will usually not be able to compress text shorter than 100 bytes. +To compare this with other libraries, think that like zlib will usually not be +able to compress text shorter than 100 bytes. -COMPRESSION EXAMPLES +Compression Examples -------------------- -'This is a small string' compressed by 50% -'foobar' compressed by 34% -'the end' compressed by 58% -'not-a-g00d-Exampl333' enlarged by 15% -'Smaz is a simple compression library' compressed by 39% -'Nothing is more difficult, and therefore more precious, than to be able to decide' compressed by 49% -'this is an example of what works very well with smaz' compressed by 49% -'1000 numbers 2000 will 10 20 30 compress very little' compressed by 10% +* 'This is a small string' compressed by 50% +* 'foobar' compressed by 34% +* 'the end' compressed by 58% +* 'not-a-g00d-Exampl333' enlarged by 15% +* 'Smaz is a simple compression library' compressed by 39% +* 'Nothing is more difficult, and therefore more precious, than to be able to decide' compressed by 49% +* 'this is an example of what works very well with smaz' compressed by 49% +* '1000 numbers 2000 will 10 20 30 compress very little' compressed by 10% In general, lowercase English will work very well. It will suck with a lot of numbers inside the strings. Other languages are compressed pretty well too, the following is Italian, not very similar to English but still compressible by smaz: -'Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura' compressed by 33% -'Mi illumino di immenso' compressed by 37% -'L'autore di questa libreria vive in Sicilia' compressed by 28% +* 'Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura' compressed by 33% +* 'Mi illumino di immenso' compressed by 37% +* 'L'autore di questa libreria vive in Sicilia' compressed by 28% It can compress URLS pretty well: -'http://google.com' compressed by 59% -'http://programming.reddit.com' compressed by 52% -'http://github.com/antirez/smaz/tree/master' compressed by 46% +* 'http://google.com' compressed by 59% +* 'http://programming.reddit.com' compressed by 52% +* 'http://github.com/antirez/smaz/tree/master' compressed by 46% -USAGE +Usage ----- The lib consists of just two functions: @@ -61,11 +62,14 @@ Decompress the buffer 'in' of length 'inlen' and put the decompressed data into 'out' of max length 'outlen' bytes. If the output buffer is too short to hold the whole decompressed string, outlen+1 is returned. Otherwise the length of the compressed string (less then or equal to outlen) is returned. This function will -not automatically put a nul-term at the end of the string if the original +not automatically put a null-term at the end of the string if the original compressed string didn't included a nulterm. -CREDITS +Credits ------- -Small was writte by Salvatore Sanfilippo and is released under the BSD license. Check the COPYING file for more information. +SMAZ was written by Salvatore Sanfilippo and is released under the BSD license. +Check the COPYING file for more information. + + diff --git a/TODO b/TODO deleted file mode 100644 index fe6a5ae..0000000 --- a/TODO +++ /dev/null @@ -1,4 +0,0 @@ -const-correct the source code -release the ruby script to build new specialized dictionaries -play well against currupted input in verbatim 253/254 codes memcpy() -play with some form of entropy coding like Huffman or range coding diff --git a/smaz_test b/smaz_test new file mode 100755 index 0000000000000000000000000000000000000000..7feeded31e10b9b8dc33acec371c1a026caa136d GIT binary patch literal 24101 zcmeHveRx#WwfCMg0U}B!Dy>mbkLjR9Lp}s;P}CWcNzT9kfrJn7V=|eUBmZtL67YGpI)Y&i@CnB{BnTXXCjlL zG#qNYZf0pXFg+ZKB-*F9S6ny!x|t>MXvsCQ-T5$tWNQ{IktUgbEC59|PsSg`0Oj9( z?+^CB`peSW4ip_Zw)nT}tF;5iUm$rK{-_M2U8>lCG+o4RUgD!F#=O3bob%3*OJRH_ ze(@OmrZM>B|1*`dXbgVk7<_7i76&DK4 z-GG!3X8Z`Rx4AVM@y2yuO!s;P!op8ca2+z*VxfrMz8;Ug!nGkMgKCuDaES zqGG;CAlfRLV!@!O@wn$ydao&&Q8IHlT|SZ=NnKNNEwxC&e^z;?6h$eag4u@d7SI%M zX3*&uhR&jy==;pu_O0(AOmoD)Fg-aGnnV%)pG+q%B6&qT0h+FZH#bP6*dF#J(x!iD zj-xmqz8noQIRSTy8S~1`;PX@?v+^_eR#O;Qkij?S4$4+D_?h$TvB zDl+(J@bKlx;7=UpqP-dX;i(t-wHf?#GUS(M@ZsR$t09A*H_S!(UGL_meuWmL=D_^a z>zIGEu3fo1O_B7qM=Dg2p878Cw#yty5#2)BLkH8iroKTmxqN7!q+cPLTs<@(>HS1o ziGE1Z&k;?o9ojDGr->$)4sDh6<3y7yhc-z1mqe2bhuS3luS8Qw3^hplpNS?{4%JHf z`$Urqha8f=muPa`P?@CfBAQ$_q=I%Xb1ro*an>)X(>A`lxlo9J)|)eR8nU%yx4j*V z@&Qj@*zWN3J#2R%x>Jv1YAzp0#lIq9HcHZZrzadL6sqh6TF)_Cw+-%iV!PHLMrj_k zYsn+!16tS4-2Q7z9pWi+TIyC9q4i!Z^M){uPk>noY+=HIgdT&TIkK}wL%A|Im`?JrA z25rf!Vf<%*8fn(0!_DeO5!|evB1k>%m}3+D2g|tG=;l2M>h=t0$p1`35EsmX|N?)(pvd(gv9^zc)d5ohhT^^~Aqt?x;x zr1@V$uco&B)#%dmk>%`q*W&E@YueWHAMEoo_Ls6y)* z(3SK)IdyN%yq0Lm!*wLM;l>TK5krZ2PdILBK);OZ1gUL2T0zv{msYLos70G~AZ|DO zs&x%mwB)!Y5K2$pEroi=)sV`p1F<*ujGKX^g{r&t_MJ^lBmJdy-I!mFdZ=FgyhjBt zvMu-ar!-WQK6v}iOzXC8&6t0?-fhv6>nO<9=W0D^TXz{vN_#P>%2Ilf@bu1{Fu&(h zTlbfUEqCvohTQK!kQ}B$8oIi@LW6L=r*}4lQO;H=%!LM6SS~6&sLw&dxx*z-%`EwI ziq5?`cC%m&4+r~jx&6p>o5tQNMXT(pA-m3AOXaG=<+|X9@*SD^O=`Oyca^^-$HCm*;w$kC(NU`lIRpCi@>l73t@X~v zGbyc~mAY5X$jHR=N89-20XP@)3ucX?6EyI@nNMQlQWq12XTbSDckefI-RNWY#=lDO z@25_FCb=wo92BOW)H{;99Y6oAkh-6x(w`c3BV>%4b7jrBhW5iWe`B_Hq+W&`x{qsq z7L+H6@y*(ZPD49iYUdl;ZmEsfO?^*ln|1d}?axwOtc~GeXctKB0z>=Hm|s&386sTw ze959g)wQc-WP%;K7JL~uH2&PlQ(E#f=Ms1F6S)xTdUqP0xdkZ|Li2l@^0nknJYDAZ zyk+ZNgVE75;Ot#mVCx=45LeCm!f9Lha?gI-mifqX+IG0a6W|MHMUGv+e29)OWIbA6 zr3Ja3acO78Ia6#slaYaPXT6eqef^t#-?V7{S2cf{MwCdM_=H066;JYQgeN^L^HZOI zMYYa4V%u^Hgo-eULbR{b0!wqDnc4{nJh*bTzPQD0T?3={y0K33?@v939CvSZzMOvh zRjAqea|%%Gl++fPg+WQv^&wBP5o+zZ&ff7!rMFz`8qDe2lsszg8wyGBd7IlBJdWsHV2qoth=r&rj746r+X)AT~FuqZIoubc=gY# zl^+{A42tUZ_`js;bIH2o9R=hftdj7crD<$G^r@TY?8&JRm|P0nd#mLn?@W##vcgr4 zM=Y}UQ};n~uORPfNh=vWw|Dlzp*uj2bW|!%#i3`0W`IBPd~?>zGM1DdtJ9KS8qao3 zitaLsv|Y9VO~t%B^JwyjC;5)nwcC>V4lRl{o;-z&#Nk_S_h3zZ8@!M8%Mw|fET^vg zRk<7c{6jKqH(S^{^Y!hH29bOP-on&vNzIjIeJRU2L{DHXnZiSu>&tnL}01!8U$O3AOHkh68Cq3fD;P)q*S**n2@_axMg2-!T2tVp|h zp4G2*c71Mn40Vgg>84L`_AOF0|8unRRg3EJEKh!+S)QYTqauJRXsYRkwp^uFC3km) z?G=`)qQi(ckLAtH3WkdX5hvws-Lx^K3T!?2bJXPPxjS-CSd#XDQj^Or~hs zp5{vK!P8Vr{>eovj2d*i+rP`UBgZnk)AsijMD{L2eDAk))3(R91C1?yTJGB}22D@U z-rEaY*eg=~`lT+c#J4<;q2=nMq0{xA#hpBU^vB#!uH?H%k7E4UcAPI;>h=#h`&!b@ zu0gD-c3L*q=Qx}jKc}M%{c;%Evk+ClUA8UNNKlI&JJ%!&OZCLXSf*1njSShH z5T1>7;?e!7aVSN$V~Fg_SPu@zWAYOkF#ky@SohQr{ zo}5F?c(m=X+WJpyuCmVFqvTb-g0Ww<%JPFZZ-kHdDPQ1yMX6>wH`PTMPTbQx%tOM^L!B z@k||}`fUBXs%E{PIOxU%h&fG5{yBB%JsMGo=V>_Q&|n$Ri@Vm@b1ZV8Jf!FoDWhuu zqh%lYKDVgKK3Pki&@6J?=3)so8Iio@E|@{p%7+bx%VbL>2EtSfnom5IFGr1-TwC`h zSR-q}Ga8EKkTE;K7&kpVMyTXoT}6xStWdG_e2rRuTDZ`9RE~qL;}#lY_xu*6Xnr)p zwxixYqwAOjOTnDs_{WMc#~c8OyD**PQ2gJC_!{wo90i$Kdc+pMZo_Gcjg{_8^^l;awUmnfoW_G8~QY zi4-PqEX8`{z{p47O~F=%8kRiiO#YWU`O?wFOkT-)8hfS|{J0!wyR(Z`6g`IeLPc*4#vwGFa8uj<9LfSY|P@t_Nvops7j{L|M^*q3d4_8wjBwJtM5*WYt}pKuXp=@r}ZUp zq;XtJVk*Zo!d3LwluGvXRa@mlL%UD2?0yt3PP;A7R~5Z)**O#j0e4NwtN^aV`M}5? z5@&bGmz?$P&|h>uD&xVw zdrwYI30g+eujW7Iob~#ioMLh3s!w;?lCLK&?s?VLLt%yO`$uL`*u&qC!mzHLJf(b~ z|4WnKb=a2bGBR?e^wl2R@WmZ#e}IFS%{h~g{*&RK{@E(qgZ@UXsXCJXbhj;Wn2v+q z=s#u_mYRhe3g!aL#X0iWByrVR#Fk;Z%oYaFRwdRPxZ{G*0Cb zcBipF>)U3htjFj^hpO+SR${%?bt1>sLu(e>j%{`;aKjh6t>-Z5qC~1nOMW=^Qk`r2 z%b6GJ$wP9zUi2YWE~(kl%F~QEj02*s6XOzPIK09XDMa@`qFPJ-CG{j85?XJ!om?>D zWH+2lUb+?jeS93ul-`wYg^N=u)5Y&0W$WK3Gy1n19q`<6?mnbaS4)xokLgrO3q+1S ziZaPT~^ z038d3>7+%c8Qs|HR+#JIk22;8vTyi23@2b3?9#eUr1cA*pr5&oeQMXyT%4oz;ymm` zI`ILmud%o`&e~GHz=TDdooGE@7IePe5)}AhO{jW{XbGx9jW&sB6Hryf+Ln;N1rk18 zto6l(x>f{n_eF#n5|Mx^g7Qza=z81rrKO@N8WZ&>F0MlWLV8rFEn;m5C5KT-Sm;rd z8H@#m+9={d5sc^}(yWpRp%{MG!;gDvlL$pr5eSM>IL?O2F`>4KUC@Ms*89r3MEBd75<!0`z+26;FjF8nPpC59^3z_Jdck=cSj zcmTc^Y8YmR;4M|Op&B$D?Ze#{N3kKe3`Y4yG!a98!9{vpbc$G9BqAcz3Q;uAj{<_? zrrBbRte8$F*ssv2)e}Ft5^enl6co8kC>?4I)YtkR!#x&! zQHjQ`63UH*KU1RmUxXE7ph}4*ZbhaZT}=0uNw|mfPlXlX@SjS|_jk|^eWc7T{Qwzj zTYe}MRh@~N1M1B}i3Kl5vKfxHA{6#o(aFEJ_;QpOMu-v*dg0k7wNO~mLmw-#XxxfO z*e(>^_l%`+g%S__L0B=GCn&M_>nKeB5LrPNczWms=tqAAy5(HlLUS$tDOQZGHp&bh zMrQn9aMR}t<)+zhLn1mEIkDhcVF?s~jO`SbiRJK?dM{e3>T8rW!KhFYk%PjDG4anr z@u@#n*82Wv3F0U9MC?1@`kq88{u3)k$^%I0yTJ2*DU`LrZ{gPRw6HW4qbaJsNmvo6 zZwgEE4Z<>M7R2JeregK`kr7pIL3+}}9|$F&UTSGMPgsf=X#c#q*&l5_s`cLEJRCu)&@EN#kuN9Wk-SAIby_@`}{=Gm}^iRSP{;JZV z-X)Y`)h=>q>RT!-t<&Kg|8d*`DPhG7sfiq#TmI1!IZG%dY6o({%dHr2pP;#+XrIy; zybZj_MOF;hcSzW$m%&56-&&&MQIjuz4YFekaf_GWreBH7papWljt%4F?69986blH& zAKVN}5GJ=H-SRyptUhdMn@xi@_#YTMLG^qk5L}6hgRw^;5lslCv17dw2yIk?zILGm z{V(Gdn1fsJsIbJ&Q6g#)i106I0IMM-7@2{gp*C3J6TyyM3^vBR5{zF5*9AWpIVJvR z>q$5~`kEE9)B$0|gg0o>@jgb2uTfYrqx==gV7{;ncU5Yp;A%bHeLtLnc{LgDv?5^8+|QW5ocN+@Cj8QqF7Rxhxub%GsJ*Wn&l?^HtZ zsmRnzaa;2^;#vLL1b%K+TH3$BROL$w%j`qKiif~9^ZBS^gsSQ^CE|aJS)*Wip+nNQ zGd44}+YNpz)2kUfv+_4EeODI$+f4r>;}02s%ILV#A?W&9p@TsDs?b3o%zB?LbPzdA zzg*}bQoLR0AQ&N%gFc(lEYHM?n12=HHH=Qig^Z?rJ=3=^db9FFOvf14 zG5#InX2vayKVp1<@!uGK$!OO5NEZE@EZUSa@pqj62gX;j7*Yc%1QL#?Ki|d&f;R;(QXL z$v5#l=3l@#g>fq5Rg9+m*O{Kp=*-HmW_lUpZHx_!O^jxF;Ve3qMR#V=>$7OH91}N< zk$)HS?_u0FhTQkF_&>>_f5!B$7=M$MznAGhGQP%mi1BU4oGXleIg8P>KR=5$`BSp^ zg-jPSUXzvo^(?xIX*c77tbDVcx-s}>{;gU04O#TcEIP*Y+ARJ?rZ+M6F@BeEJEK|e zPqXNsXVIqIL(G3TEB`U3cV_WTz2}*~e~f%n?iJ2|i*bnYBgPYqro7xIf>@W)p}Q5y}Dpwy;|$6t8-ON#J)>c z8?jj=Y>q~;Ys9;(>W%8uM9I{$>k<=%)G(yg>x(90w~UQ)w9yx%Z5-+kU}Ly)QBys( z-^M-~r8I|wv}nSvumf8M)z^&oU*o!(h=t=~en<_4!->{VB&r6GZpHd39yOKQV$tT9 zuNAq?C9z;25YlBq^;mAI_}{0->8)E8@8d#(w#L(a)0@l60@GdXzSg$zj2Sbq9&nMz zu2)^{Z9%^d7sP^kA{J5Oq0Zp-@KP}vsso2#^<;T>#2<~tprST~BEGQdZ}G*jspHC^ z=`U7;W|>p{6U}=5kMtHS@pxdOyj&4~tfNh*F2$xt4FreDsn}HDD2=O*mz0!{v8AoS zK*(3x5@;+hZB4{O{!(YSF&HVWM@P(|GXWG5iLQm0nu2TLcwac=gZbDd1|$Ap{Ccid z)V2CL*_$Ca4VxM>xX@a>wjA@tI!M;+P;3JdjnogNzK9-*1v^X83!SAp-efLq^~LE_ zKwh3JeF;5^jW7CjbprPLWNE=z$fvFetqEd}tJa15A($oQ%gf5j)JUSWF&K-h*ATH5 zp`n(Sflz0Z4V$H68_|KHbRFj~xM=es?T?^8sN1oDYr<9(8=1Ay*s8c8zcz@nXbVFz z(@A;@f?>5a*x8A|L&`71jBJfW5l>{I?kiSXLuxFf$D(U|2+oL_h#=;I;Wa)r8uuq~ zB17dwVF5Y~Rmvyu>E^Mh>SrX|8)XlAHY^`0=*|Jc(Wp%uBe#;v7oYkx2b>YOKRqdWe z|B}SwrOR7e>PjkOmF_vUZS$79Y8yLRJXM~#3nC?LOKYlKf$G+IRh8;ek8@RtXIa&1 z&%$tPOUIfuA$RATg>#n0m(?t4bJxyY*`$YD0X5MU2(ECiSUG2DY+-9vZOzL(inP}FRxH(*Rkr!s+m=QB`pme$tS`C^qzmfW(awXUY7qxP1j z;POgWA`)r#wO6;-hJzg|RxZ#h!&>Fyz%7Az`?6J`Xq#`v>Ln$seC`EvTUUgZ;Qf8e z#nv0>-$>BA*bnYcr|$!9{(U;V!74=VpVH|8qF+m=^DRO=`j>RN9hiGCoqiCw9Jmjd zdLx}yamripb~@cgXV5r0JpgRSk!=CWsleI!a^L|Rr0)Z^;5UI2K)Ub}v%D;wi-n~< z-*WLe6LRU_=-9!Xg1=nYbdV(S@~iU-=Gx9)o4Y~We9;Y8&9D~|nnjrt!|aimnM&dD|`4JFIyXU^(-qK?I^unUg19 z(ZJa?il9AX+pFpH6{E}3s61D8c@5TAEfddFUe(#kI%HIxR#6S!%@oD>`z!t`{+v$F z1Crf$D`?^-Yh|9=mD7<|;LOW+nPy3U6qC;(?LIrQay{sS|BXN9=54iF7bs_JfRpT`@_vHy2GG}qqy3TP-%9eYEbr)<9pa+$$iE+> zyzOtozd*yjsVWi22&wmrYHF*OrYp3;$9+^k} zwd1dz%EK5;0>q-b$88$#&2w~3xJTJy?aNV?%b22Zw+?bGkfSj9+x0gO{LKS@^T6Lc z@c*|5@T!Wu%vNO6CZ8Tjbm0vMd7Z;}aJ)mNCNZ6>8uTZp(otgFrx^4&B5*l!91_RN zJJtz`L#E6P0Hid_5bH!{(JT8ioQC*Az4>2UdLF? zxRkMpaSh`p#_urx3*!#PJ&b>3%&zx;O5enBVrPt8ZrGd8cn#yVoWGRmX-uzRR0|Eg zBU?C+A9oNV>x;ai>?Hkn<1LH<)}O`nWo3r`ai)tHH?!PrOgAvrG6oo%8KaD67aHYV z!sR~9<-|FE5$8Y0{G-fwFuj)PPR9F7jq$J)|6`^fU_9_O!=7qx=f5-mmyEw=e2no)#sS6~ zSnn-v*F#*-PR`%U_*=#o7++*u#PTmO{U^rP84ohP$@mWA5ytlzKVbZX@dV>ZM#1%1 z8OJkDWXxkckMXOF7cxG`m|&d3s506ar!ij6_*M2#G1L7VA7xC>WIV=L!MKaU~FV;Vhl01GPW_!<@V`Jw==F|+`xD* z%MEe+yP5y@jJ=Fo7{A5%ZN}?a{{yDK$M{c-HSCA&O#g)O0mk`^KWE&?*vfkU!St^f zA7Ol)v7hm2#?2h>dzgNf@p;A<8UMg|fbq|aZ!jKa{0`#~q=%! zFHcD3wM7OkL`l4(RrfUl^_YPzW(wa+3C7w)34U}5mN@6Qr|Z6ErkW#(lEy>`AKbwU zc_l=(_~I?1B+wB-c?RMQL79WI{5ZWFnMrt&7Yl}cq`;}Rur5kM5xl@xq6gb?$8Rzr z-KM5+#28{t!on2CVcW6J`2Z@g8Lfe)W;J5?QuUP9t7I zq+d4hBb08s>;D=rh9<406)vGdamZ8!Go~=#1d`3Xz6N3f{!IOP#Q_R&$P86dBe$H9 zq)e6Lo~f@Z4$0S9-!7?}8@R!=Z=$5BUtna5r9dnGOnq~o9F z;wt3R9?7K5ebO{msAc_Z`A2o45^hrv5?J zKS)7|Yg(28rk;^2&@@e znctx^G!kwLX_`!4}W&l=MmX1mS;?@aYKtT6O~4NAnAWIc`VuN4^g@ntxFyDbvO{m?Z8$$1y68##fH`AU%o+g4T5wak?L9yjsNR zeyEWA)BWHAUqL?(WYv4Cl)q5ax*L!h8MiBC`a-ds`Q~_R2cOC*o8&-hvi$?R4%x~a0;bn~Y9{8#(Vf5|arTl0fQBO(!8J?$p zyi1z#iCE!}>G(z=zMw(hZ>_9fxY+A)*VW_8v0iW0C~9tHMphu|Z4Tp&*sxa~J9vGG zc0q4chJ$$dwB)*LXOs)d@P-2IM9siI6{VAlK%%u3AB;8XaV@Bl)l@Hb&UblDimEZw zBelpg4p|Gm*yyDfMPV?d0d>TR`5TP{(CejB6t6E9^L2RXbzN`OEeo9U-Ib$r~xpr!Z8bHI{i+^HD=J^5vuO9W{J=2K41^?U@AKD$Q?+wb+8RIQq{3}sXGT%`( z$}tRq06xo`rJMC3-;vyz_&1;X1l=v literal 0 HcmV?d00001 diff --git a/smaz_test.c b/smaz_test.c index cf052c5..5f77319 100644 --- a/smaz_test.c +++ b/smaz_test.c @@ -5,30 +5,31 @@ #include "smaz.h" - void hexDump (char *desc, void *addr, int len) { int i; unsigned char buff[17]; unsigned char *pc = addr; - if (desc != NULL) + if (desc != NULL) { printf ("%s:\n", desc); + } for (i = 0; i < len; i++) { if ((i % 16) == 0) { - if (i != 0) + if (i != 0) { printf (" %s\n", buff); - + } printf (" %04x ", i); } printf (" %02x", pc[i]); - if ((pc[i] < 0x20) || (pc[i] > 0x7e)) + if ((pc[i] < 0x20) || (pc[i] > 0x7e)) { buff[i % 16] = '.'; - else + } else { buff[i % 16] = pc[i]; + } buff[(i % 16) + 1] = '\0'; } @@ -40,39 +41,131 @@ void hexDump (char *desc, void *addr, int len) { printf (" %s\n", buff); } -int main(void) { - char in[512]; - char out[4096]; - char d[4096]; - int comprlen, decomprlen; - int j, ranlen, x; - int times = 1000000; - struct SmazBranch *trie; - char *strings[] = { - "nojQfTh", +char *strings[] = { "ht", - "QtZpZuMhlzfgHFEGA.Kja/hsIayllFSAMFDl.fQ/bJdzfzCvxdclaIbzzWyhbOhCj.nydSJSbmPUzhOHYqszMhvIBqqsSluQkxLbcUuRVXmhS.CrCIBPpKXEPbyhLDLJNn.pVGFEdFmKDC VLAk.LWDqLOlmhyvviIzBOBWsWGQpIPJjftiEd updeZIZjBVrOmDPGJmcZZ CziiEeAhtvkUnYdaFuvKGvdmQnmGaZVtWCpaxpVozEWjc/HyGQFMaiMqjzKYmgPGzSxsFPuCjP JcHUinZvLWVPTSarCUUYQmSGGyPYfeXCEunngaxFxPleyZjNtClHCRdYdsxWkiopaZqU.kaINJmZiUmp", - "This is a small string", "foobar", "the end", + "nojQfTh", + "http://google.com", + "try it against urls", + "Mi illumino di immenso", + "http://programming.reddit.com", + "This is a small string", "not-a-g00d-Exampl333", + "/media/hdb1/music/Alben/The Bla", + "and now a few italian sentences:", "Smaz is a simple compression library", - "Nothing is more difficult, and therefore more precious, than to be able to decide", - "this is an example of what works very well with smaz", + "http://github.com/antirez/smaz/tree/master", + "L'autore di questa libreria vive in Sicilia", "1000 numbers 2000 will 10 20 30 compress very little", - "and now a few italian sentences:", + "this is an example of what works very well with smaz", "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura", - "Mi illumino di immenso", - "L'autore di questa libreria vive in Sicilia", - "try it against urls", - "http://google.com", - "http://programming.reddit.com", - "http://github.com/antirez/smaz/tree/master", - "/media/hdb1/music/Alben/The Bla", + "Nothing is more difficult, and therefore more precious, than to be able to decide", + "QtZpZuMhlzfgHFEGA.Kja/hsIayllFSAMFDl.fQ/bJdzfzCvxdclaIbzzWyhbOhCj.nydSJSbmPUzhOHYqszMhvIBqqsSluQkxLbcUuRVXmhS.CrCIBPpKXEPbyhLDLJNn.pVGFEdFmKDC VLAk.LWDqLOlmhyvviIzBOBWsWGQpIPJjftiEd updeZIZjBVrOmDPGJmcZZ CziiEeAhtvkUnYdaFuvKGvdmQnmGaZVtWCpaxpVozEWjc/HyGQFMaiMqjzKYmgPGzSxsFPuCjP JcHUinZvLWVPTSarCUUYQmSGGyPYfeXCEunngaxFxPleyZjNtClHCRdYdsxWkiopaZqU.kaINJmZiUmp", NULL }; +void test_compress_small_out_buff() { + char out[4096]; + struct SmazBranch *trie; + int comprlen = 0; + /* skip over the first test string that will give us only 1 byte */ + int j = 1; + + trie = smaz_build_trie(); + while(strings[j]) { + comprlen = smaz_compress_trie( + trie, + strings[j], + strlen(strings[j]), + out, + j + ); + if (comprlen != j+1) { + printf("Error: Expected return size: %d, got %d\n", j+1, comprlen); + exit(1); + } + j++; + } + + smaz_free_trie(trie); +} + +void test_null_term() { + char comp_out[256]; + char decomp_out[256]; + char no_null_str[4] = "test"; + char null_term_str[] = "test"; /* implicit null here */ + int comprlen = 0; + int decomprlen = 0; + struct SmazBranch *trie; + + trie = smaz_build_trie(); + comprlen = smaz_compress_trie( + trie, + no_null_str, + 4, + comp_out, + 256 + ); + decomprlen = smaz_decompress( + comp_out, + comprlen, + decomp_out, + 256 + ); + if (decomprlen != 4) { + printf("Error: Expected return size: %d, got %d\n", 4, decomprlen); + exit(1); + } + if (decomp_out[3] != 't') { + printf( + "Error: Incorrect final char on string: %c, expected %c\n", + decomp_out[3], + 't' + ); + exit(1); + } + comprlen = smaz_compress_trie( + trie, + null_term_str, + strlen(null_term_str)+1, /* include the null terminator this time. */ + comp_out, + 256 + ); + decomprlen = smaz_decompress( + comp_out, + comprlen, + decomp_out, + 256 + ); + if (decomprlen != 5) { + printf("Error: Expected return size: %d, got %d\n", 5, decomprlen); + exit(1); + } + if (decomp_out[4] != NULL) { + printf( "Error: Incorrect final char on string: %c, expected NULL", + decomp_out[4] + ); + exit(1); + } + + smaz_free_trie(trie); +} + +int main(void) { + char in[512]; + char out[4096]; + char d[4096]; + int comprlen, decomprlen; + int j, ranlen, x; + int times = 1000000; + struct SmazBranch *trie; + j=0; + + test_compress_small_out_buff(); + test_null_term(); trie = smaz_build_trie(); /* From f70686a8497f70f70472c99f7465779690f12a75 Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Fri, 28 Jun 2013 12:16:33 +0200 Subject: [PATCH 10/19] Expanded out tests and added better benchmarks --- README.md | 8 +- smaz_test.c | 317 +++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 257 insertions(+), 68 deletions(-) diff --git a/README.md b/README.md index 43aebb7..2b658cc 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,9 @@ -SMAZ - compression for very small strings +Smaz ========================================= +Compression for very small strings +---------------------------------- + Smaz is a simple compression library suitable for compressing very short strings. General purpose compression libraries will build the state needed for compressing data dynamically, in order to be able to compress every kind @@ -69,7 +72,8 @@ compressed string didn't included a nulterm. Credits ------- -SMAZ was written by Salvatore Sanfilippo and is released under the BSD license. +Smaz was written by Salvatore Sanfilippo and is released under the BSD license. Check the COPYING file for more information. +Trie-based implementation by Richard Johnson, released under the same BSD license. diff --git a/smaz_test.c b/smaz_test.c index 5f77319..873544d 100644 --- a/smaz_test.c +++ b/smaz_test.c @@ -41,6 +41,13 @@ void hexDump (char *desc, void *addr, int len) { printf (" %s\n", buff); } +int g_seed = 0; + +int fastrand() { + g_seed = (214013 * g_seed + 2531011); + return (g_seed >> 16) & 0x7FFF; +} + char *strings[] = { "ht", "foobar", @@ -89,6 +96,8 @@ void test_compress_small_out_buff() { } smaz_free_trie(trie); + + printf("TEST PASSED :)\n"); } void test_null_term() { @@ -143,54 +152,137 @@ void test_null_term() { printf("Error: Expected return size: %d, got %d\n", 5, decomprlen); exit(1); } - if (decomp_out[4] != NULL) { - printf( "Error: Incorrect final char on string: %c, expected NULL", + if (decomp_out[4] != 0) { + printf( "Error: Incorrect final char on string: %c, expected \\0\n", decomp_out[4] ); exit(1); } smaz_free_trie(trie); + + printf("TEST PASSED :)\n"); } -int main(void) { - char in[512]; +void bench_trie_smaz() { + FILE *infile; + char *in; + char *comp_out; + char *de_comp_out; + long numbytes; + int num_loops = 1000; + + infile = fopen("war_of_the_worlds.txt", "r"); + if (infile == NULL) { + printf("Missing war of the worlds text, you can download the text here: http://www.gutenberg.org/ebooks/36 and save it as war_of_the_worlds.txt\n"); + exit(1); + } + + fseek(infile, 0L, SEEK_END); + numbytes = ftell(infile); + printf("Processing %d bytes, %d times\n", (int)numbytes, num_loops); + fseek(infile, 0L, SEEK_SET); + in = (char*)calloc(numbytes, sizeof(char)); + comp_out = (char*)calloc(numbytes, sizeof(char)); + de_comp_out = (char*)calloc(numbytes, sizeof(char)); + numbytes = fread(in, sizeof(char), numbytes, infile); + fclose(infile); + + { + struct timeval t1, t2; + int x; + struct SmazBranch *trie; + + trie = smaz_build_trie(); + + gettimeofday(&t1, NULL); + for (x = 0; x < num_loops; x++) { + smaz_compress_trie( + trie, + in, + numbytes, + comp_out, + numbytes + ); + } + gettimeofday(&t2, NULL); + printf("time = %u.%06u\n", (unsigned int)t1.tv_sec, (unsigned int)t1.tv_usec); + printf("time = %u.%06u\n", (unsigned int)t2.tv_sec, (unsigned int)t2.tv_usec); + + smaz_free_trie(trie); + } + + free(in); + free(comp_out); + free(de_comp_out); +} + +void bench_old_smaz() { + FILE *infile; + char *in; + char *comp_out; + char *de_comp_out; + long numbytes; + int num_loops = 1000; + + infile = fopen("war_of_the_worlds.txt", "r"); + if (infile == NULL) { + printf("Missing war of the worlds text, you can download the text here: http://www.gutenberg.org/ebooks/36 and save it as war_of_the_worlds.txt\n"); + exit(1); + } + + fseek(infile, 0L, SEEK_END); + numbytes = ftell(infile); + printf("Processing %d bytes, %d times\n", (int)numbytes, num_loops); + fseek(infile, 0L, SEEK_SET); + in = (char*)calloc(numbytes, sizeof(char)); + comp_out = (char*)calloc(numbytes, sizeof(char)); + de_comp_out = (char*)calloc(numbytes, sizeof(char)); + numbytes = fread(in, sizeof(char), numbytes, infile); + fclose(infile); + + { + struct timeval t1, t2; + int x; + + gettimeofday(&t1, NULL); + for (x = 0; x < num_loops; x++) { + smaz_compress( + in, + numbytes, + comp_out, + numbytes + ); + } + gettimeofday(&t2, NULL); + printf("time = %u.%06u\n", (unsigned int)t1.tv_sec, (unsigned int)t1.tv_usec); + printf("time = %u.%06u\n", (unsigned int)t2.tv_sec, (unsigned int)t2.tv_usec); + } + + free(in); + free(comp_out); + free(de_comp_out); +} + +void test_strings() { char out[4096]; char d[4096]; int comprlen, decomprlen; - int j, ranlen, x; - int times = 1000000; struct SmazBranch *trie; + int j = 0; - j=0; - - test_compress_small_out_buff(); - test_null_term(); trie = smaz_build_trie(); - /* - printf("here: %d\n", trie->children['9'-'\n']); - exit(0); - trie = newTrie(); - addToBranch(trie, "f", 1); - addToBranch(trie, "for", 3); - addToBranch(trie, "fo", 2); - exit(0); - for (x = 0; x < 'z' - '\n'; x++) { - printf("here: %c %d\n", x+'\n', trie->children['o'-'\n']->children[x]); - } - printf("here: '%s'\n", trie->children['f'-'\n']->children['o'-'\n']->shortcut); - */ - - while(strings[j]) { int comprlevel; - int comprlen2; - comprlen = smaz_compress_trie(trie, strings[j],strlen(strings[j]),out,sizeof(out)); - /*hexDump("out bad", &out, 400);*/ - comprlen2 = smaz_compress(strings[j],strlen(strings[j]),out,sizeof(out)); - /*hexDump("out good", &out, 400);*/ + comprlen = smaz_compress_trie( + trie, + strings[j], + strlen(strings[j]), + out, + sizeof(out) + ); comprlevel = 100-((100*comprlen)/strlen(strings[j])); decomprlen = smaz_decompress(out,comprlen,d,sizeof(d)); if (strlen(strings[j]) != (unsigned)decomprlen || @@ -206,49 +298,142 @@ int main(void) { } j++; } + + smaz_free_trie(trie); + + printf("TEST PASSED :)\n"); +} + +void test_random() { + char in[512]; + char out[4096]; + char d[4096]; + int comprlen, decomprlen; + int j, ranlen = 0; + int times = 1000000; + struct SmazBranch *trie; + + g_seed = 0; + trie = smaz_build_trie(); + + printf("Encrypting and decrypting %d test strings...\n", times); + while(times--) { + char charset[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvxyz/. "; + ranlen = fastrand() % 512; + /*printf("doing %d\n", times);*/ + + for (j = 0; j < ranlen; j++) { + if (times & 1) + in[j] = charset[fastrand() % (sizeof(charset)-1)]; + else + in[j] = (char)(fastrand() & 0xff); + } + + comprlen = smaz_compress_trie(trie, in,ranlen,out,sizeof(out)); + /*comprlen = smaz_compress(in,ranlen,out,sizeof(out));*/ + decomprlen = smaz_decompress(out,comprlen,d,sizeof(out)); + + if (ranlen != decomprlen || memcmp(in,d,ranlen)) { + printf("Bug! TEST NOT PASSED\n"); + hexDump("in", &in, ranlen); + hexDump("out bad", &out, comprlen); + comprlen = smaz_compress(in,ranlen,out,sizeof(out)); + hexDump("out good", &out, comprlen); + exit(1); + } + } + + smaz_free_trie(trie); + + printf("TEST PASSED :)\n"); +} + +void bench_random_old_smaz() { + char in[512]; + char out[4096]; + int j, ranlen = 0; + int times = 1000000; + struct timeval t1, t2; + + g_seed = 0; + printf("Encrypting and decrypting %d test strings...\n", times); - { - struct timeval t1, t2; - double elapsedTime; + gettimeofday(&t1, NULL); - gettimeofday(&t1, NULL); - while(times--) { - char charset[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvxyz/. "; - ranlen = random() % 512; - /*printf("doing %d\n", times);*/ - - for (j = 0; j < ranlen; j++) { - if (times & 1) - in[j] = charset[random() % (sizeof(charset)-1)]; - else - in[j] = (char)(random() & 0xff); - } + while(times--) { + char charset[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvxyz/. "; + ranlen = fastrand() % 512; + /*printf("doing %d\n", times);*/ - comprlen = smaz_compress_trie(trie, in,ranlen,out,sizeof(out)); - /*comprlen = smaz_compress(in,ranlen,out,sizeof(out));*/ - decomprlen = smaz_decompress(out,comprlen,d,sizeof(out)); - - if (ranlen != decomprlen || memcmp(in,d,ranlen)) { - printf("Bug! TEST NOT PASSED\n"); - hexDump("in", &in, ranlen); - hexDump("out bad", &out, comprlen); - comprlen = smaz_compress(in,ranlen,out,sizeof(out)); - hexDump("out good", &out, comprlen); - exit(1); - } + for (j = 0; j < ranlen; j++) { + if (times & 1) + in[j] = charset[fastrand() % (sizeof(charset)-1)]; + else + in[j] = (char)(fastrand() & 0xff); } - - gettimeofday(&t2, NULL); - - /*elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0; - elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0;*/ - printf("time = %u.%06u\n", t1.tv_sec, t1.tv_usec); - printf("time = %u.%06u\n", t2.tv_sec, t2.tv_usec); + smaz_compress(in,ranlen,out,sizeof(out)); + } + gettimeofday(&t2, NULL); + + printf("time = %u.%06u\n", (unsigned int)t1.tv_sec, (unsigned int)t1.tv_usec); + printf("time = %u.%06u\n", (unsigned int)t2.tv_sec, (unsigned int)t2.tv_usec); +} + +void bench_random_trie() { + char in[512]; + char out[4096]; + int j, ranlen = 0; + int times = 1000000; + struct SmazBranch *trie; + struct timeval t1, t2; + + g_seed = 0; - printf("TEST PASSED :)\n"); + trie = smaz_build_trie(); + + printf("Encrypting and decrypting %d test strings...\n", times); + gettimeofday(&t1, NULL); + + while(times--) { + char charset[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvxyz/. "; + ranlen = fastrand() % 512; + /*printf("doing %d\n", times);*/ + + for (j = 0; j < ranlen; j++) { + if (times & 1) + in[j] = charset[fastrand() % (sizeof(charset)-1)]; + else + in[j] = (char)(fastrand() & 0xff); + } + smaz_compress_trie(trie, in,ranlen,out,sizeof(out)); } + gettimeofday(&t2, NULL); + + printf("time = %u.%06u\n", (unsigned int)t1.tv_sec, (unsigned int)t1.tv_usec); + printf("time = %u.%06u\n", (unsigned int)t2.tv_sec, (unsigned int)t2.tv_usec); smaz_free_trie(trie); - getchar(); +} + +int main(void) { + + printf("Testing result when using too smaller buffer:\n-------------\n"); + test_compress_small_out_buff(); + printf("\n\nTesting null terminators stay there:\n-------------\n"); + test_null_term(); + printf("\n\nTesting a bunch of predefined strings:\n-------------\n"); + test_strings(); + printf("\n\nTesting a bunch of randomly generated strings:\n-------------\n"); + test_random(); + printf("\n\nBenchmarking old smaz on war of the worlds:\n-------------\n"); + bench_old_smaz(); + printf("\n\nBenchmarking new smaz on war of the worlds:\n-------------\n"); + bench_trie_smaz(); + printf("\n\nBenchmarking old smaz on random data:\n-------------\n"); + bench_random_old_smaz(); + printf("\n\nBenchmarking new smaz on random data:\n-------------\n"); + bench_random_trie(); + printf("\n\nDone.\n"); + return 0; } From c137ebd1c833265731ce8d039475cc56abb47d27 Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Fri, 28 Jun 2013 12:39:08 +0200 Subject: [PATCH 11/19] Start of moving towards having trie in contigious block --- smaz.c | 16 ++++------------ smaz.h | 4 +++- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/smaz.c b/smaz.c index 96d61a0..1242392 100644 --- a/smaz.c +++ b/smaz.c @@ -81,7 +81,6 @@ static char *Smaz_rcb[254] = { }; #define SMAZ_END_LETTER 'z' -#define SMAZ_LETTER_COUNT ('z'+1) void smaz_free_trie(struct SmazBranch *t) { if (t->children != NULL) { @@ -91,7 +90,6 @@ void smaz_free_trie(struct SmazBranch *t) { smaz_free_trie(t->children[x]); } } - free(t->children); } if (t->shortcut != NULL) { free(t->shortcut); @@ -104,12 +102,11 @@ void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value) { entryLen = strlen(remEntry); if (t->shortcut == NULL) { - t->shortcut = (char *)malloc(sizeof(char) * (entryLen + 1)); + t->shortcut = (char *)calloc(entryLen+1, sizeof(char)); t->shortcut_length = entryLen; memcpy(t->shortcut, remEntry, entryLen); t->shortcut[entryLen] = '\0'; t->value = value; - t->children = NULL; return; } @@ -142,15 +139,13 @@ void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value) { tkey = t->shortcut[x]; newTBranch = (struct SmazBranch *)calloc(1, sizeof(struct SmazBranch)); - newTBranch->children = t->children; + /*newTBranch->children = t->children; */ + memcpy(newTBranch->children, t->children, SMAZ_LETTER_COUNT * sizeof(struct SmazBranch *)); + memset(t->children, 0, SMAZ_LETTER_COUNT * sizeof(struct SmazBranch *)); newTBranch->value = t->value; newTBranch->shortcut = ttail; newTBranch->shortcut_length = strlen(ttail); - if (t->children != NULL) { - free(t->children); - } - t->children = (struct SmazBranch **)calloc(SMAZ_LETTER_COUNT, sizeof(struct SmazBranch *)); t->children[tkey] = newTBranch; free(t->shortcut); t->shortcut = commonPrefix; @@ -169,9 +164,6 @@ void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value) { vtail = (char *)calloc((entryLen - x + 1), sizeof(char)); memcpy(vtail, &remEntry[x+1], (entryLen - x)); - if (t->children == NULL) { - t->children = (struct SmazBranch **)calloc(SMAZ_LETTER_COUNT, sizeof(struct SmazBranch *)); - } if (t->children[vkey] == NULL) { struct SmazBranch *newVBranch; newVBranch = (struct SmazBranch *)calloc(1, sizeof(struct SmazBranch)); diff --git a/smaz.h b/smaz.h index f8e55fa..883ef55 100644 --- a/smaz.h +++ b/smaz.h @@ -1,9 +1,11 @@ #ifndef _SMAZ_H #define _SMAZ_H +#define SMAZ_LETTER_COUNT ('z'+1) + struct SmazBranch { int value; - struct SmazBranch **children; + struct SmazBranch *children[SMAZ_LETTER_COUNT]; char *shortcut; int shortcut_length; }; From 83afce23d1f2b786457ed88890e5a9a1e67d7068 Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Sat, 29 Jun 2013 12:11:24 +0200 Subject: [PATCH 12/19] Next step of move towards contigious block --- smaz.c | 21 +++++++++++++++------ smaz_test.c | 1 + 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/smaz.c b/smaz.c index 1242392..c7e7f82 100644 --- a/smaz.c +++ b/smaz.c @@ -83,6 +83,7 @@ static char *Smaz_rcb[254] = { #define SMAZ_END_LETTER 'z' void smaz_free_trie(struct SmazBranch *t) { + /* if (t->children != NULL) { int x = 0; for (x = 0; x < SMAZ_LETTER_COUNT; x++) { @@ -94,9 +95,13 @@ void smaz_free_trie(struct SmazBranch *t) { if (t->shortcut != NULL) { free(t->shortcut); } + */ free(t); } +int g_branch_counter = 0; +struct SmazBranch *g_trie; + void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value) { int entryLen; entryLen = strlen(remEntry); @@ -138,7 +143,8 @@ void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value) { tkey = t->shortcut[x]; - newTBranch = (struct SmazBranch *)calloc(1, sizeof(struct SmazBranch)); + /*newTBranch = (struct SmazBranch *)calloc(1, sizeof(struct SmazBranch));*/ + newTBranch = &g_trie[g_branch_counter++]; /*newTBranch->children = t->children; */ memcpy(newTBranch->children, t->children, SMAZ_LETTER_COUNT * sizeof(struct SmazBranch *)); memset(t->children, 0, SMAZ_LETTER_COUNT * sizeof(struct SmazBranch *)); @@ -166,7 +172,8 @@ void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value) { if (t->children[vkey] == NULL) { struct SmazBranch *newVBranch; - newVBranch = (struct SmazBranch *)calloc(1, sizeof(struct SmazBranch)); + /*newVBranch = (struct SmazBranch *)calloc(1, sizeof(struct SmazBranch));*/ + newVBranch = &g_trie[g_branch_counter++]; newVBranch->value = -1; /*printf("asdf: %c\n", vkey+'\n');*/ t->children[vkey] = newVBranch; @@ -184,13 +191,15 @@ struct SmazBranch *smaz_build_custom_trie(char *codebook[254]) { struct SmazBranch *trie; int x; - trie = (struct SmazBranch *)calloc(1, sizeof(struct SmazBranch)); - trie->value = -1; + trie = (struct SmazBranch *)calloc(255, sizeof(struct SmazBranch)); + trie[0].value = -1; + g_branch_counter = 1; + g_trie = trie; for (x = 0; x < 254; x++) { - smaz_add_to_branch(trie, codebook[x], x); + smaz_add_to_branch(&trie[0], codebook[x], x); } - return trie; + return &trie[0]; } struct SmazBranch *smaz_build_trie() { diff --git a/smaz_test.c b/smaz_test.c index 873544d..2c8deb6 100644 --- a/smaz_test.c +++ b/smaz_test.c @@ -150,6 +150,7 @@ void test_null_term() { ); if (decomprlen != 5) { printf("Error: Expected return size: %d, got %d\n", 5, decomprlen); + hexDump("out", &decomp_out, decomprlen); exit(1); } if (decomp_out[4] != 0) { From 686c4d3104bab4f864146580bc1d12baabcb7d5a Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Sat, 29 Jun 2013 19:08:05 +0200 Subject: [PATCH 13/19] First attempt at not allocating strings --- smaz.c | 45 +++++++++++++++++++++++++++------------------ smaz.h | 5 +++-- smaz_test | Bin 24101 -> 28628 bytes smaz_test.c | 33 +++++++++++++++++++++++++++------ 4 files changed, 57 insertions(+), 26 deletions(-) diff --git a/smaz.c b/smaz.c index c7e7f82..ff9c9af 100644 --- a/smaz.c +++ b/smaz.c @@ -106,12 +106,12 @@ void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value) { int entryLen; entryLen = strlen(remEntry); - if (t->shortcut == NULL) { - t->shortcut = (char *)calloc(entryLen+1, sizeof(char)); + if (t->use_shortcut == 0) { + /*t->shortcut = (char *)calloc(entryLen+1, sizeof(char));*/ t->shortcut_length = entryLen; memcpy(t->shortcut, remEntry, entryLen); - t->shortcut[entryLen] = '\0'; t->value = value; + t->use_shortcut = 1; return; } @@ -137,29 +137,38 @@ void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value) { int tkey; struct SmazBranch *newTBranch; + + tkey = (int)t->shortcut[x]; - ttail = (char *)calloc((t->shortcut_length - x + 1), sizeof(char)); - memcpy(ttail, &t->shortcut[x+1], (t->shortcut_length - x)); - - tkey = t->shortcut[x]; - - /*newTBranch = (struct SmazBranch *)calloc(1, sizeof(struct SmazBranch));*/ newTBranch = &g_trie[g_branch_counter++]; - /*newTBranch->children = t->children; */ - memcpy(newTBranch->children, t->children, SMAZ_LETTER_COUNT * sizeof(struct SmazBranch *)); + memcpy( + newTBranch->children, + t->children, + SMAZ_LETTER_COUNT * sizeof(struct SmazBranch *) + ); memset(t->children, 0, SMAZ_LETTER_COUNT * sizeof(struct SmazBranch *)); + newTBranch->value = t->value; - newTBranch->shortcut = ttail; - newTBranch->shortcut_length = strlen(ttail); + + ttail = (char *)calloc((t->shortcut_length - x + 1), sizeof(char)); + memcpy(ttail, &t->shortcut[x+1], (t->shortcut_length - x)); + memcpy(&newTBranch->shortcut[0], &t->shortcut[x+1], (t->shortcut_length - x)); + if (strlen(ttail) != strlen(newTBranch->shortcut)) { + printf("WTF?\n"); + exit(1); + } + + newTBranch->shortcut_length = strlen(newTBranch->shortcut); t->children[tkey] = newTBranch; - free(t->shortcut); - t->shortcut = commonPrefix; - t->shortcut_length = strlen(commonPrefix); + /*free(t->shortcut);*/ + /*t->shortcut = commonPrefix;*/ + t->shortcut[x] = 0; + t->shortcut_length = strlen(t->shortcut); t->value = -1; } else { - /* the value of t remains */ - free(commonPrefix); + /* the value of t remains + free(commonPrefix);*/ } if (x < entryLen) { /* we can assign the v to a child */ diff --git a/smaz.h b/smaz.h index 883ef55..e3229e4 100644 --- a/smaz.h +++ b/smaz.h @@ -4,10 +4,11 @@ #define SMAZ_LETTER_COUNT ('z'+1) struct SmazBranch { - int value; struct SmazBranch *children[SMAZ_LETTER_COUNT]; - char *shortcut; + char shortcut[12]; + int use_shortcut; int shortcut_length; + int value; }; struct SmazBranch *smaz_build_trie(); diff --git a/smaz_test b/smaz_test index 7feeded31e10b9b8dc33acec371c1a026caa136d..5b299ebdc30742391f6a20b2b9eadc8e858cfe18 100755 GIT binary patch literal 28628 zcmeHwd3;;Nwf{)66Jv;FAVApKTb=?jO=Oz{nm_=_mYgd`PVCrOfjY7z+e)-J(p9{a zM-yzP5srh~(!#GRU4LoId!;Q6VQFx7pe+O_O9_-fS>z-@SQ5gL==VKy&$VSG?(cox z@AH0taTDL~oHOT~nK^T2?#$J_(%0SIDznKXjKeIh6GVMj;E*i+EX0~+l4RaYahh<5 zFNh1pIB@du=a4ysddk3KY}&vAma_o!@Hf+}GiI6{GGX9FI$IcM)R2(M8#x@D)bT#I zP8ukLA{nGdb)7O#lECFMVPM(mI=`0D(4*>+UM=g@vYvqhTrmTU_EFpDuYudwV0M5a z+Cd;(@;rhhOGoV+T7c5E4x$JhqwqdS31Oh2R|h?+=YRDu>bZc+8_O>*aQh5oQj~^6 zO;^n*4F{%$L(yc%w2qlqO}lD_JrT2CA=_PyijeG_c?)Hg3_nf+MRlHtKZ*g$-+aft zk+r3_I)D2e|M_h{U3YO|;Z@I&uJMQDfBCo*l4qSlIz9$}>lDrYa4ZoN&GDcwAXiF0j8U;%hlno({dh1a=6F37m5o=} zr`u-?r^`o@BdIIwSE6~s%>OLIDT?AeQ2CNWGA7Uz>-0y5c7EtInxTGxH1Y2E=65kH z^TZ~mCx${3DDwZnbn*g{H;X@krepv7^#m}B!r)2T7=a;f9KGn%MW@hn?Ifn8aS$s5j_)xR>g~MDlH;aEp z7Jo?=AC4S8mS^#chPgttX7R0A{I)E<(RY-sb{{H8kB2AI-=b5ELadzyI}Fv#l@F?l z$V`41XX_UoNDF z5lvk%G$85Qh$hz$t&#MtM3c*h+9cgYG`V_cxuoMnpGI`8q*oA4t{rknx`}8Ct)VhW zFC&^?Cyl&1Zw(mc_q8=YM!6`1fY8I7Pb4JNuBjV z!q#m4qDvsC22E-z|K<4zrir_xP+$I&pdb@}ZFl}dNSdgq8*bXs+}v!pez6DsCcSG1 zTquXi)y~=?S*TaRO^>K3D6{{j9Y)P`9J=9nCX=P_O$BWGy{WJ*?MZ#+O=-5n-qg*u zV`}OnHFcwHuZo-((41Ch>UmGNY#3wzQo&i zvu!zi-MXWBz}-7&?VW~LIrymTCs~cmP&+u1_Vhn2%Xg;q_N)HA-qbtk?|hcY809$k z^hkeEy${~T?d{9|1apbk|FSprW~Ftr-ej+T5Mkg=ZTD>3Uzo~&0TSxA*YehTZKYet zC7zPKUQ7OCU^|bJ+#YN1M`)1a=pM7R_jx^)XYG9esnoXer?(ARHsyVw{z5yg@7&aR z57L~n$*i4Wz5k4Jn^R|;r*>~QZ!&4)t()`CU7vTJ^XN{PeCt8o_km%%o?*Mw$nI0+ zgRpyaI}HCuE8HXJdbN8`-qG!5?X+g1L7BT>GijFH;|0YEg@qI0K|YtAS8~9aT0@i< zAy&}eg<|t;=)6hj-Sjn|Vp2(MQ~O*trH^>2C)JeKR_yKfPC^7c?M*%51wAQ!1q8k5 zV510cAGkIB162Q#1BT@DUNW4{C#w)B_M+1ZykwYk)zssCSHKT>{p&I(PMrAYxi1}_ zyatChl-li0K{wB~?VaLZty#l$wYqkz{c*Eubv-8r)$SK9YRTgic9YUGKA{5GtSm&G zwcXmAK&PwLtu9-sil9=QsR8mvA%@#`u+1m zePHyR+nt8|?z|KN6k5~4v)&ITbatDYCD%MeqZ`vjp3T$WVS)#2Cg*yw<$1Vm=pEEo z)_v$hzxMbEs3qUH;U;ftzw<_#u1Dk_xf=_O zp>;5&&->2B&PC3J&W433{sVRGp(Q9Xp!VfWo&p;xCKVNo^18Kn^5GhD+|0t8` z9n{AJXlu_nl;i?E)0_IhnR>!K>rH2$Y`ArU9a)$N*I!DN$+&wT)4uHN{@6qljd+kw z+IVMwomuriPGLT&qyd#d)S2|b0h?2MAG2=U4{bR{+$C?;_+Rf1+oqblrhl$S&mydP z;arR#YtJ5tk#%b?6=&V-vN?8-KjAKWZ2cUQyZ5NI=PIyeqFv@&W_MY?5k^w&hKq0o ziclJ3QL%0wRQslGS9{;K_RJ-v)JLw=d!E!@clqlscw(j}^$&M{*kP!C zg5xe9q~zmBf;}78u!BELUvUIufudpPbhOM^uMEd~>Ie{~ZmtXq$*ciE;^r`o%7*_Xz4KG_j=y4`p zC69ro0XZe~H1SHFM+J*%m`*BrpX#JIyHB8devi5%g`(-6ki4#W`=;&^r4DhhaG=Jz z_4WDv;mKyVb#qX-dtSHp3{iR3tvg+_4k!0}u;B301pbfonIF=CP5zboBTpZNQxWlb zCOHD{HEV@vKwkGChT&dvd_jrJHc?F-Q%zXLVT8MoJ(1Qk8`q#BR4}=;0K;&i2}=;y zWRt7ph}ZP#sPTw-p$GMmTE+iC34wJXoMxxqHQ`)yM<(!=9sE!rvv&c(fNCpTcBme+gqmYxw0$bpmX z1Zux^{kKtl@_GUJ0n!B`sY!nfxe=G-f+^^aXQ@Ai+ED>sk3$1X1ihbz8o^I}-21jO zg*7Ba{Ln&*X-rLds=omz9(vtrlQHzZ{XQ-t$Vcg7Xdg9LS&CB3!)^$eo!y7f5xdOJ^={#TPVc_t-(0CQdb=1a>w!pO+kLjDZLn^hcv1P_*6$nq?l-LIKj;~G7xmZfU-QXV+kd3`pIx6f@!$`2|7?8?ixb{6 zJ5g%7o8*t~vL@d^rBknM?Kh02L&%{3o`l&tPoJ$XA92YJj@N7z)N2Nmvs`b z!~4a){B=%6ga(gST15ViA6+U^`AIOg9V?!kL3i(vYmq-~TC!Z!%iS6#=t-Dmv)tNa zD*yvUlw$#b#nHjb$$!+f-Mb3XQ&E~T^@5suHMJcZAuNpG&h)tuowXxb>FtY%8jRkj zz)!D2Kfr508E)tr=&IN#ko}RiPQ$c8RnS9hM{1k&TI$8EVy1(tQ4KA9dAA&?^h58U zxg+gGRIEKT1#|wW7EA)8d=LxmVkER+LPHWa-DGlik1Y>Lk*>qVQ|-S`$~d>s%!5|R zU6kBx?npbNz#TTqdVo$CXlU0gEI_-DVISrM%ewgvn+3S$6V2K?0SU55MX9Ox`wHdO zW3AkJJoTkVjjhKvqqJAH4e6T?t81IuUrqatr?Brxy{d0L-p8^j{SaMd`%kLGec14H zAIneeN>*4mqeUpWXD}I7Q?H~yhMPwDy#OvFzmr!+{Qgz=og9Z74ykPq=EEb>;~cZ; zXWrJmL+yU#cj&{^fppARdZ(X7u665PDZI5+@2H=Er+3u-NTtt`B2Pb{k>xUo7?wMS zOyVG|akR=X#n`r6g^j4(w`7t%*O&yT)QMwZ&hJ1X; z@G;4s%EyOw{>eUm9vS3YtHgKH$Fo0$knY(nNEz$Z;q|bbr^<5Y zDKOC4doWX~ebdA>3PP7wGPI$&0&d6Et5k2F7xTZ8b$fG$tP-wdeN$h)8?(7oq1`Pa z+lk)kLG|<%Nk8t2V8X!+i@OA=?dcUL%-d&^TB%*s%q1j?DW(`Q50D_;9C<5Uj`gV8 zH(Lr~jgc-l#1xpK@YQACpP=YCUW{&qj7WPQ(2ZZKJu!aKKeWP;j}D2Ooi^y z2gow+Gtw$7eH7jqH3jBV-DwJh!-uCpYwvoLLen6)M#p0HfmL9Q9|7>%)_qwcayyz) z-FMM(>1}xU63|$-r?x-xr75_7Tn`V>GO;*y;E`icOIJt+8VZL#B2Z=AF-=bcQxB4F zL*48DbY7oLT0A(Ae)X+P2J@fu5l4lU=RW7d*CUN<+(V5OUpwvX$@kq!8>&mNH_HG1 zECrR%i6<3V<%8b-du(SSHyy*3;6bt`qiSny)`a*W3Zv_~tMB54u%vGFRL}L+%$r}U zKH#jVbh)eMsA}py+awrNAE{eV->`7e;w7FfOPBeY{DEMz=Yf{i(2AAeNYwLStZh|1 zp(R(VsSUPb$U8$H(CS0QZCC}Q7dJVkNZep6lX!=1CJ>io2cV;~tF#$u7t(fR?QUAL z;1XR0QlyTguDpLHRxihk*S?WKWLbM?$?EE_!Ohp#)c%KM6ghD-(wiEpF%2EO4)g6{ zCl))-wfkwg<4p~!AN^C^b|7Ev&s%`MLDe+qDH-(6dWhr9S?uCPB zBL;3(NI&RA6l|Sk3>;U`+2iPM0yK?dMR>@EBgh4ko%UXD+$_vAlz6=D|0sFmCjWKl%_ z-1a5-EPX9(>Qn!Y*C~KS-#Oofn!3dZT0~MUR4A(G?wW$My2XfXin9iA>Po zIXNU+A-pGl^9%(Z``00H8|M0)7{e6_;e94*qQ}aeBjsVf3&QBgoR&57VZxGqj(JbU z#_&w|ujf5y=u{)gk`E2vNEeHE(+M9h79$ja3O7NHb17vjn)qvoesQFH05r;eIibpFXJt{^gK zReS9zqo)YbLjj>rXgkvP@5judAMyiC`5~v;KdtgIMR@xs&nBRcJhMOVqz^m056$$J zyx~k8_oR-h+m2g2spr$5zJhq|eOtSze4oD0{luF(>cS!_Gw~&Pt6GP!k-8Tx`6#6I;cb0C(-PZ1%GjTU{4Mpk)S@$lehQps1;}hadmm=_Jc0d0idJIk8 zXT`_F1^A4)51$tw%Ou}}Df$Fh{~S7fJG?@i9b{QQOxRm4Zm7DxP-v||;fn@@qP2?F zpdyr5vxqeV6-BhShWxFN@M)smmk>(32;%IE3MC|>0YwDmpJ>&zwyR4^@nB9|G{9U! zg8+oIm{3|pdkB`pC?qVj7;FaPL7_AWJc1L9Y9iXAP!&RPJZlq+2&GwsqKXIvMJN`< z6Fblfi#Vzk4~ck8gcA5rMbU&0HiAA(z^HOuC=n5iqsDEho+6q$$zZ1lK*J|Oei4mn zA~6+4Nl{aTKPbXJ;e)6aiisGCiy}wyDPpSP6A2}Tr;kJ{9uCr4LkST?UT{Jo8s91u ze++a?B;kdCC0n`IG4p7^6A2>dt z#32s{B!s^eRf(g>)u>q~(#UMZA3Olx3nh$dhu|$mw4oR@9qq%}mw?$2T!u>dMJyRd zf5AmsLUf6ELL{Ri6oDw3=ZAryxNf#sEz74;1RiWsAc2ne!@ghO*{Rv3A`VBihv2)_ zA_$i$c-#s>01x}{s1&O0N1sH6IWcv!u)sy%GlzqVg#|v`C`_I8!W{AahdCU&6Rfy$ zuP}$hcUjQLM?@a=Zi_h*3=0ct_@Xd%?L#WsQYB2+{X>|efom;j@8d`X`(RnyYC*5B z0@>1FL2JA6$UD1XEcPAf#iA>b^xb0)cOF1RLJ6-05jW3*e)zSpz$4GXTC-w-8y^$q zSaUv@{@72<(b)YKwB;C);dd+uK%WKi{1udw7&KiH6c&WS?dGUfYeB^H3Ugax z%4tW11>x{JbKG|;v_l`5XP3T*jP}+a3$vokK*<5+dSQ+Szl3B99Bn};?6IJepE3FJ z%yEnmb0XLX&o*N@Y(WowXpYAc7DU3`!mRloH8m|WCqjP{7L4Zc=6K>|7}MTIR?rQe z7J35uu|I%rJrk!;waI^x1*5BtGJ|g*Gx1BDv>IW)ZuXmyh)qOJJlHNwfk`0aJA|pQ z9NtpCg;pxsYV+z~Oqi3={lbDV@l#>;DL*r}`<^oe@c>*h{#|f=4VAeQ(enbm%PjF_?%=?R5D66S#N z1rug))6_aR&38UdOVF4(yyU(OCs6NKUcVfF{tqb3NGn~-k(U*@oKpQ&v&4cg#;V(bK!v(16v z3KSfSZ-GQCDa=isUpEIrYt2DlhcE~I&*K!Rz$ti8nBr%cqe=;g@b75=DatxFmV-J7yMY{+5NG|aX38ok_EHW3&Mg4ZyO%`5aw22ldxb$`3%Wm zu`nesMIzLU5fIaQ%prd_IzFH{%%Q;B$cR0JkXGiJR!2}p~!2HiYiZ< zLs2Wp*Z{&CT&_!q{f8J}k~>hTiOuQI;Lc!=>ZqalBU=@X0v9Ct?k1S1X^zsOj^IGyoY zMnliQ3g*ulBj1plJ4SvT^B0eie*^OaIr%G?Zev`XlYa}-w=#Z%aRcLb7{AB(V@9LC zzhU|hIea633-kZPxSjFOjC&ak`Inh~jq$CV{6kD1XEaaK$H6$pGZ+nfXXVh7a%d%o zo}5D)b_|?4Mt&LduVlP>3^_**zdDDW&-7A8e@=dc>6;mEW&8%?2F71A{+7|G|K=Rp z;BU|2Kg#sej4$NmznnuKVESFg4|DR3ay}k|Z{(YJJ~rraIdpLjJ&EaybNH7rJ(Y1f z49_Ej1KHsCYN2&AsXEE4N9%EzTWLpt}ZFW z_K5bd@y1wlBNj%D?Xh?`kg#hV8a9EEpmMD;Icc9%2x2S2XxJBT!Ahm6Q<)r?Jh@P38|v8^70H$_DYA0& z<{K5f8cKztu+W7inY|F(Y3!t9O};qo^I$uGo#TqS<_7G~^^Gh4CS_XtR?P?Ah*RH4+a7uO&V&zdQ`!G zpOT1RSD@ep8c5LId75upOIcZ9n!CdnX$w!EK3(8&lYg#O+#PK}zlQe3gIY2kRT81D zAhl*HoTJHBkuVOH_eA}%cpNH9b13QyEB;nr9NRuF4VwO`O3<)5*;4n!-c=CWQSWvPs%*t+6r4tRS$c6)H5S1(wIvu0#<7e1SNat} zlos*DS5l>8VTeY2UDWp{b|IfVw~q6Wp>1>bV#Q{!lE5p^3dV?gYhfKS}X5q0&fVOonh@ zQa|dnWo^rnHLc;U<`%WeJ;!OEy8_*v@c25z;i`IPO_eKbZ>}qCst$BDcU7+L2>8Q3 zPg7Ud;?CBl`K^^J?9t9ZeRX|Pq;_Ff>wIuOKMs#S^laI$V?hquYUFu7pS zl1OX4y)s_usi@oYjFyT{JSsw``HNxU$XH(Y7eogUYqtXu&a~AvNs*#GWu9~{exmwUQ*Skaw z20DCUXS3SntZQfqCRTNNyJDsG_==9YuBOGyYP%LTtqMew{@{Y(isnUKr5$aqiss5W z3l|2$-lZ$+J2iW9!D@S;&R6YTY;USfRLAQUub5ZSG}l|z6!1oT$=Xn5z`aOqSu)qN z%-1!iJ|0>WUs0E^tM-sqqgGYBlD-we%2o5Vxy!1m1Dy*sw|iAn(+YdSURAf!*||gw ztWaZh9j?WzqgBl<6*b-^TGia@WvinLYC;Q}oh>!hO=@XFVo6s`{oFu!zPhr*SzWu> zRa#ot6iLqOtXu3<7kPrKwOH%I<^@faE49^XbA2+_wxX`SVR5*$xouIYHm|0#vVQS= z&%ByBZCdAoie>X+OG--F*S1fLsU!!^Bakb4~Sz*U3uM6DqptoYK zy&|#HS+PLVylSkpuGJM^Hm|;`x^`LJ0(-Q!+FiX`tyxlewUvp^mUzMyuJm+u zR@W@B&$lQ1HO=$<&2y8@)$yia^-{mn>uT;;uvDGfQ0t9E76u!P@npKta?O1hTIb+z z|L#m?FX5g{rXXL4fyXkLW1#N_7Mp-iWHJt5@spWM8&CnRC;I73hTbB2L{7tr(DHt^{?2nx^^0a_FlR~1dFww~Tz zutr>e!8Mmpw_QT2r0>Gtz@wQAnEUqGDT^`!no$o5DH6JW2 zn&K={oJOnXP%U-;UIhPb`5R(Ja`Y3~odNzJ_`8S&U-grJH&|e0U0xO0^%e~(=I$KB zPHC9>S%ZEpV*E*JKRR`t8Krky<`fO)S!Tg^)X~nO0;xd$|2gCjV=P36{pB6YU#_C% zmJ)Km>!j{<%kn8!KZLzvj3IgtgzCT1414_)9~;Oo>nJ|D$H6(4^I)pXB^@*z7P=j( z*KEYYl)q;(rLZ@4T*x?^Q?!4%<*rl5#c(?}z+M5y@{_~%)Y0vnGaR>dMFVcjzC>Jk26jz(do@obbNtvALoC@^b<@ArgyVmDdY2;PcN6z@ifzaWO;h{ zosJ6_pW*xxrk`VaGSm5NC&IXc`DUX$raxo7+n7Ga`JXa9o-xS$7g_#mO#g%F2UzZP z=D)@CzG-^B-(fj=?UW9U<<4cfH<`bb>5YujbE z#uQ^e;{f9wjCV4ApYcbGKV$qQLZ(IFH+PA>(%D?`C|Q@d?H~ zj1RN?(@ghrzdy(Hi;Ul9xqmRdf^mrB_f@9fU_8KB&G-)E`;2SY-chDMVSI-11fz-N zdKvQ=3mDH}v@)K}crN2%#tRs~z<3GcrHnKAIk0=V{g*R;8e=)*492f8Uco#y>MY$@nzmvy3k=?qhs~@t=%uF&<=mm+=F}A9H<+;JPH!+H9 zb$;u0I!dEB&8e)sTA8vCAI)pYQZGKzkH?f5_N(mErDYjKZFdco#wD z;IrHWy@ruZG$Jn^4EsodQ*B{Q*h5jgDqyE4D1;r45`>T!qmR^uJ=ogVj0aAFjjaLb z8kDg6>1hQNsKW|BGGWsf!5bTPJUt>$c4=F;(}dR@=#yYp#UxG&CVz&s>Q$Dg6U#q0n@`y~nDE^aXGqbO*q3j*OqVD!1Q|IjhPihmMfyhn4PNE|h0iTme;~&Jip(83Z9+0zf8_hl zb(#A{L}e_v9!T{io-mo=8yn#(e^M5s!{3G_Y{w`YTs|4`j0S z>6={XnE7ASe*&`E`YP+Ite;z6F8vJjDfW!|8~c3Y`|9a4Y&wkiH1v&jy@b5%@{RAG z-alWe4ad7qjl{oU=hYm2<33B8^#=?^MnliQ13CI@YL`pNHMP3J3x=Rh3Wd>t!$^>b z(f%b%m&>drOC_8npJPrOff%*FOcx(M8~T?IbMyqs(qZ74NE>C#3_jh;<)6rAB5mjy zY@&^^WAG$4=~M^bV7eG2U2BZ`8^emSPgQ;gO{_R>%!&Wp@^z_GpDn2EOta&pO!_`_ z<2sq0X+cel>)&X8-thgs(fs`3`+B4K9KB3FCS&nqN44 ze{VGZbYaZjqxolGK6YfYWZEJo2>Pz*+`}S@gfWk1=UIdm^Wv%Gi-&*bVYK|2!kEuS z^UuQkv^<+7(-v_y=ABdVCkkU;$yT!9DIH_J7|lO-_*2lpFp`7Lzl}F<%PrR2I#RNg?B9l1T4>Vp)r|`o}$v@c-Zt%_MM@LS% zH%R%B`WxeB1@o8lJY$T<4z_b|j-9dU-y`juCCYO8>zk5)rnuMbKx*XpF_I=cWt7{G zzX6}>-VFh` z(<$Og@4%Dg&w^u3xa)3QH>6pdrhtDI+I2U+lZFoYEC_H0^Otjbjk(js`~jPT#D#oj z1GtF!dnY-FE#$K{z@X$y)~FMG#jJu=TW&nx%69g1|K_vSH<^E!`SKYry$|p9%h%=1F2bk1s>y3*eusKIJ5zb-gg+#3A|Ti`Ja@sgril@0U2FY5A-Z zQZ3A1ewNOc&+dVrWd7bubw2Gw=;#HX;;n=2>s%pjV|n9up$z^vCEuV%@aHTqCOIT? z7iVo_{{HiHzQX*+!9P`;KgaUMb#^Dq?~{Bcvfv2I-+Qh@a^IvY-rnC5twat})~nN&YF;6@I*&ll45l zL`}z+bI=`}Kr#}cpUIKsxaYZKF;xqkHSR`(qGF8nNGbAOk1T~A@Nc9ST2Nt11L}zt z_c!SYU}GcQO=|STwRE1d##1>uNB&ljXq@AnU*YsN&abMfcQ-UPI4ivF zMnNSs>ThU3N&c*lKap&dcc$fJ!(sgF5B;hS-GS17 z7f4nbH?)F5T~)Tn7YHw4q!B(fyd|rZ^0r~4Kbg>C z5uMF3X7smov+R{d`t_vImh>NC8qK3%j`>IP8XL2mEq~-GH-FR*ML|3fYsA|c0laWR zWwdpU$m3U<8a?yTae+{@F_}OQv~VXi;(HG0CMkD`9x08(Ju}Mt*#RVPbLugvhoAg~ zsgV-NXEOGNyAGxhS9o8ru?fReG~*>4Dh)<)Lz81l{?=6X4_%EIq#vDX4R*Maco{<0 x3a_E?D?mD}_lgW-`NLQvs#KT8>eY=4f-NC>^QSV5FB8ER_|T7NmbkLjR9Lp}s;P}CWcNzT9kfrJn7V=|eUBmZtL67YGpI)Y&i@CnB{BnTXXCjlL zG#qNYZf0pXFg+ZKB-*F9S6ny!x|t>MXvsCQ-T5$tWNQ{IktUgbEC59|PsSg`0Oj9( z?+^CB`peSW4ip_Zw)nT}tF;5iUm$rK{-_M2U8>lCG+o4RUgD!F#=O3bob%3*OJRH_ ze(@OmrZM>B|1*`dXbgVk7<_7i76&DK4 z-GG!3X8Z`Rx4AVM@y2yuO!s;P!op8ca2+z*VxfrMz8;Ug!nGkMgKCuDaES zqGG;CAlfRLV!@!O@wn$ydao&&Q8IHlT|SZ=NnKNNEwxC&e^z;?6h$eag4u@d7SI%M zX3*&uhR&jy==;pu_O0(AOmoD)Fg-aGnnV%)pG+q%B6&qT0h+FZH#bP6*dF#J(x!iD zj-xmqz8noQIRSTy8S~1`;PX@?v+^_eR#O;Qkij?S4$4+D_?h$TvB zDl+(J@bKlx;7=UpqP-dX;i(t-wHf?#GUS(M@ZsR$t09A*H_S!(UGL_meuWmL=D_^a z>zIGEu3fo1O_B7qM=Dg2p878Cw#yty5#2)BLkH8iroKTmxqN7!q+cPLTs<@(>HS1o ziGE1Z&k;?o9ojDGr->$)4sDh6<3y7yhc-z1mqe2bhuS3luS8Qw3^hplpNS?{4%JHf z`$Urqha8f=muPa`P?@CfBAQ$_q=I%Xb1ro*an>)X(>A`lxlo9J)|)eR8nU%yx4j*V z@&Qj@*zWN3J#2R%x>Jv1YAzp0#lIq9HcHZZrzadL6sqh6TF)_Cw+-%iV!PHLMrj_k zYsn+!16tS4-2Q7z9pWi+TIyC9q4i!Z^M){uPk>noY+=HIgdT&TIkK}wL%A|Im`?JrA z25rf!Vf<%*8fn(0!_DeO5!|evB1k>%m}3+D2g|tG=;l2M>h=t0$p1`35EsmX|N?)(pvd(gv9^zc)d5ohhT^^~Aqt?x;x zr1@V$uco&B)#%dmk>%`q*W&E@YueWHAMEoo_Ls6y)* z(3SK)IdyN%yq0Lm!*wLM;l>TK5krZ2PdILBK);OZ1gUL2T0zv{msYLos70G~AZ|DO zs&x%mwB)!Y5K2$pEroi=)sV`p1F<*ujGKX^g{r&t_MJ^lBmJdy-I!mFdZ=FgyhjBt zvMu-ar!-WQK6v}iOzXC8&6t0?-fhv6>nO<9=W0D^TXz{vN_#P>%2Ilf@bu1{Fu&(h zTlbfUEqCvohTQK!kQ}B$8oIi@LW6L=r*}4lQO;H=%!LM6SS~6&sLw&dxx*z-%`EwI ziq5?`cC%m&4+r~jx&6p>o5tQNMXT(pA-m3AOXaG=<+|X9@*SD^O=`Oyca^^-$HCm*;w$kC(NU`lIRpCi@>l73t@X~v zGbyc~mAY5X$jHR=N89-20XP@)3ucX?6EyI@nNMQlQWq12XTbSDckefI-RNWY#=lDO z@25_FCb=wo92BOW)H{;99Y6oAkh-6x(w`c3BV>%4b7jrBhW5iWe`B_Hq+W&`x{qsq z7L+H6@y*(ZPD49iYUdl;ZmEsfO?^*ln|1d}?axwOtc~GeXctKB0z>=Hm|s&386sTw ze959g)wQc-WP%;K7JL~uH2&PlQ(E#f=Ms1F6S)xTdUqP0xdkZ|Li2l@^0nknJYDAZ zyk+ZNgVE75;Ot#mVCx=45LeCm!f9Lha?gI-mifqX+IG0a6W|MHMUGv+e29)OWIbA6 zr3Ja3acO78Ia6#slaYaPXT6eqef^t#-?V7{S2cf{MwCdM_=H066;JYQgeN^L^HZOI zMYYa4V%u^Hgo-eULbR{b0!wqDnc4{nJh*bTzPQD0T?3={y0K33?@v939CvSZzMOvh zRjAqea|%%Gl++fPg+WQv^&wBP5o+zZ&ff7!rMFz`8qDe2lsszg8wyGBd7IlBJdWsHV2qoth=r&rj746r+X)AT~FuqZIoubc=gY# zl^+{A42tUZ_`js;bIH2o9R=hftdj7crD<$G^r@TY?8&JRm|P0nd#mLn?@W##vcgr4 zM=Y}UQ};n~uORPfNh=vWw|Dlzp*uj2bW|!%#i3`0W`IBPd~?>zGM1DdtJ9KS8qao3 zitaLsv|Y9VO~t%B^JwyjC;5)nwcC>V4lRl{o;-z&#Nk_S_h3zZ8@!M8%Mw|fET^vg zRk<7c{6jKqH(S^{^Y!hH29bOP-on&vNzIjIeJRU2L{DHXnZiSu>&tnL}01!8U$O3AOHkh68Cq3fD;P)q*S**n2@_axMg2-!T2tVp|h zp4G2*c71Mn40Vgg>84L`_AOF0|8unRRg3EJEKh!+S)QYTqauJRXsYRkwp^uFC3km) z?G=`)qQi(ckLAtH3WkdX5hvws-Lx^K3T!?2bJXPPxjS-CSd#XDQj^Or~hs zp5{vK!P8Vr{>eovj2d*i+rP`UBgZnk)AsijMD{L2eDAk))3(R91C1?yTJGB}22D@U z-rEaY*eg=~`lT+c#J4<;q2=nMq0{xA#hpBU^vB#!uH?H%k7E4UcAPI;>h=#h`&!b@ zu0gD-c3L*q=Qx}jKc}M%{c;%Evk+ClUA8UNNKlI&JJ%!&OZCLXSf*1njSShH z5T1>7;?e!7aVSN$V~Fg_SPu@zWAYOkF#ky@SohQr{ zo}5F?c(m=X+WJpyuCmVFqvTb-g0Ww<%JPFZZ-kHdDPQ1yMX6>wH`PTMPTbQx%tOM^L!B z@k||}`fUBXs%E{PIOxU%h&fG5{yBB%JsMGo=V>_Q&|n$Ri@Vm@b1ZV8Jf!FoDWhuu zqh%lYKDVgKK3Pki&@6J?=3)so8Iio@E|@{p%7+bx%VbL>2EtSfnom5IFGr1-TwC`h zSR-q}Ga8EKkTE;K7&kpVMyTXoT}6xStWdG_e2rRuTDZ`9RE~qL;}#lY_xu*6Xnr)p zwxixYqwAOjOTnDs_{WMc#~c8OyD**PQ2gJC_!{wo90i$Kdc+pMZo_Gcjg{_8^^l;awUmnfoW_G8~QY zi4-PqEX8`{z{p47O~F=%8kRiiO#YWU`O?wFOkT-)8hfS|{J0!wyR(Z`6g`IeLPc*4#vwGFa8uj<9LfSY|P@t_Nvops7j{L|M^*q3d4_8wjBwJtM5*WYt}pKuXp=@r}ZUp zq;XtJVk*Zo!d3LwluGvXRa@mlL%UD2?0yt3PP;A7R~5Z)**O#j0e4NwtN^aV`M}5? z5@&bGmz?$P&|h>uD&xVw zdrwYI30g+eujW7Iob~#ioMLh3s!w;?lCLK&?s?VLLt%yO`$uL`*u&qC!mzHLJf(b~ z|4WnKb=a2bGBR?e^wl2R@WmZ#e}IFS%{h~g{*&RK{@E(qgZ@UXsXCJXbhj;Wn2v+q z=s#u_mYRhe3g!aL#X0iWByrVR#Fk;Z%oYaFRwdRPxZ{G*0Cb zcBipF>)U3htjFj^hpO+SR${%?bt1>sLu(e>j%{`;aKjh6t>-Z5qC~1nOMW=^Qk`r2 z%b6GJ$wP9zUi2YWE~(kl%F~QEj02*s6XOzPIK09XDMa@`qFPJ-CG{j85?XJ!om?>D zWH+2lUb+?jeS93ul-`wYg^N=u)5Y&0W$WK3Gy1n19q`<6?mnbaS4)xokLgrO3q+1S ziZaPT~^ z038d3>7+%c8Qs|HR+#JIk22;8vTyi23@2b3?9#eUr1cA*pr5&oeQMXyT%4oz;ymm` zI`ILmud%o`&e~GHz=TDdooGE@7IePe5)}AhO{jW{XbGx9jW&sB6Hryf+Ln;N1rk18 zto6l(x>f{n_eF#n5|Mx^g7Qza=z81rrKO@N8WZ&>F0MlWLV8rFEn;m5C5KT-Sm;rd z8H@#m+9={d5sc^}(yWpRp%{MG!;gDvlL$pr5eSM>IL?O2F`>4KUC@Ms*89r3MEBd75<!0`z+26;FjF8nPpC59^3z_Jdck=cSj zcmTc^Y8YmR;4M|Op&B$D?Ze#{N3kKe3`Y4yG!a98!9{vpbc$G9BqAcz3Q;uAj{<_? zrrBbRte8$F*ssv2)e}Ft5^enl6co8kC>?4I)YtkR!#x&! zQHjQ`63UH*KU1RmUxXE7ph}4*ZbhaZT}=0uNw|mfPlXlX@SjS|_jk|^eWc7T{Qwzj zTYe}MRh@~N1M1B}i3Kl5vKfxHA{6#o(aFEJ_;QpOMu-v*dg0k7wNO~mLmw-#XxxfO z*e(>^_l%`+g%S__L0B=GCn&M_>nKeB5LrPNczWms=tqAAy5(HlLUS$tDOQZGHp&bh zMrQn9aMR}t<)+zhLn1mEIkDhcVF?s~jO`SbiRJK?dM{e3>T8rW!KhFYk%PjDG4anr z@u@#n*82Wv3F0U9MC?1@`kq88{u3)k$^%I0yTJ2*DU`LrZ{gPRw6HW4qbaJsNmvo6 zZwgEE4Z<>M7R2JeregK`kr7pIL3+}}9|$F&UTSGMPgsf=X#c#q*&l5_s`cLEJRCu)&@EN#kuN9Wk-SAIby_@`}{=Gm}^iRSP{;JZV z-X)Y`)h=>q>RT!-t<&Kg|8d*`DPhG7sfiq#TmI1!IZG%dY6o({%dHr2pP;#+XrIy; zybZj_MOF;hcSzW$m%&56-&&&MQIjuz4YFekaf_GWreBH7papWljt%4F?69986blH& zAKVN}5GJ=H-SRyptUhdMn@xi@_#YTMLG^qk5L}6hgRw^;5lslCv17dw2yIk?zILGm z{V(Gdn1fsJsIbJ&Q6g#)i106I0IMM-7@2{gp*C3J6TyyM3^vBR5{zF5*9AWpIVJvR z>q$5~`kEE9)B$0|gg0o>@jgb2uTfYrqx==gV7{;ncU5Yp;A%bHeLtLnc{LgDv?5^8+|QW5ocN+@Cj8QqF7Rxhxub%GsJ*Wn&l?^HtZ zsmRnzaa;2^;#vLL1b%K+TH3$BROL$w%j`qKiif~9^ZBS^gsSQ^CE|aJS)*Wip+nNQ zGd44}+YNpz)2kUfv+_4EeODI$+f4r>;}02s%ILV#A?W&9p@TsDs?b3o%zB?LbPzdA zzg*}bQoLR0AQ&N%gFc(lEYHM?n12=HHH=Qig^Z?rJ=3=^db9FFOvf14 zG5#InX2vayKVp1<@!uGK$!OO5NEZE@EZUSa@pqj62gX;j7*Yc%1QL#?Ki|d&f;R;(QXL z$v5#l=3l@#g>fq5Rg9+m*O{Kp=*-HmW_lUpZHx_!O^jxF;Ve3qMR#V=>$7OH91}N< zk$)HS?_u0FhTQkF_&>>_f5!B$7=M$MznAGhGQP%mi1BU4oGXleIg8P>KR=5$`BSp^ zg-jPSUXzvo^(?xIX*c77tbDVcx-s}>{;gU04O#TcEIP*Y+ARJ?rZ+M6F@BeEJEK|e zPqXNsXVIqIL(G3TEB`U3cV_WTz2}*~e~f%n?iJ2|i*bnYBgPYqro7xIf>@W)p}Q5y}Dpwy;|$6t8-ON#J)>c z8?jj=Y>q~;Ys9;(>W%8uM9I{$>k<=%)G(yg>x(90w~UQ)w9yx%Z5-+kU}Ly)QBys( z-^M-~r8I|wv}nSvumf8M)z^&oU*o!(h=t=~en<_4!->{VB&r6GZpHd39yOKQV$tT9 zuNAq?C9z;25YlBq^;mAI_}{0->8)E8@8d#(w#L(a)0@l60@GdXzSg$zj2Sbq9&nMz zu2)^{Z9%^d7sP^kA{J5Oq0Zp-@KP}vsso2#^<;T>#2<~tprST~BEGQdZ}G*jspHC^ z=`U7;W|>p{6U}=5kMtHS@pxdOyj&4~tfNh*F2$xt4FreDsn}HDD2=O*mz0!{v8AoS zK*(3x5@;+hZB4{O{!(YSF&HVWM@P(|GXWG5iLQm0nu2TLcwac=gZbDd1|$Ap{Ccid z)V2CL*_$Ca4VxM>xX@a>wjA@tI!M;+P;3JdjnogNzK9-*1v^X83!SAp-efLq^~LE_ zKwh3JeF;5^jW7CjbprPLWNE=z$fvFetqEd}tJa15A($oQ%gf5j)JUSWF&K-h*ATH5 zp`n(Sflz0Z4V$H68_|KHbRFj~xM=es?T?^8sN1oDYr<9(8=1Ay*s8c8zcz@nXbVFz z(@A;@f?>5a*x8A|L&`71jBJfW5l>{I?kiSXLuxFf$D(U|2+oL_h#=;I;Wa)r8uuq~ zB17dwVF5Y~Rmvyu>E^Mh>SrX|8)XlAHY^`0=*|Jc(Wp%uBe#;v7oYkx2b>YOKRqdWe z|B}SwrOR7e>PjkOmF_vUZS$79Y8yLRJXM~#3nC?LOKYlKf$G+IRh8;ek8@RtXIa&1 z&%$tPOUIfuA$RATg>#n0m(?t4bJxyY*`$YD0X5MU2(ECiSUG2DY+-9vZOzL(inP}FRxH(*Rkr!s+m=QB`pme$tS`C^qzmfW(awXUY7qxP1j z;POgWA`)r#wO6;-hJzg|RxZ#h!&>Fyz%7Az`?6J`Xq#`v>Ln$seC`EvTUUgZ;Qf8e z#nv0>-$>BA*bnYcr|$!9{(U;V!74=VpVH|8qF+m=^DRO=`j>RN9hiGCoqiCw9Jmjd zdLx}yamripb~@cgXV5r0JpgRSk!=CWsleI!a^L|Rr0)Z^;5UI2K)Ub}v%D;wi-n~< z-*WLe6LRU_=-9!Xg1=nYbdV(S@~iU-=Gx9)o4Y~We9;Y8&9D~|nnjrt!|aimnM&dD|`4JFIyXU^(-qK?I^unUg19 z(ZJa?il9AX+pFpH6{E}3s61D8c@5TAEfddFUe(#kI%HIxR#6S!%@oD>`z!t`{+v$F z1Crf$D`?^-Yh|9=mD7<|;LOW+nPy3U6qC;(?LIrQay{sS|BXN9=54iF7bs_JfRpT`@_vHy2GG}qqy3TP-%9eYEbr)<9pa+$$iE+> zyzOtozd*yjsVWi22&wmrYHF*OrYp3;$9+^k} zwd1dz%EK5;0>q-b$88$#&2w~3xJTJy?aNV?%b22Zw+?bGkfSj9+x0gO{LKS@^T6Lc z@c*|5@T!Wu%vNO6CZ8Tjbm0vMd7Z;}aJ)mNCNZ6>8uTZp(otgFrx^4&B5*l!91_RN zJJtz`L#E6P0Hid_5bH!{(JT8ioQC*Az4>2UdLF? zxRkMpaSh`p#_urx3*!#PJ&b>3%&zx;O5enBVrPt8ZrGd8cn#yVoWGRmX-uzRR0|Eg zBU?C+A9oNV>x;ai>?Hkn<1LH<)}O`nWo3r`ai)tHH?!PrOgAvrG6oo%8KaD67aHYV z!sR~9<-|FE5$8Y0{G-fwFuj)PPR9F7jq$J)|6`^fU_9_O!=7qx=f5-mmyEw=e2no)#sS6~ zSnn-v*F#*-PR`%U_*=#o7++*u#PTmO{U^rP84ohP$@mWA5ytlzKVbZX@dV>ZM#1%1 z8OJkDWXxkckMXOF7cxG`m|&d3s506ar!ij6_*M2#G1L7VA7xC>WIV=L!MKaU~FV;Vhl01GPW_!<@V`Jw==F|+`xD* z%MEe+yP5y@jJ=Fo7{A5%ZN}?a{{yDK$M{c-HSCA&O#g)O0mk`^KWE&?*vfkU!St^f zA7Ol)v7hm2#?2h>dzgNf@p;A<8UMg|fbq|aZ!jKa{0`#~q=%! zFHcD3wM7OkL`l4(RrfUl^_YPzW(wa+3C7w)34U}5mN@6Qr|Z6ErkW#(lEy>`AKbwU zc_l=(_~I?1B+wB-c?RMQL79WI{5ZWFnMrt&7Yl}cq`;}Rur5kM5xl@xq6gb?$8Rzr z-KM5+#28{t!on2CVcW6J`2Z@g8Lfe)W;J5?QuUP9t7I zq+d4hBb08s>;D=rh9<406)vGdamZ8!Go~=#1d`3Xz6N3f{!IOP#Q_R&$P86dBe$H9 zq)e6Lo~f@Z4$0S9-!7?}8@R!=Z=$5BUtna5r9dnGOnq~o9F z;wt3R9?7K5ebO{msAc_Z`A2o45^hrv5?J zKS)7|Yg(28rk;^2&@@e znctx^G!kwLX_`!4}W&l=MmX1mS;?@aYKtT6O~4NAnAWIc`VuN4^g@ntxFyDbvO{m?Z8$$1y68##fH`AU%o+g4T5wak?L9yjsNR zeyEWA)BWHAUqL?(WYv4Cl)q5ax*L!h8MiBC`a-ds`Q~_R2cOC*o8&-hvi$?R4%x~a0;bn~Y9{8#(Vf5|arTl0fQBO(!8J?$p zyi1z#iCE!}>G(z=zMw(hZ>_9fxY+A)*VW_8v0iW0C~9tHMphu|Z4Tp&*sxa~J9vGG zc0q4chJ$$dwB)*LXOs)d@P-2IM9siI6{VAlK%%u3AB;8XaV@Bl)l@Hb&UblDimEZw zBelpg4p|Gm*yyDfMPV?d0d>TR`5TP{(CejB6t6E9^L2RXbzN`OEeo9U-Ib$r~xpr!Z8bHI{i+^HD=J^5vuO9W{J=2K41^?U@AKD$Q?+wb+8RIQq{3}sXGT%`( z$}tRq06xo`rJMC3-;vyz_&1;X1l=v diff --git a/smaz_test.c b/smaz_test.c index 2c8deb6..3a5cb0a 100644 --- a/smaz_test.c +++ b/smaz_test.c @@ -49,6 +49,7 @@ int fastrand() { } char *strings[] = { + " ", "ht", "foobar", "the end", @@ -69,6 +70,7 @@ char *strings[] = { "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura", "Nothing is more difficult, and therefore more precious, than to be able to decide", "QtZpZuMhlzfgHFEGA.Kja/hsIayllFSAMFDl.fQ/bJdzfzCvxdclaIbzzWyhbOhCj.nydSJSbmPUzhOHYqszMhvIBqqsSluQkxLbcUuRVXmhS.CrCIBPpKXEPbyhLDLJNn.pVGFEdFmKDC VLAk.LWDqLOlmhyvviIzBOBWsWGQpIPJjftiEd updeZIZjBVrOmDPGJmcZZ CziiEeAhtvkUnYdaFuvKGvdmQnmGaZVtWCpaxpVozEWjc/HyGQFMaiMqjzKYmgPGzSxsFPuCjP JcHUinZvLWVPTSarCUUYQmSGGyPYfeXCEunngaxFxPleyZjNtClHCRdYdsxWkiopaZqU.kaINJmZiUmp", + "oTnBmdtIaFEFHFpgqkGlYdCtqIXFTKIPfJsdIotaZ/oUGWaKHmBzzMQyKteDKLXHedxalAfHzAQTgesqyLzo/.rjxQzbWZPzUbqdnuceRejfVz/xpDBfCGUUdlLYkSyt.uRv.dQaJEW.bPsJrQWjNBbKLFbdLmauPiCdEVHgXKIZazGSriVrjQs.H.itMHFJDuajeCqOtKZFJdyUtEEqbbj.s.FQkAyXHdjHoQxDWvnFfgBMLXtFKJZvnRMiUfAgMJbH/TsXzMSKdlOHkxAJPWD//QbmuNyQWAHVIevtohUfRbCktvHfSuopjQSTWl/fpV/tNMCCSWOINMGptyRBZNobtdL.KMzKqvnnu.A.jWgOMtLrrHpcCB.GLIREreLBK.BsYABRttLHo/QhDrZNSzJPZQR.nPJEJvHMX/sO/H.tksygrsDlCIzyJMR.O.scMfNcfKufJrbeJYcALDfxRYHKTPLmmUeTe", NULL }; @@ -267,6 +269,7 @@ void bench_old_smaz() { void test_strings() { char out[4096]; + char out_good[4096]; char d[4096]; int comprlen, decomprlen; struct SmazBranch *trie; @@ -275,7 +278,7 @@ void test_strings() { trie = smaz_build_trie(); while(strings[j]) { - int comprlevel; + int comprlevel, comprlen_good; comprlen = smaz_compress_trie( trie, @@ -284,12 +287,25 @@ void test_strings() { out, sizeof(out) ); + + comprlen_good = smaz_compress( + strings[j], + strlen(strings[j]), + out_good, + sizeof(out_good) + ); + comprlevel = 100-((100*comprlen)/strlen(strings[j])); decomprlen = smaz_decompress(out,comprlen,d,sizeof(d)); - if (strlen(strings[j]) != (unsigned)decomprlen || + + if (comprlen != comprlen_good || + strlen(strings[j]) != (unsigned)decomprlen || memcmp(strings[j],d,decomprlen)) { printf("BUG: error compressing '%s'\n", strings[j]); + hexDump("in", strings[j], strlen(strings[j])); + hexDump("out bad", &out, comprlen); + hexDump("out good", &out_good, comprlen_good); exit(1); } if (comprlevel < 0) { @@ -333,9 +349,8 @@ void test_random() { comprlen = smaz_compress_trie(trie, in,ranlen,out,sizeof(out)); /*comprlen = smaz_compress(in,ranlen,out,sizeof(out));*/ decomprlen = smaz_decompress(out,comprlen,d,sizeof(out)); - if (ranlen != decomprlen || memcmp(in,d,ranlen)) { - printf("Bug! TEST NOT PASSED\n"); + printf("Bug! TEST NOT PASSED: %d\n", 1000000-times); hexDump("in", &in, ranlen); hexDump("out bad", &out, comprlen); comprlen = smaz_compress(in,ranlen,out,sizeof(out)); @@ -417,9 +432,15 @@ void bench_random_trie() { } int main(void) { + /* + struct SmazBranch *trie; + trie = smaz_build_trie(); + printf("val: %d\n", trie[0].children[' ']->value); + exit(0); + */ - printf("Testing result when using too smaller buffer:\n-------------\n"); - test_compress_small_out_buff(); + /*printf("Testing result when using too smaller buffer:\n-------------\n"); + test_compress_small_out_buff();*/ printf("\n\nTesting null terminators stay there:\n-------------\n"); test_null_term(); printf("\n\nTesting a bunch of predefined strings:\n-------------\n"); From 41eb727c6b1b0313b8ff721ccf6b0d1cacabda01 Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Sat, 29 Jun 2013 19:23:02 +0200 Subject: [PATCH 14/19] Fix for not-working contigious block --- smaz.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/smaz.c b/smaz.c index ff9c9af..f5c19a0 100644 --- a/smaz.c +++ b/smaz.c @@ -107,7 +107,6 @@ void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value) { entryLen = strlen(remEntry); if (t->use_shortcut == 0) { - /*t->shortcut = (char *)calloc(entryLen+1, sizeof(char));*/ t->shortcut_length = entryLen; memcpy(t->shortcut, remEntry, entryLen); t->value = value; @@ -121,7 +120,6 @@ void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value) { } else { int smallestLen = entryLen; int x; - char *commonPrefix; if (smallestLen > t->shortcut_length) { smallestLen = t->shortcut_length; @@ -129,14 +127,9 @@ void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value) { for (x = 0; x < smallestLen && t->shortcut[x] == remEntry[x]; x++) { } - commonPrefix = (char *)calloc(x + 1, sizeof(char)); - memcpy(commonPrefix, t->shortcut, x); - if (x < t->shortcut_length) { - char *ttail; int tkey; struct SmazBranch *newTBranch; - tkey = (int)t->shortcut[x]; @@ -150,25 +143,17 @@ void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value) { newTBranch->value = t->value; - ttail = (char *)calloc((t->shortcut_length - x + 1), sizeof(char)); - memcpy(ttail, &t->shortcut[x+1], (t->shortcut_length - x)); memcpy(&newTBranch->shortcut[0], &t->shortcut[x+1], (t->shortcut_length - x)); - if (strlen(ttail) != strlen(newTBranch->shortcut)) { - printf("WTF?\n"); - exit(1); - } newTBranch->shortcut_length = strlen(newTBranch->shortcut); + newTBranch->use_shortcut = 1; t->children[tkey] = newTBranch; - /*free(t->shortcut);*/ - /*t->shortcut = commonPrefix;*/ t->shortcut[x] = 0; t->shortcut_length = strlen(t->shortcut); t->value = -1; } else { - /* the value of t remains - free(commonPrefix);*/ + /* the value of t remains */ } if (x < entryLen) { /* we can assign the v to a child */ @@ -181,10 +166,8 @@ void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value) { if (t->children[vkey] == NULL) { struct SmazBranch *newVBranch; - /*newVBranch = (struct SmazBranch *)calloc(1, sizeof(struct SmazBranch));*/ newVBranch = &g_trie[g_branch_counter++]; newVBranch->value = -1; - /*printf("asdf: %c\n", vkey+'\n');*/ t->children[vkey] = newVBranch; } smaz_add_to_branch(t->children[vkey], vtail, value); From 4373a59166898347674b7308fdf75f3fe27b8903 Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Sat, 29 Jun 2013 19:23:46 +0200 Subject: [PATCH 15/19] kill accidently commited bin --- smaz_test | Bin 28628 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100755 smaz_test diff --git a/smaz_test b/smaz_test deleted file mode 100755 index 5b299ebdc30742391f6a20b2b9eadc8e858cfe18..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28628 zcmeHwd3;;Nwf{)66Jv;FAVApKTb=?jO=Oz{nm_=_mYgd`PVCrOfjY7z+e)-J(p9{a zM-yzP5srh~(!#GRU4LoId!;Q6VQFx7pe+O_O9_-fS>z-@SQ5gL==VKy&$VSG?(cox z@AH0taTDL~oHOT~nK^T2?#$J_(%0SIDznKXjKeIh6GVMj;E*i+EX0~+l4RaYahh<5 zFNh1pIB@du=a4ysddk3KY}&vAma_o!@Hf+}GiI6{GGX9FI$IcM)R2(M8#x@D)bT#I zP8ukLA{nGdb)7O#lECFMVPM(mI=`0D(4*>+UM=g@vYvqhTrmTU_EFpDuYudwV0M5a z+Cd;(@;rhhOGoV+T7c5E4x$JhqwqdS31Oh2R|h?+=YRDu>bZc+8_O>*aQh5oQj~^6 zO;^n*4F{%$L(yc%w2qlqO}lD_JrT2CA=_PyijeG_c?)Hg3_nf+MRlHtKZ*g$-+aft zk+r3_I)D2e|M_h{U3YO|;Z@I&uJMQDfBCo*l4qSlIz9$}>lDrYa4ZoN&GDcwAXiF0j8U;%hlno({dh1a=6F37m5o=} zr`u-?r^`o@BdIIwSE6~s%>OLIDT?AeQ2CNWGA7Uz>-0y5c7EtInxTGxH1Y2E=65kH z^TZ~mCx${3DDwZnbn*g{H;X@krepv7^#m}B!r)2T7=a;f9KGn%MW@hn?Ifn8aS$s5j_)xR>g~MDlH;aEp z7Jo?=AC4S8mS^#chPgttX7R0A{I)E<(RY-sb{{H8kB2AI-=b5ELadzyI}Fv#l@F?l z$V`41XX_UoNDF z5lvk%G$85Qh$hz$t&#MtM3c*h+9cgYG`V_cxuoMnpGI`8q*oA4t{rknx`}8Ct)VhW zFC&^?Cyl&1Zw(mc_q8=YM!6`1fY8I7Pb4JNuBjV z!q#m4qDvsC22E-z|K<4zrir_xP+$I&pdb@}ZFl}dNSdgq8*bXs+}v!pez6DsCcSG1 zTquXi)y~=?S*TaRO^>K3D6{{j9Y)P`9J=9nCX=P_O$BWGy{WJ*?MZ#+O=-5n-qg*u zV`}OnHFcwHuZo-((41Ch>UmGNY#3wzQo&i zvu!zi-MXWBz}-7&?VW~LIrymTCs~cmP&+u1_Vhn2%Xg;q_N)HA-qbtk?|hcY809$k z^hkeEy${~T?d{9|1apbk|FSprW~Ftr-ej+T5Mkg=ZTD>3Uzo~&0TSxA*YehTZKYet zC7zPKUQ7OCU^|bJ+#YN1M`)1a=pM7R_jx^)XYG9esnoXer?(ARHsyVw{z5yg@7&aR z57L~n$*i4Wz5k4Jn^R|;r*>~QZ!&4)t()`CU7vTJ^XN{PeCt8o_km%%o?*Mw$nI0+ zgRpyaI}HCuE8HXJdbN8`-qG!5?X+g1L7BT>GijFH;|0YEg@qI0K|YtAS8~9aT0@i< zAy&}eg<|t;=)6hj-Sjn|Vp2(MQ~O*trH^>2C)JeKR_yKfPC^7c?M*%51wAQ!1q8k5 zV510cAGkIB162Q#1BT@DUNW4{C#w)B_M+1ZykwYk)zssCSHKT>{p&I(PMrAYxi1}_ zyatChl-li0K{wB~?VaLZty#l$wYqkz{c*Eubv-8r)$SK9YRTgic9YUGKA{5GtSm&G zwcXmAK&PwLtu9-sil9=QsR8mvA%@#`u+1m zePHyR+nt8|?z|KN6k5~4v)&ITbatDYCD%MeqZ`vjp3T$WVS)#2Cg*yw<$1Vm=pEEo z)_v$hzxMbEs3qUH;U;ftzw<_#u1Dk_xf=_O zp>;5&&->2B&PC3J&W433{sVRGp(Q9Xp!VfWo&p;xCKVNo^18Kn^5GhD+|0t8` z9n{AJXlu_nl;i?E)0_IhnR>!K>rH2$Y`ArU9a)$N*I!DN$+&wT)4uHN{@6qljd+kw z+IVMwomuriPGLT&qyd#d)S2|b0h?2MAG2=U4{bR{+$C?;_+Rf1+oqblrhl$S&mydP z;arR#YtJ5tk#%b?6=&V-vN?8-KjAKWZ2cUQyZ5NI=PIyeqFv@&W_MY?5k^w&hKq0o ziclJ3QL%0wRQslGS9{;K_RJ-v)JLw=d!E!@clqlscw(j}^$&M{*kP!C zg5xe9q~zmBf;}78u!BELUvUIufudpPbhOM^uMEd~>Ie{~ZmtXq$*ciE;^r`o%7*_Xz4KG_j=y4`p zC69ro0XZe~H1SHFM+J*%m`*BrpX#JIyHB8devi5%g`(-6ki4#W`=;&^r4DhhaG=Jz z_4WDv;mKyVb#qX-dtSHp3{iR3tvg+_4k!0}u;B301pbfonIF=CP5zboBTpZNQxWlb zCOHD{HEV@vKwkGChT&dvd_jrJHc?F-Q%zXLVT8MoJ(1Qk8`q#BR4}=;0K;&i2}=;y zWRt7ph}ZP#sPTw-p$GMmTE+iC34wJXoMxxqHQ`)yM<(!=9sE!rvv&c(fNCpTcBme+gqmYxw0$bpmX z1Zux^{kKtl@_GUJ0n!B`sY!nfxe=G-f+^^aXQ@Ai+ED>sk3$1X1ihbz8o^I}-21jO zg*7Ba{Ln&*X-rLds=omz9(vtrlQHzZ{XQ-t$Vcg7Xdg9LS&CB3!)^$eo!y7f5xdOJ^={#TPVc_t-(0CQdb=1a>w!pO+kLjDZLn^hcv1P_*6$nq?l-LIKj;~G7xmZfU-QXV+kd3`pIx6f@!$`2|7?8?ixb{6 zJ5g%7o8*t~vL@d^rBknM?Kh02L&%{3o`l&tPoJ$XA92YJj@N7z)N2Nmvs`b z!~4a){B=%6ga(gST15ViA6+U^`AIOg9V?!kL3i(vYmq-~TC!Z!%iS6#=t-Dmv)tNa zD*yvUlw$#b#nHjb$$!+f-Mb3XQ&E~T^@5suHMJcZAuNpG&h)tuowXxb>FtY%8jRkj zz)!D2Kfr508E)tr=&IN#ko}RiPQ$c8RnS9hM{1k&TI$8EVy1(tQ4KA9dAA&?^h58U zxg+gGRIEKT1#|wW7EA)8d=LxmVkER+LPHWa-DGlik1Y>Lk*>qVQ|-S`$~d>s%!5|R zU6kBx?npbNz#TTqdVo$CXlU0gEI_-DVISrM%ewgvn+3S$6V2K?0SU55MX9Ox`wHdO zW3AkJJoTkVjjhKvqqJAH4e6T?t81IuUrqatr?Brxy{d0L-p8^j{SaMd`%kLGec14H zAIneeN>*4mqeUpWXD}I7Q?H~yhMPwDy#OvFzmr!+{Qgz=og9Z74ykPq=EEb>;~cZ; zXWrJmL+yU#cj&{^fppARdZ(X7u665PDZI5+@2H=Er+3u-NTtt`B2Pb{k>xUo7?wMS zOyVG|akR=X#n`r6g^j4(w`7t%*O&yT)QMwZ&hJ1X; z@G;4s%EyOw{>eUm9vS3YtHgKH$Fo0$knY(nNEz$Z;q|bbr^<5Y zDKOC4doWX~ebdA>3PP7wGPI$&0&d6Et5k2F7xTZ8b$fG$tP-wdeN$h)8?(7oq1`Pa z+lk)kLG|<%Nk8t2V8X!+i@OA=?dcUL%-d&^TB%*s%q1j?DW(`Q50D_;9C<5Uj`gV8 zH(Lr~jgc-l#1xpK@YQACpP=YCUW{&qj7WPQ(2ZZKJu!aKKeWP;j}D2Ooi^y z2gow+Gtw$7eH7jqH3jBV-DwJh!-uCpYwvoLLen6)M#p0HfmL9Q9|7>%)_qwcayyz) z-FMM(>1}xU63|$-r?x-xr75_7Tn`V>GO;*y;E`icOIJt+8VZL#B2Z=AF-=bcQxB4F zL*48DbY7oLT0A(Ae)X+P2J@fu5l4lU=RW7d*CUN<+(V5OUpwvX$@kq!8>&mNH_HG1 zECrR%i6<3V<%8b-du(SSHyy*3;6bt`qiSny)`a*W3Zv_~tMB54u%vGFRL}L+%$r}U zKH#jVbh)eMsA}py+awrNAE{eV->`7e;w7FfOPBeY{DEMz=Yf{i(2AAeNYwLStZh|1 zp(R(VsSUPb$U8$H(CS0QZCC}Q7dJVkNZep6lX!=1CJ>io2cV;~tF#$u7t(fR?QUAL z;1XR0QlyTguDpLHRxihk*S?WKWLbM?$?EE_!Ohp#)c%KM6ghD-(wiEpF%2EO4)g6{ zCl))-wfkwg<4p~!AN^C^b|7Ev&s%`MLDe+qDH-(6dWhr9S?uCPB zBL;3(NI&RA6l|Sk3>;U`+2iPM0yK?dMR>@EBgh4ko%UXD+$_vAlz6=D|0sFmCjWKl%_ z-1a5-EPX9(>Qn!Y*C~KS-#Oofn!3dZT0~MUR4A(G?wW$My2XfXin9iA>Po zIXNU+A-pGl^9%(Z``00H8|M0)7{e6_;e94*qQ}aeBjsVf3&QBgoR&57VZxGqj(JbU z#_&w|ujf5y=u{)gk`E2vNEeHE(+M9h79$ja3O7NHb17vjn)qvoesQFH05r;eIibpFXJt{^gK zReS9zqo)YbLjj>rXgkvP@5judAMyiC`5~v;KdtgIMR@xs&nBRcJhMOVqz^m056$$J zyx~k8_oR-h+m2g2spr$5zJhq|eOtSze4oD0{luF(>cS!_Gw~&Pt6GP!k-8Tx`6#6I;cb0C(-PZ1%GjTU{4Mpk)S@$lehQps1;}hadmm=_Jc0d0idJIk8 zXT`_F1^A4)51$tw%Ou}}Df$Fh{~S7fJG?@i9b{QQOxRm4Zm7DxP-v||;fn@@qP2?F zpdyr5vxqeV6-BhShWxFN@M)smmk>(32;%IE3MC|>0YwDmpJ>&zwyR4^@nB9|G{9U! zg8+oIm{3|pdkB`pC?qVj7;FaPL7_AWJc1L9Y9iXAP!&RPJZlq+2&GwsqKXIvMJN`< z6Fblfi#Vzk4~ck8gcA5rMbU&0HiAA(z^HOuC=n5iqsDEho+6q$$zZ1lK*J|Oei4mn zA~6+4Nl{aTKPbXJ;e)6aiisGCiy}wyDPpSP6A2}Tr;kJ{9uCr4LkST?UT{Jo8s91u ze++a?B;kdCC0n`IG4p7^6A2>dt z#32s{B!s^eRf(g>)u>q~(#UMZA3Olx3nh$dhu|$mw4oR@9qq%}mw?$2T!u>dMJyRd zf5AmsLUf6ELL{Ri6oDw3=ZAryxNf#sEz74;1RiWsAc2ne!@ghO*{Rv3A`VBihv2)_ zA_$i$c-#s>01x}{s1&O0N1sH6IWcv!u)sy%GlzqVg#|v`C`_I8!W{AahdCU&6Rfy$ zuP}$hcUjQLM?@a=Zi_h*3=0ct_@Xd%?L#WsQYB2+{X>|efom;j@8d`X`(RnyYC*5B z0@>1FL2JA6$UD1XEcPAf#iA>b^xb0)cOF1RLJ6-05jW3*e)zSpz$4GXTC-w-8y^$q zSaUv@{@72<(b)YKwB;C);dd+uK%WKi{1udw7&KiH6c&WS?dGUfYeB^H3Ugax z%4tW11>x{JbKG|;v_l`5XP3T*jP}+a3$vokK*<5+dSQ+Szl3B99Bn};?6IJepE3FJ z%yEnmb0XLX&o*N@Y(WowXpYAc7DU3`!mRloH8m|WCqjP{7L4Zc=6K>|7}MTIR?rQe z7J35uu|I%rJrk!;waI^x1*5BtGJ|g*Gx1BDv>IW)ZuXmyh)qOJJlHNwfk`0aJA|pQ z9NtpCg;pxsYV+z~Oqi3={lbDV@l#>;DL*r}`<^oe@c>*h{#|f=4VAeQ(enbm%PjF_?%=?R5D66S#N z1rug))6_aR&38UdOVF4(yyU(OCs6NKUcVfF{tqb3NGn~-k(U*@oKpQ&v&4cg#;V(bK!v(16v z3KSfSZ-GQCDa=isUpEIrYt2DlhcE~I&*K!Rz$ti8nBr%cqe=;g@b75=DatxFmV-J7yMY{+5NG|aX38ok_EHW3&Mg4ZyO%`5aw22ldxb$`3%Wm zu`nesMIzLU5fIaQ%prd_IzFH{%%Q;B$cR0JkXGiJR!2}p~!2HiYiZ< zLs2Wp*Z{&CT&_!q{f8J}k~>hTiOuQI;Lc!=>ZqalBU=@X0v9Ct?k1S1X^zsOj^IGyoY zMnliQ3g*ulBj1plJ4SvT^B0eie*^OaIr%G?Zev`XlYa}-w=#Z%aRcLb7{AB(V@9LC zzhU|hIea633-kZPxSjFOjC&ak`Inh~jq$CV{6kD1XEaaK$H6$pGZ+nfXXVh7a%d%o zo}5D)b_|?4Mt&LduVlP>3^_**zdDDW&-7A8e@=dc>6;mEW&8%?2F71A{+7|G|K=Rp z;BU|2Kg#sej4$NmznnuKVESFg4|DR3ay}k|Z{(YJJ~rraIdpLjJ&EaybNH7rJ(Y1f z49_Ej1KHsCYN2&AsXEE4N9%EzTWLpt}ZFW z_K5bd@y1wlBNj%D?Xh?`kg#hV8a9EEpmMD;Icc9%2x2S2XxJBT!Ahm6Q<)r?Jh@P38|v8^70H$_DYA0& z<{K5f8cKztu+W7inY|F(Y3!t9O};qo^I$uGo#TqS<_7G~^^Gh4CS_XtR?P?Ah*RH4+a7uO&V&zdQ`!G zpOT1RSD@ep8c5LId75upOIcZ9n!CdnX$w!EK3(8&lYg#O+#PK}zlQe3gIY2kRT81D zAhl*HoTJHBkuVOH_eA}%cpNH9b13QyEB;nr9NRuF4VwO`O3<)5*;4n!-c=CWQSWvPs%*t+6r4tRS$c6)H5S1(wIvu0#<7e1SNat} zlos*DS5l>8VTeY2UDWp{b|IfVw~q6Wp>1>bV#Q{!lE5p^3dV?gYhfKS}X5q0&fVOonh@ zQa|dnWo^rnHLc;U<`%WeJ;!OEy8_*v@c25z;i`IPO_eKbZ>}qCst$BDcU7+L2>8Q3 zPg7Ud;?CBl`K^^J?9t9ZeRX|Pq;_Ff>wIuOKMs#S^laI$V?hquYUFu7pS zl1OX4y)s_usi@oYjFyT{JSsw``HNxU$XH(Y7eogUYqtXu&a~AvNs*#GWu9~{exmwUQ*Skaw z20DCUXS3SntZQfqCRTNNyJDsG_==9YuBOGyYP%LTtqMew{@{Y(isnUKr5$aqiss5W z3l|2$-lZ$+J2iW9!D@S;&R6YTY;USfRLAQUub5ZSG}l|z6!1oT$=Xn5z`aOqSu)qN z%-1!iJ|0>WUs0E^tM-sqqgGYBlD-we%2o5Vxy!1m1Dy*sw|iAn(+YdSURAf!*||gw ztWaZh9j?WzqgBl<6*b-^TGia@WvinLYC;Q}oh>!hO=@XFVo6s`{oFu!zPhr*SzWu> zRa#ot6iLqOtXu3<7kPrKwOH%I<^@faE49^XbA2+_wxX`SVR5*$xouIYHm|0#vVQS= z&%ByBZCdAoie>X+OG--F*S1fLsU!!^Bakb4~Sz*U3uM6DqptoYK zy&|#HS+PLVylSkpuGJM^Hm|;`x^`LJ0(-Q!+FiX`tyxlewUvp^mUzMyuJm+u zR@W@B&$lQ1HO=$<&2y8@)$yia^-{mn>uT;;uvDGfQ0t9E76u!P@npKta?O1hTIb+z z|L#m?FX5g{rXXL4fyXkLW1#N_7Mp-iWHJt5@spWM8&CnRC;I73hTbB2L{7tr(DHt^{?2nx^^0a_FlR~1dFww~Tz zutr>e!8Mmpw_QT2r0>Gtz@wQAnEUqGDT^`!no$o5DH6JW2 zn&K={oJOnXP%U-;UIhPb`5R(Ja`Y3~odNzJ_`8S&U-grJH&|e0U0xO0^%e~(=I$KB zPHC9>S%ZEpV*E*JKRR`t8Krky<`fO)S!Tg^)X~nO0;xd$|2gCjV=P36{pB6YU#_C% zmJ)Km>!j{<%kn8!KZLzvj3IgtgzCT1414_)9~;Oo>nJ|D$H6(4^I)pXB^@*z7P=j( z*KEYYl)q;(rLZ@4T*x?^Q?!4%<*rl5#c(?}z+M5y@{_~%)Y0vnGaR>dMFVcjzC>Jk26jz(do@obbNtvALoC@^b<@ArgyVmDdY2;PcN6z@ifzaWO;h{ zosJ6_pW*xxrk`VaGSm5NC&IXc`DUX$raxo7+n7Ga`JXa9o-xS$7g_#mO#g%F2UzZP z=D)@CzG-^B-(fj=?UW9U<<4cfH<`bb>5YujbE z#uQ^e;{f9wjCV4ApYcbGKV$qQLZ(IFH+PA>(%D?`C|Q@d?H~ zj1RN?(@ghrzdy(Hi;Ul9xqmRdf^mrB_f@9fU_8KB&G-)E`;2SY-chDMVSI-11fz-N zdKvQ=3mDH}v@)K}crN2%#tRs~z<3GcrHnKAIk0=V{g*R;8e=)*492f8Uco#y>MY$@nzmvy3k=?qhs~@t=%uF&<=mm+=F}A9H<+;JPH!+H9 zb$;u0I!dEB&8e)sTA8vCAI)pYQZGKzkH?f5_N(mErDYjKZFdco#wD z;IrHWy@ruZG$Jn^4EsodQ*B{Q*h5jgDqyE4D1;r45`>T!qmR^uJ=ogVj0aAFjjaLb z8kDg6>1hQNsKW|BGGWsf!5bTPJUt>$c4=F;(}dR@=#yYp#UxG&CVz&s>Q$Dg6U#q0n@`y~nDE^aXGqbO*q3j*OqVD!1Q|IjhPihmMfyhn4PNE|h0iTme;~&Jip(83Z9+0zf8_hl zb(#A{L}e_v9!T{io-mo=8yn#(e^M5s!{3G_Y{w`YTs|4`j0S z>6={XnE7ASe*&`E`YP+Ite;z6F8vJjDfW!|8~c3Y`|9a4Y&wkiH1v&jy@b5%@{RAG z-alWe4ad7qjl{oU=hYm2<33B8^#=?^MnliQ13CI@YL`pNHMP3J3x=Rh3Wd>t!$^>b z(f%b%m&>drOC_8npJPrOff%*FOcx(M8~T?IbMyqs(qZ74NE>C#3_jh;<)6rAB5mjy zY@&^^WAG$4=~M^bV7eG2U2BZ`8^emSPgQ;gO{_R>%!&Wp@^z_GpDn2EOta&pO!_`_ z<2sq0X+cel>)&X8-thgs(fs`3`+B4K9KB3FCS&nqN44 ze{VGZbYaZjqxolGK6YfYWZEJo2>Pz*+`}S@gfWk1=UIdm^Wv%Gi-&*bVYK|2!kEuS z^UuQkv^<+7(-v_y=ABdVCkkU;$yT!9DIH_J7|lO-_*2lpFp`7Lzl}F<%PrR2I#RNg?B9l1T4>Vp)r|`o}$v@c-Zt%_MM@LS% zH%R%B`WxeB1@o8lJY$T<4z_b|j-9dU-y`juCCYO8>zk5)rnuMbKx*XpF_I=cWt7{G zzX6}>-VFh` z(<$Og@4%Dg&w^u3xa)3QH>6pdrhtDI+I2U+lZFoYEC_H0^Otjbjk(js`~jPT#D#oj z1GtF!dnY-FE#$K{z@X$y)~FMG#jJu=TW&nx%69g1|K_vSH<^E!`SKYry$|p9%h%=1F2bk1s>y3*eusKIJ5zb-gg+#3A|Ti`Ja@sgril@0U2FY5A-Z zQZ3A1ewNOc&+dVrWd7bubw2Gw=;#HX;;n=2>s%pjV|n9up$z^vCEuV%@aHTqCOIT? z7iVo_{{HiHzQX*+!9P`;KgaUMb#^Dq?~{Bcvfv2I-+Qh@a^IvY-rnC5twat})~nN&YF;6@I*&ll45l zL`}z+bI=`}Kr#}cpUIKsxaYZKF;xqkHSR`(qGF8nNGbAOk1T~A@Nc9ST2Nt11L}zt z_c!SYU}GcQO=|STwRE1d##1>uNB&ljXq@AnU*YsN&abMfcQ-UPI4ivF zMnNSs>ThU3N&c*lKap&dcc$fJ!(sgF5B;hS-GS17 z7f4nbH?)F5T~)Tn7YHw4q!B(fyd|rZ^0r~4Kbg>C z5uMF3X7smov+R{d`t_vImh>NC8qK3%j`>IP8XL2mEq~-GH-FR*ML|3fYsA|c0laWR zWwdpU$m3U<8a?yTae+{@F_}OQv~VXi;(HG0CMkD`9x08(Ju}Mt*#RVPbLugvhoAg~ zsgV-NXEOGNyAGxhS9o8ru?fReG~*>4Dh)<)Lz81l{?=6X4_%EIq#vDX4R*Maco{<0 x3a_E?D?mD}_lgW-`NLQvs#KT8>eY=4f-NC>^QSV5FB8ER_|T7N Date: Sun, 30 Jun 2013 12:22:41 +0200 Subject: [PATCH 16/19] Make loop a little nicer --- smaz.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/smaz.c b/smaz.c index f5c19a0..4205dfa 100644 --- a/smaz.c +++ b/smaz.c @@ -213,30 +213,31 @@ int smaz_compress_trie(struct SmazBranch *trie, char *in, int inlen, char *out, while (remaining_length--) { unsigned char nextChar; struct SmazBranch **children; + struct SmazBranch *tmpBranch; + char *shortcut; + int shortcut_length; + nextChar = in[length]; if (nextChar > SMAZ_END_LETTER) { break; } children = branch->children; - if (children && children[nextChar]) { - struct SmazBranch *tmpBranch; - char *shortcut; - int shortcut_length; - tmpBranch = children[nextChar]; - shortcut = tmpBranch->shortcut; - shortcut_length = tmpBranch->shortcut_length; - length++; - if (shortcut) { - if (length <= inlen && memcmp(shortcut, in+length, shortcut_length)) { - length--; - break; - } - length += shortcut_length; + if (!(children && children[nextChar])) { + break; + } + + tmpBranch = children[nextChar]; + shortcut = tmpBranch->shortcut; + shortcut_length = tmpBranch->shortcut_length; + length++; + if (shortcut) { + if (length <= inlen && memcmp(shortcut, in+length, shortcut_length)) { + length--; + break; } - branch = tmpBranch; - continue; + length += shortcut_length; } - break; + branch = tmpBranch; } if (branch->value >= 0 && length <= inlen) { /* Match found, prepare a verbatim bytes flush if needed */ From b15c80aaff5df631c594f4a5a9ffc5af6a1fbace Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Sun, 30 Jun 2013 13:59:28 +0200 Subject: [PATCH 17/19] Added benchmark for heavily compressible data --- smaz_test.c | 130 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 122 insertions(+), 8 deletions(-) diff --git a/smaz_test.c b/smaz_test.c index 3a5cb0a..88b3197 100644 --- a/smaz_test.c +++ b/smaz_test.c @@ -49,7 +49,6 @@ int fastrand() { } char *strings[] = { - " ", "ht", "foobar", "the end", @@ -431,22 +430,137 @@ void bench_random_trie() { smaz_free_trie(trie); } -int main(void) { - /* +/* Reverse compression codebook, used for decompression */ +static char *Smaz_rcb[254] = { +" ", "the", "e", "t", "a", "of", "o", "and", "i", "n", "s", "e ", "r", " th", +" t", "in", "he", "th", "h", "he ", "to", "\r\n", "l", "s ", "d", " a", "an", +"er", "c", " o", "d ", "on", " of", "re", "of ", "t ", ", ", "is", "u", "at", +" ", "n ", "or", "which", "f", "m", "as", "it", "that", "\n", "was", "en", +" ", " w", "es", " an", " i", "\r", "f ", "g", "p", "nd", " s", "nd ", "ed ", +"w", "ed", "http://", "for", "te", "ing", "y ", "The", " c", "ti", "r ", "his", +"st", " in", "ar", "nt", ",", " to", "y", "ng", " h", "with", "le", "al", "to ", +"b", "ou", "be", "were", " b", "se", "o ", "ent", "ha", "ng ", "their", "\"", +"hi", "from", " f", "in ", "de", "ion", "me", "v", ".", "ve", "all", "re ", +"ri", "ro", "is ", "co", "f t", "are", "ea", ". ", "her", " m", "er ", " p", +"es ", "by", "they", "di", "ra", "ic", "not", "s, ", "d t", "at ", "ce", "la", +"h ", "ne", "as ", "tio", "on ", "n t", "io", "we", " a ", "om", ", a", "s o", +"ur", "li", "ll", "ch", "had", "this", "e t", "g ", "e\r\n", " wh", "ere", +" co", "e o", "a ", "us", " d", "ss", "\n\r\n", "\r\n\r", "=\"", " be", " e", +"s a", "ma", "one", "t t", "or ", "but", "el", "so", "l ", "e s", "s,", "no", +"ter", " wa", "iv", "ho", "e a", " r", "hat", "s t", "ns", "ch ", "wh", "tr", +"ut", "/", "have", "ly ", "ta", " ha", " on", "tha", "-", " l", "ati", "en ", +"pe", " re", "there", "ass", "si", " fo", "wa", "ec", "our", "who", "its", "z", +"fo", "rs", ">", "ot", "un", "<", "im", "th ", "nc", "ate", "><", "ver", "ad", +" we", "ly", "ee", " n", "id", " cl", "ac", "il", "value); - exit(0); - */ - /*printf("Testing result when using too smaller buffer:\n-------------\n"); - test_compress_small_out_buff();*/ + printf("Creating compressible strings\n"); + in = (char **)calloc(times+1, sizeof(char *)); + for (x = 0; x < times; x++) { + int charlen = 0; + in[x] = (char *)calloc(1024, sizeof(char)); + /* 7 being the longest possible string */ + while (charlen < (1024 - 7)) { + char *val = Smaz_rcb[fastrand() % 254]; + memcpy(in[x]+charlen, val, strlen(val)); + charlen += strlen(val); + } + } + + printf("Encrypting and decrypting %d test strings...\n", times); + gettimeofday(&t1, NULL); + + while (times--) { + smaz_compress_trie( + trie, + in[times], + strlen(in[times]), + out, + sizeof(out) + ); + } + gettimeofday(&t2, NULL); + + printf("time = %u.%06u\n", (unsigned int)t1.tv_sec, (unsigned int)t1.tv_usec); + printf("time = %u.%06u\n", (unsigned int)t2.tv_sec, (unsigned int)t2.tv_usec); + + for (x = 0; x < times; x++) { + free(in[x]); + } + free(in); + smaz_free_trie(trie); +} + +void bench_random_compressible_old() { + char **in; + char out[2048]; + int x = 0; + int times = 500000; + struct timeval t1, t2; + + g_seed = 0; + + printf("Creating compressible strings\n"); + in = (char **)calloc(times+1, sizeof(char *)); + for (x = 0; x < times; x++) { + int charlen = 0; + in[x] = (char *)calloc(1024, sizeof(char)); + /* 7 being the longest possible string */ + while (charlen < (1024 - 7)) { + char *val = Smaz_rcb[fastrand() % 254]; + memcpy(in[x]+charlen, val, strlen(val)); + charlen += strlen(val); + } + } + + printf("Encrypting and decrypting %d test strings...\n", times); + gettimeofday(&t1, NULL); + + while (times--) { + smaz_compress( + in[times], + strlen(in[times]), + out, + sizeof(out) + ); + } + gettimeofday(&t2, NULL); + + printf("time = %u.%06u\n", (unsigned int)t1.tv_sec, (unsigned int)t1.tv_usec); + printf("time = %u.%06u\n", (unsigned int)t2.tv_sec, (unsigned int)t2.tv_usec); + + for (x = 0; x < times; x++) { + free(in[x]); + } + free(in); +} + +int main(void) { + + printf("\n\nTesting result when using too smaller buffer:\n-------------\n"); + test_compress_small_out_buff(); printf("\n\nTesting null terminators stay there:\n-------------\n"); test_null_term(); printf("\n\nTesting a bunch of predefined strings:\n-------------\n"); test_strings(); printf("\n\nTesting a bunch of randomly generated strings:\n-------------\n"); test_random(); + printf("\n\nBenchmarking old smaz with very compressible data:\n-------------\n"); + bench_random_compressible_old(); + printf("\n\nBenchmarking new smaz with very compressible data:\n-------------\n"); + bench_random_compressible(); printf("\n\nBenchmarking old smaz on war of the worlds:\n-------------\n"); bench_old_smaz(); printf("\n\nBenchmarking new smaz on war of the worlds:\n-------------\n"); From 075fe05708d4fbdba3b12ffdd9a8f5eb06a0f4d5 Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Sun, 30 Jun 2013 16:58:12 +0200 Subject: [PATCH 18/19] Fix bug with exceeding trie array bounds, alter function signatures --- smaz.c | 42 +++++++++++++++++++++++++++--------------- smaz.h | 6 ++++-- smaz_test.c | 28 ++++++++++++++-------------- 3 files changed, 45 insertions(+), 31 deletions(-) diff --git a/smaz.c b/smaz.c index 4205dfa..9f18681 100644 --- a/smaz.c +++ b/smaz.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "smaz.h" @@ -99,10 +100,8 @@ void smaz_free_trie(struct SmazBranch *t) { free(t); } -int g_branch_counter = 0; -struct SmazBranch *g_trie; -void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value) { +void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value, struct SmazBranch *g_trie, int *g_branch_counter) { int entryLen; entryLen = strlen(remEntry); @@ -133,7 +132,9 @@ void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value) { tkey = (int)t->shortcut[x]; - newTBranch = &g_trie[g_branch_counter++]; + *g_branch_counter = *g_branch_counter+1; + assert(*g_branch_counter < 256); + newTBranch = &g_trie[*g_branch_counter]; memcpy( newTBranch->children, t->children, @@ -166,11 +167,13 @@ void smaz_add_to_branch(struct SmazBranch *t, char *remEntry, int value) { if (t->children[vkey] == NULL) { struct SmazBranch *newVBranch; - newVBranch = &g_trie[g_branch_counter++]; + *g_branch_counter = *g_branch_counter+1; + assert(*g_branch_counter < 256); + newVBranch = &g_trie[*g_branch_counter]; newVBranch->value = -1; t->children[vkey] = newVBranch; } - smaz_add_to_branch(t->children[vkey], vtail, value); + smaz_add_to_branch(t->children[vkey], vtail, value, g_trie, g_branch_counter); free(vtail); } else { /* the value of v now takes up the position */ @@ -183,22 +186,28 @@ struct SmazBranch *smaz_build_custom_trie(char *codebook[254]) { struct SmazBranch *trie; int x; - trie = (struct SmazBranch *)calloc(255, sizeof(struct SmazBranch)); - trie[0].value = -1; - g_branch_counter = 1; - g_trie = trie; + int *g_branch_counter = 0; + struct SmazBranch *g_trie; + + g_trie = (struct SmazBranch *)calloc(256, sizeof(struct SmazBranch)); + g_trie[0].value = -1; + g_branch_counter = (int *)calloc(1, sizeof(int)); + *g_branch_counter = 1; for (x = 0; x < 254; x++) { - smaz_add_to_branch(&trie[0], codebook[x], x); + smaz_add_to_branch(&g_trie[0], codebook[x], x, g_trie, g_branch_counter); } - return &trie[0]; + + free(g_branch_counter); + + return g_trie; } struct SmazBranch *smaz_build_trie() { return smaz_build_custom_trie(Smaz_rcb); } -int smaz_compress_trie(struct SmazBranch *trie, char *in, int inlen, char *out, int outlen) { +int smaz_compress(struct SmazBranch *trie, char *in, int inlen, char *out, int outlen) { int verblen = 0, _outlen = outlen; char verb[256], *_out = out; @@ -289,7 +298,7 @@ int smaz_compress_trie(struct SmazBranch *trie, char *in, int inlen, char *out, return out-_out; } -int smaz_compress(char *in, int inlen, char *out, int outlen) { +int smaz_compress_ref(char *in, int inlen, char *out, int outlen) { unsigned int h1,h2,h3=0; int verblen = 0, _outlen = outlen; char verb[256], *_out = out; @@ -370,6 +379,9 @@ int smaz_compress(char *in, int inlen, char *out, int outlen) { } int smaz_decompress(char *in, int inlen, char *out, int outlen) { + return smaz_decompress_custom(Smaz_rcb, in, inlen, out, outlen); +} +int smaz_decompress_custom(char *cb[254], char *in, int inlen, char *out, int outlen) { unsigned char *c = (unsigned char*) in; char *_out = out; int _outlen = outlen; @@ -394,7 +406,7 @@ int smaz_decompress(char *in, int inlen, char *out, int outlen) { inlen -= 2+len; } else { /* Codebook entry */ - char *s = Smaz_rcb[*c]; + char *s = cb[*c]; int len = strlen(s); if (outlen < len) return _outlen+1; diff --git a/smaz.h b/smaz.h index e3229e4..8288570 100644 --- a/smaz.h +++ b/smaz.h @@ -12,10 +12,12 @@ struct SmazBranch { }; struct SmazBranch *smaz_build_trie(); +struct SmazBranch *smaz_build_custom_trie(char *codebook[254]); void smaz_free_trie(struct SmazBranch *t); -int smaz_compress(char *in, int inlen, char *out, int outlen); -int smaz_compress_trie(struct SmazBranch *trie, char *in, int inlen, char *out, int outlen); +int smaz_compress_ref(char *in, int inlen, char *out, int outlen); +int smaz_compress(struct SmazBranch *trie, char *in, int inlen, char *out, int outlen); int smaz_decompress(char *in, int inlen, char *out, int outlen); +int smaz_decompress_custom(char *cb[254], char *in, int inlen, char *out, int outlen); #endif diff --git a/smaz_test.c b/smaz_test.c index 88b3197..f74fcdc 100644 --- a/smaz_test.c +++ b/smaz_test.c @@ -82,7 +82,7 @@ void test_compress_small_out_buff() { trie = smaz_build_trie(); while(strings[j]) { - comprlen = smaz_compress_trie( + comprlen = smaz_compress( trie, strings[j], strlen(strings[j]), @@ -111,7 +111,7 @@ void test_null_term() { struct SmazBranch *trie; trie = smaz_build_trie(); - comprlen = smaz_compress_trie( + comprlen = smaz_compress( trie, no_null_str, 4, @@ -136,7 +136,7 @@ void test_null_term() { ); exit(1); } - comprlen = smaz_compress_trie( + comprlen = smaz_compress( trie, null_term_str, strlen(null_term_str)+1, /* include the null terminator this time. */ @@ -199,7 +199,7 @@ void bench_trie_smaz() { gettimeofday(&t1, NULL); for (x = 0; x < num_loops; x++) { - smaz_compress_trie( + smaz_compress( trie, in, numbytes, @@ -249,7 +249,7 @@ void bench_old_smaz() { gettimeofday(&t1, NULL); for (x = 0; x < num_loops; x++) { - smaz_compress( + smaz_compress_ref( in, numbytes, comp_out, @@ -279,7 +279,7 @@ void test_strings() { while(strings[j]) { int comprlevel, comprlen_good; - comprlen = smaz_compress_trie( + comprlen = smaz_compress( trie, strings[j], strlen(strings[j]), @@ -287,7 +287,7 @@ void test_strings() { sizeof(out) ); - comprlen_good = smaz_compress( + comprlen_good = smaz_compress_ref( strings[j], strlen(strings[j]), out_good, @@ -345,14 +345,14 @@ void test_random() { in[j] = (char)(fastrand() & 0xff); } - comprlen = smaz_compress_trie(trie, in,ranlen,out,sizeof(out)); - /*comprlen = smaz_compress(in,ranlen,out,sizeof(out));*/ + comprlen = smaz_compress(trie, in,ranlen,out,sizeof(out)); + /*comprlen = smaz_compress_ref(in,ranlen,out,sizeof(out));*/ decomprlen = smaz_decompress(out,comprlen,d,sizeof(out)); if (ranlen != decomprlen || memcmp(in,d,ranlen)) { printf("Bug! TEST NOT PASSED: %d\n", 1000000-times); hexDump("in", &in, ranlen); hexDump("out bad", &out, comprlen); - comprlen = smaz_compress(in,ranlen,out,sizeof(out)); + comprlen = smaz_compress_ref(in,ranlen,out,sizeof(out)); hexDump("out good", &out, comprlen); exit(1); } @@ -386,7 +386,7 @@ void bench_random_old_smaz() { else in[j] = (char)(fastrand() & 0xff); } - smaz_compress(in,ranlen,out,sizeof(out)); + smaz_compress_ref(in,ranlen,out,sizeof(out)); } gettimeofday(&t2, NULL); @@ -420,7 +420,7 @@ void bench_random_trie() { else in[j] = (char)(fastrand() & 0xff); } - smaz_compress_trie(trie, in,ranlen,out,sizeof(out)); + smaz_compress(trie, in,ranlen,out,sizeof(out)); } gettimeofday(&t2, NULL); @@ -483,7 +483,7 @@ void bench_random_compressible() { gettimeofday(&t1, NULL); while (times--) { - smaz_compress_trie( + smaz_compress( trie, in[times], strlen(in[times]), @@ -529,7 +529,7 @@ void bench_random_compressible_old() { gettimeofday(&t1, NULL); while (times--) { - smaz_compress( + smaz_compress_ref( in[times], strlen(in[times]), out, From f1a86c7c9d1a40ef00c11b54d2242a37856dd4ca Mon Sep 17 00:00:00 2001 From: Richard Johnson Date: Sun, 30 Jun 2013 16:58:28 +0200 Subject: [PATCH 19/19] Updated readme --- README.md | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 2b658cc..9269ab6 100644 --- a/README.md +++ b/README.md @@ -50,29 +50,89 @@ It can compress URLS pretty well: Usage ----- -The lib consists of just two functions: +**Compression:** - int smaz_compress(char *in, int inlen, char *out, int outlen); +The compression function is: -Compress the buffer 'in' of length 'inlen' and put the compressed data into +```cpp +int smaz_compress(struct SmazBranch *trie, char *in, int inlen, char *out, int outlen); +``` + +This compresses the buffer 'in' of length 'inlen' and put the compressed data into 'out' of max length 'outlen' bytes. If the output buffer is too short to hold the whole compressed string, outlen+1 is returned. Otherwise the length of the compressed string (less then or equal to outlen) is returned. - int smaz_decompress(char *in, int inlen, char *out, int outlen); +The first parameter is the lookup trie used for compression. The default one can be generated with: + +```cpp +struct SmazBranch *smaz_build_trie(); +``` + +Alternatively, you can provide a custom codebook with: + +```cpp +struct SmazBranch *smaz_build_custom_trie(char *codebook[254]); +``` + +*Note:* If you are using a custom codebook, be sure not to have any entries exceeding +11 characters in length. + +The original reference implementation of Smaz compression is included for testing +and benchmarking comparison purposes: + +```cpp +int smaz_compress_ref(char *in, int inlen, char *out, int outlen); +``` + +**Decompression:** + +To decompress with the default codebook: + +```cpp +int smaz_decompress(char *in, int inlen, char *out, int outlen); +``` + +Or if you are using a custom codebook: -Decompress the buffer 'in' of length 'inlen' and put the decompressed data into +```cpp +int smaz_decompress_custom(char *cb[254], char *in, int inlen, char *out, int outlen); +``` + +These decompress the buffer 'in' of length 'inlen' and put the decompressed data into 'out' of max length 'outlen' bytes. If the output buffer is too short to hold the whole decompressed string, outlen+1 is returned. Otherwise the length of the compressed string (less then or equal to outlen) is returned. This function will not automatically put a null-term at the end of the string if the original compressed string didn't included a nulterm. +smaz_test +--------- + +smaz_test.c contains some simple tests and comparitive benchmarks between the reference +implementation and the trie implementation. + +The provided makefile should take care compilation. Running the tests will take up +about a gig of RAM, as some tests pre-generate large numbers of strings. + + +Trie speed improvement +---------------------- + +These are just some rough numbers generated by my machine. + +For very compressible data, the new implementation appears ~2.2x faster than the +reference implementation. + +Basic english strings should see something around a ~2.6x speed improvement. + +For random textual strings you can get somewhere around a 4.9x speed increase. + Credits ------- -Smaz was written by Salvatore Sanfilippo and is released under the BSD license. +Smaz was written by Salvatore Sanfilippo and is released under the 3 clause BSD license. Check the COPYING file for more information. Trie-based implementation by Richard Johnson, released under the same BSD license.