Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Minor compression ratio improvements for high compression modes on small data #2781

Draft
wants to merge 7 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
improve initial literals stats
  • Loading branch information
Cyan4973 committed Sep 13, 2021
commit 0b6fa7a11cbdf9504eb40c458be7be0e30118708
3 changes: 1 addition & 2 deletions lib/compress/zstd_compress_literals.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);

/* small ? don't even attempt compression (speed opt) */
# define COMPRESS_LITERALS_SIZE_MIN 63
{ size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
}
Expand Down Expand Up @@ -136,7 +135,7 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
switch(lhSize)
{
case 3: /* 2 - 2 - 10 - 10 */
{ U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
{ U32 const lhc = hType + (((U32)!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
MEM_writeLE24(ostart, lhc);
break;
}
Expand Down
5 changes: 5 additions & 0 deletions lib/compress/zstd_compress_literals.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
#include "zstd_compress_internal.h" /* ZSTD_hufCTables_t, ZSTD_minGain() */


/* Below this limit, the heuristic does not even attempt compression of literals,
* as the cost of the headers is expected to outweight the benefits.
* This limit is not applicable when re-using statistics from dictionary or previous block */
#define COMPRESS_LITERALS_SIZE_MIN 63

size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);

size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
Expand Down
52 changes: 42 additions & 10 deletions lib/compress/zstd_opt.c
Original file line number Diff line number Diff line change
Expand Up @@ -1253,6 +1253,31 @@ size_t ZSTD_compressBlock_btopt(
}


#include "zstd_compress_literals.h" /* COMPRESS_LITERALS_SIZE_MIN */

static void transfer_LiteralStats(optState_t* opt, const seqStore_t* seqStore)
{
size_t const nbLiterals = (size_t)(seqStore->lit - seqStore->litStart);
assert(seqStore->lit >= seqStore->litStart);

// test if literals are compressible
//char block[ZSTD_BLOCKSIZE_MAX];
//size_t const cLitSize = HUF_compress(block, sizeof(block), seqStore->litStart, nbLiterals);
//printf("nb literals = %zu => %zu huf_compress\n", nbLiterals, cLitSize);

if (nbLiterals < COMPRESS_LITERALS_SIZE_MIN) {
/* literals won't be compressed anyway, give them a flat cost */
/* note : it would be better if it was also possible to extend this category
* to non-compressible literals which are more numerous than threshold */
unsigned u;
for (u=0; u<=MaxLit; u++) opt->litFreq[u]=2;
} else {
unsigned maxlit = MaxLit;
HIST_count_simple(opt->litFreq, &maxlit, seqStore->litStart, nbLiterals);
}
opt->litSum = ZSTD_downscaleStats(opt->litFreq, MaxLit, 0); /* flatten stats, by providing at least 1 to every symbol */
}


#include "zstd_lazy.h"
/* ZSTD_initStats_greedy():
Expand All @@ -1275,16 +1300,16 @@ ZSTD_initStats_greedy(ZSTD_matchState_t* ms,
assert(ms->window.dictLimit == ms->window.lowLimit); /* no dictionary */
assert(ms->window.dictLimit - ms->nextToUpdate <= 1); /* no prefix (note: intentional overflow, defined as 2-complement) */

ZSTD_compressBlock_greedy(ms, seqStore, tmpRep, src, srcSize); /* generate stats into seqstore */

/* transfer stats into ms-opt */
/* literals stats */
{ unsigned maxlit = MaxLit;
assert(seqStore->lit >= seqStore->litStart);
HIST_count_simple(ms->opt.litFreq, &maxlit, seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart));
ms->opt.litSum = ZSTD_downscaleStats(ms->opt.litFreq, MaxLit, 0); /* flatten stats, by providing at least 1 to every symbol */
{ size_t const lastLits = ZSTD_compressBlock_greedy(ms, seqStore, tmpRep, src, srcSize); /* generate stats into seqstore */
/* add last lits into literals buffers for proper accounting */
assert(lastLits <= srcSize);
ZSTD_memcpy(seqStore->lit , (const char*)src + srcSize - lastLits, lastLits);
seqStore->lit += lastLits;
}

/* transfer stats from seqStore into ms-opt */
transfer_LiteralStats(&ms->opt, seqStore);

/* seqStats */
assert(seqStore->sequences >= seqStore->sequencesStart);
{ U32 const nbSeq = (U32)(seqStore->sequences - seqStore->sequencesStart);
Expand Down Expand Up @@ -1367,6 +1392,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
U32 rep[ZSTD_REP_NUM],
const void* src, size_t srcSize)
{
size_t lastLits;
U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */
ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep));

Expand All @@ -1378,13 +1404,19 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,

if (srcSize <= 16 KB) {
/* raw btultra, initialized by default starting stats */
ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); /* generate stats into ms->opt*/
lastLits = ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); /* generate stats into ms->opt*/
} else {
/* in this mode, btultra is initialized greedy;
* measured better for larger blocks, but not for small ones */
ZSTD_compressBlock_btultra(ms, seqStore, tmpRep, src, srcSize); /* generate stats into ms->opt*/
lastLits = ZSTD_compressBlock_btultra(ms, seqStore, tmpRep, src, srcSize); /* generate stats into ms->opt*/
}

/* transfer stats from seqStore into ms-opt */
assert(lastLits <= srcSize);
ZSTD_memcpy(seqStore->lit , (const char*)src + srcSize - lastLits, lastLits);
seqStore->lit += lastLits;
transfer_LiteralStats(&ms->opt, seqStore);

/* invalidate first scan from history */
ZSTD_resetSeqStore(seqStore);
ms->window.base -= srcSize;
Expand Down