Skip to content

Commit 35e69fc

Browse files
authored
New feature: "Large Window Brotli" (google#640)
* New feature: "Large Window Brotli" By setting special encoder/decoder flag it is now possible to extend LZ-window up to 30 bits; though produced stream will not be RFC7932 compliant. Added new dictionary generator - "DSH". It combines speed of "Sieve" and quality of "DM". Plus utilities to prepare train corpora (remove unique strings). Improved compression ratio: now two sub-blocks could be stitched: the last copy command could be extended to span the next sub-block. Fixed compression ineffectiveness caused by floating numbers rounding and wrong cost heuristic. Other C changes: - combined / moved `context.h` to `common` - moved transforms to `common` - unified some aspects of code formatting - added an abstraction for encoder (static) dictionary - moved default allocator/deallocator functions to `common` brotli CLI: - window size is auto-adjusted if not specified explicitly Java: - added "eager" decoding both to JNI wrapper and pure decoder - huge speed-up of `DictionaryData` initialization * Add dictionaryless compressed dictionary * Fix `sources.lst` * Fix `sources.lst` and add a note that `libtool` is also required. * Update setup.py * Fix `EagerStreamTest` * Fix BUILD file * Add missing `libdivsufsort` dependency * Fix "unused parameter" warning.
1 parent 3af1899 commit 35e69fc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

92 files changed

+3613
-1793
lines changed

.gitmodules

+3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
11
[submodule "research/esaxx"]
22
path = research/esaxx
33
url = https://github.com/hillbig/esaxx
4+
[submodule "research/libdivsufsort"]
5+
path = research/libdivsufsort
6+
url = https://github.com/y-256/libdivsufsort.git

WORKSPACE

+6
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,12 @@ filegroup(
8282
)""",
8383
)
8484

85+
new_local_repository(
86+
name = "divsufsort",
87+
build_file = "//research:BUILD.libdivsufsort",
88+
path = "research/libdivsufsort",
89+
)
90+
8591
load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
8692
closure_repositories()
8793

bootstrap

+2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ sed --version >/dev/null 2>&1 || { echo >&2 "'sed' $REQUIRED"; exit 1; }
77
fi
88
autoreconf --version >/dev/null 2>&1 || { echo >&2 "'autoconf' $REQUIRED"; exit 1; }
99

10+
# If libtool is not installed -> "error: Libtool library used but 'LIBTOOL' is undefined"
11+
1012
mkdir m4 2>/dev/null
1113

1214
BROTLI_ABI_HEX=`sed -n 's/#define BROTLI_ABI_VERSION 0x//p' c/common/version.h`

c/common/constants.h

+13-6
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,25 @@
2828
/* "code length of 8 is repeated" */
2929
#define BROTLI_INITIAL_REPEATED_CODE_LENGTH 8
3030

31+
/* "Large Window Brotli" */
32+
#define BROTLI_LARGE_MAX_DISTANCE_BITS 62U
33+
#define BROTLI_LARGE_MIN_WBITS 10
34+
#define BROTLI_LARGE_MAX_WBITS 30
35+
3136
/* Specification: 4. Encoding of distances */
3237
#define BROTLI_NUM_DISTANCE_SHORT_CODES 16
3338
#define BROTLI_MAX_NPOSTFIX 3
3439
#define BROTLI_MAX_NDIRECT 120
3540
#define BROTLI_MAX_DISTANCE_BITS 24U
36-
/* BROTLI_NUM_DISTANCE_SYMBOLS == 520 */
37-
#define BROTLI_NUM_DISTANCE_SYMBOLS (BROTLI_NUM_DISTANCE_SHORT_CODES + \
38-
BROTLI_MAX_NDIRECT + \
39-
(BROTLI_MAX_DISTANCE_BITS << \
40-
(BROTLI_MAX_NPOSTFIX + 1)))
41-
/* Distance that is guaranteed to be representable in any stream. */
41+
#define BROTLI_DISTANCE_ALPHABET_SIZE(NDIRECT, NPOSTFIX, MAXNBITS) ( \
42+
BROTLI_NUM_DISTANCE_SHORT_CODES + (NDIRECT) + \
43+
((MAXNBITS) << ((NPOSTFIX) + 1)))
44+
/* BROTLI_NUM_DISTANCE_SYMBOLS == 1128 */
45+
#define BROTLI_NUM_DISTANCE_SYMBOLS \
46+
BROTLI_DISTANCE_ALPHABET_SIZE( \
47+
BROTLI_MAX_NDIRECT, BROTLI_MAX_NPOSTFIX, BROTLI_LARGE_MAX_DISTANCE_BITS)
4248
#define BROTLI_MAX_DISTANCE 0x3FFFFFC
49+
#define BROTLI_MAX_ALLOWED_DISTANCE 0x7FFFFFFC
4350

4451
/* 7.1. Context modes and context ID lookup for literals */
4552
/* "context IDs for literals are in the range of 0..63" */

c/dec/context.h c/common/context.h

+182-172
Large diffs are not rendered by default.

c/common/dictionary.bin.br

50.5 KB
Binary file not shown.

c/common/platform.h

+20-2
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define BROTLI_COMMON_PLATFORM_H_
1111

1212
#include <string.h> /* memcpy */
13+
#include <stdlib.h> /* malloc, free */
1314

1415
#include <brotli/port.h>
1516
#include <brotli/types.h>
@@ -204,7 +205,7 @@ static BROTLI_INLINE uint64_t BrotliUnalignedRead64(const void* p) {
204205
static BROTLI_INLINE void BrotliUnalignedWrite64(void* p, uint64_t v) {
205206
memcpy(p, &v, sizeof v);
206207
}
207-
#else /* BROTLI_ALIGNED_READ */
208+
#else /* BROTLI_ALIGNED_READ */
208209
/* Unaligned memory access is allowed: just cast pointer to requested type. */
209210
static BROTLI_INLINE uint16_t BrotliUnalignedRead16(const void* p) {
210211
return *(const uint16_t*)p;
@@ -218,7 +219,7 @@ static BROTLI_INLINE uint64_t BrotliUnalignedRead64(const void* p) {
218219
static BROTLI_INLINE void BrotliUnalignedWrite64(void* p, uint64_t v) {
219220
*(uint64_t*)p = v;
220221
}
221-
#endif /* BROTLI_ALIGNED_READ */
222+
#endif /* BROTLI_ALIGNED_READ */
222223

223224
#if BROTLI_LITTLE_ENDIAN
224225
/* Straight endianness. Just read / write values. */
@@ -390,6 +391,18 @@ BROTLI_MIN_MAX(size_t) BROTLI_MIN_MAX(uint32_t) BROTLI_MIN_MAX(uint8_t)
390391
(A)[(J)] = __brotli_swap_tmp; \
391392
}
392393

394+
/* Default brotli_alloc_func */
395+
static void* BrotliDefaultAllocFunc(void* opaque, size_t size) {
396+
BROTLI_UNUSED(opaque);
397+
return malloc(size);
398+
}
399+
400+
/* Default brotli_free_func */
401+
static void BrotliDefaultFreeFunc(void* opaque, void* address) {
402+
BROTLI_UNUSED(opaque);
403+
free(address);
404+
}
405+
393406
BROTLI_UNUSED_FUNCTION void BrotliSuppressUnusedFunctions(void) {
394407
BROTLI_UNUSED(BrotliSuppressUnusedFunctions);
395408
BROTLI_UNUSED(BrotliUnalignedRead16);
@@ -413,6 +426,11 @@ BROTLI_UNUSED_FUNCTION void BrotliSuppressUnusedFunctions(void) {
413426
BROTLI_UNUSED(brotli_max_uint32_t);
414427
BROTLI_UNUSED(brotli_min_uint8_t);
415428
BROTLI_UNUSED(brotli_max_uint8_t);
429+
BROTLI_UNUSED(BrotliDefaultAllocFunc);
430+
BROTLI_UNUSED(BrotliDefaultFreeFunc);
431+
#if defined(BROTLI_DEBUG) || defined(BROTLI_ENABLE_LOG)
432+
BROTLI_UNUSED(BrotliDump);
433+
#endif
416434
}
417435

418436
#endif /* BROTLI_COMMON_PLATFORM_H_ */

c/common/transform.c

+236
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
/* Copyright 2013 Google Inc. All Rights Reserved.
2+
3+
Distributed under MIT license.
4+
See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5+
*/
6+
7+
#include "./platform.h"
8+
#include "./transform.h"
9+
10+
#if defined(__cplusplus) || defined(c_plusplus)
11+
extern "C" {
12+
#endif
13+
14+
/* RFC 7932 transforms string data */
15+
static const char kPrefixSuffix[217] =
16+
"\1 \2, \10 of the \4 of \2s \1.\5 and \4 "
17+
/* 0x _0 _2 __5 _E _3 _6 _8 _E */
18+
"in \1\"\4 to \2\">\1\n\2. \1]\5 for \3 a \6 "
19+
/* 2x _3_ _5 _A_ _D_ _F _2 _4 _A _E */
20+
"that \1\'\6 with \6 from \4 by \1(\6. T"
21+
/* 4x _5_ _7 _E _5 _A _C */
22+
"he \4 on \4 as \4 is \4ing \2\n\t\1:\3ed "
23+
/* 6x _3 _8 _D _2 _7_ _ _A _C */
24+
"\2=\"\4 at \3ly \1,\2=\'\5.com/\7. This \5"
25+
/* 8x _0 _ _3 _8 _C _E _ _1 _7 _F */
26+
" not \3er \3al \4ful \4ive \5less \4es"
27+
/* Ax _5 _9 _D _2 _7 _D */
28+
"t \4ize \2\xc2\xa0\4ous \5 the \2e \0";
29+
/* Cx _2 _7___ ___ _A _F _5 _8 */
30+
31+
static const uint16_t kPrefixSuffixMap[50] = {
32+
0x00, 0x02, 0x05, 0x0E, 0x13, 0x16, 0x18, 0x1E, 0x23, 0x25,
33+
0x2A, 0x2D, 0x2F, 0x32, 0x34, 0x3A, 0x3E, 0x45, 0x47, 0x4E,
34+
0x55, 0x5A, 0x5C, 0x63, 0x68, 0x6D, 0x72, 0x77, 0x7A, 0x7C,
35+
0x80, 0x83, 0x88, 0x8C, 0x8E, 0x91, 0x97, 0x9F, 0xA5, 0xA9,
36+
0xAD, 0xB2, 0xB7, 0xBD, 0xC2, 0xC7, 0xCA, 0xCF, 0xD5, 0xD8
37+
};
38+
39+
/* RFC 7932 transforms */
40+
static const uint8_t kTransformsData[] = {
41+
49, BROTLI_TRANSFORM_IDENTITY, 49,
42+
49, BROTLI_TRANSFORM_IDENTITY, 0,
43+
0, BROTLI_TRANSFORM_IDENTITY, 0,
44+
49, BROTLI_TRANSFORM_OMIT_FIRST_1, 49,
45+
49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 0,
46+
49, BROTLI_TRANSFORM_IDENTITY, 47,
47+
0, BROTLI_TRANSFORM_IDENTITY, 49,
48+
4, BROTLI_TRANSFORM_IDENTITY, 0,
49+
49, BROTLI_TRANSFORM_IDENTITY, 3,
50+
49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 49,
51+
49, BROTLI_TRANSFORM_IDENTITY, 6,
52+
49, BROTLI_TRANSFORM_OMIT_FIRST_2, 49,
53+
49, BROTLI_TRANSFORM_OMIT_LAST_1, 49,
54+
1, BROTLI_TRANSFORM_IDENTITY, 0,
55+
49, BROTLI_TRANSFORM_IDENTITY, 1,
56+
0, BROTLI_TRANSFORM_UPPERCASE_FIRST, 0,
57+
49, BROTLI_TRANSFORM_IDENTITY, 7,
58+
49, BROTLI_TRANSFORM_IDENTITY, 9,
59+
48, BROTLI_TRANSFORM_IDENTITY, 0,
60+
49, BROTLI_TRANSFORM_IDENTITY, 8,
61+
49, BROTLI_TRANSFORM_IDENTITY, 5,
62+
49, BROTLI_TRANSFORM_IDENTITY, 10,
63+
49, BROTLI_TRANSFORM_IDENTITY, 11,
64+
49, BROTLI_TRANSFORM_OMIT_LAST_3, 49,
65+
49, BROTLI_TRANSFORM_IDENTITY, 13,
66+
49, BROTLI_TRANSFORM_IDENTITY, 14,
67+
49, BROTLI_TRANSFORM_OMIT_FIRST_3, 49,
68+
49, BROTLI_TRANSFORM_OMIT_LAST_2, 49,
69+
49, BROTLI_TRANSFORM_IDENTITY, 15,
70+
49, BROTLI_TRANSFORM_IDENTITY, 16,
71+
0, BROTLI_TRANSFORM_UPPERCASE_FIRST, 49,
72+
49, BROTLI_TRANSFORM_IDENTITY, 12,
73+
5, BROTLI_TRANSFORM_IDENTITY, 49,
74+
0, BROTLI_TRANSFORM_IDENTITY, 1,
75+
49, BROTLI_TRANSFORM_OMIT_FIRST_4, 49,
76+
49, BROTLI_TRANSFORM_IDENTITY, 18,
77+
49, BROTLI_TRANSFORM_IDENTITY, 17,
78+
49, BROTLI_TRANSFORM_IDENTITY, 19,
79+
49, BROTLI_TRANSFORM_IDENTITY, 20,
80+
49, BROTLI_TRANSFORM_OMIT_FIRST_5, 49,
81+
49, BROTLI_TRANSFORM_OMIT_FIRST_6, 49,
82+
47, BROTLI_TRANSFORM_IDENTITY, 49,
83+
49, BROTLI_TRANSFORM_OMIT_LAST_4, 49,
84+
49, BROTLI_TRANSFORM_IDENTITY, 22,
85+
49, BROTLI_TRANSFORM_UPPERCASE_ALL, 49,
86+
49, BROTLI_TRANSFORM_IDENTITY, 23,
87+
49, BROTLI_TRANSFORM_IDENTITY, 24,
88+
49, BROTLI_TRANSFORM_IDENTITY, 25,
89+
49, BROTLI_TRANSFORM_OMIT_LAST_7, 49,
90+
49, BROTLI_TRANSFORM_OMIT_LAST_1, 26,
91+
49, BROTLI_TRANSFORM_IDENTITY, 27,
92+
49, BROTLI_TRANSFORM_IDENTITY, 28,
93+
0, BROTLI_TRANSFORM_IDENTITY, 12,
94+
49, BROTLI_TRANSFORM_IDENTITY, 29,
95+
49, BROTLI_TRANSFORM_OMIT_FIRST_9, 49,
96+
49, BROTLI_TRANSFORM_OMIT_FIRST_7, 49,
97+
49, BROTLI_TRANSFORM_OMIT_LAST_6, 49,
98+
49, BROTLI_TRANSFORM_IDENTITY, 21,
99+
49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 1,
100+
49, BROTLI_TRANSFORM_OMIT_LAST_8, 49,
101+
49, BROTLI_TRANSFORM_IDENTITY, 31,
102+
49, BROTLI_TRANSFORM_IDENTITY, 32,
103+
47, BROTLI_TRANSFORM_IDENTITY, 3,
104+
49, BROTLI_TRANSFORM_OMIT_LAST_5, 49,
105+
49, BROTLI_TRANSFORM_OMIT_LAST_9, 49,
106+
0, BROTLI_TRANSFORM_UPPERCASE_FIRST, 1,
107+
49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 8,
108+
5, BROTLI_TRANSFORM_IDENTITY, 21,
109+
49, BROTLI_TRANSFORM_UPPERCASE_ALL, 0,
110+
49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 10,
111+
49, BROTLI_TRANSFORM_IDENTITY, 30,
112+
0, BROTLI_TRANSFORM_IDENTITY, 5,
113+
35, BROTLI_TRANSFORM_IDENTITY, 49,
114+
47, BROTLI_TRANSFORM_IDENTITY, 2,
115+
49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 17,
116+
49, BROTLI_TRANSFORM_IDENTITY, 36,
117+
49, BROTLI_TRANSFORM_IDENTITY, 33,
118+
5, BROTLI_TRANSFORM_IDENTITY, 0,
119+
49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 21,
120+
49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 5,
121+
49, BROTLI_TRANSFORM_IDENTITY, 37,
122+
0, BROTLI_TRANSFORM_IDENTITY, 30,
123+
49, BROTLI_TRANSFORM_IDENTITY, 38,
124+
0, BROTLI_TRANSFORM_UPPERCASE_ALL, 0,
125+
49, BROTLI_TRANSFORM_IDENTITY, 39,
126+
0, BROTLI_TRANSFORM_UPPERCASE_ALL, 49,
127+
49, BROTLI_TRANSFORM_IDENTITY, 34,
128+
49, BROTLI_TRANSFORM_UPPERCASE_ALL, 8,
129+
49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 12,
130+
0, BROTLI_TRANSFORM_IDENTITY, 21,
131+
49, BROTLI_TRANSFORM_IDENTITY, 40,
132+
0, BROTLI_TRANSFORM_UPPERCASE_FIRST, 12,
133+
49, BROTLI_TRANSFORM_IDENTITY, 41,
134+
49, BROTLI_TRANSFORM_IDENTITY, 42,
135+
49, BROTLI_TRANSFORM_UPPERCASE_ALL, 17,
136+
49, BROTLI_TRANSFORM_IDENTITY, 43,
137+
0, BROTLI_TRANSFORM_UPPERCASE_FIRST, 5,
138+
49, BROTLI_TRANSFORM_UPPERCASE_ALL, 10,
139+
0, BROTLI_TRANSFORM_IDENTITY, 34,
140+
49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 33,
141+
49, BROTLI_TRANSFORM_IDENTITY, 44,
142+
49, BROTLI_TRANSFORM_UPPERCASE_ALL, 5,
143+
45, BROTLI_TRANSFORM_IDENTITY, 49,
144+
0, BROTLI_TRANSFORM_IDENTITY, 33,
145+
49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 30,
146+
49, BROTLI_TRANSFORM_UPPERCASE_ALL, 30,
147+
49, BROTLI_TRANSFORM_IDENTITY, 46,
148+
49, BROTLI_TRANSFORM_UPPERCASE_ALL, 1,
149+
49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 34,
150+
0, BROTLI_TRANSFORM_UPPERCASE_FIRST, 33,
151+
0, BROTLI_TRANSFORM_UPPERCASE_ALL, 30,
152+
0, BROTLI_TRANSFORM_UPPERCASE_ALL, 1,
153+
49, BROTLI_TRANSFORM_UPPERCASE_ALL, 33,
154+
49, BROTLI_TRANSFORM_UPPERCASE_ALL, 21,
155+
49, BROTLI_TRANSFORM_UPPERCASE_ALL, 12,
156+
0, BROTLI_TRANSFORM_UPPERCASE_ALL, 5,
157+
49, BROTLI_TRANSFORM_UPPERCASE_ALL, 34,
158+
0, BROTLI_TRANSFORM_UPPERCASE_ALL, 12,
159+
0, BROTLI_TRANSFORM_UPPERCASE_FIRST, 30,
160+
0, BROTLI_TRANSFORM_UPPERCASE_ALL, 34,
161+
0, BROTLI_TRANSFORM_UPPERCASE_FIRST, 34,
162+
};
163+
164+
static BrotliTransforms kBrotliTransforms = {
165+
sizeof(kPrefixSuffix),
166+
(const uint8_t*)kPrefixSuffix,
167+
kPrefixSuffixMap,
168+
sizeof(kTransformsData) / (3 * sizeof(kTransformsData[0])),
169+
kTransformsData,
170+
{0, 12, 27, 23, 42, 63, 56, 48, 59, 64}
171+
};
172+
173+
const BrotliTransforms* BrotliGetTransforms(void) {
174+
return &kBrotliTransforms;
175+
}
176+
177+
static int ToUpperCase(uint8_t* p) {
178+
if (p[0] < 0xC0) {
179+
if (p[0] >= 'a' && p[0] <= 'z') {
180+
p[0] ^= 32;
181+
}
182+
return 1;
183+
}
184+
/* An overly simplified uppercasing model for UTF-8. */
185+
if (p[0] < 0xE0) {
186+
p[1] ^= 32;
187+
return 2;
188+
}
189+
/* An arbitrary transform for three byte characters. */
190+
p[2] ^= 5;
191+
return 3;
192+
}
193+
194+
int BrotliTransformDictionaryWord(uint8_t* dst, const uint8_t* word, int len,
195+
const BrotliTransforms* BROTLI_RESTRICT transforms, int transfom_idx) {
196+
int idx = 0;
197+
const uint8_t* prefix = BROTLI_TRANSFORM_PREFIX(transforms, transfom_idx);
198+
uint8_t type = BROTLI_TRANSFORM_TYPE(transforms, transfom_idx);
199+
const uint8_t* suffix = BROTLI_TRANSFORM_SUFFIX(transforms, transfom_idx);
200+
{
201+
int prefix_len = *prefix++;
202+
while (prefix_len--) { dst[idx++] = *prefix++; }
203+
}
204+
{
205+
const int t = type;
206+
int i = 0;
207+
if (t <= BROTLI_TRANSFORM_OMIT_LAST_9) {
208+
len -= t;
209+
} else if (t >= BROTLI_TRANSFORM_OMIT_FIRST_1
210+
&& t <= BROTLI_TRANSFORM_OMIT_FIRST_9) {
211+
int skip = t - (BROTLI_TRANSFORM_OMIT_FIRST_1 - 1);
212+
word += skip;
213+
len -= skip;
214+
}
215+
while (i < len) { dst[idx++] = word[i++]; }
216+
if (t == BROTLI_TRANSFORM_UPPERCASE_FIRST) {
217+
ToUpperCase(&dst[idx - len]);
218+
} else if (t == BROTLI_TRANSFORM_UPPERCASE_ALL) {
219+
uint8_t* uppercase = &dst[idx - len];
220+
while (len > 0) {
221+
int step = ToUpperCase(uppercase);
222+
uppercase += step;
223+
len -= step;
224+
}
225+
}
226+
}
227+
{
228+
int suffix_len = *suffix++;
229+
while (suffix_len--) { dst[idx++] = *suffix++; }
230+
return idx;
231+
}
232+
}
233+
234+
#if defined(__cplusplus) || defined(c_plusplus)
235+
} /* extern "C" */
236+
#endif

0 commit comments

Comments
 (0)