Skip to content

Commit f15ddca

Browse files
ROB: Improve handling of LZW decoder table overflow (#3159)
Closes #3032.
1 parent 6003a1e commit f15ddca

File tree

3 files changed

+22
-1
lines changed

3 files changed

+22
-1
lines changed

pypdf/_codecs/_codecs.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
from abc import ABC, abstractmethod
1010
from typing import Dict, List
1111

12+
from pypdf._utils import logger_warning
13+
1214

1315
class Codec(ABC):
1416
"""Abstract base class for all codecs."""
@@ -142,9 +144,10 @@ def _pack_codes_into_bytes(self, codes: List[int]) -> bytes:
142144
return bytes(output)
143145

144146
def _initialize_decoding_table(self) -> None:
147+
self.max_code_value = (1 << self.MAX_BITS_PER_CODE) - 1
145148
self.decoding_table = [bytes([i]) for i in range(self.CLEAR_TABLE_MARKER)] + [
146149
b""
147-
] * (4096 - self.CLEAR_TABLE_MARKER)
150+
] * (self.max_code_value - self.CLEAR_TABLE_MARKER + 1)
148151
self._table_index = self.EOD_MARKER + 1
149152
self._bits_to_get = 9
150153

@@ -250,6 +253,9 @@ def decode(self, data: bytes) -> bytes:
250253

251254
def _add_entry_decode(self, old_string: bytes, new_char: int) -> None:
252255
new_string = old_string + bytes([new_char])
256+
if self._table_index > self.max_code_value:
257+
logger_warning("Ignoring too large LZW table index.", __name__)
258+
return
253259
self.decoding_table[self._table_index] = new_string
254260
self._table_index += 1
255261

5.29 KB
Binary file not shown.

tests/test_codecs.py

+15
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
"""Test LZW-related code."""
2+
from pathlib import Path
23

34
import pytest
45

56
from pypdf._codecs._codecs import LzwCodec
67

8+
TESTS_ROOT = Path(__file__).parent.resolve()
9+
PROJECT_ROOT = TESTS_ROOT.parent
10+
RESOURCE_ROOT = PROJECT_ROOT / "resources"
11+
712
test_cases = [
813
pytest.param(b"", id="Empty input"),
914
pytest.param(b"A", id="Single character"),
@@ -56,3 +61,13 @@ def test_decode_lzw(encoded, expected_decoded):
5661
codec = LzwCodec()
5762
actual_decoded = codec.decode(encoded)
5863
assert actual_decoded == expected_decoded
64+
65+
66+
def test_lzw_decoder_table_overflow(caplog):
67+
path = RESOURCE_ROOT / "lzw_decoder_table_overflow.bin"
68+
codec = LzwCodec()
69+
assert codec.decode(path.read_bytes()).startswith(
70+
b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@'
71+
)
72+
assert len(codec.decoding_table) == 4096
73+
assert "Ignoring too large LZW table index." in caplog.text

0 commit comments

Comments
 (0)