Skip to content

Commit e825ac0

Browse files
MAINT: New LZW decoding implementation (#2887)
MAINT: New LZW decoding implementation --------- Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
1 parent d5233a0 commit e825ac0

File tree

4 files changed

+142
-87
lines changed

4 files changed

+142
-87
lines changed

pypdf/_codecs/_codecs.py

+123-8
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
the module should not do any PDF parsing.
66
"""
77

8+
import io
89
from abc import ABC, abstractmethod
910
from typing import Dict, List
1011

@@ -47,7 +48,7 @@ class LzwCodec(Codec):
4748

4849
def _initialize_encoding_table(self) -> None:
4950
"""Initialize the encoding table and state to initial conditions."""
50-
self.table: Dict[bytes, int] = {bytes([i]): i for i in range(256)}
51+
self.encoding_table: Dict[bytes, int] = {bytes([i]): i for i in range(256)}
5152
self.next_code = self.EOD_MARKER + 1
5253
self.bits_per_code = self.INITIAL_BITS_PER_CODE
5354
self.max_code_value = (1 << self.bits_per_code) - 1
@@ -78,16 +79,16 @@ def encode(self, data: bytes) -> bytes:
7879
for byte in data:
7980
next_sequence = current_sequence + bytes([byte])
8081

81-
if next_sequence in self.table:
82+
if next_sequence in self.encoding_table:
8283
# Extend current sequence if already in the table
8384
current_sequence = next_sequence
8485
else:
8586
# Output code for the current sequence
86-
result_codes.append(self.table[current_sequence])
87+
result_codes.append(self.encoding_table[current_sequence])
8788

8889
# Add the new sequence to the table if there's room
8990
if self.next_code <= (1 << self.MAX_BITS_PER_CODE) - 1:
90-
self.table[next_sequence] = self.next_code
91+
self.encoding_table[next_sequence] = self.next_code
9192
self._increase_next_code()
9293
else:
9394
# If the table is full, emit a clear-table command
@@ -99,7 +100,7 @@ def encode(self, data: bytes) -> bytes:
99100

100101
# Ensure everything actually is encoded
101102
if current_sequence:
102-
result_codes.append(self.table[current_sequence])
103+
result_codes.append(self.encoding_table[current_sequence])
103104
result_codes.append(self.EOD_MARKER)
104105

105106
return self._pack_codes_into_bytes(result_codes)
@@ -138,8 +139,122 @@ def _pack_codes_into_bytes(self, codes: List[int]) -> bytes:
138139

139140
return bytes(output)
140141

142+
def _initialize_decoding_table(self) -> None:
143+
self.decoding_table = [bytes([i]) for i in range(self.CLEAR_TABLE_MARKER)] + [
144+
b""
145+
] * (4096 - self.CLEAR_TABLE_MARKER)
146+
self._table_index = self.EOD_MARKER + 1
147+
self._bits_to_get = 9
148+
149+
def _next_code_decode(self, data: bytes) -> int:
150+
self._next_data: int
151+
try:
152+
while self._next_bits < self._bits_to_get:
153+
self._next_data = (self._next_data << 8) | (
154+
data[self._byte_pointer] & 0xFF
155+
)
156+
self._byte_pointer += 1
157+
self._next_bits += 8
158+
159+
code = (
160+
self._next_data >> (self._next_bits - self._bits_to_get)
161+
) & self._and_table[self._bits_to_get - 9]
162+
self._next_bits -= self._bits_to_get
163+
164+
return code
165+
except IndexError:
166+
return self.EOD_MARKER
167+
168+
# The following method has been converted to Python from PDFsharp:
169+
# https://github.com/empira/PDFsharp/blob/5fbf6ed14740bc4e16786816882d32e43af3ff5d/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
170+
#
171+
# Original license:
172+
#
173+
# -------------------------------------------------------------------------
174+
# Copyright (c) 2001-2024 empira Software GmbH, Troisdorf (Cologne Area),
175+
# Germany
176+
#
177+
# http://docs.pdfsharp.net
178+
#
179+
# MIT License
180+
#
181+
# Permission is hereby granted, free of charge, to any person obtaining a
182+
# copy of this software and associated documentation files (the "Software"),
183+
# to deal in the Software without restriction, including without limitation
184+
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
185+
# and/or sell copies of the Software, and to permit persons to whom the
186+
# Software is furnished to do so, subject to the following conditions:
187+
#
188+
# The above copyright notice and this permission notice shall be included
189+
# in all copies or substantial portions of the Software.
190+
#
191+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
192+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
193+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
194+
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
195+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
196+
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
197+
# DEALINGS IN THE SOFTWARE.
198+
# --------------------------------------------------------------------------
141199
def decode(self, data: bytes) -> bytes:
142-
"""Decode data using LZW."""
143-
from ..filters import LZWDecode
200+
"""
201+
The following code was converted to Python from the following code:
202+
https://github.com/empira/PDFsharp/blob/master/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
203+
"""
204+
self._and_table = [511, 1023, 2047, 4095]
205+
self._table_index = 0
206+
self._bits_to_get = 9
207+
self._byte_pointer = 0
208+
self._next_data = 0
209+
self._next_bits = 0
210+
211+
output_stream = io.BytesIO()
212+
213+
self._initialize_decoding_table()
214+
self._byte_pointer = 0
215+
self._next_data = 0
216+
self._next_bits = 0
217+
old_code = self.CLEAR_TABLE_MARKER
218+
219+
while True:
220+
code = self._next_code_decode(data)
221+
if code == self.EOD_MARKER:
222+
break
144223

145-
return LZWDecode.Decoder(data).decode()
224+
if code == self.CLEAR_TABLE_MARKER:
225+
self._initialize_decoding_table()
226+
code = self._next_code_decode(data)
227+
if code == self.EOD_MARKER:
228+
break
229+
output_stream.write(self.decoding_table[code])
230+
old_code = code
231+
elif code < self._table_index:
232+
string = self.decoding_table[code]
233+
output_stream.write(string)
234+
if old_code != self.CLEAR_TABLE_MARKER:
235+
self._add_entry_decode(self.decoding_table[old_code], string[0])
236+
old_code = code
237+
else:
238+
# The code is not in the table and not one of the special codes
239+
string = (
240+
self.decoding_table[old_code] + self.decoding_table[old_code][:1]
241+
)
242+
output_stream.write(string)
243+
self._add_entry_decode(self.decoding_table[old_code], string[0])
244+
old_code = code
245+
246+
output = output_stream.getvalue()
247+
return output
248+
249+
def _add_entry_decode(self, old_string: bytes, new_char: int) -> None:
250+
new_string = old_string + bytes([new_char])
251+
self.decoding_table[self._table_index] = new_string
252+
self._table_index += 1
253+
254+
# Update the number of bits to get based on the table index
255+
if self._table_index == 511:
256+
self._bits_to_get = 10
257+
elif self._table_index == 1023:
258+
self._bits_to_get = 11
259+
elif self._table_index == 2047:
260+
self._bits_to_get = 12

pypdf/filters.py

+2-76
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,12 @@
4141
from io import BytesIO
4242
from typing import Any, Dict, List, Optional, Tuple, Union, cast
4343

44+
from ._codecs._codecs import LzwCodec as _LzwCodec
4445
from ._utils import (
4546
WHITESPACES_AS_BYTES,
4647
deprecate,
4748
deprecation_no_replacement,
4849
logger_warning,
49-
ord_,
5050
)
5151
from .constants import CcittFaxDecodeParameters as CCITT
5252
from .constants import FilterTypeAbbreviations as FTA
@@ -366,89 +366,15 @@ def decode(
366366

367367

368368
class LZWDecode:
369-
"""
370-
Taken from:
371-
372-
https://github.com/katjas/PDFrenderer/blob/master/src/com/sun/pdfview/decode/LZWDecode.java
373-
"""
374-
375369
class Decoder:
376370
STOP = 257
377371
CLEARDICT = 256
378372

379373
def __init__(self, data: bytes) -> None:
380374
self.data = data
381-
self.bytepos = 0
382-
self.bitpos = 0
383-
self.dict = [struct.pack("B", i) for i in range(256)] + [b""] * (4096 - 256)
384-
self.reset_dict()
385-
386-
def reset_dict(self) -> None:
387-
self.dictlen = 258
388-
self.bitspercode = 9
389-
390-
def next_code(self) -> int:
391-
fillbits = self.bitspercode
392-
value = 0
393-
while fillbits > 0:
394-
if self.bytepos >= len(self.data):
395-
return -1
396-
nextbits = ord_(self.data[self.bytepos])
397-
bitsfromhere = 8 - self.bitpos
398-
bitsfromhere = min(bitsfromhere, fillbits)
399-
value |= (
400-
(nextbits >> (8 - self.bitpos - bitsfromhere))
401-
& (0xFF >> (8 - bitsfromhere))
402-
) << (fillbits - bitsfromhere)
403-
fillbits -= bitsfromhere
404-
self.bitpos += bitsfromhere
405-
if self.bitpos >= 8:
406-
self.bitpos = 0
407-
self.bytepos = self.bytepos + 1
408-
return value
409375

410376
def decode(self) -> bytes:
411-
"""
412-
TIFF 6.0 specification explains in sufficient details the steps to
413-
implement the LZW encode() and decode() algorithms.
414-
415-
algorithm derived from:
416-
http://www.rasip.fer.hr/research/compress/algorithms/fund/lz/lzw.html
417-
and the PDFReference
418-
419-
Raises:
420-
PdfReadError: If the stop code is missing
421-
"""
422-
cW = self.CLEARDICT
423-
baos = b""
424-
while True:
425-
pW = cW
426-
cW = self.next_code()
427-
if cW == -1:
428-
raise PdfReadError("Missed the stop code in LZWDecode!")
429-
if cW == self.STOP:
430-
break
431-
elif cW == self.CLEARDICT:
432-
self.reset_dict()
433-
elif pW == self.CLEARDICT:
434-
baos += self.dict[cW]
435-
else:
436-
if cW < self.dictlen:
437-
baos += self.dict[cW]
438-
p = self.dict[pW] + self.dict[cW][0:1]
439-
self.dict[self.dictlen] = p
440-
self.dictlen += 1
441-
else:
442-
p = self.dict[pW] + self.dict[pW][0:1]
443-
baos += p
444-
self.dict[self.dictlen] = p
445-
self.dictlen += 1
446-
if (
447-
self.dictlen >= (1 << self.bitspercode) - 1
448-
and self.bitspercode < 12
449-
):
450-
self.bitspercode += 1
451-
return baos
377+
return _LzwCodec().decode(self.data)
452378

453379
@staticmethod
454380
def _decodeb(

tests/test_codecs.py

+15
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,18 @@ def test_encode_lzw(plain, expected_encoded):
4141
codec = LzwCodec()
4242
actual_encoded = codec.encode(plain)
4343
assert actual_encoded == expected_encoded
44+
45+
46+
@pytest.mark.parametrize(
47+
("encoded", "expected_decoded"),
48+
[
49+
# _pack_codes_into_bytes([256, 65, 66, 67, 68, 256, 256, 69, 70, 71, 72, 257])
50+
(b"\x80\x10HD2$\x02\x00E#\x11\xc9\x10\x10", b"ABCDEFGH"), # Clear twice.
51+
# _pack_codes_into_bytes([65, 66, 67, 68, 257])
52+
(b" \x90\x88dH\x08", b"ABCD"), # No explicit initial clear marker.
53+
],
54+
)
55+
def test_decode_lzw(encoded, expected_decoded):
56+
codec = LzwCodec()
57+
actual_decoded = codec.decode(encoded)
58+
assert actual_decoded == expected_decoded

tests/test_filters.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -235,9 +235,7 @@ def test_decompress_zlib_error(caplog):
235235
def test_lzw_decode_neg1():
236236
reader = PdfReader(BytesIO(get_data_from_url(name="tika-921632.pdf")))
237237
page = reader.pages[47]
238-
with pytest.raises(PdfReadError) as exc:
239-
page.extract_text()
240-
assert exc.value.args[0] == "Missed the stop code in LZWDecode!"
238+
assert page.extract_text().startswith("Chapter 2")
241239

242240

243241
@pytest.mark.enable_socket()
@@ -249,6 +247,7 @@ def test_issue_399():
249247
@pytest.mark.enable_socket()
250248
def test_image_without_pillow(tmp_path):
251249
import os
250+
252251
name = "tika-914102.pdf"
253252
pdf_path = Path(__file__).parent / "pdf_cache" / name
254253
pdf_path_str = str(pdf_path.resolve()).replace("\\", "/")

0 commit comments

Comments
 (0)