MAINT: New LZW decoding implementation (#2887)

MartinThoma · stefan6419846 · web-flow · commit e825ac07ea89 · 2024-10-03T16:10:33.000+02:00
MAINT: New LZW decoding implementation

---------

Co-authored-by: Stefan &lt;96178532+stefan6419846@users.noreply.github.com&gt;
diff --git a/pypdf/_codecs/_codecs.py b/pypdf/_codecs/_codecs.py
@@ -5,6 +5,7 @@
 the module should not do any PDF parsing.
 """
 
+import io
 from abc import ABC, abstractmethod
 from typing import Dict, List
 
@@ -47,7 +48,7 @@ class LzwCodec(Codec):
 
     def _initialize_encoding_table(self) -> None:
         """Initialize the encoding table and state to initial conditions."""
-        self.table: Dict[bytes, int] = {bytes([i]): i for i in range(256)}
+        self.encoding_table: Dict[bytes, int] = {bytes([i]): i for i in range(256)}
         self.next_code = self.EOD_MARKER + 1
         self.bits_per_code = self.INITIAL_BITS_PER_CODE
         self.max_code_value = (1 << self.bits_per_code) - 1
@@ -78,16 +79,16 @@ def encode(self, data: bytes) -> bytes:
         for byte in data:
             next_sequence = current_sequence + bytes([byte])
 
-            if next_sequence in self.table:
+            if next_sequence in self.encoding_table:
                 # Extend current sequence if already in the table
                 current_sequence = next_sequence
             else:
                 # Output code for the current sequence
-                result_codes.append(self.table[current_sequence])
+                result_codes.append(self.encoding_table[current_sequence])
 
                 # Add the new sequence to the table if there's room
                 if self.next_code <= (1 << self.MAX_BITS_PER_CODE) - 1:
-                    self.table[next_sequence] = self.next_code
+                    self.encoding_table[next_sequence] = self.next_code
                     self._increase_next_code()
                 else:
                     # If the table is full, emit a clear-table command
@@ -99,7 +100,7 @@ def encode(self, data: bytes) -> bytes:
 
         # Ensure everything actually is encoded
         if current_sequence:
-            result_codes.append(self.table[current_sequence])
+            result_codes.append(self.encoding_table[current_sequence])
         result_codes.append(self.EOD_MARKER)
 
         return self._pack_codes_into_bytes(result_codes)
@@ -138,8 +139,122 @@ def _pack_codes_into_bytes(self, codes: List[int]) -> bytes:
 
         return bytes(output)
 
+    def _initialize_decoding_table(self) -> None:
+        self.decoding_table = [bytes([i]) for i in range(self.CLEAR_TABLE_MARKER)] + [
+            b""
+        ] * (4096 - self.CLEAR_TABLE_MARKER)
+        self._table_index = self.EOD_MARKER + 1
+        self._bits_to_get = 9
+
+    def _next_code_decode(self, data: bytes) -> int:
+        self._next_data: int
+        try:
+            while self._next_bits < self._bits_to_get:
+                self._next_data = (self._next_data << 8) | (
+                    data[self._byte_pointer] & 0xFF
+                )
+                self._byte_pointer += 1
+                self._next_bits += 8
+
+            code = (
+                self._next_data >> (self._next_bits - self._bits_to_get)
+            ) & self._and_table[self._bits_to_get - 9]
+            self._next_bits -= self._bits_to_get
+
+            return code
+        except IndexError:
+            return self.EOD_MARKER
+
+    # The following method has been converted to Python from PDFsharp:
+    # https://github.com/empira/PDFsharp/blob/5fbf6ed14740bc4e16786816882d32e43af3ff5d/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
+    #
+    # Original license:
+    #
+    # -------------------------------------------------------------------------
+    # Copyright (c) 2001-2024 empira Software GmbH, Troisdorf (Cologne Area),
+    # Germany
+    #
+    # http://docs.pdfsharp.net
+    #
+    # MIT License
+    #
+    # Permission is hereby granted, free of charge, to any person obtaining a
+    # copy of this software and associated documentation files (the "Software"),
+    # to deal in the Software without restriction, including without limitation
+    # the rights to use, copy, modify, merge, publish, distribute, sublicense,
+    # and/or sell copies of the Software, and to permit persons to whom the
+    # Software is furnished to do so, subject to the following conditions:
+    #
+    # The above copyright notice and this permission notice shall be included
+    # in all copies or substantial portions of the Software.
+    #
+    # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    # DEALINGS IN THE SOFTWARE.
+    # --------------------------------------------------------------------------
     def decode(self, data: bytes) -> bytes:
-        """Decode data using LZW."""
-        from ..filters import LZWDecode
+        """
+        The following code was converted to Python from the following code:
+        https://github.com/empira/PDFsharp/blob/master/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
+        """
+        self._and_table = [511, 1023, 2047, 4095]
+        self._table_index = 0
+        self._bits_to_get = 9
+        self._byte_pointer = 0
+        self._next_data = 0
+        self._next_bits = 0
+
+        output_stream = io.BytesIO()
+
+        self._initialize_decoding_table()
+        self._byte_pointer = 0
+        self._next_data = 0
+        self._next_bits = 0
+        old_code = self.CLEAR_TABLE_MARKER
+
+        while True:
+            code = self._next_code_decode(data)
+            if code == self.EOD_MARKER:
+                break
 
-        return LZWDecode.Decoder(data).decode()
+            if code == self.CLEAR_TABLE_MARKER:
+                self._initialize_decoding_table()
+                code = self._next_code_decode(data)
+                if code == self.EOD_MARKER:
+                    break
+                output_stream.write(self.decoding_table[code])
+                old_code = code
+            elif code < self._table_index:
+                string = self.decoding_table[code]
+                output_stream.write(string)
+                if old_code != self.CLEAR_TABLE_MARKER:
+                    self._add_entry_decode(self.decoding_table[old_code], string[0])
+                old_code = code
+            else:
+                # The code is not in the table and not one of the special codes
+                string = (
+                    self.decoding_table[old_code] + self.decoding_table[old_code][:1]
+                )
+                output_stream.write(string)
+                self._add_entry_decode(self.decoding_table[old_code], string[0])
+                old_code = code
+
+        output = output_stream.getvalue()
+        return output
+
+    def _add_entry_decode(self, old_string: bytes, new_char: int) -> None:
+        new_string = old_string + bytes([new_char])
+        self.decoding_table[self._table_index] = new_string
+        self._table_index += 1
+
+        # Update the number of bits to get based on the table index
+        if self._table_index == 511:
+            self._bits_to_get = 10
+        elif self._table_index == 1023:
+            self._bits_to_get = 11
+        elif self._table_index == 2047:
+            self._bits_to_get = 12
diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -41,12 +41,12 @@
 from io import BytesIO
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
+from ._codecs._codecs import LzwCodec as _LzwCodec
 from ._utils import (
     WHITESPACES_AS_BYTES,
     deprecate,
     deprecation_no_replacement,
     logger_warning,
-    ord_,
 )
 from .constants import CcittFaxDecodeParameters as CCITT
 from .constants import FilterTypeAbbreviations as FTA
@@ -366,89 +366,15 @@ def decode(
 
 
 class LZWDecode:
-    """
-    Taken from:
-
-    https://github.com/katjas/PDFrenderer/blob/master/src/com/sun/pdfview/decode/LZWDecode.java
-    """
-
     class Decoder:
         STOP = 257
         CLEARDICT = 256
 
         def __init__(self, data: bytes) -> None:
             self.data = data
-            self.bytepos = 0
-            self.bitpos = 0
-            self.dict = [struct.pack("B", i) for i in range(256)] + [b""] * (4096 - 256)
-            self.reset_dict()
-
-        def reset_dict(self) -> None:
-            self.dictlen = 258
-            self.bitspercode = 9
-
-        def next_code(self) -> int:
-            fillbits = self.bitspercode
-            value = 0
-            while fillbits > 0:
-                if self.bytepos >= len(self.data):
-                    return -1
-                nextbits = ord_(self.data[self.bytepos])
-                bitsfromhere = 8 - self.bitpos
-                bitsfromhere = min(bitsfromhere, fillbits)
-                value |= (
-                    (nextbits >> (8 - self.bitpos - bitsfromhere))
-                    & (0xFF >> (8 - bitsfromhere))
-                ) << (fillbits - bitsfromhere)
-                fillbits -= bitsfromhere
-                self.bitpos += bitsfromhere
-                if self.bitpos >= 8:
-                    self.bitpos = 0
-                    self.bytepos = self.bytepos + 1
-            return value
 
         def decode(self) -> bytes:
-            """
-            TIFF 6.0 specification explains in sufficient details the steps to
-            implement the LZW encode() and decode() algorithms.
-
-            algorithm derived from:
-            http://www.rasip.fer.hr/research/compress/algorithms/fund/lz/lzw.html
-            and the PDFReference
-
-            Raises:
-              PdfReadError: If the stop code is missing
-            """
-            cW = self.CLEARDICT
-            baos = b""
-            while True:
-                pW = cW
-                cW = self.next_code()
-                if cW == -1:
-                    raise PdfReadError("Missed the stop code in LZWDecode!")
-                if cW == self.STOP:
-                    break
-                elif cW == self.CLEARDICT:
-                    self.reset_dict()
-                elif pW == self.CLEARDICT:
-                    baos += self.dict[cW]
-                else:
-                    if cW < self.dictlen:
-                        baos += self.dict[cW]
-                        p = self.dict[pW] + self.dict[cW][0:1]
-                        self.dict[self.dictlen] = p
-                        self.dictlen += 1
-                    else:
-                        p = self.dict[pW] + self.dict[pW][0:1]
-                        baos += p
-                        self.dict[self.dictlen] = p
-                        self.dictlen += 1
-                    if (
-                        self.dictlen >= (1 << self.bitspercode) - 1
-                        and self.bitspercode < 12
-                    ):
-                        self.bitspercode += 1
-            return baos
+            return _LzwCodec().decode(self.data)
 
     @staticmethod
     def _decodeb(
diff --git a/tests/test_codecs.py b/tests/test_codecs.py
@@ -41,3 +41,18 @@ def test_encode_lzw(plain, expected_encoded):
     codec = LzwCodec()
     actual_encoded = codec.encode(plain)
     assert actual_encoded == expected_encoded
+
+
+@pytest.mark.parametrize(
+    ("encoded", "expected_decoded"),
+    [
+        # _pack_codes_into_bytes([256, 65, 66, 67, 68, 256, 256, 69, 70, 71, 72, 257])
+        (b"\x80\x10HD2$\x02\x00E#\x11\xc9\x10\x10", b"ABCDEFGH"),  # Clear twice.
+        # _pack_codes_into_bytes([65, 66, 67, 68, 257])
+        (b" \x90\x88dH\x08", b"ABCD"),  # No explicit initial clear marker.
+    ],
+)
+def test_decode_lzw(encoded, expected_decoded):
+    codec = LzwCodec()
+    actual_decoded = codec.decode(encoded)
+    assert actual_decoded == expected_decoded
diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -235,9 +235,7 @@ def test_decompress_zlib_error(caplog):
 def test_lzw_decode_neg1():
     reader = PdfReader(BytesIO(get_data_from_url(name="tika-921632.pdf")))
     page = reader.pages[47]
-    with pytest.raises(PdfReadError) as exc:
-        page.extract_text()
-    assert exc.value.args[0] == "Missed the stop code in LZWDecode!"
+    assert page.extract_text().startswith("Chapter 2")
 
 
 @pytest.mark.enable_socket()
@@ -249,6 +247,7 @@ def test_issue_399():
 @pytest.mark.enable_socket()
 def test_image_without_pillow(tmp_path):
     import os
+
     name = "tika-914102.pdf"
     pdf_path = Path(__file__).parent / "pdf_cache" / name
     pdf_path_str = str(pdf_path.resolve()).replace("\\", "/")