5
5
the module should not do any PDF parsing.
6
6
"""
7
7
8
+ import io
8
9
from abc import ABC , abstractmethod
9
10
from typing import Dict , List
10
11
@@ -47,7 +48,7 @@ class LzwCodec(Codec):
47
48
48
49
def _initialize_encoding_table (self ) -> None :
49
50
"""Initialize the encoding table and state to initial conditions."""
50
- self .table : Dict [bytes , int ] = {bytes ([i ]): i for i in range (256 )}
51
+ self .encoding_table : Dict [bytes , int ] = {bytes ([i ]): i for i in range (256 )}
51
52
self .next_code = self .EOD_MARKER + 1
52
53
self .bits_per_code = self .INITIAL_BITS_PER_CODE
53
54
self .max_code_value = (1 << self .bits_per_code ) - 1
@@ -78,16 +79,16 @@ def encode(self, data: bytes) -> bytes:
78
79
for byte in data :
79
80
next_sequence = current_sequence + bytes ([byte ])
80
81
81
- if next_sequence in self .table :
82
+ if next_sequence in self .encoding_table :
82
83
# Extend current sequence if already in the table
83
84
current_sequence = next_sequence
84
85
else :
85
86
# Output code for the current sequence
86
- result_codes .append (self .table [current_sequence ])
87
+ result_codes .append (self .encoding_table [current_sequence ])
87
88
88
89
# Add the new sequence to the table if there's room
89
90
if self .next_code <= (1 << self .MAX_BITS_PER_CODE ) - 1 :
90
- self .table [next_sequence ] = self .next_code
91
+ self .encoding_table [next_sequence ] = self .next_code
91
92
self ._increase_next_code ()
92
93
else :
93
94
# If the table is full, emit a clear-table command
@@ -99,7 +100,7 @@ def encode(self, data: bytes) -> bytes:
99
100
100
101
# Ensure everything actually is encoded
101
102
if current_sequence :
102
- result_codes .append (self .table [current_sequence ])
103
+ result_codes .append (self .encoding_table [current_sequence ])
103
104
result_codes .append (self .EOD_MARKER )
104
105
105
106
return self ._pack_codes_into_bytes (result_codes )
@@ -138,8 +139,122 @@ def _pack_codes_into_bytes(self, codes: List[int]) -> bytes:
138
139
139
140
return bytes (output )
140
141
142
+ def _initialize_decoding_table (self ) -> None :
143
+ self .decoding_table = [bytes ([i ]) for i in range (self .CLEAR_TABLE_MARKER )] + [
144
+ b""
145
+ ] * (4096 - self .CLEAR_TABLE_MARKER )
146
+ self ._table_index = self .EOD_MARKER + 1
147
+ self ._bits_to_get = 9
148
+
149
+ def _next_code_decode (self , data : bytes ) -> int :
150
+ self ._next_data : int
151
+ try :
152
+ while self ._next_bits < self ._bits_to_get :
153
+ self ._next_data = (self ._next_data << 8 ) | (
154
+ data [self ._byte_pointer ] & 0xFF
155
+ )
156
+ self ._byte_pointer += 1
157
+ self ._next_bits += 8
158
+
159
+ code = (
160
+ self ._next_data >> (self ._next_bits - self ._bits_to_get )
161
+ ) & self ._and_table [self ._bits_to_get - 9 ]
162
+ self ._next_bits -= self ._bits_to_get
163
+
164
+ return code
165
+ except IndexError :
166
+ return self .EOD_MARKER
167
+
168
+ # The following method has been converted to Python from PDFsharp:
169
+ # https://github.com/empira/PDFsharp/blob/5fbf6ed14740bc4e16786816882d32e43af3ff5d/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
170
+ #
171
+ # Original license:
172
+ #
173
+ # -------------------------------------------------------------------------
174
+ # Copyright (c) 2001-2024 empira Software GmbH, Troisdorf (Cologne Area),
175
+ # Germany
176
+ #
177
+ # http://docs.pdfsharp.net
178
+ #
179
+ # MIT License
180
+ #
181
+ # Permission is hereby granted, free of charge, to any person obtaining a
182
+ # copy of this software and associated documentation files (the "Software"),
183
+ # to deal in the Software without restriction, including without limitation
184
+ # the rights to use, copy, modify, merge, publish, distribute, sublicense,
185
+ # and/or sell copies of the Software, and to permit persons to whom the
186
+ # Software is furnished to do so, subject to the following conditions:
187
+ #
188
+ # The above copyright notice and this permission notice shall be included
189
+ # in all copies or substantial portions of the Software.
190
+ #
191
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
192
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
193
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
194
+ # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
195
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
196
+ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
197
+ # DEALINGS IN THE SOFTWARE.
198
+ # --------------------------------------------------------------------------
141
199
def decode (self , data : bytes ) -> bytes :
142
- """Decode data using LZW."""
143
- from ..filters import LZWDecode
200
+ """
201
+ The following code was converted to Python from the following code:
202
+ https://github.com/empira/PDFsharp/blob/master/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
203
+ """
204
+ self ._and_table = [511 , 1023 , 2047 , 4095 ]
205
+ self ._table_index = 0
206
+ self ._bits_to_get = 9
207
+ self ._byte_pointer = 0
208
+ self ._next_data = 0
209
+ self ._next_bits = 0
210
+
211
+ output_stream = io .BytesIO ()
212
+
213
+ self ._initialize_decoding_table ()
214
+ self ._byte_pointer = 0
215
+ self ._next_data = 0
216
+ self ._next_bits = 0
217
+ old_code = self .CLEAR_TABLE_MARKER
218
+
219
+ while True :
220
+ code = self ._next_code_decode (data )
221
+ if code == self .EOD_MARKER :
222
+ break
144
223
145
- return LZWDecode .Decoder (data ).decode ()
224
+ if code == self .CLEAR_TABLE_MARKER :
225
+ self ._initialize_decoding_table ()
226
+ code = self ._next_code_decode (data )
227
+ if code == self .EOD_MARKER :
228
+ break
229
+ output_stream .write (self .decoding_table [code ])
230
+ old_code = code
231
+ elif code < self ._table_index :
232
+ string = self .decoding_table [code ]
233
+ output_stream .write (string )
234
+ if old_code != self .CLEAR_TABLE_MARKER :
235
+ self ._add_entry_decode (self .decoding_table [old_code ], string [0 ])
236
+ old_code = code
237
+ else :
238
+ # The code is not in the table and not one of the special codes
239
+ string = (
240
+ self .decoding_table [old_code ] + self .decoding_table [old_code ][:1 ]
241
+ )
242
+ output_stream .write (string )
243
+ self ._add_entry_decode (self .decoding_table [old_code ], string [0 ])
244
+ old_code = code
245
+
246
+ output = output_stream .getvalue ()
247
+ return output
248
+
249
+ def _add_entry_decode (self , old_string : bytes , new_char : int ) -> None :
250
+ new_string = old_string + bytes ([new_char ])
251
+ self .decoding_table [self ._table_index ] = new_string
252
+ self ._table_index += 1
253
+
254
+ # Update the number of bits to get based on the table index
255
+ if self ._table_index == 511 :
256
+ self ._bits_to_get = 10
257
+ elif self ._table_index == 1023 :
258
+ self ._bits_to_get = 11
259
+ elif self ._table_index == 2047 :
260
+ self ._bits_to_get = 12
0 commit comments