@@ -64,6 +64,7 @@ import Control.Monad.ST (runST)
64
64
import Data.Bits ((.&.) )
65
65
import Data.ByteString as B
66
66
import qualified Data.ByteString.Internal as B
67
+ import Data.Foldable (traverse_ )
67
68
import Data.Text.Encoding.Error (OnDecodeError , UnicodeException , strictDecode )
68
69
import Data.Text.Internal (Text (.. ), safe , text )
69
70
import Data.Text.Internal.Functions
@@ -275,19 +276,22 @@ newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
275
276
streamDecodeUtf8 :: ByteString -> Decoding
276
277
streamDecodeUtf8 = streamDecodeUtf8With strictDecode
277
278
278
- -- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
279
+ -- | Decode, in a stream oriented way, a lazy 'ByteString' containing UTF-8
279
280
-- encoded text.
280
281
--
281
282
-- @since 1.0.0.0
282
283
streamDecodeUtf8With :: OnDecodeError -> ByteString -> Decoding
283
284
streamDecodeUtf8With onErr = decodeChunk B. empty 0 0
284
285
where
285
286
-- We create a slightly larger than necessary buffer to accommodate a
286
- -- potential surrogate pair started in the last buffer
287
+ -- potential surrogate pair started in the last buffer (@undecoded0@), or
288
+ -- replacement characters for each byte in @undecoded0@ if the
289
+ -- sequence turns out to be invalid. There can be up to three bytes there,
290
+ -- hence we allocate @len+3@ 16-bit words.
287
291
decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
288
292
-> Decoding
289
293
decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where
290
- aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A. new (len+ 1 )
294
+ aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A. new (len+ 3 )
291
295
where
292
296
decodeChunkToBuffer :: A. MArray s -> IO Decoding
293
297
decodeChunkToBuffer dest = unsafeWithForeignPtr fp $ \ ptr ->
@@ -297,23 +301,32 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
297
301
with nullPtr $ \ curPtrPtr ->
298
302
let end = ptr `plusPtr` len
299
303
loop curPtr = do
304
+ prevState <- peek statePtr
300
305
poke curPtrPtr curPtr
301
- curPtr' <- c_decode_utf8_with_state (A. maBA dest) destOffPtr
306
+ lastPtr <- c_decode_utf8_with_state (A. maBA dest) destOffPtr
302
307
curPtrPtr end codepointPtr statePtr
303
308
state <- peek statePtr
304
309
case state of
305
310
UTF8_REJECT -> do
306
311
-- We encountered an encoding error
307
- x <- peek curPtr'
308
312
poke statePtr 0
309
- case onErr desc (Just x) of
310
- Nothing -> loop $ curPtr' `plusPtr` 1
311
- Just c -> do
312
- destOff <- peek destOffPtr
313
- w <- unsafeSTToIO $
314
- unsafeWrite dest (fromIntegral destOff) (safe c)
315
- poke destOffPtr (destOff + fromIntegral w)
316
- loop $ curPtr' `plusPtr` 1
313
+ let skipByte x = case onErr desc (Just x) of
314
+ Nothing -> return ()
315
+ Just c -> do
316
+ destOff <- peek destOffPtr
317
+ w <- unsafeSTToIO $
318
+ unsafeWrite dest (fromIntegral destOff) (safe c)
319
+ poke destOffPtr (destOff + fromIntegral w)
320
+ if ptr == lastPtr && prevState /= UTF8_ACCEPT then do
321
+ -- If we can't complete the sequence @undecoded0@ from
322
+ -- the previous chunk, we invalidate the bytes from
323
+ -- @undecoded0@ and retry decoding the current chunk from
324
+ -- the initial state.
325
+ traverse_ skipByte (B. unpack undecoded0 )
326
+ loop lastPtr
327
+ else do
328
+ peek lastPtr >>= skipByte
329
+ loop (lastPtr `plusPtr` 1 )
317
330
318
331
_ -> do
319
332
-- We encountered the end of the buffer while decoding
@@ -322,11 +335,11 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
322
335
chunkText <- unsafeSTToIO $ do
323
336
arr <- A. unsafeFreeze dest
324
337
return $! text arr 0 (fromIntegral n)
325
- lastPtr <- peek curPtrPtr
326
- let left = lastPtr `minusPtr` curPtr
338
+ let left = lastPtr `minusPtr` ptr
327
339
! undecoded = case state of
328
340
UTF8_ACCEPT -> B. empty
329
- _ -> B. append undecoded0 (B. drop left bs)
341
+ _ | left == 0 && prevState /= UTF8_ACCEPT -> B. append undecoded0 bs
342
+ | otherwise -> B. drop left bs
330
343
return $ Some chunkText undecoded
331
344
(decodeChunk undecoded codepoint state)
332
345
in loop ptr
0 commit comments