Skip to content

Commit c9874d3

Browse files
committed
Fix UTF-8 decoding of lazy bytestrings
At the beginning of a new chunk we may be trying to complete a UTF-8 sequence started in the previous chunk (contained in the `undecode0` buffer). If it turns out to be invalid, we must apply the `onErr` handler to every character in that buffer. When we reach the end of the chunk, we must also be more careful about when to keep the previous buffer: a UTF-8 sequence (up to 4 bytes) can span more than two chunks, when those chunks are very short (of length 0, 1, or 2).
1 parent 8d1b6ff commit c9874d3

File tree

5 files changed

+78
-20
lines changed

5 files changed

+78
-20
lines changed

src/Data/Text/Encoding.hs

+29-16
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ import Control.Monad.ST (runST)
6464
import Data.Bits ((.&.))
6565
import Data.ByteString as B
6666
import qualified Data.ByteString.Internal as B
67+
import Data.Foldable (traverse_)
6768
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
6869
import Data.Text.Internal (Text(..), safe, text)
6970
import Data.Text.Internal.Functions
@@ -275,19 +276,22 @@ newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
275276
streamDecodeUtf8 :: ByteString -> Decoding
276277
streamDecodeUtf8 = streamDecodeUtf8With strictDecode
277278

278-
-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
279+
-- | Decode, in a stream oriented way, a lazy 'ByteString' containing UTF-8
279280
-- encoded text.
280281
--
281282
-- @since 1.0.0.0
282283
streamDecodeUtf8With :: OnDecodeError -> ByteString -> Decoding
283284
streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
284285
where
285286
-- We create a slightly larger than necessary buffer to accommodate a
286-
-- potential surrogate pair started in the last buffer
287+
-- potential surrogate pair started in the last buffer (@undecoded0@), or
288+
-- replacement characters for each byte in @undecoded0@ if the
289+
-- sequence turns out to be invalid. There can be up to three bytes there,
290+
-- hence we allocate @len+3@ 16-bit words.
287291
decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
288292
-> Decoding
289293
decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where
290-
aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
294+
aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+3)
291295
where
292296
decodeChunkToBuffer :: A.MArray s -> IO Decoding
293297
decodeChunkToBuffer dest = unsafeWithForeignPtr fp $ \ptr ->
@@ -297,23 +301,32 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
297301
with nullPtr $ \curPtrPtr ->
298302
let end = ptr `plusPtr` len
299303
loop curPtr = do
304+
prevState <- peek statePtr
300305
poke curPtrPtr curPtr
301-
curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
306+
lastPtr <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
302307
curPtrPtr end codepointPtr statePtr
303308
state <- peek statePtr
304309
case state of
305310
UTF8_REJECT -> do
306311
-- We encountered an encoding error
307-
x <- peek curPtr'
308312
poke statePtr 0
309-
case onErr desc (Just x) of
310-
Nothing -> loop $ curPtr' `plusPtr` 1
311-
Just c -> do
312-
destOff <- peek destOffPtr
313-
w <- unsafeSTToIO $
314-
unsafeWrite dest (fromIntegral destOff) (safe c)
315-
poke destOffPtr (destOff + fromIntegral w)
316-
loop $ curPtr' `plusPtr` 1
313+
let skipByte x = case onErr desc (Just x) of
314+
Nothing -> return ()
315+
Just c -> do
316+
destOff <- peek destOffPtr
317+
w <- unsafeSTToIO $
318+
unsafeWrite dest (fromIntegral destOff) (safe c)
319+
poke destOffPtr (destOff + fromIntegral w)
320+
if ptr == lastPtr && prevState /= UTF8_ACCEPT then do
321+
-- If we can't complete the sequence @undecoded0@ from
322+
-- the previous chunk, we invalidate the bytes from
323+
-- @undecoded0@ and retry decoding the current chunk from
324+
-- the initial state.
325+
traverse_ skipByte (B.unpack undecoded0 )
326+
loop lastPtr
327+
else do
328+
peek lastPtr >>= skipByte
329+
loop (lastPtr `plusPtr` 1)
317330

318331
_ -> do
319332
-- We encountered the end of the buffer while decoding
@@ -322,11 +335,11 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
322335
chunkText <- unsafeSTToIO $ do
323336
arr <- A.unsafeFreeze dest
324337
return $! text arr 0 (fromIntegral n)
325-
lastPtr <- peek curPtrPtr
326-
let left = lastPtr `minusPtr` curPtr
338+
let left = lastPtr `minusPtr` ptr
327339
!undecoded = case state of
328340
UTF8_ACCEPT -> B.empty
329-
_ -> B.append undecoded0 (B.drop left bs)
341+
_ | left == 0 && prevState /= UTF8_ACCEPT -> B.append undecoded0 bs
342+
| otherwise -> B.drop left bs
330343
return $ Some chunkText undecoded
331344
(decodeChunk undecoded codepoint state)
332345
in loop ptr

src/Data/Text/Lazy/Encoding.hs

+7-3
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,13 @@ decodeUtf8With onErr (B.Chunk b0 bs0) =
100100
TE.Some t l f -> chunk t (go f l bs)
101101
go _ l _
102102
| S.null l = empty
103-
| otherwise = case onErr desc (Just (B.unsafeHead l)) of
104-
Nothing -> empty
105-
Just c -> Chunk (T.singleton c) Empty
103+
| otherwise =
104+
let !t = T.pack (skipBytes l)
105+
skipBytes = S.foldr (\x s' ->
106+
case onErr desc (Just x) of
107+
Just c -> c : s'
108+
Nothing -> s') [] in
109+
Chunk t Empty
106110
desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
107111
decodeUtf8With _ _ = empty
108112

tests/Tests/Properties/Transcoding.hs

+14
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import qualified Data.ByteString as B
2424
import qualified Data.ByteString.Lazy as BL
2525
import qualified Data.Text as T
2626
import qualified Data.Text.Encoding as E
27+
import qualified Data.Text.Encoding.Error as E
2728
import qualified Data.Text.Lazy as TL
2829
import qualified Data.Text.Lazy.Encoding as EL
2930

@@ -152,6 +153,18 @@ genInvalidUTF8 = B.pack <$> oneof [
152153
k <- choose (0,n)
153154
vectorOf k gen
154155

156+
decodeLL :: BL.ByteString -> TL.Text
157+
decodeLL = EL.decodeUtf8With E.lenientDecode
158+
159+
decodeL :: B.ByteString -> T.Text
160+
decodeL = E.decodeUtf8With E.lenientDecode
161+
162+
-- The lenient decoding of lazy bytestrings should not depend on how they are chunked,
163+
-- and it should behave the same as decoding of strict bytestrings.
164+
t_decode_utf8_lenient :: Property
165+
t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
166+
decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs
167+
155168
-- See http://unicode.org/faq/utf_bom.html#gen8
156169
-- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ...
157170
-- When faced with this illegal byte sequence ... a UTF-8 conformant process
@@ -206,6 +219,7 @@ testTranscoding =
206219
testProperty "t_utf8_err'" t_utf8_err'
207220
],
208221
testGroup "error recovery" [
222+
testProperty "t_decode_utf8_lenient" t_decode_utf8_lenient,
209223
testProperty "t_decode_with_error2" t_decode_with_error2,
210224
testProperty "t_decode_with_error3" t_decode_with_error3,
211225
testProperty "t_decode_with_error4" t_decode_with_error4,

tests/Tests/QuickCheckUtils.hs

+19-1
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ import Test.QuickCheck.Monadic (assert, monadicIO, run)
4949
import Test.QuickCheck.Unicode (string)
5050
import Tests.Utils
5151
import qualified Data.ByteString as B
52+
import qualified Data.ByteString.Lazy as BL
5253
import qualified Data.Text as T
5354
import qualified Data.Text.Encoding.Error as T
5455
import qualified Data.Text.Internal.Fusion as TF
@@ -61,6 +62,9 @@ import qualified System.IO as IO
6162
genUnicode :: IsString a => Gen a
6263
genUnicode = fromString <$> string
6364

65+
genWord8 :: Gen Word8
66+
genWord8 = chooseAny
67+
6468
instance Random I16 where
6569
randomR = integralRandomR
6670
random = randomR (minBound,maxBound)
@@ -70,9 +74,23 @@ instance Arbitrary I16 where
7074
shrink = shrinkIntegral
7175

7276
instance Arbitrary B.ByteString where
73-
arbitrary = B.pack `fmap` arbitrary
77+
arbitrary = B.pack `fmap` listOf genWord8
7478
shrink = map B.pack . shrink . B.unpack
7579

80+
instance Arbitrary BL.ByteString where
81+
arbitrary = oneof
82+
[ BL.fromChunks <$> arbitrary
83+
-- so that a single utf8 code point could appear split over up to 4 chunks
84+
, BL.fromChunks . map B.singleton <$> listOf genWord8
85+
-- so that a code point with 4 byte long utf8 representation
86+
-- could appear split over 3 non-singleton chunks
87+
, (\a b c -> BL.fromChunks [a, b, c])
88+
<$> arbitrary
89+
<*> ((\a b -> B.pack [a, b]) <$> genWord8 <*> genWord8)
90+
<*> arbitrary
91+
]
92+
shrink xs = BL.fromChunks <$> shrink (BL.toChunks xs)
93+
7694
-- For tests that have O(n^2) running times or input sizes, resize
7795
-- their inputs to the square root of the originals.
7896
unsquare :: (Arbitrary a, Show a, Testable b) => (a -> b) -> Property

tests/Tests/Regressions.hs

+9
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import qualified Data.ByteString.Lazy as LB
2020
import qualified Data.Text as T
2121
import qualified Data.Text.Array as TA
2222
import qualified Data.Text.Encoding as TE
23+
import qualified Data.Text.Encoding.Error as E
2324
import qualified Data.Text.Internal as T
2425
import qualified Data.Text.IO as T
2526
import qualified Data.Text.Lazy as LT
@@ -136,6 +137,13 @@ t301 = do
136137
original@(T.Text originalArr originalOff originalLen) = T.pack "1234567890"
137138
T.Text newArr _off _len = T.take 1 $ T.drop 1 original
138139

140+
t330 :: IO ()
141+
t330 = do
142+
let decodeL = LE.decodeUtf8With E.lenientDecode
143+
assertEqual "The lenient decoding of lazy bytestrings should not depend on how they are chunked"
144+
(decodeL (LB.fromChunks [B.pack [194], B.pack [97, 98, 99]]))
145+
(decodeL (LB.fromChunks [B.pack [194, 97, 98, 99]]))
146+
139147
tests :: F.TestTree
140148
tests = F.testGroup "Regressions"
141149
[ F.testCase "hGetContents_crash" hGetContents_crash
@@ -149,4 +157,5 @@ tests = F.testGroup "Regressions"
149157
, F.testCase "t280/fromString" t280_fromString
150158
, F.testCase "t280/singleton" t280_singleton
151159
, F.testCase "t301" t301
160+
, F.testCase "t330" t330
152161
]

0 commit comments

Comments
 (0)