-
Notifications
You must be signed in to change notification settings - Fork 158
/
Copy pathEncoding.hs
254 lines (226 loc) · 9.13 KB
/
Encoding.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
{-# LANGUAGE BangPatterns,CPP #-}
#if __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
-- |
-- Module : Data.Text.Lazy.Encoding
-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Portability : portable
--
-- Functions for converting lazy 'Text' values to and from lazy
-- 'ByteString', using several standard encodings.
--
-- To gain access to a much larger family of encodings, use the
-- <http://hackage.haskell.org/package/text-icu text-icu package>.
module Data.Text.Lazy.Encoding
(
-- * Decoding ByteStrings to Text
-- $strict
decodeASCII
, decodeLatin1
, decodeUtf8
, decodeUtf16LE
, decodeUtf16BE
, decodeUtf32LE
, decodeUtf32BE
-- ** Catchable failure
, decodeUtf8'
-- ** Controllable error handling
, decodeUtf8With
, decodeUtf16LEWith
, decodeUtf16BEWith
, decodeUtf32LEWith
, decodeUtf32BEWith
-- * Encoding Text to ByteStrings
, encodeUtf8
, encodeUtf16LE
, encodeUtf16BE
, encodeUtf32LE
, encodeUtf32BE
-- * Encoding Text using ByteString Builders
, encodeUtf8Builder
, encodeUtf8BuilderEscaped
) where
import Control.Exception (evaluate, try)
import Data.Monoid (Monoid(..))
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks)
import Data.Word (Word8)
import qualified Data.ByteString as S
import qualified Data.ByteString.Builder as B
import qualified Data.ByteString.Builder.Extra as B (safeStrategy, toLazyByteStringWith)
import qualified Data.ByteString.Builder.Prim as BP
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Lazy.Internal as B
import qualified Data.ByteString.Unsafe as B
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Internal.Lazy.Encoding.Fusion as E
import qualified Data.Text.Internal.Lazy.Fusion as F
import Data.Text.Unsafe (unsafeDupablePerformIO)
-- $strict
--
-- All of the single-parameter functions for decoding bytestrings
-- encoded in one of the Unicode Transformation Formats (UTF) operate
-- in a /strict/ mode: each will throw an exception if given invalid
-- input.
--
-- Each function has a variant, whose name is suffixed with -'With',
-- that gives greater control over the handling of decoding errors.
-- For instance, 'decodeUtf8' will throw an exception, but
-- 'decodeUtf8With' allows the programmer to determine what to do on a
-- decoding error.
-- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII
-- encoded text.
decodeASCII :: B.ByteString -> Text
decodeASCII = decodeUtf8
{-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
decodeLatin1 :: B.ByteString -> Text
decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
-- | Decode a 'ByteString' containing UTF-8 encoded text.
decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
decodeUtf8With onErr (B.Chunk b0 bs0) =
case TE.streamDecodeUtf8With onErr b0 of
TE.Some t l f -> chunk t (go f l bs0)
where
go f0 _ (B.Chunk b bs) =
case f0 b of
TE.Some t l f -> chunk t (go f l bs)
go _ l _
| S.null l = empty
| otherwise =
let !t = T.pack (skipBytes l)
skipBytes = S.foldr (\x s' ->
case onErr desc (Just x) of
Just c -> c : s'
Nothing -> s') [] in
Chunk t Empty
desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
decodeUtf8With _ _ = empty
-- | Decode a 'ByteString' containing UTF-8 encoded text that is known
-- to be valid.
--
-- If the input contains any invalid UTF-8 data, an exception will be
-- thrown that cannot be caught in pure code. For more control over
-- the handling of invalid data, use 'decodeUtf8'' or
-- 'decodeUtf8With'.
decodeUtf8 :: B.ByteString -> Text
decodeUtf8 = decodeUtf8With strictDecode
{-# INLINE[0] decodeUtf8 #-}
-- This rule seems to cause performance loss.
{- RULES "LAZY STREAM stream/decodeUtf8' fusion" [1]
forall bs. F.stream (decodeUtf8' bs) = E.streamUtf8 strictDecode bs #-}
-- | Decode a 'ByteString' containing UTF-8 encoded text..
--
-- If the input contains any invalid UTF-8 data, the relevant
-- exception will be returned, otherwise the decoded text.
--
-- /Note/: this function is /not/ lazy, as it must decode its entire
-- input before it can return a result. If you need lazy (streaming)
-- decoding, use 'decodeUtf8With' in lenient mode.
decodeUtf8' :: B.ByteString -> Either UnicodeException Text
decodeUtf8' bs = unsafeDupablePerformIO $ do
let t = decodeUtf8 bs
try (evaluate (rnf t `seq` t))
where
rnf Empty = ()
rnf (Chunk _ ts) = rnf ts
{-# INLINE decodeUtf8' #-}
-- | Encode text using UTF-8 encoding.
encodeUtf8 :: Text -> B.ByteString
encodeUtf8 Empty = B.empty
encodeUtf8 lt@(Chunk t _) =
B.toLazyByteStringWith strategy B.empty $ encodeUtf8Builder lt
where
-- To improve our small string performance, we use a strategy that
-- allocates a buffer that is guaranteed to be large enough for the
-- encoding of the first chunk, but not larger than the default
-- B.smallChunkSize. We clamp the firstChunkSize to ensure that we don't
-- generate too large buffers which hamper streaming.
firstChunkSize = min B.smallChunkSize (4 * (T.length t + 1))
strategy = B.safeStrategy firstChunkSize B.defaultChunkSize
-- | Encode text to a ByteString 'B.Builder' using UTF-8 encoding.
--
-- @since 1.1.0.0
encodeUtf8Builder :: Text -> B.Builder
encodeUtf8Builder =
foldrChunks (\c b -> TE.encodeUtf8Builder c `mappend` b) Data.Monoid.mempty
-- | Encode text using UTF-8 encoding and escape the ASCII characters using
-- a 'BP.BoundedPrim'.
--
-- Use this function is to implement efficient encoders for text-based formats
-- like JSON or HTML.
--
-- @since 1.1.0.0
{-# INLINE encodeUtf8BuilderEscaped #-}
encodeUtf8BuilderEscaped :: BP.BoundedPrim Word8 -> Text -> B.Builder
encodeUtf8BuilderEscaped prim =
foldrChunks (\c b -> TE.encodeUtf8BuilderEscaped prim c `mappend` b) mempty
-- | Decode text from little endian UTF-16 encoding.
decodeUtf16LEWith :: OnDecodeError -> B.ByteString -> Text
decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
{-# INLINE decodeUtf16LEWith #-}
-- | Decode text from little endian UTF-16 encoding.
--
-- If the input contains any invalid little endian UTF-16 data, an
-- exception will be thrown. For more control over the handling of
-- invalid data, use 'decodeUtf16LEWith'.
decodeUtf16LE :: B.ByteString -> Text
decodeUtf16LE = decodeUtf16LEWith strictDecode
{-# INLINE decodeUtf16LE #-}
-- | Decode text from big endian UTF-16 encoding.
decodeUtf16BEWith :: OnDecodeError -> B.ByteString -> Text
decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
{-# INLINE decodeUtf16BEWith #-}
-- | Decode text from big endian UTF-16 encoding.
--
-- If the input contains any invalid big endian UTF-16 data, an
-- exception will be thrown. For more control over the handling of
-- invalid data, use 'decodeUtf16BEWith'.
decodeUtf16BE :: B.ByteString -> Text
decodeUtf16BE = decodeUtf16BEWith strictDecode
{-# INLINE decodeUtf16BE #-}
-- | Encode text using little endian UTF-16 encoding.
encodeUtf16LE :: Text -> B.ByteString
encodeUtf16LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16LE) [] txt)
{-# INLINE encodeUtf16LE #-}
-- | Encode text using big endian UTF-16 encoding.
encodeUtf16BE :: Text -> B.ByteString
encodeUtf16BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16BE) [] txt)
{-# INLINE encodeUtf16BE #-}
-- | Decode text from little endian UTF-32 encoding.
decodeUtf32LEWith :: OnDecodeError -> B.ByteString -> Text
decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
{-# INLINE decodeUtf32LEWith #-}
-- | Decode text from little endian UTF-32 encoding.
--
-- If the input contains any invalid little endian UTF-32 data, an
-- exception will be thrown. For more control over the handling of
-- invalid data, use 'decodeUtf32LEWith'.
decodeUtf32LE :: B.ByteString -> Text
decodeUtf32LE = decodeUtf32LEWith strictDecode
{-# INLINE decodeUtf32LE #-}
-- | Decode text from big endian UTF-32 encoding.
decodeUtf32BEWith :: OnDecodeError -> B.ByteString -> Text
decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
{-# INLINE decodeUtf32BEWith #-}
-- | Decode text from big endian UTF-32 encoding.
--
-- If the input contains any invalid big endian UTF-32 data, an
-- exception will be thrown. For more control over the handling of
-- invalid data, use 'decodeUtf32BEWith'.
decodeUtf32BE :: B.ByteString -> Text
decodeUtf32BE = decodeUtf32BEWith strictDecode
{-# INLINE decodeUtf32BE #-}
-- | Encode text using little endian UTF-32 encoding.
encodeUtf32LE :: Text -> B.ByteString
encodeUtf32LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32LE) [] txt)
{-# INLINE encodeUtf32LE #-}
-- | Encode text using big endian UTF-32 encoding.
encodeUtf32BE :: Text -> B.ByteString
encodeUtf32BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32BE) [] txt)
{-# INLINE encodeUtf32BE #-}