Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 51de0e8

Browse files
committedAug 22, 2023
Add decodeNewlineDelimited
1 parent 74c5ee7 commit 51de0e8

File tree

3 files changed

+68
-12
lines changed

3 files changed

+68
-12
lines changed
 

‎CHANGELOG.md‎

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# Revision history for json-syntax
22

3+
## 0.2.7.0 -- 2023-??-??
4+
5+
* Add `decodeNewlineDelimited`.
6+
37
## 0.2.6.1 -- 2023-07-28
48

59
* Correct implementations of `object15` and `object16`.

‎json-syntax.cabal‎

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
cabal-version: 2.2
22
name: json-syntax
3-
version: 0.2.6.1
3+
version: 0.2.7.0
44
synopsis: High-performance JSON parser and encoder
55
description:
66
This library parses JSON into a @Value@ type that is consistent with the
@@ -38,15 +38,16 @@ library
3838
, byteslice >=0.2.9 && <0.3
3939
, bytesmith >=0.3.8 && <0.4
4040
, bytestring >=0.10.8 && <0.12
41-
, natural-arithmetic >=0.1.2 && <0.3
4241
, contiguous >=0.6 && <0.7
42+
, natural-arithmetic >=0.1.2 && <0.3
4343
, primitive >=0.7 && <0.10
4444
, run-st >=0.1.1 && <0.2
4545
, scientific-notation >=0.1.6 && <0.2
46+
, text >=1.2
4647
, text-short >=0.1.3 && <0.2
48+
, transformers >=0.5.6.2
4749
, word-compat >=0.0.3
4850
, zigzag >=0.0.1
49-
, text >=1.2
5051
hs-source-dirs: src
5152
default-language: Haskell2010
5253
ghc-options: -Wall -O2

‎src/Json.hs‎

Lines changed: 60 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ module Json
2020
, ToValue(..)
2121
-- * Functions
2222
, decode
23+
, decodeNewlineDelimited
2324
, encode
2425
-- * Infix Synonyms
2526
, pattern (:->)
@@ -78,9 +79,13 @@ import GHC.Exts (Char(C#),Int(I#),gtWord#,ltWord#,word2Int#,chr#)
7879
import GHC.Word (Word8,Word16,Word32,Word64)
7980
import GHC.Int (Int8,Int16,Int32,Int64)
8081
import Data.Text (Text)
82+
import Data.Foldable (foldlM)
83+
import Control.Monad.Trans.Except (runExceptT,except)
84+
import Control.Monad.Trans.Class (lift)
8185

8286
import qualified Prelude
8387
import qualified Data.Builder.ST as B
88+
import qualified Data.Bytes as Bytes
8489
import qualified Data.Bytes.Builder as BLDR
8590
import qualified Data.Bytes.Parser as P
8691
import qualified Data.Chunks as Chunks
@@ -142,6 +147,7 @@ data SyntaxException
142147
| InvalidNumber
143148
| LeadingZero
144149
| UnexpectedLeftovers
150+
| PossibleOverflow
145151
deriving stock (Eq,Show)
146152
deriving anyclass (Exception)
147153

@@ -172,13 +178,57 @@ isSpace w =
172178

173179
-- | Decode a JSON syntax tree from a byte sequence.
174180
decode :: Bytes -> Either SyntaxException Value
175-
decode = P.parseBytesEither do
181+
{-# noinline decode #-}
182+
decode = P.parseBytesEither parser
183+
184+
parser :: Parser SyntaxException s Value
185+
{-# inline parser #-}
186+
parser = do
176187
P.skipWhile isSpace
177-
result <- Latin.any EmptyInput >>= parser
188+
result <- Latin.any EmptyInput >>= parserStep
178189
P.skipWhile isSpace
179190
P.endOfInput UnexpectedLeftovers
180191
pure result
181192

193+
-- | Decode newline-delimited JSON. Both the LF and the CRLF conventions
194+
-- are supported. The newline character (or character sequence) following
195+
-- the final object may be omitted. This also allows blanks lines consisting
196+
-- of only whitespace.
197+
--
198+
-- It's not strictly necessary for this to be a part of this library, but
199+
-- newline-delimited JSON is somewhat common in practice. It's nice to have
200+
-- this here instead of having to reimplement it in a bunch of different
201+
-- applications.
202+
--
203+
-- Note: To protect against malicious input, this reject byte sequences with
204+
-- more than 10 million newlines. If this is causing a problem for you, open
205+
-- an issue.
206+
--
207+
-- Other note: in the future, this function might be changed transparently
208+
-- to parallelize the decoding of large input (at least 1000 lines) with
209+
-- GHC sparks.
210+
decodeNewlineDelimited :: Bytes -> Either SyntaxException (SmallArray Value)
211+
{-# noinline decodeNewlineDelimited #-}
212+
decodeNewlineDelimited !everything =
213+
let maxVals = Bytes.count 0x0A everything + 1
214+
in if maxVals > 10000000
215+
then Left PossibleOverflow
216+
else runST $ runExceptT $ do
217+
!dst <- PM.newSmallArray maxVals Null
218+
!total <- foldlM
219+
(\ !ix b ->
220+
let clean = Bytes.dropWhile isSpace (Bytes.dropWhileEnd isSpace b)
221+
in if Bytes.null clean
222+
then pure ix
223+
else do
224+
v <- except (decode clean)
225+
lift (PM.writeSmallArray dst ix v)
226+
pure (ix + 1)
227+
) 0 (Bytes.split 0x0A everything)
228+
lift $ PM.shrinkSmallMutableArray dst total
229+
dst' <- lift $ PM.unsafeFreezeSmallArray dst
230+
pure dst'
231+
182232
-- | Encode a JSON syntax tree.
183233
encode :: Value -> BLDR.Builder
184234
{-# noinline encode #-}
@@ -229,9 +279,10 @@ foldrTail f z !ary = go 1 where
229279
= f x (go (i+1))
230280

231281
-- Precondition: skip over all space before calling this.
232-
-- It will not skip leading space for you. It does
233-
parser :: Char -> Parser SyntaxException s Value
234-
parser = \case
282+
-- It will not skip leading space for you. It does not skip
283+
-- over trailing space either.
284+
parserStep :: Char -> Parser SyntaxException s Value
285+
parserStep = \case
235286
'{' -> objectTrailedByBrace
236287
'[' -> arrayTrailedByBracket
237288
't' -> do
@@ -266,7 +317,7 @@ objectTrailedByBrace = do
266317
P.skipWhile isSpace
267318
Latin.char ExpectedColon ':'
268319
P.skipWhile isSpace
269-
val <- Latin.any IncompleteObject >>= parser
320+
val <- Latin.any IncompleteObject >>= parserStep
270321
let !mbr = Member theKey val
271322
!b0 <- P.effect B.new
272323
b1 <- P.effect (B.push mbr b0)
@@ -285,7 +336,7 @@ objectStep !b = do
285336
P.skipWhile isSpace
286337
Latin.char ExpectedColon ':'
287338
P.skipWhile isSpace
288-
val <- Latin.any IncompleteObject >>= parser
339+
val <- Latin.any IncompleteObject >>= parserStep
289340
let !mbr = Member theKey val
290341
P.effect (B.push mbr b) >>= objectStep
291342
'}' -> do
@@ -310,7 +361,7 @@ arrayTrailedByBracket = do
310361
']' -> pure emptyArray
311362
c -> do
312363
!b0 <- P.effect B.new
313-
val <- parser c
364+
val <- parserStep c
314365
b1 <- P.effect (B.push val b0)
315366
arrayStep b1
316367

@@ -328,7 +379,7 @@ arrayStep !b = do
328379
Latin.any IncompleteArray >>= \case
329380
',' -> do
330381
P.skipWhile isSpace
331-
val <- Latin.any IncompleteArray >>= parser
382+
val <- Latin.any IncompleteArray >>= parserStep
332383
P.effect (B.push val b) >>= arrayStep
333384
']' -> do
334385
!r <- P.effect (B.freeze b)

0 commit comments

Comments
 (0)
Please sign in to comment.