Skip to content

Commit f6e236a

Browse files
committed
Start quick check implementation
1 parent 278f273 commit f6e236a

File tree

5 files changed

+107
-12
lines changed

5 files changed

+107
-12
lines changed

Data/Text/Normalize.hs

+5-3
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ module Data.Text.Normalize
1616
NormalizationMode(..)
1717
-- * Normalization API
1818
, normalize
19+
, normalizeQC
1920
) where
2021

2122
import Data.Text (Text)
@@ -24,6 +25,7 @@ import Data.Unicode.Types (NormalizationMode(..))
2425
-- Internal modules
2526
import Data.Unicode.Internal.NormalizeStream
2627
( DecomposeMode(..)
28+
, normalizeQC
2729
, stream
2830
, unstream
2931
, unstreamC
@@ -34,7 +36,7 @@ import Data.Unicode.Internal.NormalizeStream
3436
normalize :: NormalizationMode -> Text -> Text
3537
normalize mode =
3638
case mode of
37-
NFD -> (unstream Canonical) . stream
38-
NFKD -> (unstream Kompat) . stream
39-
NFC -> (unstreamC Canonical) . stream
39+
NFD -> (unstream Canonical) . stream
40+
NFKD -> (unstream Kompat) . stream
41+
NFC -> (unstreamC Canonical) . stream
4042
NFKC -> (unstreamC Kompat) . stream

Data/Unicode/Internal/NormalizeStream.hs

+56-1
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,27 @@
1717
module Data.Unicode.Internal.NormalizeStream
1818
(
1919
UC.DecomposeMode(..)
20+
, normalizeQC
2021
, stream
2122
, unstream
2223
, unstreamC
2324
)
2425
where
2526

2627
import Data.Char (chr, ord)
28+
import Data.Unicode.Types (NormalizationMode(..))
2729
import GHC.ST (ST(..))
2830
import GHC.Types (SPEC(..))
31+
import Unicode.Char.Normalization
32+
( QuickCheck(..)
33+
, isNFD
34+
, isNFKD
35+
, isNFC
36+
, isNFKC
37+
)
2938

30-
import qualified Data.Text.Array as A
3139
import qualified Unicode.Char as UC
40+
import qualified Data.Text.Array as A
3241

3342
#if MIN_VERSION_text(2,0,0)
3443
import Data.Text.Internal.Fusion (stream)
@@ -169,6 +178,52 @@ stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len)
169178
{-# INLINE [0] stream #-}
170179
#endif
171180

181+
-- | Perform Unicode normalization on @Text@ according to the specified
182+
-- normalization mode, using the quick check algorithm to speed up case when
183+
-- the text is already normalized.
184+
normalizeQC
185+
:: NormalizationMode
186+
-> Text
187+
-> Text
188+
normalizeQC mode t = case stream t of
189+
str@(Stream next0 s0 _len) ->
190+
let quickCheck s cc r = case next0 s of
191+
Done -> r
192+
Skip s' -> quickCheck s' cc r
193+
Yield c s' ->
194+
-- [TODO] the ASCII check speeds up latin scripts,
195+
-- but it slows down the other scripts.
196+
if c <= '\x7f'
197+
-- For ASCII we know it’s always allowed and a starter
198+
then quickCheck s' 0 r
199+
-- Otherwise, lookup the combining class and QC property
200+
else let cc' = UC.combiningClass c in
201+
if cc > cc' && cc' /= 0
202+
then No
203+
else case check c of
204+
No -> No
205+
Maybe -> quickCheck s' cc' Maybe
206+
Yes -> quickCheck s' cc' r
207+
-- let cc' = UC.combiningClass c
208+
-- in if cc > cc' && cc' /= 0
209+
-- then No
210+
-- else case check c of
211+
-- No -> No
212+
-- Maybe -> quickCheck s' cc' Maybe
213+
-- Yes -> quickCheck s' cc' r
214+
check = case mode of
215+
NFD -> isNFD
216+
NFKD -> isNFKD
217+
NFC -> isNFC
218+
NFKC -> isNFKC
219+
in case quickCheck s0 0 Yes of
220+
Yes -> t
221+
_ -> case mode of
222+
NFD -> unstream UC.Canonical str
223+
NFKD -> unstream UC.Kompat str
224+
NFC -> unstreamC UC.Canonical str
225+
NFKC -> unstreamC UC.Kompat str
226+
172227
-- | /O(n)/ Convert a 'Stream Char' into a decompose-normalized 'Text'.
173228
unstream :: UC.DecomposeMode -> Stream Char -> Text
174229
unstream mode (Stream next0 s0 len) = runText $ \done -> do

benchmark/Benchmark.hs

+35-5
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ import Path.IO (listDir)
2020
import System.FilePath (dropExtensions, takeFileName)
2121

2222
import Gauge.Main (Benchmark, bench, bgroup, defaultMain, env, nf)
23+
#ifdef USE_TASTY
24+
import Test.Tasty.Bench (bcompare)
25+
#endif
2326

2427
import qualified Data.Text as T
2528
import qualified Data.Text.Normalize as UTText
@@ -44,6 +47,14 @@ unicodeTransformTextFuncs =
4447
, ("NFKC", UTText.normalize UTText.NFKC)
4548
]
4649

50+
unicodeTransformTextFuncsQuickCheck :: [(String, Text -> Text)]
51+
unicodeTransformTextFuncsQuickCheck =
52+
[ ("NFD", UTText.normalizeQC UTText.NFD)
53+
, ("NFKD", UTText.normalizeQC UTText.NFKD)
54+
, ("NFC", UTText.normalizeQC UTText.NFC)
55+
, ("NFKC", UTText.normalizeQC UTText.NFKC)
56+
]
57+
4758
dataDir :: Path Rel Dir
4859
dataDir = $(mkRelDir "benchmark") </> $(mkRelDir "data")
4960

@@ -53,9 +64,25 @@ dataDir = $(mkRelDir "benchmark") </> $(mkRelDir "data")
5364
dataSetSize :: Int
5465
dataSetSize = 1000000
5566

56-
makeBench :: (NFData a, NFData b) => (String, a -> b) -> (String, IO a) -> Benchmark
57-
makeBench (implName, func) (dataName, setup) =
58-
env setup (\txt -> bench (implName ++ "/" ++ dataName) (nf func txt))
67+
makeBench :: (NFData a, NFData b) => (String, a -> b) -> String -> a -> Benchmark
68+
makeBench (implName, func) dataName =
69+
\txt -> bench (makeTestName implName dataName) (nf func txt)
70+
71+
makeTestName :: String -> String -> String
72+
makeTestName implName dataName = implName ++ "/" ++ dataName
73+
74+
makeBenchRef :: (NFData a, NFData b) => (String, a -> b) -> (String, IO a) -> Benchmark
75+
makeBenchRef impl (dataName, setup) = env setup (makeBench impl dataName)
76+
77+
makeBenchComp :: (NFData a, NFData b) => (String, a -> b) -> (String, IO a) -> Benchmark
78+
#ifdef USE_TASTY
79+
makeBenchComp impl (dataName, setup) = env setup
80+
( bcompare ("$NF == \"" <> (makeTestName (fst impl) dataName)
81+
<> "\" && $(NF-1) == \"unicode-transforms-text\"")
82+
. makeBench impl dataName)
83+
#else
84+
makeBenchComp = makeBenchRef
85+
#endif
5986

6087
strInput :: FilePath -> (String, IO String)
6188
strInput file = (dataName file,
@@ -73,10 +100,13 @@ main = do
73100
[
74101
#ifdef BENCH_ICU
75102
bgroup "text-icu"
76-
$ makeBench <$> textICUFuncs <*> (map txtInput dataFiles)
103+
$ makeBenchComp <$> textICUFuncs <*> (map txtInput dataFiles)
77104
,
78105
#endif
79106
bgroup "unicode-transforms-text"
80-
$ makeBench <$> unicodeTransformTextFuncs
107+
$ makeBenchRef <$> unicodeTransformTextFuncs
108+
<*> (map txtInput dataFiles)
109+
, bgroup "unicode-transforms-text (QC)"
110+
$ makeBenchComp <$> unicodeTransformTextFuncsQuickCheck
81111
<*> (map txtInput dataFiles)
82112
]

cabal.project

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
packages: .
2+
source-repository-package
3+
type: git
4+
location: https://github.com/composewell/unicode-data.git
5+
tag: d81b67cc76d7f312a35de8e2a42c8e856c393885

test/NormalizationTest.hs

+6-3
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@ import Data.List.Split (splitOn)
2222
import Data.Text (Text)
2323
import qualified Data.Text as T
2424
import qualified Data.Text.Encoding as T
25-
import Data.Text.Normalize (NormalizationMode(NFD, NFKD, NFC, NFKC), normalize)
25+
import Data.Text.Normalize
26+
( NormalizationMode(NFD, NFKD, NFC, NFKC)
27+
, normalize, normalizeQC)
2628
import Text.Printf (printf)
2729

2830
#if !MIN_VERSION_base(4,8,0)
@@ -49,8 +51,9 @@ checkEqual opName op (c1, c2) =
4951

5052
checkOp :: String -> NormalizationMode -> [(Text, Text)] -> IO Bool
5153
checkOp name op pairs = do
52-
res <- mapM (checkEqual name ((normalize op))) pairs
53-
return $ all (== True) res
54+
res1 <- mapM (checkEqual name ((normalize op))) pairs
55+
res2 <- mapM (checkEqual name ((normalizeQC op))) pairs
56+
return $ all (== True) res1 && all (== True) res2
5457

5558
checkNFC :: (Text, Text, Text, Text, Text) -> IO Bool
5659
checkNFC (c1, c2, c3, c4, c5) =

0 commit comments

Comments
 (0)