-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutf8-tests.scm
135 lines (109 loc) · 4.55 KB
/
utf8-tests.scm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
;;; utf8proc->s7 tests
(load "libutf8proc.scm")
(when (defined? '*libutf8proc*)
(with-let *libutf8proc*
;; --------------------------------
;; these are from the libutf8proc test directory
(define (print-property c)
(format *stderr* " category = ~S~% charwidth = ~D~%~A~%"
(utf8proc_category_string c)
(utf8proc_charwidth c)
(utf8proc_get_property c)))
(do ((c 1 (+ c 1)))
((= c #x110000))
(let ((l (utf8proc_tolower c))
(u (utf8proc_toupper c)))
(unless (or (= l c)
(utf8proc_codepoint_valid l))
(format *stderr* "~X: invalid tolower~%" c))
(unless (or (= u c)
(utf8proc_codepoint_valid u))
(format *stderr* "~X: invalid toupper~%" c))
))
(do ((c 0 (+ c 1)))
((or (= c #xd800)
(and (not (utf8proc_codepoint_valid c))
(not (format *stderr* "~X: codepoint invalid~%" c))))))
(do ((c #xd800 (+ c 1)))
((or (= c #xe000)
(and (utf8proc_codepoint_valid c)
(not (format *stderr* "~X: codepoint valid?~%" c))))))
(do ((c #xe000 (+ c 1)))
((or (= c #x110000)
(and (not (utf8proc_codepoint_valid c))
(not (format *stderr* "~X: codepoint invalid~%" c))))))
(do ((c #x110000 (+ c 1)))
((or (= c #x110010)
(and (utf8proc_codepoint_valid c)
(not (format *stderr* "~X: codepoint valid?~%" c))))))
;; (print-property #xbb)
(do ((c 1 (+ c 1)))
((= c #x110000))
(let ((cat ((utf8proc_get_property c) 'category))
(w (utf8proc_charwidth c)))
(if (and (or (= cat UTF8PROC_CATEGORY_MN) (= cat UTF8PROC_CATEGORY_ME))
(positive? w))
(format *stderr* "nonzero width ~D for combining char ~X~%" w c))
(if (and (zero? w)
(or (and (>= cat UTF8PROC_CATEGORY_LU) (<= cat UTF8PROC_CATEGORY_LO))
(and (>= cat UTF8PROC_CATEGORY_ND) (<= cat UTF8PROC_CATEGORY_SC))
(and (>= cat UTF8PROC_CATEGORY_SO) (<= cat UTF8PROC_CATEGORY_ZS))))
(format *stderr* "zero width for symbol-like char ~X~%" c))))
;; --------------------------------
(define s '("élan ‘quote’")) ; example from Norman Gray
(display s) ; ("élan â\x80;\x98;quoteâ\x80;\x99;") -- this is due to write's slashify_table choices: now displays ("élan ‘quote’")
(newline)
(display (car s)) ; élan ‘quote’
(newline)
(define b (string->byte-vector (car s)))
(format #t "~{~X ~}" b) ;c3 a9 6c 61 6e 20 e2 80 98 71 75 6f 74 65 e2 80 99
(newline)
(define p (utf8proc_map (car s) UTF8PROC_NULLTERM)) ; is this doing anything useful (besides error checking)?
(display (car p)) ; élan ‘quote’
(newline)
(define p1 (utf8proc_map "(\"élan ‘quote’\")" UTF8PROC_NULLTERM))
(display (car p1)) ; ("élan ‘quote’")
(newline)
(define b1 (string->byte-vector (car p1)))
(format #t "~{~X ~}" b1) ;28 22 c3 a9 6c 61 6e 20 e2 80 98 71 75 6f 74 65 e2 80 99 22 29
(newline)
(define s1 (with-output-to-string (lambda () (display s))))
(display s1) (newline) ; ("élan ‘quote’")
(define p2 (utf8proc_map s1 UTF8PROC_NULLTERM))
(if (integer? (cdr p2))
(display (utf8proc_errmsg (cdr p2))) ; "Invalid UTF-8 string" or "unknown error" -- what is the problem here?
(display (car p2)))
(newline)
(let ((len (cdr p1))
(p1c (copy (car p1))))
(do ((n (utf8proc_iterate p1c len) (utf8proc_iterate p1c len)))
((<= (car n) 0)) ; (cdr n) is the codepoint as an integer
(display (substring p1c 0 (car n))) (display #\space) ; ( " é l a n ‘ q u o t e ’ " )
(set! p1c (substring p1c (car n)))
(set! len (- len (car n))))
(newline))
(let ((e1 (utf8proc_encode_char #x00E9))) ; unicode code-point to utf-8 -> (cons utf-8-string length-thereof)
(format #t "#x~{~X~}" (string->byte-vector (car e1))) ; #xc3a9
(newline)
(display (car e1)) ; é
(newline))
(let ((e1 (utf8proc_encode_char #x018b)))
(format #t "#x~{~X~}" (string->byte-vector (car e1)))
(newline)
(display (car e1)) ; latin cap D with top bar
(newline))
(let ((e1 (utf8proc_encode_char #x0238)))
(format #t "#x~{~X~}" (string->byte-vector (car e1)))
(newline)
(display (car e1)) ; latin small db digraph
(newline))
(let ((e1 (utf8proc_encode_char #x1e00)))
(format #t "#x~{~X~}" (string->byte-vector (car e1)))
(newline)
(display (car e1)) ; latin cap A ring below
(newline))
(display (string->symbol "élan ‘quote’"))
(newline)
(display (symbol->string (symbol "élan ‘quote’")))
(newline)
))