text-encoding.scm 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. ; -*- Mode: Scheme; Syntax: Scheme; Package: Scheme; -*-
  2. ; Part of Scheme 48 1.9. See file COPYING for notices and license.
  3. ; Authors: Mike Sperber
  4. (define (encode-scalar-value encoding value buffer count)
  5. (let-syntax ((encode
  6. (syntax-rules ()
  7. ((encode ?encode-proc)
  8. (call-with-values
  9. (lambda ()
  10. (?encode-proc value buffer count))
  11. (lambda (encoding-ok? out-of-space? count)
  12. (values #t encoding-ok? out-of-space? count)))))))
  13. (enum-case
  14. text-encoding-option encoding
  15. ((us-ascii) (encode encode-scalar-value/us-ascii))
  16. ((latin-1) (encode encode-scalar-value/latin-1))
  17. ((utf-8) (encode encode-scalar-value/utf-8))
  18. ((utf-16le) (encode encode-scalar-value/utf-16le))
  19. ((utf-16be) (encode encode-scalar-value/utf-16be))
  20. ((utf-32le) (encode encode-scalar-value/utf-32le))
  21. ((utf-32be) (encode encode-scalar-value/utf-32be))
  22. (else
  23. (values #f #f #f 0)))))
  24. (define (decode-scalar-value encoding buffer count)
  25. (let-syntax ((decode
  26. (syntax-rules ()
  27. ((decode ?decode-proc)
  28. (call-with-values
  29. (lambda () (?decode-proc buffer count))
  30. (lambda (ok? incomplete? value count)
  31. (values #t ok? incomplete? value count)))))))
  32. (enum-case
  33. text-encoding-option encoding
  34. ((us-ascii) (decode decode-scalar-value/us-ascii))
  35. ((latin-1) (decode decode-scalar-value/latin-1))
  36. ((utf-8) (decode decode-scalar-value/utf-8))
  37. ((utf-16le) (decode decode-scalar-value/utf-16le))
  38. ((utf-16be) (decode decode-scalar-value/utf-16be))
  39. ((utf-32le) (decode decode-scalar-value/utf-32le))
  40. ((utf-32be) (decode decode-scalar-value/utf-32be))
  41. (else
  42. (values #f #f #f 0 0)))))
  43. ;; US-ASCII
  44. ;; This is mainly needed because it might be the default locale
  45. ;; encoding reported by the OS.
  46. (define (encode-scalar-value/us-ascii value buffer count)
  47. (cond
  48. ((< count 1)
  49. (values #t #t 1))
  50. ((< value 128)
  51. (buffer-set! buffer 0 value)
  52. (values #t #f 1))
  53. (else
  54. (values #f #f 0))))
  55. (define (decode-scalar-value/us-ascii buffer count)
  56. (values #t ; OK?
  57. #f ; incomplete?
  58. (buffer-ref buffer 0)
  59. 1))
  60. ; Latin-1
  61. (define (encode-scalar-value/latin-1 value buffer count)
  62. (cond
  63. ((< count 1)
  64. (values #t #t 1))
  65. ((< value 256)
  66. (buffer-set! buffer 0 value)
  67. (values #t #f 1))
  68. (else
  69. (values #f #f 0))))
  70. (define (decode-scalar-value/latin-1 buffer count)
  71. (values #t ; OK?
  72. #f ; incomplete?
  73. (buffer-ref buffer 0)
  74. 1))
  75. ; UTF-8
  76. (define (encode-scalar-value/utf-8 value buffer count)
  77. (cond
  78. ((<= value #x7f)
  79. (if (>= count 1)
  80. (begin
  81. (buffer-set! buffer 0 value)
  82. (values #t #f 1))
  83. (values #t #t 1)))
  84. ((<= value #x7ff)
  85. (if (>= count 2)
  86. (begin
  87. (buffer-set!
  88. buffer 0
  89. (+ #xc0
  90. (logical-shift-right (bitwise-and value #b11111000000)
  91. 6)))
  92. (buffer-set!
  93. buffer 1
  94. (+ #x80
  95. (bitwise-and value #b111111)))
  96. (values #t #f 2))
  97. (values #t #t 2)))
  98. ((<= value #xffff)
  99. (if (>= count 3)
  100. (begin
  101. (buffer-set!
  102. buffer 0
  103. (+ #xe0
  104. (logical-shift-right (bitwise-and value #b1111000000000000)
  105. 12)))
  106. (buffer-set!
  107. buffer 1
  108. (+ #x80
  109. (logical-shift-right (bitwise-and value #b111111000000)
  110. 6)))
  111. (buffer-set!
  112. buffer 2
  113. (+ #x80
  114. (bitwise-and value #b111111)))
  115. (values #t #f 3))
  116. (values #t #t 3)))
  117. (else
  118. (if (>= count 4)
  119. (begin
  120. (buffer-set!
  121. buffer 0
  122. (+ #xf0
  123. (logical-shift-right (bitwise-and value #b111000000000000000000)
  124. 18)))
  125. (buffer-set!
  126. buffer 1
  127. (+ #x80
  128. (logical-shift-right (bitwise-and value #b111111000000000000)
  129. 12)))
  130. (buffer-set!
  131. buffer 2
  132. (+ #x80
  133. (logical-shift-right (bitwise-and value #b111111000000)
  134. 6)))
  135. (buffer-set!
  136. buffer 3
  137. (+ #x80
  138. (bitwise-and value #b111111)))
  139. (values #t #f 4))
  140. (values #t #t 4)))))
  141. ; The table, and the associated decoding algorithm, is from
  142. ; Richard Gillam: "Unicode Demystified", chapter 14
  143. (define *utf-8-state-table*
  144. '#(;; state 0
  145. 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 1 1 1 1 2 2 3 -1
  146. ;; state 1
  147. -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 0 0 0 0 0 0 0 0 -2 -2 -2 -2 -2 -2 -2 -2
  148. ;; state 2
  149. -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 1 1 1 1 1 1 1 1 -2 -2 -2 -2 -2 -2 -2 -2
  150. ;; state 3
  151. -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 2 2 2 2 2 2 2 2 -2 -2 -2 -2 -2 -2 -2 -2))
  152. (define *utf-8-masks* '#(#x7f #x1f #x0f #x07))
  153. ; We don't check for non-shortest-form UTF-8. Too bad.
  154. (define (decode-scalar-value/utf-8 buffer count)
  155. (let loop ((q 0) (state 0) (mask 0) (scalar-value 0))
  156. (if (< q count)
  157. (let* ((c (buffer-ref buffer q))
  158. (state (vector-ref *utf-8-state-table*
  159. (+ (shift-left state 5) ; (* state 32)
  160. (arithmetic-shift-right c 3)))))
  161. (case state
  162. ((0)
  163. (let ((scalar-value (+ scalar-value
  164. (bitwise-and c #x7f))))
  165. (if (scalar-value? scalar-value)
  166. (values #t #f scalar-value (+ q 1))
  167. (values #f #f 0 0))))
  168. ((1 2 3)
  169. (loop (+ 1 q) state #x3f
  170. (shift-left (+ scalar-value
  171. (bitwise-and c
  172. (if (= 0 mask)
  173. (vector-ref *utf-8-masks* state)
  174. mask)))
  175. 6)))
  176. ((-2 -1)
  177. (values #f #f 0 0))
  178. (else ; this can't happen
  179. (values #f #f 0 0))))
  180. (values #t #t 0 (+ 1 q)))))
  181. ; UTF-16
  182. (define (buffer-set-word16/le! buffer index word)
  183. (buffer-set! buffer index
  184. (bitwise-and #b11111111 word))
  185. (buffer-set! buffer (+ index 1)
  186. (logical-shift-right word 8)))
  187. (define (buffer-set-word16/be! buffer index word)
  188. (buffer-set! buffer index
  189. (logical-shift-right word 8))
  190. (buffer-set! buffer (+ index 1)
  191. (bitwise-and #b11111111 word)))
  192. (define (make-encode-scalar-value/utf-16 buffer-set-word16!)
  193. (lambda (value buffer count)
  194. (if (<= value #xffff)
  195. (if (< count 2)
  196. (values #t #t 2)
  197. (begin
  198. (buffer-set-word16! buffer 0 value)
  199. (values #t #f 2)))
  200. (if (< count 4)
  201. (values #t #t 4)
  202. (begin
  203. (buffer-set-word16!
  204. buffer 0
  205. (+ (logical-shift-right value 10) #xd7c0))
  206. (buffer-set-word16!
  207. buffer 2
  208. (+ (bitwise-and value #x3ff) #xdc00))
  209. (values #t #f 4))))))
  210. (define encode-scalar-value/utf-16le
  211. (make-encode-scalar-value/utf-16 buffer-set-word16/le!))
  212. (define encode-scalar-value/utf-16be
  213. (make-encode-scalar-value/utf-16 buffer-set-word16/be!))
  214. (define (buffer-ref-word16/le codes index)
  215. (+ (buffer-ref codes index)
  216. (shift-left (buffer-ref codes (+ index 1)) 8)))
  217. (define (buffer-ref-word16/be codes index)
  218. (+ (shift-left (buffer-ref codes index) 8)
  219. (buffer-ref codes (+ index 1))))
  220. (define (make-decode-scalar-value/utf-16 buffer-ref-word16)
  221. (lambda (buffer count)
  222. (if (< count 2)
  223. (values #t #t 0 2)
  224. (let ((word0 (buffer-ref-word16 buffer 0)))
  225. (cond
  226. ((or (< word0 #xd800)
  227. (> word0 #xdfff))
  228. (values #t #f word0 2))
  229. ((< count 4)
  230. (values #t #t 0 4))
  231. ((<= word0 #xdbff)
  232. (let ((word1 (buffer-ref-word16 buffer 2 )))
  233. (if (and (>= word1 #xdc00)
  234. (<= word1 #xdfff))
  235. (values #t #f
  236. (+ (shift-left (- word0 #xd7c0) 10)
  237. (bitwise-and word1 #x3ff))
  238. 4)
  239. (values #f #f 0 0))))
  240. (else
  241. (values #f #f 0 0)))))))
  242. (define decode-scalar-value/utf-16le
  243. (make-decode-scalar-value/utf-16 buffer-ref-word16/le))
  244. (define decode-scalar-value/utf-16be
  245. (make-decode-scalar-value/utf-16 buffer-ref-word16/be))
  246. ; UTF-32
  247. (define (encode-scalar-value/utf-32le value buffer count)
  248. (if (< count 4)
  249. (values #t #t 4)
  250. (begin
  251. (buffer-set! buffer 0
  252. (bitwise-and value #xff))
  253. (buffer-set! buffer 1
  254. (logical-shift-right
  255. (bitwise-and value #xff00)
  256. 8))
  257. (buffer-set! buffer 2
  258. (logical-shift-right
  259. (bitwise-and value #xff0000)
  260. 16))
  261. (buffer-set! buffer 3
  262. (logical-shift-right value 24))
  263. (values #t #f 4))))
  264. (define (encode-scalar-value/utf-32be value buffer count)
  265. (if (< count 4)
  266. (values #t #t 4)
  267. (begin
  268. (buffer-set! buffer 0
  269. (logical-shift-right value 24))
  270. (buffer-set! buffer 1
  271. (logical-shift-right
  272. (bitwise-and value #xff0000)
  273. 16))
  274. (buffer-set! buffer 2
  275. (logical-shift-right
  276. (bitwise-and value #xff00)
  277. 8))
  278. (buffer-set! buffer 3
  279. (bitwise-and value #xff))
  280. (values #t #f 4))))
  281. (define (decode-scalar-value/utf-32le buffer count)
  282. (if (< count 4)
  283. (values #t #t 0 4)
  284. (let ((code-point
  285. (+ (buffer-ref buffer 0)
  286. (shift-left (buffer-ref buffer 1)
  287. 8)
  288. (shift-left (buffer-ref buffer 2)
  289. 16)
  290. (shift-left (buffer-ref buffer 3)
  291. 24))))
  292. (if (scalar-value? code-point)
  293. (values #t #f
  294. code-point
  295. 4)
  296. (values #f #f 0 0)))))
  297. (define (decode-scalar-value/utf-32be buffer count)
  298. (if (< count 4)
  299. (values #t #t 0 4)
  300. (let ((code-point
  301. (+ (shift-left (buffer-ref buffer 0)
  302. 24)
  303. (shift-left (buffer-ref buffer 1)
  304. 16)
  305. (shift-left
  306. (buffer-ref buffer 2)
  307. 8)
  308. (buffer-ref buffer 3))))
  309. (if (scalar-value? code-point)
  310. (values #t #f
  311. code-point
  312. 4)
  313. (values #f #f 0 0)))))
  314. ; Utilities
  315. (define (scalar-value? x)
  316. (and (>= x 0)
  317. (or (<= x #xd7ff)
  318. (and (>= x #xe000) (<= x #x10ffff)))))
  319. (define (buffer-ref b i)
  320. (unsigned-byte-ref (address+ b i)))
  321. (define (buffer-set! b i v)
  322. (unsigned-byte-set! (address+ b i) v))