bytevector-string.scm 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. ; Part of Scheme 48 1.9. See file COPYING for notices and license.
  2. ; Authors: Harald Glab-Phlak, Mike Sperber
  3. (define (string->utf8 string)
  4. (enc:string->utf-8 string))
  5. ; If the bytevector begins with the three-byte sequence #xef #xbb
  6. ; #xbf, then those bytes are ignored. (They are conventionally used
  7. ; as a signature to indicate UTF-8 encoding. The string->utf8
  8. ; procedure does not emit those bytes, but UTF-8 encodings produced by
  9. ; other sources may contain them.)
  10. (define (replacement-character)
  11. (integer->char #xfffd))
  12. (define (begins-with-utf8-bom? bv)
  13. (let* ((n (bytevector-length bv)))
  14. (and (<= 3 n)
  15. (= #xef (bytevector-u8-ref bv 0))
  16. (= #xbb (bytevector-u8-ref bv 1))
  17. (= #xbf (bytevector-u8-ref bv 2)))))
  18. (define (utf8->string bv)
  19. (if (begins-with-utf8-bom? bv)
  20. (let ((start 3)
  21. (count (- (bytevector-length bv) 3)))
  22. (enc:utf-8->string-n bv start count (replacement-character)))
  23. (enc:utf-8->string bv (replacement-character))))
  24. (define string->utf16
  25. (opt-lambda (string (endness #f))
  26. (let ((text-codec
  27. (case endness
  28. ((#f big) utf-16be-codec)
  29. ((little) utf-16le-codec)
  30. (else (endianness-violation 'string->utf16 endness)))))
  31. (enc:string->bytes text-codec string))))
  32. (define (maybe-utf16-bom bytevector n)
  33. (and (<= 2 n)
  34. (let ((b0 (bytevector-u8-ref bytevector 0))
  35. (b1 (bytevector-u8-ref bytevector 1)))
  36. (or (and (= b0 #xfe) (= b1 #xff) (endianness big))
  37. (and (= b0 #xff) (= b1 #xfe) (endianness little))))))
  38. (define utf16->string
  39. (opt-lambda (bytevector endness (endianness-mandatory? #f))
  40. (let ((n (bytevector-length bytevector)))
  41. (call-with-values
  42. (lambda ()
  43. (cond
  44. (endianness-mandatory? (values endness 0))
  45. ((maybe-utf16-bom bytevector n)
  46. => (lambda (endness)
  47. (values endness 2)))
  48. (else (values endness 0))))
  49. (lambda (endness start)
  50. (let ((text-codec (case endness
  51. ((big) utf-16be-codec)
  52. ((little) utf-16le-codec)
  53. (else
  54. (endianness-violation 'utf16->string endness))))
  55. (conv-len (- n start)))
  56. (if (not (zero? (remainder n 2)))
  57. (assertion-violation 'utf16->string "Bytevector has bad length." bytevector))
  58. (enc:bytes->string-n text-codec bytevector start conv-len (replacement-character))))))))
  59. ; There is no utf-32-codec, so we can't use textual i/o for this.
  60. (define string->utf32
  61. (opt-lambda (string (endness #f))
  62. (let ((text-codec (case endness
  63. ((#f big) utf-32be-codec)
  64. ((little) utf-32le-codec)
  65. (else (endianness-violation 'string->utf32 endness)))))
  66. (enc:string->bytes text-codec string))))
  67. (define (maybe-utf32-bom bytevector n)
  68. (and (<= 4 n)
  69. (let ((b0 (bytevector-u8-ref bytevector 0))
  70. (b1 (bytevector-u8-ref bytevector 1))
  71. (b2 (bytevector-u8-ref bytevector 2))
  72. (b3 (bytevector-u8-ref bytevector 3)))
  73. (or (and (= b0 0) (= b1 0) (= b2 #xfe) (= b3 #xff)
  74. (endianness big))
  75. (and (= b0 #xff) (= b1 #xfe) (= b2 0) (= b3 0)
  76. (endianness little))))))
  77. (define utf32->string
  78. (opt-lambda (bytevector endness (endianness-mandatory? #f))
  79. (let ((n (bytevector-length bytevector)))
  80. (call-with-values
  81. (lambda ()
  82. (cond
  83. (endianness-mandatory? (values endness 0))
  84. ((maybe-utf32-bom bytevector n)
  85. => (lambda (endness)
  86. (values endness 4)))
  87. (else (values endness 0))))
  88. (lambda (endness start)
  89. (let ((text-codec (case endness
  90. ((big) utf-32be-codec)
  91. ((little) utf-32le-codec)
  92. (else
  93. (endianness-violation 'utf32->string endness))))
  94. (conv-len (- n start)))
  95. (if (not (zero? (remainder n 4)))
  96. (assertion-violation 'utf32->string "Bytevector has bad length." bytevector))
  97. (enc:bytes->string-n text-codec bytevector start conv-len (replacement-character))))))))
  98. (define (endianness-violation who what)
  99. (assertion-violation who "bad endianness" what))