bytevector-string.scm 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. ; Copyright (c) 1993-2008 by Richard Kelsey and Jonathan Rees. See file COPYING.
  2. (define (string->utf8 string)
  3. (enc:string->utf-8 string))
  4. ; If the bytevector begins with the three-byte sequence #xef #xbb
  5. ; #xbf, then those bytes are ignored. (They are conventionally used
  6. ; as a signature to indicate UTF-8 encoding. The string->utf8
  7. ; procedure does not emit those bytes, but UTF-8 encodings produced by
  8. ; other sources may contain them.)
  9. (define (replacement-character)
  10. (integer->char #xfffd))
  11. (define (begins-with-utf8-bom? bv)
  12. (let* ((n (bytevector-length bv)))
  13. (and (<= 3 n)
  14. (= #xef (bytevector-u8-ref bv 0))
  15. (= #xbb (bytevector-u8-ref bv 1))
  16. (= #xbf (bytevector-u8-ref bv 2)))))
  17. (define (utf8->string bv)
  18. (if (begins-with-utf8-bom? bv)
  19. (let ((start 3)
  20. (count (- (bytevector-length bv) 3)))
  21. (enc:utf-8->string-n bv start count (replacement-character)))
  22. (enc:utf-8->string bv (replacement-character))))
  23. (define string->utf16
  24. (opt-lambda (string (endness #f))
  25. (let ((text-codec
  26. (case endness
  27. ((#f big) utf-16be-codec)
  28. ((little) utf-16le-codec)
  29. (else (endianness-violation 'string->utf16 endness)))))
  30. (enc:string->bytes text-codec string))))
  31. (define (maybe-utf16-bom bytevector n)
  32. (and (<= 2 n)
  33. (let ((b0 (bytevector-u8-ref bytevector 0))
  34. (b1 (bytevector-u8-ref bytevector 1)))
  35. (or (and (= b0 #xfe) (= b1 #xff) (endianness big))
  36. (and (= b0 #xff) (= b1 #xfe) (endianness little))))))
  37. (define utf16->string
  38. (opt-lambda (bytevector endness (endianness-mandatory? #f))
  39. (let ((n (bytevector-length bytevector)))
  40. (call-with-values
  41. (lambda ()
  42. (cond
  43. (endianness-mandatory? (values endness 0))
  44. ((maybe-utf16-bom bytevector n)
  45. => (lambda (endness)
  46. (values endness 2)))
  47. (else (values endness 0))))
  48. (lambda (endness start)
  49. (let ((text-codec (case endness
  50. ((big) utf-16be-codec)
  51. ((little) utf-16le-codec)
  52. (else
  53. (endianness-violation 'utf16->string endness))))
  54. (conv-len (- n start)))
  55. (if (not (zero? (remainder n 2)))
  56. (assertion-violation 'utf16->string "Bytevector has bad length." bytevector))
  57. (enc:bytes->string-n text-codec bytevector start conv-len (replacement-character))))))))
  58. ; There is no utf-32-codec, so we can't use textual i/o for this.
  59. (define string->utf32
  60. (opt-lambda (string (endness #f))
  61. (let ((text-codec (case endness
  62. ((#f big) utf-32be-codec)
  63. ((little) utf-32le-codec)
  64. (else (endianness-violation 'string->utf32 endness)))))
  65. (enc:string->bytes text-codec string))))
  66. (define (maybe-utf32-bom bytevector n)
  67. (and (<= 4 n)
  68. (let ((b0 (bytevector-u8-ref bytevector 0))
  69. (b1 (bytevector-u8-ref bytevector 1))
  70. (b2 (bytevector-u8-ref bytevector 2))
  71. (b3 (bytevector-u8-ref bytevector 3)))
  72. (or (and (= b0 0) (= b1 0) (= b2 #xfe) (= b3 #xff)
  73. (endianness big))
  74. (and (= b0 #xff) (= b1 #xfe) (= b2 0) (= b3 0)
  75. (endianness little))))))
  76. (define utf32->string
  77. (opt-lambda (bytevector endness (endianness-mandatory? #f))
  78. (let ((n (bytevector-length bytevector)))
  79. (call-with-values
  80. (lambda ()
  81. (cond
  82. (endianness-mandatory? (values endness 0))
  83. ((maybe-utf32-bom bytevector n)
  84. => (lambda (endness)
  85. (values endness 4)))
  86. (else (values endness 0))))
  87. (lambda (endness start)
  88. (let ((text-codec (case endness
  89. ((big) utf-32be-codec)
  90. ((little) utf-32le-codec)
  91. (else
  92. (endianness-violation 'utf32->string endness))))
  93. (conv-len (- n start)))
  94. (if (not (zero? (remainder n 4)))
  95. (assertion-violation 'utf32->string "Bytevector has bad length." bytevector))
  96. (enc:bytes->string-n text-codec bytevector start conv-len (replacement-character))))))))
  97. (define (endianness-violation who what)
  98. (assertion-violation who "bad endianness" what))