strings.c 58 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169
  1. /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
  2. *
  3. * This library is free software; you can redistribute it and/or
  4. * modify it under the terms of the GNU Lesser General Public License
  5. * as published by the Free Software Foundation; either version 3 of
  6. * the License, or (at your option) any later version.
  7. *
  8. * This library is distributed in the hope that it will be useful, but
  9. * WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. * Lesser General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Lesser General Public
  14. * License along with this library; if not, write to the Free Software
  15. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16. * 02110-1301 USA
  17. */
  18. #ifdef HAVE_CONFIG_H
  19. # include <config.h>
  20. #endif
  21. #include <alloca.h>
  22. #include <string.h>
  23. #include <stdio.h>
  24. #include <ctype.h>
  25. #include <uninorm.h>
  26. #include <unistr.h>
  27. #include <uniconv.h>
  28. #include "striconveh.h"
  29. #include "libguile/_scm.h"
  30. #include "libguile/chars.h"
  31. #include "libguile/root.h"
  32. #include "libguile/strings.h"
  33. #include "libguile/error.h"
  34. #include "libguile/generalized-vectors.h"
  35. #include "libguile/deprecation.h"
  36. #include "libguile/validate.h"
  37. #include "libguile/private-options.h"
  38. /* {Strings}
  39. */
  40. /* Stringbufs
  41. *
  42. * XXX - keeping an accurate refcount during GC seems to be quite
  43. * tricky, so we just keep score of whether a stringbuf might be
  44. * shared, not whether it definitely is.
  45. *
  46. * The scheme I (mvo) tried to keep an accurate reference count would
  47. * recount all strings that point to a stringbuf during the mark-phase
  48. * of the GC. This was done since one cannot access the stringbuf of
  49. * a string when that string is freed (in order to decrease the
  50. * reference count). The memory of the stringbuf might have been
  51. * reused already for something completely different.
  52. *
  53. * This recounted worked for a small number of threads beating on
  54. * cow-strings, but it failed randomly with more than 10 threads, say.
  55. * I couldn't figure out what went wrong, so I used the conservative
  56. * approach implemented below.
  57. *
  58. * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
  59. * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
  60. * strings.
  61. */
  62. /* The size in words of the stringbuf header (type tag + size). */
  63. #define STRINGBUF_HEADER_SIZE 2U
  64. #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
  65. #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
  66. #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
  67. #define STRINGBUF_TAG scm_tc7_stringbuf
  68. #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
  69. #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
  70. #define STRINGBUF_CONTENTS(buf) ((void *) \
  71. SCM_CELL_OBJECT_LOC (buf, \
  72. STRINGBUF_HEADER_SIZE))
  73. #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
  74. #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
  75. #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
  76. #define SET_STRINGBUF_SHARED(buf) \
  77. do \
  78. { \
  79. /* Don't modify BUF if it's already marked as shared since it might be \
  80. a read-only, statically allocated stringbuf. */ \
  81. if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
  82. SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
  83. } \
  84. while (0)
  85. #ifdef SCM_STRING_LENGTH_HISTOGRAM
  86. static size_t lenhist[1001];
  87. #endif
  88. /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
  89. characters. */
  90. static SCM
  91. make_stringbuf (size_t len)
  92. {
  93. /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
  94. scm_i_symbol_chars, all stringbufs are null-terminated. Once
  95. SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
  96. has been changed for scm_i_symbol_chars, this null-termination
  97. can be dropped.
  98. */
  99. SCM buf;
  100. #ifdef SCM_STRING_LENGTH_HISTOGRAM
  101. if (len < 1000)
  102. lenhist[len]++;
  103. else
  104. lenhist[1000]++;
  105. #endif
  106. buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
  107. "string"));
  108. SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
  109. SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
  110. STRINGBUF_CHARS (buf)[len] = 0;
  111. return buf;
  112. }
  113. /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
  114. characters. */
  115. static SCM
  116. make_wide_stringbuf (size_t len)
  117. {
  118. SCM buf;
  119. size_t raw_len;
  120. #ifdef SCM_STRING_LENGTH_HISTOGRAM
  121. if (len < 1000)
  122. lenhist[len]++;
  123. else
  124. lenhist[1000]++;
  125. #endif
  126. raw_len = (len + 1) * sizeof (scm_t_wchar);
  127. buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
  128. "string"));
  129. SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
  130. SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
  131. STRINGBUF_WIDE_CHARS (buf)[len] = 0;
  132. return buf;
  133. }
  134. /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
  135. characters from BUF. */
  136. static SCM
  137. wide_stringbuf (SCM buf)
  138. {
  139. SCM new_buf;
  140. if (STRINGBUF_WIDE (buf))
  141. new_buf = buf;
  142. else
  143. {
  144. size_t i, len;
  145. scm_t_wchar *mem;
  146. len = STRINGBUF_LENGTH (buf);
  147. new_buf = make_wide_stringbuf (len);
  148. mem = STRINGBUF_WIDE_CHARS (new_buf);
  149. for (i = 0; i < len; i++)
  150. mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
  151. mem[len] = 0;
  152. }
  153. return new_buf;
  154. }
  155. /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
  156. characters from BUF, if possible. */
  157. static SCM
  158. narrow_stringbuf (SCM buf)
  159. {
  160. SCM new_buf;
  161. if (!STRINGBUF_WIDE (buf))
  162. new_buf = buf;
  163. else
  164. {
  165. size_t i, len;
  166. scm_t_wchar *wmem;
  167. unsigned char *mem;
  168. len = STRINGBUF_LENGTH (buf);
  169. wmem = STRINGBUF_WIDE_CHARS (buf);
  170. for (i = 0; i < len; i++)
  171. if (wmem[i] > 0xFF)
  172. /* BUF cannot be narrowed. */
  173. return buf;
  174. new_buf = make_stringbuf (len);
  175. mem = STRINGBUF_CHARS (new_buf);
  176. for (i = 0; i < len; i++)
  177. mem[i] = (unsigned char) wmem[i];
  178. mem[len] = 0;
  179. }
  180. return new_buf;
  181. }
  182. scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
  183. /* Copy-on-write strings.
  184. */
  185. #define STRING_TAG scm_tc7_string
  186. #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
  187. #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
  188. #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
  189. #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
  190. #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
  191. #define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
  192. /* Read-only strings.
  193. */
  194. #define RO_STRING_TAG scm_tc7_ro_string
  195. #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
  196. /* Mutation-sharing substrings
  197. */
  198. #define SH_STRING_TAG (scm_tc7_string + 0x100)
  199. #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
  200. /* START and LENGTH as for STRINGs. */
  201. #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
  202. SCM scm_nullstr;
  203. /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
  204. characters. CHARSP, if not NULL, will be set to location of the
  205. char array. If READ_ONLY_P, the returned string is read-only;
  206. otherwise it is writable. */
  207. SCM
  208. scm_i_make_string (size_t len, char **charsp, int read_only_p)
  209. {
  210. SCM buf = make_stringbuf (len);
  211. SCM res;
  212. if (charsp)
  213. *charsp = (char *) STRINGBUF_CHARS (buf);
  214. res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
  215. SCM_UNPACK (buf),
  216. (scm_t_bits) 0, (scm_t_bits) len);
  217. return res;
  218. }
  219. /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
  220. characters. CHARSP, if not NULL, will be set to location of the
  221. character array. If READ_ONLY_P, the returned string is read-only;
  222. otherwise it is writable. */
  223. SCM
  224. scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
  225. {
  226. SCM buf = make_wide_stringbuf (len);
  227. SCM res;
  228. if (charsp)
  229. *charsp = STRINGBUF_WIDE_CHARS (buf);
  230. res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
  231. SCM_UNPACK (buf),
  232. (scm_t_bits) 0, (scm_t_bits) len);
  233. return res;
  234. }
  235. static void
  236. validate_substring_args (SCM str, size_t start, size_t end)
  237. {
  238. if (!IS_STRING (str))
  239. scm_wrong_type_arg_msg (NULL, 0, str, "string");
  240. if (start > STRING_LENGTH (str))
  241. scm_out_of_range (NULL, scm_from_size_t (start));
  242. if (end > STRING_LENGTH (str) || end < start)
  243. scm_out_of_range (NULL, scm_from_size_t (end));
  244. }
  245. static inline void
  246. get_str_buf_start (SCM *str, SCM *buf, size_t *start)
  247. {
  248. *start = STRING_START (*str);
  249. if (IS_SH_STRING (*str))
  250. {
  251. *str = SH_STRING_STRING (*str);
  252. *start += STRING_START (*str);
  253. }
  254. *buf = STRING_STRINGBUF (*str);
  255. }
  256. SCM
  257. scm_i_substring (SCM str, size_t start, size_t end)
  258. {
  259. SCM buf;
  260. size_t str_start;
  261. get_str_buf_start (&str, &buf, &str_start);
  262. scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
  263. SET_STRINGBUF_SHARED (buf);
  264. scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
  265. return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
  266. (scm_t_bits)str_start + start,
  267. (scm_t_bits) end - start);
  268. }
  269. SCM
  270. scm_i_substring_read_only (SCM str, size_t start, size_t end)
  271. {
  272. SCM buf;
  273. size_t str_start;
  274. get_str_buf_start (&str, &buf, &str_start);
  275. scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
  276. SET_STRINGBUF_SHARED (buf);
  277. scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
  278. return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
  279. (scm_t_bits)str_start + start,
  280. (scm_t_bits) end - start);
  281. }
  282. SCM
  283. scm_i_substring_copy (SCM str, size_t start, size_t end)
  284. {
  285. size_t len = end - start;
  286. SCM buf, my_buf;
  287. size_t str_start;
  288. get_str_buf_start (&str, &buf, &str_start);
  289. if (scm_i_is_narrow_string (str))
  290. {
  291. my_buf = make_stringbuf (len);
  292. memcpy (STRINGBUF_CHARS (my_buf),
  293. STRINGBUF_CHARS (buf) + str_start + start, len);
  294. }
  295. else
  296. {
  297. my_buf = make_wide_stringbuf (len);
  298. u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
  299. (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
  300. + start), len);
  301. /* Even though this string is wide, the substring may be narrow.
  302. Consider adding code to narrow the string. */
  303. }
  304. scm_remember_upto_here_1 (buf);
  305. return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
  306. (scm_t_bits) 0, (scm_t_bits) len);
  307. }
  308. SCM
  309. scm_i_substring_shared (SCM str, size_t start, size_t end)
  310. {
  311. if (start == 0 && end == STRING_LENGTH (str))
  312. return str;
  313. else
  314. {
  315. size_t len = end - start;
  316. if (IS_SH_STRING (str))
  317. {
  318. start += STRING_START (str);
  319. str = SH_STRING_STRING (str);
  320. }
  321. return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
  322. (scm_t_bits)start, (scm_t_bits) len);
  323. }
  324. }
  325. SCM
  326. scm_c_substring (SCM str, size_t start, size_t end)
  327. {
  328. validate_substring_args (str, start, end);
  329. return scm_i_substring (str, start, end);
  330. }
  331. SCM
  332. scm_c_substring_read_only (SCM str, size_t start, size_t end)
  333. {
  334. validate_substring_args (str, start, end);
  335. return scm_i_substring_read_only (str, start, end);
  336. }
  337. SCM
  338. scm_c_substring_copy (SCM str, size_t start, size_t end)
  339. {
  340. validate_substring_args (str, start, end);
  341. return scm_i_substring_copy (str, start, end);
  342. }
  343. SCM
  344. scm_c_substring_shared (SCM str, size_t start, size_t end)
  345. {
  346. validate_substring_args (str, start, end);
  347. return scm_i_substring_shared (str, start, end);
  348. }
  349. /* Internal accessors
  350. */
  351. /* Returns the number of characters in STR. This may be different
  352. than the memory size of the string storage. */
  353. size_t
  354. scm_i_string_length (SCM str)
  355. {
  356. return STRING_LENGTH (str);
  357. }
  358. /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
  359. encoding. False if it is 'wide', having a 32-bit UCS-4
  360. encoding. */
  361. int
  362. scm_i_is_narrow_string (SCM str)
  363. {
  364. return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
  365. }
  366. /* Try to coerce a string to be narrow. It if is narrow already, do
  367. nothing. If it is wide, shrink it to narrow if none of its
  368. characters are above 0xFF. Return true if the string is narrow or
  369. was made to be narrow. */
  370. int
  371. scm_i_try_narrow_string (SCM str)
  372. {
  373. SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
  374. return scm_i_is_narrow_string (str);
  375. }
  376. /* Return a pointer to the raw data of the string, which can be either Latin-1
  377. or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
  378. const void *
  379. scm_i_string_data (SCM str)
  380. {
  381. SCM buf;
  382. size_t start;
  383. const char *data;
  384. get_str_buf_start (&str, &buf, &start);
  385. data = STRINGBUF_CONTENTS (buf);
  386. data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
  387. return data;
  388. }
  389. /* Returns a pointer to the 8-bit Latin-1 encoded character array of
  390. STR. */
  391. const char *
  392. scm_i_string_chars (SCM str)
  393. {
  394. SCM buf;
  395. size_t start;
  396. get_str_buf_start (&str, &buf, &start);
  397. if (scm_i_is_narrow_string (str))
  398. return (const char *) STRINGBUF_CHARS (buf) + start;
  399. else
  400. scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
  401. scm_list_1 (str));
  402. return NULL;
  403. }
  404. /* Returns a pointer to the 32-bit UCS-4 encoded character array of
  405. STR. */
  406. const scm_t_wchar *
  407. scm_i_string_wide_chars (SCM str)
  408. {
  409. SCM buf;
  410. size_t start;
  411. get_str_buf_start (&str, &buf, &start);
  412. if (!scm_i_is_narrow_string (str))
  413. return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
  414. else
  415. scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
  416. scm_list_1 (str));
  417. }
  418. /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
  419. a new string buffer, so that it can be modified without modifying
  420. other strings. Also, lock the string mutex. Later, one must call
  421. scm_i_string_stop_writing to unlock the mutex. */
  422. SCM
  423. scm_i_string_start_writing (SCM orig_str)
  424. {
  425. SCM buf, str = orig_str;
  426. size_t start;
  427. get_str_buf_start (&str, &buf, &start);
  428. if (IS_RO_STRING (str))
  429. scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
  430. scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
  431. if (STRINGBUF_SHARED (buf))
  432. {
  433. /* Clone the stringbuf. */
  434. size_t len = STRING_LENGTH (str);
  435. SCM new_buf;
  436. scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
  437. if (scm_i_is_narrow_string (str))
  438. {
  439. new_buf = make_stringbuf (len);
  440. memcpy (STRINGBUF_CHARS (new_buf),
  441. STRINGBUF_CHARS (buf) + STRING_START (str), len);
  442. }
  443. else
  444. {
  445. new_buf = make_wide_stringbuf (len);
  446. u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
  447. (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
  448. + STRING_START (str)), len);
  449. }
  450. SET_STRING_STRINGBUF (str, new_buf);
  451. start -= STRING_START (str);
  452. /* FIXME: The following operations are not atomic, so other threads
  453. looking at STR may see an inconsistent state. Nevertheless it can't
  454. hurt much since (i) accessing STR while it is being mutated can't
  455. yield a crash, and (ii) concurrent accesses to STR should be
  456. protected by a mutex at the application level. The latter may not
  457. apply when STR != ORIG_STR, though. */
  458. SET_STRING_START (str, 0);
  459. SET_STRING_STRINGBUF (str, new_buf);
  460. buf = new_buf;
  461. scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
  462. }
  463. return orig_str;
  464. }
  465. /* Return a pointer to the 8-bit Latin-1 chars of a string. */
  466. char *
  467. scm_i_string_writable_chars (SCM str)
  468. {
  469. SCM buf;
  470. size_t start;
  471. get_str_buf_start (&str, &buf, &start);
  472. if (scm_i_is_narrow_string (str))
  473. return (char *) STRINGBUF_CHARS (buf) + start;
  474. else
  475. scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
  476. scm_list_1 (str));
  477. return NULL;
  478. }
  479. /* Return a pointer to the UCS-4 codepoints of a string. */
  480. static scm_t_wchar *
  481. scm_i_string_writable_wide_chars (SCM str)
  482. {
  483. SCM buf;
  484. size_t start;
  485. get_str_buf_start (&str, &buf, &start);
  486. if (!scm_i_is_narrow_string (str))
  487. return STRINGBUF_WIDE_CHARS (buf) + start;
  488. else
  489. scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
  490. scm_list_1 (str));
  491. }
  492. /* Unlock the string mutex that was locked when
  493. scm_i_string_start_writing was called. */
  494. void
  495. scm_i_string_stop_writing (void)
  496. {
  497. scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
  498. }
  499. /* Return the Xth character of STR as a UCS-4 codepoint. */
  500. scm_t_wchar
  501. scm_i_string_ref (SCM str, size_t x)
  502. {
  503. if (scm_i_is_narrow_string (str))
  504. return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
  505. else
  506. return scm_i_string_wide_chars (str)[x];
  507. }
  508. /* Returns index+1 of the first char in STR that matches C, or
  509. 0 if the char is not found. */
  510. int
  511. scm_i_string_contains_char (SCM str, char ch)
  512. {
  513. size_t i;
  514. size_t len = scm_i_string_length (str);
  515. i = 0;
  516. if (scm_i_is_narrow_string (str))
  517. {
  518. while (i < len)
  519. {
  520. if (scm_i_string_chars (str)[i] == ch)
  521. return i+1;
  522. i++;
  523. }
  524. }
  525. else
  526. {
  527. while (i < len)
  528. {
  529. if (scm_i_string_wide_chars (str)[i]
  530. == (unsigned char) ch)
  531. return i+1;
  532. i++;
  533. }
  534. }
  535. return 0;
  536. }
  537. int
  538. scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
  539. {
  540. if (scm_i_is_narrow_string (sstr))
  541. {
  542. const char *a = scm_i_string_chars (sstr) + start_x;
  543. const char *b = cstr;
  544. return strncmp (a, b, strlen(b));
  545. }
  546. else
  547. {
  548. size_t i;
  549. const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
  550. const char *b = cstr;
  551. for (i = 0; i < strlen (b); i++)
  552. {
  553. if (a[i] != (unsigned char) b[i])
  554. return 1;
  555. }
  556. }
  557. return 0;
  558. }
  559. /* Set the Pth character of STR to UCS-4 codepoint CHR. */
  560. void
  561. scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
  562. {
  563. if (chr > 0xFF && scm_i_is_narrow_string (str))
  564. SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
  565. if (scm_i_is_narrow_string (str))
  566. {
  567. char *dst = scm_i_string_writable_chars (str);
  568. dst[p] = chr;
  569. }
  570. else
  571. {
  572. scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
  573. dst[p] = chr;
  574. }
  575. }
  576. /* Symbols.
  577. Basic symbol creation and accessing is done here, the rest is in
  578. symbols.[hc]. This has been done to keep stringbufs and the
  579. internals of strings and string-like objects confined to this file.
  580. */
  581. #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
  582. SCM
  583. scm_i_make_symbol (SCM name, scm_t_bits flags,
  584. unsigned long hash, SCM props)
  585. {
  586. SCM buf;
  587. size_t start = STRING_START (name);
  588. size_t length = STRING_LENGTH (name);
  589. if (IS_SH_STRING (name))
  590. {
  591. name = SH_STRING_STRING (name);
  592. start += STRING_START (name);
  593. }
  594. buf = SYMBOL_STRINGBUF (name);
  595. if (start == 0 && length == STRINGBUF_LENGTH (buf))
  596. {
  597. /* reuse buf. */
  598. scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
  599. SET_STRINGBUF_SHARED (buf);
  600. scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
  601. }
  602. else
  603. {
  604. /* make new buf. */
  605. if (scm_i_is_narrow_string (name))
  606. {
  607. SCM new_buf = make_stringbuf (length);
  608. memcpy (STRINGBUF_CHARS (new_buf),
  609. STRINGBUF_CHARS (buf) + start, length);
  610. buf = new_buf;
  611. }
  612. else
  613. {
  614. SCM new_buf = make_wide_stringbuf (length);
  615. u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
  616. (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
  617. length);
  618. buf = new_buf;
  619. }
  620. }
  621. return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
  622. (scm_t_bits) hash, SCM_UNPACK (props));
  623. }
  624. SCM
  625. scm_i_c_make_symbol (const char *name, size_t len,
  626. scm_t_bits flags, unsigned long hash, SCM props)
  627. {
  628. SCM buf = make_stringbuf (len);
  629. memcpy (STRINGBUF_CHARS (buf), name, len);
  630. return scm_immutable_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
  631. (scm_t_bits) hash, SCM_UNPACK (props));
  632. }
  633. /* Returns the number of characters in SYM. This may be different
  634. from the memory size of SYM. */
  635. size_t
  636. scm_i_symbol_length (SCM sym)
  637. {
  638. return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
  639. }
  640. size_t
  641. scm_c_symbol_length (SCM sym)
  642. #define FUNC_NAME "scm_c_symbol_length"
  643. {
  644. SCM_VALIDATE_SYMBOL (1, sym);
  645. return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
  646. }
  647. #undef FUNC_NAME
  648. /* True if the name of SYM is stored as a Latin-1 encoded string.
  649. False if it is stored as a 32-bit UCS-4-encoded string. */
  650. int
  651. scm_i_is_narrow_symbol (SCM sym)
  652. {
  653. SCM buf;
  654. buf = SYMBOL_STRINGBUF (sym);
  655. return !STRINGBUF_WIDE (buf);
  656. }
  657. /* Returns a pointer to the 8-bit Latin-1 encoded character array that
  658. contains the name of SYM. */
  659. const char *
  660. scm_i_symbol_chars (SCM sym)
  661. {
  662. SCM buf;
  663. buf = SYMBOL_STRINGBUF (sym);
  664. if (!STRINGBUF_WIDE (buf))
  665. return (const char *) STRINGBUF_CHARS (buf);
  666. else
  667. scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
  668. scm_list_1 (sym));
  669. }
  670. /* Return a pointer to the 32-bit UCS-4-encoded character array of a
  671. symbol's name. */
  672. const scm_t_wchar *
  673. scm_i_symbol_wide_chars (SCM sym)
  674. {
  675. SCM buf;
  676. buf = SYMBOL_STRINGBUF (sym);
  677. if (STRINGBUF_WIDE (buf))
  678. return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
  679. else
  680. scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
  681. scm_list_1 (sym));
  682. }
  683. SCM
  684. scm_i_symbol_substring (SCM sym, size_t start, size_t end)
  685. {
  686. SCM buf = SYMBOL_STRINGBUF (sym);
  687. scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
  688. SET_STRINGBUF_SHARED (buf);
  689. scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
  690. return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
  691. (scm_t_bits)start, (scm_t_bits) end - start);
  692. }
  693. /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
  694. scm_t_wchar
  695. scm_i_symbol_ref (SCM sym, size_t x)
  696. {
  697. if (scm_i_is_narrow_symbol (sym))
  698. return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
  699. else
  700. return scm_i_symbol_wide_chars (sym)[x];
  701. }
  702. /* Debugging
  703. */
  704. SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
  705. "Returns an association list containing debugging information\n"
  706. "for @var{str}. The association list has the following entries."
  707. "@table @code\n"
  708. "@item string\n"
  709. "The string itself.\n"
  710. "@item start\n"
  711. "The start index of the string into its stringbuf\n"
  712. "@item length\n"
  713. "The length of the string\n"
  714. "@item shared\n"
  715. "If this string is a substring, it returns its parent string.\n"
  716. "Otherwise, it returns @code{#f}\n"
  717. "@item read-only\n"
  718. "@code{#t} if the string is read-only\n"
  719. "@item stringbuf-chars\n"
  720. "A new string containing this string's stringbuf's characters\n"
  721. "@item stringbuf-length\n"
  722. "The number of characters in this stringbuf\n"
  723. "@item stringbuf-shared\n"
  724. "@code{#t} if this stringbuf is shared\n"
  725. "@item stringbuf-wide\n"
  726. "@code{#t} if this stringbuf's characters are stored in a\n"
  727. "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
  728. "buffer\n"
  729. "@end table")
  730. #define FUNC_NAME s_scm_sys_string_dump
  731. {
  732. SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
  733. SCM buf;
  734. SCM_VALIDATE_STRING (1, str);
  735. /* String info */
  736. e1 = scm_cons (scm_from_latin1_symbol ("string"),
  737. str);
  738. e2 = scm_cons (scm_from_latin1_symbol ("start"),
  739. scm_from_size_t (STRING_START (str)));
  740. e3 = scm_cons (scm_from_latin1_symbol ("length"),
  741. scm_from_size_t (STRING_LENGTH (str)));
  742. if (IS_SH_STRING (str))
  743. {
  744. e4 = scm_cons (scm_from_latin1_symbol ("shared"),
  745. SH_STRING_STRING (str));
  746. buf = STRING_STRINGBUF (SH_STRING_STRING (str));
  747. }
  748. else
  749. {
  750. e4 = scm_cons (scm_from_latin1_symbol ("shared"),
  751. SCM_BOOL_F);
  752. buf = STRING_STRINGBUF (str);
  753. }
  754. if (IS_RO_STRING (str))
  755. e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
  756. SCM_BOOL_T);
  757. else
  758. e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
  759. SCM_BOOL_F);
  760. /* Stringbuf info */
  761. if (!STRINGBUF_WIDE (buf))
  762. {
  763. size_t len = STRINGBUF_LENGTH (buf);
  764. char *cbuf;
  765. SCM sbc = scm_i_make_string (len, &cbuf, 0);
  766. memcpy (cbuf, STRINGBUF_CHARS (buf), len);
  767. e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
  768. sbc);
  769. }
  770. else
  771. {
  772. size_t len = STRINGBUF_LENGTH (buf);
  773. scm_t_wchar *cbuf;
  774. SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
  775. u32_cpy ((scm_t_uint32 *) cbuf,
  776. (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
  777. e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
  778. sbc);
  779. }
  780. e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
  781. scm_from_size_t (STRINGBUF_LENGTH (buf)));
  782. if (STRINGBUF_SHARED (buf))
  783. e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
  784. SCM_BOOL_T);
  785. else
  786. e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
  787. SCM_BOOL_F);
  788. if (STRINGBUF_WIDE (buf))
  789. e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
  790. SCM_BOOL_T);
  791. else
  792. e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
  793. SCM_BOOL_F);
  794. return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
  795. }
  796. #undef FUNC_NAME
  797. SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
  798. "Returns an association list containing debugging information\n"
  799. "for @var{sym}. The association list has the following entries."
  800. "@table @code\n"
  801. "@item symbol\n"
  802. "The symbol itself\n"
  803. "@item hash\n"
  804. "Its hash value\n"
  805. "@item interned\n"
  806. "@code{#t} if it is an interned symbol\n"
  807. "@item stringbuf-chars\n"
  808. "A new string containing this symbols's stringbuf's characters\n"
  809. "@item stringbuf-length\n"
  810. "The number of characters in this stringbuf\n"
  811. "@item stringbuf-shared\n"
  812. "@code{#t} if this stringbuf is shared\n"
  813. "@item stringbuf-wide\n"
  814. "@code{#t} if this stringbuf's characters are stored in a\n"
  815. "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
  816. "buffer\n"
  817. "@end table")
  818. #define FUNC_NAME s_scm_sys_symbol_dump
  819. {
  820. SCM e1, e2, e3, e4, e5, e6, e7;
  821. SCM buf;
  822. SCM_VALIDATE_SYMBOL (1, sym);
  823. e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
  824. sym);
  825. e2 = scm_cons (scm_from_latin1_symbol ("hash"),
  826. scm_from_ulong (scm_i_symbol_hash (sym)));
  827. e3 = scm_cons (scm_from_latin1_symbol ("interned"),
  828. scm_symbol_interned_p (sym));
  829. buf = SYMBOL_STRINGBUF (sym);
  830. /* Stringbuf info */
  831. if (!STRINGBUF_WIDE (buf))
  832. {
  833. size_t len = STRINGBUF_LENGTH (buf);
  834. char *cbuf;
  835. SCM sbc = scm_i_make_string (len, &cbuf, 0);
  836. memcpy (cbuf, STRINGBUF_CHARS (buf), len);
  837. e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
  838. sbc);
  839. }
  840. else
  841. {
  842. size_t len = STRINGBUF_LENGTH (buf);
  843. scm_t_wchar *cbuf;
  844. SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
  845. u32_cpy ((scm_t_uint32 *) cbuf,
  846. (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
  847. e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
  848. sbc);
  849. }
  850. e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
  851. scm_from_size_t (STRINGBUF_LENGTH (buf)));
  852. if (STRINGBUF_SHARED (buf))
  853. e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
  854. SCM_BOOL_T);
  855. else
  856. e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
  857. SCM_BOOL_F);
  858. if (STRINGBUF_WIDE (buf))
  859. e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
  860. SCM_BOOL_T);
  861. else
  862. e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
  863. SCM_BOOL_F);
  864. return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
  865. }
  866. #undef FUNC_NAME
  867. #ifdef SCM_STRING_LENGTH_HISTOGRAM
  868. SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
  869. #define FUNC_NAME s_scm_sys_stringbuf_hist
  870. {
  871. int i;
  872. for (i = 0; i < 1000; i++)
  873. if (lenhist[i])
  874. fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
  875. fprintf (stderr, ">999: %u\n", lenhist[1000]);
  876. return SCM_UNSPECIFIED;
  877. }
  878. #undef FUNC_NAME
  879. #endif
  880. SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
  881. (SCM obj),
  882. "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
  883. #define FUNC_NAME s_scm_string_p
  884. {
  885. return scm_from_bool (IS_STRING (obj));
  886. }
  887. #undef FUNC_NAME
  888. SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
  889. SCM_DEFINE (scm_string, "string", 0, 0, 1,
  890. (SCM chrs),
  891. "@deffnx {Scheme Procedure} list->string chrs\n"
  892. "Return a newly allocated string composed of the arguments,\n"
  893. "@var{chrs}.")
  894. #define FUNC_NAME s_scm_string
  895. {
  896. SCM result = SCM_BOOL_F;
  897. SCM rest;
  898. size_t len;
  899. size_t p = 0;
  900. long i;
  901. int wide = 0;
  902. /* Verify that this is a list of chars. */
  903. i = scm_ilength (chrs);
  904. SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
  905. len = (size_t) i;
  906. rest = chrs;
  907. while (len > 0 && scm_is_pair (rest))
  908. {
  909. SCM elt = SCM_CAR (rest);
  910. SCM_VALIDATE_CHAR (SCM_ARGn, elt);
  911. if (SCM_CHAR (elt) > 0xFF)
  912. wide = 1;
  913. rest = SCM_CDR (rest);
  914. len--;
  915. scm_remember_upto_here_1 (elt);
  916. }
  917. /* Construct a string containing this list of chars. */
  918. len = (size_t) i;
  919. rest = chrs;
  920. if (wide == 0)
  921. {
  922. char *buf;
  923. result = scm_i_make_string (len, NULL, 0);
  924. result = scm_i_string_start_writing (result);
  925. buf = scm_i_string_writable_chars (result);
  926. while (len > 0 && scm_is_pair (rest))
  927. {
  928. SCM elt = SCM_CAR (rest);
  929. buf[p] = (unsigned char) SCM_CHAR (elt);
  930. p++;
  931. rest = SCM_CDR (rest);
  932. len--;
  933. scm_remember_upto_here_1 (elt);
  934. }
  935. }
  936. else
  937. {
  938. scm_t_wchar *buf;
  939. result = scm_i_make_wide_string (len, NULL, 0);
  940. result = scm_i_string_start_writing (result);
  941. buf = scm_i_string_writable_wide_chars (result);
  942. while (len > 0 && scm_is_pair (rest))
  943. {
  944. SCM elt = SCM_CAR (rest);
  945. buf[p] = SCM_CHAR (elt);
  946. p++;
  947. rest = SCM_CDR (rest);
  948. len--;
  949. scm_remember_upto_here_1 (elt);
  950. }
  951. }
  952. scm_i_string_stop_writing ();
  953. if (len > 0)
  954. scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
  955. if (!scm_is_null (rest))
  956. scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
  957. return result;
  958. }
  959. #undef FUNC_NAME
  960. SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
  961. (SCM k, SCM chr),
  962. "Return a newly allocated string of\n"
  963. "length @var{k}. If @var{chr} is given, then all elements of\n"
  964. "the string are initialized to @var{chr}, otherwise the contents\n"
  965. "of the @var{string} are all set to @var{#\nul}.")
  966. #define FUNC_NAME s_scm_make_string
  967. {
  968. return scm_c_make_string (scm_to_size_t (k), chr);
  969. }
  970. #undef FUNC_NAME
  971. SCM
  972. scm_c_make_string (size_t len, SCM chr)
  973. #define FUNC_NAME NULL
  974. {
  975. size_t p;
  976. char *contents = NULL;
  977. SCM res = scm_i_make_string (len, &contents, 0);
  978. /* If no char is given, initialize string contents to NULL. */
  979. if (SCM_UNBNDP (chr))
  980. memset (contents, 0, len);
  981. else
  982. {
  983. SCM_VALIDATE_CHAR (0, chr);
  984. res = scm_i_string_start_writing (res);
  985. for (p = 0; p < len; p++)
  986. scm_i_string_set_x (res, p, SCM_CHAR (chr));
  987. scm_i_string_stop_writing ();
  988. }
  989. return res;
  990. }
  991. #undef FUNC_NAME
  992. SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
  993. (SCM string),
  994. "Return the number of characters in @var{string}.")
  995. #define FUNC_NAME s_scm_string_length
  996. {
  997. SCM_VALIDATE_STRING (1, string);
  998. return scm_from_size_t (STRING_LENGTH (string));
  999. }
  1000. #undef FUNC_NAME
  1001. SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
  1002. (SCM string),
  1003. "Return the bytes used to represent a character in @var{string}."
  1004. "This will return 1 or 4.")
  1005. #define FUNC_NAME s_scm_string_bytes_per_char
  1006. {
  1007. SCM_VALIDATE_STRING (1, string);
  1008. if (!scm_i_is_narrow_string (string))
  1009. return scm_from_int (4);
  1010. return scm_from_int (1);
  1011. }
  1012. #undef FUNC_NAME
  1013. size_t
  1014. scm_c_string_length (SCM string)
  1015. {
  1016. if (!IS_STRING (string))
  1017. scm_wrong_type_arg_msg (NULL, 0, string, "string");
  1018. return STRING_LENGTH (string);
  1019. }
  1020. SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
  1021. (SCM str, SCM k),
  1022. "Return character @var{k} of @var{str} using zero-origin\n"
  1023. "indexing. @var{k} must be a valid index of @var{str}.")
  1024. #define FUNC_NAME s_scm_string_ref
  1025. {
  1026. size_t len;
  1027. unsigned long idx;
  1028. SCM_VALIDATE_STRING (1, str);
  1029. len = scm_i_string_length (str);
  1030. if (SCM_LIKELY (len > 0))
  1031. idx = scm_to_unsigned_integer (k, 0, len - 1);
  1032. else
  1033. scm_out_of_range (NULL, k);
  1034. if (scm_i_is_narrow_string (str))
  1035. return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
  1036. else
  1037. return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
  1038. }
  1039. #undef FUNC_NAME
  1040. SCM
  1041. scm_c_string_ref (SCM str, size_t p)
  1042. {
  1043. if (p >= scm_i_string_length (str))
  1044. scm_out_of_range (NULL, scm_from_size_t (p));
  1045. if (scm_i_is_narrow_string (str))
  1046. return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
  1047. else
  1048. return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
  1049. }
  1050. SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
  1051. (SCM str, SCM k, SCM chr),
  1052. "Store @var{chr} in element @var{k} of @var{str} and return\n"
  1053. "an unspecified value. @var{k} must be a valid index of\n"
  1054. "@var{str}.")
  1055. #define FUNC_NAME s_scm_string_set_x
  1056. {
  1057. size_t len;
  1058. unsigned long idx;
  1059. SCM_VALIDATE_STRING (1, str);
  1060. len = scm_i_string_length (str);
  1061. if (SCM_LIKELY (len > 0))
  1062. idx = scm_to_unsigned_integer (k, 0, len - 1);
  1063. else
  1064. scm_out_of_range (NULL, k);
  1065. SCM_VALIDATE_CHAR (3, chr);
  1066. str = scm_i_string_start_writing (str);
  1067. scm_i_string_set_x (str, idx, SCM_CHAR (chr));
  1068. scm_i_string_stop_writing ();
  1069. return SCM_UNSPECIFIED;
  1070. }
  1071. #undef FUNC_NAME
  1072. void
  1073. scm_c_string_set_x (SCM str, size_t p, SCM chr)
  1074. {
  1075. if (p >= scm_i_string_length (str))
  1076. scm_out_of_range (NULL, scm_from_size_t (p));
  1077. str = scm_i_string_start_writing (str);
  1078. scm_i_string_set_x (str, p, SCM_CHAR (chr));
  1079. scm_i_string_stop_writing ();
  1080. }
  1081. SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
  1082. (SCM str, SCM start, SCM end),
  1083. "Return a newly allocated string formed from the characters\n"
  1084. "of @var{str} beginning with index @var{start} (inclusive) and\n"
  1085. "ending with index @var{end} (exclusive).\n"
  1086. "@var{str} must be a string, @var{start} and @var{end} must be\n"
  1087. "exact integers satisfying:\n\n"
  1088. "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
  1089. #define FUNC_NAME s_scm_substring
  1090. {
  1091. size_t len, from, to;
  1092. SCM_VALIDATE_STRING (1, str);
  1093. len = scm_i_string_length (str);
  1094. from = scm_to_unsigned_integer (start, 0, len);
  1095. if (SCM_UNBNDP (end))
  1096. to = len;
  1097. else
  1098. to = scm_to_unsigned_integer (end, from, len);
  1099. return scm_i_substring (str, from, to);
  1100. }
  1101. #undef FUNC_NAME
  1102. SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
  1103. (SCM str, SCM start, SCM end),
  1104. "Return a newly allocated string formed from the characters\n"
  1105. "of @var{str} beginning with index @var{start} (inclusive) and\n"
  1106. "ending with index @var{end} (exclusive).\n"
  1107. "@var{str} must be a string, @var{start} and @var{end} must be\n"
  1108. "exact integers satisfying:\n"
  1109. "\n"
  1110. "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
  1111. "\n"
  1112. "The returned string is read-only.\n")
  1113. #define FUNC_NAME s_scm_substring_read_only
  1114. {
  1115. size_t len, from, to;
  1116. SCM_VALIDATE_STRING (1, str);
  1117. len = scm_i_string_length (str);
  1118. from = scm_to_unsigned_integer (start, 0, len);
  1119. if (SCM_UNBNDP (end))
  1120. to = len;
  1121. else
  1122. to = scm_to_unsigned_integer (end, from, len);
  1123. return scm_i_substring_read_only (str, from, to);
  1124. }
  1125. #undef FUNC_NAME
  1126. SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
  1127. (SCM str, SCM start, SCM end),
  1128. "Return a newly allocated string formed from the characters\n"
  1129. "of @var{str} beginning with index @var{start} (inclusive) and\n"
  1130. "ending with index @var{end} (exclusive).\n"
  1131. "@var{str} must be a string, @var{start} and @var{end} must be\n"
  1132. "exact integers satisfying:\n\n"
  1133. "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
  1134. #define FUNC_NAME s_scm_substring_copy
  1135. {
  1136. /* For the Scheme version, START is mandatory, but for the C
  1137. version, it is optional. See scm_string_copy in srfi-13.c for a
  1138. rationale.
  1139. */
  1140. size_t from, to;
  1141. SCM_VALIDATE_STRING (1, str);
  1142. scm_i_get_substring_spec (scm_i_string_length (str),
  1143. start, &from, end, &to);
  1144. return scm_i_substring_copy (str, from, to);
  1145. }
  1146. #undef FUNC_NAME
  1147. SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
  1148. (SCM str, SCM start, SCM end),
  1149. "Return string that indirectly refers to the characters\n"
  1150. "of @var{str} beginning with index @var{start} (inclusive) and\n"
  1151. "ending with index @var{end} (exclusive).\n"
  1152. "@var{str} must be a string, @var{start} and @var{end} must be\n"
  1153. "exact integers satisfying:\n\n"
  1154. "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
  1155. #define FUNC_NAME s_scm_substring_shared
  1156. {
  1157. size_t len, from, to;
  1158. SCM_VALIDATE_STRING (1, str);
  1159. len = scm_i_string_length (str);
  1160. from = scm_to_unsigned_integer (start, 0, len);
  1161. if (SCM_UNBNDP (end))
  1162. to = len;
  1163. else
  1164. to = scm_to_unsigned_integer (end, from, len);
  1165. return scm_i_substring_shared (str, from, to);
  1166. }
  1167. #undef FUNC_NAME
  1168. SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
  1169. (SCM args),
  1170. "Return a newly allocated string whose characters form the\n"
  1171. "concatenation of the given strings, @var{args}.")
  1172. #define FUNC_NAME s_scm_string_append
  1173. {
  1174. SCM res;
  1175. size_t len = 0;
  1176. int wide = 0;
  1177. SCM l, s;
  1178. size_t i;
  1179. union
  1180. {
  1181. char *narrow;
  1182. scm_t_wchar *wide;
  1183. } data;
  1184. SCM_VALIDATE_REST_ARGUMENT (args);
  1185. for (l = args; !scm_is_null (l); l = SCM_CDR (l))
  1186. {
  1187. s = SCM_CAR (l);
  1188. SCM_VALIDATE_STRING (SCM_ARGn, s);
  1189. len += scm_i_string_length (s);
  1190. if (!scm_i_is_narrow_string (s))
  1191. wide = 1;
  1192. }
  1193. data.narrow = NULL;
  1194. if (!wide)
  1195. res = scm_i_make_string (len, &data.narrow, 0);
  1196. else
  1197. res = scm_i_make_wide_string (len, &data.wide, 0);
  1198. for (l = args; !scm_is_null (l); l = SCM_CDR (l))
  1199. {
  1200. size_t len;
  1201. s = SCM_CAR (l);
  1202. SCM_VALIDATE_STRING (SCM_ARGn, s);
  1203. len = scm_i_string_length (s);
  1204. if (!wide)
  1205. {
  1206. memcpy (data.narrow, scm_i_string_chars (s), len);
  1207. data.narrow += len;
  1208. }
  1209. else
  1210. {
  1211. if (scm_i_is_narrow_string (s))
  1212. {
  1213. for (i = 0; i < scm_i_string_length (s); i++)
  1214. data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
  1215. }
  1216. else
  1217. u32_cpy ((scm_t_uint32 *) data.wide,
  1218. (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
  1219. data.wide += len;
  1220. }
  1221. scm_remember_upto_here_1 (s);
  1222. }
  1223. return res;
  1224. }
  1225. #undef FUNC_NAME
  1226. /* Charset conversion error handling. */
  1227. SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
  1228. SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
  1229. /* Raise an exception informing that character CHR could not be written
  1230. to PORT in its current encoding. */
  1231. void
  1232. scm_encoding_error (const char *subr, int err, const char *message,
  1233. SCM port, SCM chr)
  1234. {
  1235. scm_throw (scm_encoding_error_key,
  1236. scm_list_n (scm_from_latin1_string (subr),
  1237. scm_from_latin1_string (message),
  1238. scm_from_int (err),
  1239. port, chr,
  1240. SCM_UNDEFINED));
  1241. }
  1242. /* Raise an exception informing of an encoding error on PORT. This
  1243. means that a character could not be written in PORT's encoding. */
  1244. void
  1245. scm_decoding_error (const char *subr, int err, const char *message, SCM port)
  1246. {
  1247. scm_throw (scm_decoding_error_key,
  1248. scm_list_n (scm_from_latin1_string (subr),
  1249. scm_from_latin1_string (message),
  1250. scm_from_int (err),
  1251. port,
  1252. SCM_UNDEFINED));
  1253. }
  1254. /* String conversion to/from C. */
  1255. SCM
  1256. scm_from_stringn (const char *str, size_t len, const char *encoding,
  1257. scm_t_string_failed_conversion_handler handler)
  1258. {
  1259. size_t u32len, i;
  1260. scm_t_wchar *u32;
  1261. int wide = 0;
  1262. SCM res;
  1263. /* The order of these checks is important. */
  1264. if (!str && len != 0)
  1265. scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
  1266. if (len == (size_t) -1)
  1267. len = strlen (str);
  1268. if (len == 0)
  1269. return scm_nullstr;
  1270. if (encoding == NULL)
  1271. {
  1272. /* If encoding is null, use Latin-1. */
  1273. char *buf;
  1274. res = scm_i_make_string (len, &buf, 0);
  1275. memcpy (buf, str, len);
  1276. return res;
  1277. }
  1278. u32len = 0;
  1279. u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
  1280. (enum iconv_ilseq_handler)
  1281. handler,
  1282. str, len,
  1283. NULL,
  1284. NULL, &u32len);
  1285. if (SCM_UNLIKELY (u32 == NULL))
  1286. {
  1287. /* Raise an error and pass the raw C string as a bytevector to the `throw'
  1288. handler. */
  1289. SCM bv;
  1290. signed char *buf;
  1291. buf = scm_gc_malloc_pointerless (len, "bytevector");
  1292. memcpy (buf, str, len);
  1293. bv = scm_c_take_bytevector (buf, len, SCM_BOOL_F);
  1294. scm_decoding_error (__func__, errno,
  1295. "input locale conversion error", bv);
  1296. }
  1297. i = 0;
  1298. while (i < u32len)
  1299. if (u32[i++] > 0xFF)
  1300. {
  1301. wide = 1;
  1302. break;
  1303. }
  1304. if (!wide)
  1305. {
  1306. char *dst;
  1307. res = scm_i_make_string (u32len, &dst, 0);
  1308. for (i = 0; i < u32len; i ++)
  1309. dst[i] = (unsigned char) u32[i];
  1310. dst[u32len] = '\0';
  1311. }
  1312. else
  1313. {
  1314. scm_t_wchar *wdst;
  1315. res = scm_i_make_wide_string (u32len, &wdst, 0);
  1316. u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
  1317. wdst[u32len] = 0;
  1318. }
  1319. free (u32);
  1320. return res;
  1321. }
  1322. SCM
  1323. scm_from_locale_string (const char *str)
  1324. {
  1325. return scm_from_locale_stringn (str, -1);
  1326. }
  1327. SCM
  1328. scm_from_locale_stringn (const char *str, size_t len)
  1329. {
  1330. return scm_from_stringn (str, len, locale_charset (),
  1331. scm_i_get_conversion_strategy (SCM_BOOL_F));
  1332. }
  1333. SCM
  1334. scm_from_latin1_string (const char *str)
  1335. {
  1336. return scm_from_latin1_stringn (str, -1);
  1337. }
  1338. SCM
  1339. scm_from_latin1_stringn (const char *str, size_t len)
  1340. {
  1341. char *buf;
  1342. SCM result;
  1343. if (len == (size_t) -1)
  1344. len = strlen (str);
  1345. /* Make a narrow string and copy STR as is. */
  1346. result = scm_i_make_string (len, &buf, 0);
  1347. memcpy (buf, str, len);
  1348. return result;
  1349. }
  1350. SCM
  1351. scm_from_utf8_string (const char *str)
  1352. {
  1353. return scm_from_utf8_stringn (str, -1);
  1354. }
  1355. SCM
  1356. scm_from_utf8_stringn (const char *str, size_t len)
  1357. {
  1358. return scm_from_stringn (str, len, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
  1359. }
  1360. SCM
  1361. scm_from_utf32_string (const scm_t_wchar *str)
  1362. {
  1363. return scm_from_utf32_stringn (str, -1);
  1364. }
  1365. SCM
  1366. scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
  1367. {
  1368. SCM result;
  1369. scm_t_wchar *buf;
  1370. if (len == (size_t) -1)
  1371. len = u32_strlen ((uint32_t *) str);
  1372. result = scm_i_make_wide_string (len, &buf, 0);
  1373. memcpy (buf, str, len * sizeof (scm_t_wchar));
  1374. scm_i_try_narrow_string (result);
  1375. return result;
  1376. }
  1377. /* Create a new scheme string from the C string STR. The memory of
  1378. STR may be used directly as storage for the new string. */
  1379. /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
  1380. would be to register a finalizer to eventually free(3) STR, which isn't
  1381. worth it. Should we just deprecate the `scm_take_' functions? */
  1382. SCM
  1383. scm_take_locale_stringn (char *str, size_t len)
  1384. {
  1385. SCM res;
  1386. res = scm_from_locale_stringn (str, len);
  1387. free (str);
  1388. return res;
  1389. }
  1390. SCM
  1391. scm_take_locale_string (char *str)
  1392. {
  1393. return scm_take_locale_stringn (str, -1);
  1394. }
  1395. /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
  1396. *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
  1397. Set *LENP to the size of the resulting string.
  1398. FIXME: This is a hack we should get rid of. See
  1399. <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
  1400. for details. */
  1401. static void
  1402. unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
  1403. {
  1404. char *before, *after;
  1405. size_t i, j;
  1406. before = buf;
  1407. after = buf;
  1408. i = 0;
  1409. j = 0;
  1410. while (i < *lenp)
  1411. {
  1412. if ((i <= *lenp - 6)
  1413. && before[i] == '\\'
  1414. && before[i + 1] == 'u'
  1415. && before[i + 2] == '0' && before[i + 3] == '0')
  1416. {
  1417. /* Convert \u00NN to \xNN */
  1418. after[j] = '\\';
  1419. after[j + 1] = 'x';
  1420. after[j + 2] = tolower ((int) before[i + 4]);
  1421. after[j + 3] = tolower ((int) before[i + 5]);
  1422. i += 6;
  1423. j += 4;
  1424. }
  1425. else if ((i <= *lenp - 10)
  1426. && before[i] == '\\'
  1427. && before[i + 1] == 'U'
  1428. && before[i + 2] == '0' && before[i + 3] == '0')
  1429. {
  1430. /* Convert \U00NNNNNN to \UNNNNNN */
  1431. after[j] = '\\';
  1432. after[j + 1] = 'U';
  1433. after[j + 2] = tolower ((int) before[i + 4]);
  1434. after[j + 3] = tolower ((int) before[i + 5]);
  1435. after[j + 4] = tolower ((int) before[i + 6]);
  1436. after[j + 5] = tolower ((int) before[i + 7]);
  1437. after[j + 6] = tolower ((int) before[i + 8]);
  1438. after[j + 7] = tolower ((int) before[i + 9]);
  1439. i += 10;
  1440. j += 8;
  1441. }
  1442. else
  1443. {
  1444. after[j] = before[i];
  1445. i++;
  1446. j++;
  1447. }
  1448. }
  1449. *lenp = j;
  1450. }
  1451. /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
  1452. *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
  1453. of the resulting string. BUF must be large enough to handle the
  1454. worst case when `\uXXXX' escapes (6 characters) are replaced by
  1455. `\xXXXX;' (7 characters). */
  1456. static void
  1457. unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
  1458. {
  1459. char *before, *after;
  1460. size_t i, j;
  1461. /* The worst case is if the input string contains all 4-digit hex escapes.
  1462. "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
  1463. size_t max_out_len = (*lenp * 7) / 6 + 1;
  1464. size_t nzeros, ndigits;
  1465. before = buf;
  1466. after = alloca (max_out_len);
  1467. i = 0;
  1468. j = 0;
  1469. while (i < *lenp)
  1470. {
  1471. if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
  1472. || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
  1473. {
  1474. if (before[i + 1] == 'u')
  1475. ndigits = 4;
  1476. else if (before[i + 1] == 'U')
  1477. ndigits = 8;
  1478. else
  1479. abort ();
  1480. /* Add the R6RS hex escape initial sequence. */
  1481. after[j] = '\\';
  1482. after[j + 1] = 'x';
  1483. /* Move string positions to the start of the hex numbers. */
  1484. i += 2;
  1485. j += 2;
  1486. /* Find the number of initial zeros in this hex number. */
  1487. nzeros = 0;
  1488. while (before[i + nzeros] == '0' && nzeros < ndigits)
  1489. nzeros++;
  1490. /* Copy the number, skipping initial zeros, and then move the string
  1491. positions. */
  1492. if (nzeros == ndigits)
  1493. {
  1494. after[j] = '0';
  1495. i += ndigits;
  1496. j += 1;
  1497. }
  1498. else
  1499. {
  1500. int pos;
  1501. for (pos = 0; pos < ndigits - nzeros; pos++)
  1502. after[j + pos] = tolower ((int) before[i + nzeros + pos]);
  1503. i += ndigits;
  1504. j += (ndigits - nzeros);
  1505. }
  1506. /* Add terminating semicolon. */
  1507. after[j] = ';';
  1508. j++;
  1509. }
  1510. else
  1511. {
  1512. after[j] = before[i];
  1513. i++;
  1514. j++;
  1515. }
  1516. }
  1517. *lenp = j;
  1518. memcpy (before, after, j);
  1519. }
  1520. char *
  1521. scm_to_locale_string (SCM str)
  1522. {
  1523. return scm_to_locale_stringn (str, NULL);
  1524. }
  1525. char *
  1526. scm_to_locale_stringn (SCM str, size_t *lenp)
  1527. {
  1528. return scm_to_stringn (str, lenp,
  1529. locale_charset (),
  1530. scm_i_get_conversion_strategy (SCM_BOOL_F));
  1531. }
  1532. char *
  1533. scm_to_latin1_string (SCM str)
  1534. {
  1535. return scm_to_latin1_stringn (str, NULL);
  1536. }
  1537. char *
  1538. scm_to_latin1_stringn (SCM str, size_t *lenp)
  1539. #define FUNC_NAME "scm_to_latin1_stringn"
  1540. {
  1541. char *result;
  1542. SCM_VALIDATE_STRING (1, str);
  1543. if (scm_i_is_narrow_string (str))
  1544. {
  1545. if (lenp)
  1546. *lenp = scm_i_string_length (str);
  1547. result = scm_strdup (scm_i_string_data (str));
  1548. }
  1549. else
  1550. result = scm_to_stringn (str, lenp, NULL,
  1551. SCM_FAILED_CONVERSION_ERROR);
  1552. return result;
  1553. }
  1554. #undef FUNC_NAME
  1555. char *
  1556. scm_to_utf8_string (SCM str)
  1557. {
  1558. return scm_to_utf8_stringn (str, NULL);
  1559. }
  1560. char *
  1561. scm_to_utf8_stringn (SCM str, size_t *lenp)
  1562. {
  1563. return scm_to_stringn (str, lenp, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
  1564. }
  1565. scm_t_wchar *
  1566. scm_to_utf32_string (SCM str)
  1567. {
  1568. return scm_to_utf32_stringn (str, NULL);
  1569. }
  1570. scm_t_wchar *
  1571. scm_to_utf32_stringn (SCM str, size_t *lenp)
  1572. #define FUNC_NAME "scm_to_utf32_stringn"
  1573. {
  1574. scm_t_wchar *result;
  1575. SCM_VALIDATE_STRING (1, str);
  1576. if (scm_i_is_narrow_string (str))
  1577. result = (scm_t_wchar *)
  1578. scm_to_stringn (str, lenp, "UTF-32",
  1579. SCM_FAILED_CONVERSION_ERROR);
  1580. else
  1581. {
  1582. size_t len;
  1583. len = scm_i_string_length (str);
  1584. if (lenp)
  1585. *lenp = len;
  1586. result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
  1587. memcpy (result, scm_i_string_wide_chars (str),
  1588. len * sizeof (scm_t_wchar));
  1589. result[len] = 0;
  1590. }
  1591. return result;
  1592. }
  1593. #undef FUNC_NAME
  1594. /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
  1595. according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
  1596. the returned buffer. If the conversion to ENCODING fails, apply the strategy
  1597. defined by HANDLER. */
  1598. char *
  1599. scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
  1600. scm_t_string_failed_conversion_handler handler)
  1601. {
  1602. char *buf;
  1603. size_t ilen, len, i;
  1604. int ret;
  1605. const char *enc;
  1606. if (!scm_is_string (str))
  1607. scm_wrong_type_arg_msg (NULL, 0, str, "string");
  1608. ilen = scm_i_string_length (str);
  1609. if (ilen == 0)
  1610. {
  1611. buf = scm_malloc (1);
  1612. buf[0] = '\0';
  1613. if (lenp)
  1614. *lenp = 0;
  1615. return buf;
  1616. }
  1617. if (lenp == NULL)
  1618. for (i = 0; i < ilen; i++)
  1619. if (scm_i_string_ref (str, i) == '\0')
  1620. scm_misc_error (NULL,
  1621. "string contains #\\nul character: ~S",
  1622. scm_list_1 (str));
  1623. if (scm_i_is_narrow_string (str) && (encoding == NULL))
  1624. {
  1625. /* If using native Latin-1 encoding, just copy the string
  1626. contents. */
  1627. if (lenp)
  1628. {
  1629. buf = scm_malloc (ilen);
  1630. memcpy (buf, scm_i_string_chars (str), ilen);
  1631. *lenp = ilen;
  1632. return buf;
  1633. }
  1634. else
  1635. {
  1636. buf = scm_malloc (ilen + 1);
  1637. memcpy (buf, scm_i_string_chars (str), ilen);
  1638. buf[ilen] = '\0';
  1639. return buf;
  1640. }
  1641. }
  1642. buf = NULL;
  1643. len = 0;
  1644. enc = encoding;
  1645. if (enc == NULL)
  1646. enc = "ISO-8859-1";
  1647. if (scm_i_is_narrow_string (str))
  1648. {
  1649. ret = mem_iconveh (scm_i_string_chars (str), ilen,
  1650. "ISO-8859-1", enc,
  1651. (enum iconv_ilseq_handler) handler, NULL,
  1652. &buf, &len);
  1653. if (ret != 0)
  1654. scm_encoding_error (__func__, errno,
  1655. "cannot convert narrow string to output locale",
  1656. SCM_BOOL_F,
  1657. /* FIXME: Faulty character unknown. */
  1658. SCM_BOOL_F);
  1659. }
  1660. else
  1661. {
  1662. buf = u32_conv_to_encoding (enc,
  1663. (enum iconv_ilseq_handler) handler,
  1664. (scm_t_uint32 *) scm_i_string_wide_chars (str),
  1665. ilen,
  1666. NULL,
  1667. NULL, &len);
  1668. if (buf == NULL)
  1669. scm_encoding_error (__func__, errno,
  1670. "cannot convert wide string to output locale",
  1671. SCM_BOOL_F,
  1672. /* FIXME: Faulty character unknown. */
  1673. SCM_BOOL_F);
  1674. }
  1675. if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
  1676. {
  1677. if (SCM_R6RS_ESCAPES_P)
  1678. {
  1679. /* The worst case is if the input string contains all 4-digit
  1680. hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
  1681. (seven characters). Make BUF large enough to hold
  1682. that. */
  1683. buf = scm_realloc (buf, (len * 7) / 6 + 1);
  1684. unistring_escapes_to_r6rs_escapes (buf, &len);
  1685. }
  1686. else
  1687. unistring_escapes_to_guile_escapes (buf, &len);
  1688. buf = scm_realloc (buf, len);
  1689. }
  1690. if (lenp)
  1691. *lenp = len;
  1692. else
  1693. {
  1694. buf = scm_realloc (buf, len + 1);
  1695. buf[len] = '\0';
  1696. }
  1697. scm_remember_upto_here_1 (str);
  1698. return buf;
  1699. }
  1700. size_t
  1701. scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
  1702. {
  1703. size_t len;
  1704. char *result = NULL;
  1705. if (!scm_is_string (str))
  1706. scm_wrong_type_arg_msg (NULL, 0, str, "string");
  1707. result = scm_to_locale_stringn (str, &len);
  1708. memcpy (buf, result, (len > max_len) ? max_len : len);
  1709. free (result);
  1710. scm_remember_upto_here_1 (str);
  1711. return len;
  1712. }
  1713. /* Unicode string normalization. */
  1714. /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
  1715. libguile/i18n.c. It would be useful to have this factored out into a more
  1716. convenient location, but its use of alloca makes that tricky to do. */
  1717. static SCM
  1718. normalize_str (SCM string, uninorm_t form)
  1719. {
  1720. SCM ret;
  1721. scm_t_uint32 *w_str;
  1722. scm_t_wchar *cbuf;
  1723. size_t rlen, len = scm_i_string_length (string);
  1724. if (scm_i_is_narrow_string (string))
  1725. {
  1726. size_t i;
  1727. const char *buf = scm_i_string_chars (string);
  1728. w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
  1729. for (i = 0; i < len; i ++)
  1730. w_str[i] = (unsigned char) buf[i];
  1731. w_str[len] = 0;
  1732. }
  1733. else
  1734. w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
  1735. w_str = u32_normalize (form, w_str, len, NULL, &rlen);
  1736. ret = scm_i_make_wide_string (rlen, &cbuf, 0);
  1737. u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
  1738. free (w_str);
  1739. scm_i_try_narrow_string (ret);
  1740. return ret;
  1741. }
  1742. SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
  1743. (SCM string),
  1744. "Returns the NFC normalized form of @var{string}.")
  1745. #define FUNC_NAME s_scm_string_normalize_nfc
  1746. {
  1747. SCM_VALIDATE_STRING (1, string);
  1748. return normalize_str (string, UNINORM_NFC);
  1749. }
  1750. #undef FUNC_NAME
  1751. SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
  1752. (SCM string),
  1753. "Returns the NFD normalized form of @var{string}.")
  1754. #define FUNC_NAME s_scm_string_normalize_nfd
  1755. {
  1756. SCM_VALIDATE_STRING (1, string);
  1757. return normalize_str (string, UNINORM_NFD);
  1758. }
  1759. #undef FUNC_NAME
  1760. SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
  1761. (SCM string),
  1762. "Returns the NFKC normalized form of @var{string}.")
  1763. #define FUNC_NAME s_scm_string_normalize_nfkc
  1764. {
  1765. SCM_VALIDATE_STRING (1, string);
  1766. return normalize_str (string, UNINORM_NFKC);
  1767. }
  1768. #undef FUNC_NAME
  1769. SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
  1770. (SCM string),
  1771. "Returns the NFKD normalized form of @var{string}.")
  1772. #define FUNC_NAME s_scm_string_normalize_nfkd
  1773. {
  1774. SCM_VALIDATE_STRING (1, string);
  1775. return normalize_str (string, UNINORM_NFKD);
  1776. }
  1777. #undef FUNC_NAME
  1778. /* converts C scm_array of strings to SCM scm_list of strings. */
  1779. /* If argc < 0, a null terminated scm_array is assumed. */
  1780. SCM
  1781. scm_makfromstrs (int argc, char **argv)
  1782. {
  1783. int i = argc;
  1784. SCM lst = SCM_EOL;
  1785. if (0 > i)
  1786. for (i = 0; argv[i]; i++);
  1787. while (i--)
  1788. lst = scm_cons (scm_from_locale_string (argv[i]), lst);
  1789. return lst;
  1790. }
  1791. /* Return a newly allocated array of char pointers to each of the strings
  1792. in args, with a terminating NULL pointer. */
  1793. char **
  1794. scm_i_allocate_string_pointers (SCM list)
  1795. #define FUNC_NAME "scm_i_allocate_string_pointers"
  1796. {
  1797. char **result;
  1798. int len = scm_ilength (list);
  1799. int i;
  1800. if (len < 0)
  1801. scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
  1802. result = scm_gc_malloc ((len + 1) * sizeof (char *),
  1803. "string pointers");
  1804. result[len] = NULL;
  1805. /* The list might be have been modified in another thread, so
  1806. we check LIST before each access.
  1807. */
  1808. for (i = 0; i < len && scm_is_pair (list); i++)
  1809. {
  1810. SCM str;
  1811. size_t len;
  1812. str = SCM_CAR (list);
  1813. len = scm_c_string_length (str);
  1814. result[i] = scm_gc_malloc_pointerless (len + 1, "string pointers");
  1815. memcpy (result[i], scm_i_string_chars (str), len);
  1816. result[i][len] = '\0';
  1817. list = SCM_CDR (list);
  1818. }
  1819. return result;
  1820. }
  1821. #undef FUNC_NAME
  1822. void
  1823. scm_i_get_substring_spec (size_t len,
  1824. SCM start, size_t *cstart,
  1825. SCM end, size_t *cend)
  1826. {
  1827. if (SCM_UNBNDP (start))
  1828. *cstart = 0;
  1829. else
  1830. *cstart = scm_to_unsigned_integer (start, 0, len);
  1831. if (SCM_UNBNDP (end))
  1832. *cend = len;
  1833. else
  1834. *cend = scm_to_unsigned_integer (end, *cstart, len);
  1835. }
  1836. static SCM
  1837. string_handle_ref (scm_t_array_handle *h, size_t index)
  1838. {
  1839. return scm_c_string_ref (h->array, index);
  1840. }
  1841. static void
  1842. string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
  1843. {
  1844. scm_c_string_set_x (h->array, index, val);
  1845. }
  1846. static void
  1847. string_get_handle (SCM v, scm_t_array_handle *h)
  1848. {
  1849. h->array = v;
  1850. h->ndims = 1;
  1851. h->dims = &h->dim0;
  1852. h->dim0.lbnd = 0;
  1853. h->dim0.ubnd = scm_c_string_length (v) - 1;
  1854. h->dim0.inc = 1;
  1855. h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
  1856. h->elements = h->writable_elements = NULL;
  1857. }
  1858. SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
  1859. string_handle_ref, string_handle_set,
  1860. string_get_handle)
  1861. SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
  1862. void
  1863. scm_init_strings ()
  1864. {
  1865. scm_nullstr = scm_i_make_string (0, NULL, 1);
  1866. #include "libguile/strings.x"
  1867. }
  1868. /*
  1869. Local Variables:
  1870. c-file-style: "gnu"
  1871. End:
  1872. */