utf8.c 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497
  1. /*
  2. *
  3. * Embedded Linux library
  4. *
  5. * Copyright (C) 2011-2014 Intel Corporation. All rights reserved.
  6. *
  7. * This library is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * This library is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with this library; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  20. *
  21. */
  22. #ifdef HAVE_CONFIG_H
  23. #include <config.h>
  24. #endif
  25. #include <stdio.h>
  26. #include <wchar.h>
  27. #include "strv.h"
  28. #include "utf8.h"
  29. #include "private.h"
  30. #include "useful.h"
  31. /**
  32. * SECTION:utf8
  33. * @short_description: UTF-8 utility function
  34. *
  35. * UTF-8 string handling support
  36. */
  37. LIB_EXPORT unsigned char l_ascii_table[256] = {
  38. [0x00 ... 0x08] = L_ASCII_CNTRL,
  39. [0x09 ... 0x0D] = L_ASCII_CNTRL | L_ASCII_SPACE,
  40. [0x0E ... 0x1F] = L_ASCII_CNTRL,
  41. [0x20] = L_ASCII_PRINT | L_ASCII_SPACE,
  42. [0x21 ... 0x2F] = L_ASCII_PRINT | L_ASCII_PUNCT,
  43. [0x30 ... 0x39] = L_ASCII_DIGIT | L_ASCII_XDIGIT | L_ASCII_PRINT,
  44. [0x3A ... 0x40] = L_ASCII_PRINT | L_ASCII_PUNCT,
  45. [0x41 ... 0x46] = L_ASCII_PRINT | L_ASCII_XDIGIT | L_ASCII_UPPER,
  46. [0x47 ... 0x5A] = L_ASCII_PRINT | L_ASCII_UPPER,
  47. [0x5B ... 0x60] = L_ASCII_PRINT | L_ASCII_PUNCT,
  48. [0x61 ... 0x66] = L_ASCII_PRINT | L_ASCII_XDIGIT | L_ASCII_LOWER,
  49. [0x67 ... 0x7A] = L_ASCII_PRINT | L_ASCII_LOWER,
  50. [0x7B ... 0x7E] = L_ASCII_PRINT | L_ASCII_PUNCT,
  51. [0x7F] = L_ASCII_CNTRL,
  52. [0x80 ... 0xFF] = 0,
  53. };
  54. static inline bool __attribute__ ((always_inline))
  55. valid_unicode(wchar_t c)
  56. {
  57. if (c <= 0xd7ff)
  58. return true;
  59. if (c < 0xe000 || c > 0x10ffff)
  60. return false;
  61. if (c >= 0xfdd0 && c <= 0xfdef)
  62. return false;
  63. if ((c & 0xfffe) == 0xfffe)
  64. return false;
  65. return true;
  66. }
  67. /**
  68. * l_utf8_get_codepoint
  69. * @str: a pointer to codepoint data
  70. * @len: maximum bytes to read
  71. * @cp: destination for codepoint
  72. *
  73. * Returns: number of bytes read, or -1 for invalid coddepoint
  74. **/
  75. LIB_EXPORT int l_utf8_get_codepoint(const char *str, size_t len, wchar_t *cp)
  76. {
  77. static const wchar_t mins[3] = { 1 << 7, 1 << 11, 1 << 16 };
  78. unsigned int expect_bytes;
  79. wchar_t val;
  80. size_t i;
  81. if (len == 0)
  82. return 0;
  83. if ((signed char) str[0] > 0) {
  84. *cp = str[0];
  85. return 1;
  86. }
  87. expect_bytes = __builtin_clz(~((unsigned char)str[0] << 24));
  88. if (expect_bytes < 2 || expect_bytes > 4)
  89. goto error;
  90. if (expect_bytes > len)
  91. goto error;
  92. val = str[0] & (0xff >> (expect_bytes + 1));
  93. for (i = 1; i < expect_bytes; i++) {
  94. if ((str[i] & 0xc0) != 0x80)
  95. goto error;
  96. val <<= 6;
  97. val |= str[i] & 0x3f;
  98. }
  99. if (val < mins[expect_bytes - 2])
  100. goto error;
  101. if (valid_unicode(val) == false)
  102. goto error;
  103. *cp = val;
  104. return expect_bytes;
  105. error:
  106. return -1;
  107. }
  108. /**
  109. * l_utf8_validate:
  110. * @str: a pointer to character data
  111. * @len: max bytes to validate
  112. * @end: return location for end of valid data
  113. *
  114. * Validates UTF-8 encoded text. If @end is non-NULL, then the end of
  115. * the valid range will be stored there (i.e. the start of the first
  116. * invalid character if some bytes were invalid, or the end of the text
  117. * being validated otherwise).
  118. *
  119. * Returns: Whether the text was valid UTF-8
  120. **/
  121. LIB_EXPORT bool l_utf8_validate(const char *str, size_t len, const char **end)
  122. {
  123. size_t pos = 0;
  124. int ret;
  125. wchar_t val;
  126. while (pos < len && str[pos]) {
  127. ret = l_utf8_get_codepoint(str + pos, len - pos, &val);
  128. if (ret < 0)
  129. goto error;
  130. pos += ret;
  131. }
  132. error:
  133. if (end)
  134. *end = str + pos;
  135. if (pos != len)
  136. return false;
  137. return true;
  138. }
  139. /**
  140. * l_utf8_strlen:
  141. * @str: a pointer to character data
  142. *
  143. * Computes the number of UTF-8 characters (not bytes) in the string given
  144. * by @str.
  145. *
  146. * Returns: The number of UTF-8 characters in the string
  147. **/
  148. LIB_EXPORT size_t l_utf8_strlen(const char *str)
  149. {
  150. size_t l = 0;
  151. size_t i;
  152. unsigned char b;
  153. for (i = 0; str[i]; i++) {
  154. b = str[i];
  155. if ((b >> 6) == 2)
  156. l += 1;
  157. }
  158. return i - l;
  159. }
  160. static inline int __attribute__ ((always_inline))
  161. utf8_length(wchar_t c)
  162. {
  163. if (c <= 0x7f)
  164. return 1;
  165. if (c <= 0x7ff)
  166. return 2;
  167. if (c <= 0xffff)
  168. return 3;
  169. return 4;
  170. }
  171. static inline uint16_t __attribute__ ((always_inline))
  172. surrogate_value(uint16_t h, uint16_t l)
  173. {
  174. return 0x10000 + (h - 0xd800) * 0x400 + l - 0xdc00;
  175. }
  176. /*
  177. * l_utf8_from_wchar:
  178. * @c: a wide-character to convert
  179. * @out_buf: Buffer to write out to
  180. *
  181. * Assumes c is valid unicode and out_buf contains enough space for a single
  182. * utf8 character (maximum 4 bytes)
  183. * Returns: number of characters written
  184. */
  185. LIB_EXPORT size_t l_utf8_from_wchar(wchar_t c, char *out_buf)
  186. {
  187. int len = utf8_length(c);
  188. int i;
  189. if (len == 1) {
  190. out_buf[0] = c;
  191. return 1;
  192. }
  193. for (i = len - 1; i; i--) {
  194. out_buf[i] = (c & 0x3f) | 0x80;
  195. c >>= 6;
  196. }
  197. out_buf[0] = (0xff << (8 - len)) | c;
  198. return len;
  199. }
  200. /**
  201. * l_utf8_from_utf16:
  202. * @utf16: Array of UTF16 characters
  203. * @utf16_size: The size of the @utf16 array in bytes. Must be a multiple of 2.
  204. *
  205. * Returns: A newly-allocated buffer containing UTF16 encoded string converted
  206. * to UTF8. The UTF8 string will always be null terminated, even if the
  207. * original UTF16 string was not.
  208. **/
  209. LIB_EXPORT char *l_utf8_from_utf16(const void *utf16, ssize_t utf16_size)
  210. {
  211. char *utf8;
  212. size_t utf8_len = 0;
  213. wchar_t high_surrogate = 0;
  214. ssize_t i = 0;
  215. uint16_t in;
  216. wchar_t c;
  217. if (unlikely(utf16_size % 2))
  218. return NULL;
  219. while (utf16_size < 0 || i < utf16_size) {
  220. in = l_get_u16(utf16 + i);
  221. if (!in)
  222. break;
  223. if (in >= 0xdc00 && in < 0xe000) {
  224. if (high_surrogate)
  225. c = surrogate_value(high_surrogate, in);
  226. else
  227. return NULL;
  228. high_surrogate = 0;
  229. } else {
  230. if (high_surrogate)
  231. return NULL;
  232. if (in >= 0xd800 && in < 0xdc00) {
  233. high_surrogate = in;
  234. goto next;
  235. }
  236. c = in;
  237. }
  238. if (!valid_unicode(c))
  239. return NULL;
  240. utf8_len += utf8_length(c);
  241. next:
  242. i += 2;
  243. }
  244. if (high_surrogate)
  245. return NULL;
  246. utf8 = l_malloc(utf8_len + 1);
  247. utf8_len = 0;
  248. i = 0;
  249. while (utf16_size < 0 || i < utf16_size) {
  250. in = l_get_u16(utf16 + i);
  251. if (!in)
  252. break;
  253. if (in >= 0xd800 && in < 0xdc00) {
  254. high_surrogate = in;
  255. i += 2;
  256. in = l_get_u16(utf16 + i);
  257. c = surrogate_value(high_surrogate, in);
  258. } else
  259. c = in;
  260. utf8_len += l_utf8_from_wchar(c, utf8 + utf8_len);
  261. i += 2;
  262. }
  263. utf8[utf8_len] = '\0';
  264. return utf8;
  265. }
  266. /**
  267. * l_utf8_to_utf16:
  268. * @utf8: UTF8 formatted string
  269. * @out_size: The size in bytes of the converted utf16 string
  270. *
  271. * Converts a UTF8 formatted string to UTF16. It is assumed that the string
  272. * is valid UTF8 and no sanity checking is performed.
  273. *
  274. * Returns: A newly-allocated buffer containing UTF8 encoded string converted
  275. * to UTF16. The UTF16 string will always be null terminated.
  276. **/
  277. LIB_EXPORT void *l_utf8_to_utf16(const char *utf8, size_t *out_size)
  278. {
  279. const char *c;
  280. wchar_t wc;
  281. int len;
  282. uint16_t *utf16;
  283. size_t n_utf16;
  284. if (unlikely(!utf8))
  285. return NULL;
  286. c = utf8;
  287. n_utf16 = 0;
  288. while (*c) {
  289. len = l_utf8_get_codepoint(c, 4, &wc);
  290. if (len < 0)
  291. return NULL;
  292. if (wc < 0x10000)
  293. n_utf16 += 1;
  294. else
  295. n_utf16 += 2;
  296. c += len;
  297. }
  298. utf16 = l_malloc((n_utf16 + 1) * 2);
  299. c = utf8;
  300. n_utf16 = 0;
  301. while (*c) {
  302. len = l_utf8_get_codepoint(c, 4, &wc);
  303. if (wc >= 0x10000) {
  304. utf16[n_utf16++] = (wc - 0x1000) / 0x400 + 0xd800;
  305. utf16[n_utf16++] = (wc - 0x1000) % 0x400 + 0xdc00;
  306. } else
  307. utf16[n_utf16++] = wc;
  308. c += len;
  309. }
  310. utf16[n_utf16] = 0;
  311. if (out_size)
  312. *out_size = (n_utf16 + 1) * 2;
  313. return utf16;
  314. }
  315. /**
  316. * l_utf8_from_ucs2be:
  317. * @ucs2be: Array of UCS2 characters in big-endian format
  318. * @ucs2be_size: The size of the @ucs2 array in bytes. Must be a multiple of 2.
  319. *
  320. * Returns: A newly-allocated buffer containing UCS2BE encoded string converted
  321. * to UTF8. The UTF8 string will always be null terminated, even if the
  322. * original UCS2BE string was not.
  323. **/
  324. LIB_EXPORT char *l_utf8_from_ucs2be(const void *ucs2be, ssize_t ucs2be_size)
  325. {
  326. char *utf8;
  327. size_t utf8_len = 0;
  328. ssize_t i = 0;
  329. uint16_t in;
  330. if (unlikely(ucs2be_size % 2))
  331. return NULL;
  332. while (ucs2be_size < 0 || i < ucs2be_size) {
  333. in = l_get_be16(ucs2be + i);
  334. if (!in)
  335. break;
  336. if (in >= 0xd800 && in < 0xe000)
  337. return NULL;
  338. if (!valid_unicode(in))
  339. return NULL;
  340. utf8_len += utf8_length(in);
  341. i += 2;
  342. }
  343. utf8 = l_malloc(utf8_len + 1);
  344. utf8_len = 0;
  345. i = 0;
  346. while (ucs2be_size < 0 || i < ucs2be_size) {
  347. in = l_get_be16(ucs2be + i);
  348. if (!in)
  349. break;
  350. utf8_len += l_utf8_from_wchar(in, utf8 + utf8_len);
  351. i += 2;
  352. }
  353. utf8[utf8_len] = '\0';
  354. return utf8;
  355. }
  356. /**
  357. * l_utf8_to_ucs2be:
  358. * @utf8: UTF8 formatted string
  359. * @out_size: The size in bytes of the converted ucs2be string
  360. *
  361. * Converts a UTF8 formatted string to UCS2BE. It is assumed that the string
  362. * is valid UTF8 and no sanity checking is performed.
  363. *
  364. * Returns: A newly-allocated buffer containing UTF8 encoded string converted
  365. * to UCS2BE. The UCS2BE string will always be null terminated.
  366. **/
  367. LIB_EXPORT void *l_utf8_to_ucs2be(const char *utf8, size_t *out_size)
  368. {
  369. const char *c;
  370. wchar_t wc;
  371. int len;
  372. uint16_t *ucs2be;
  373. size_t n_ucs2be;
  374. if (unlikely(!utf8))
  375. return NULL;
  376. c = utf8;
  377. n_ucs2be = 0;
  378. while (*c) {
  379. len = l_utf8_get_codepoint(c, 4, &wc);
  380. if (len < 0)
  381. return NULL;
  382. if (wc >= 0x10000)
  383. return NULL;
  384. n_ucs2be += 1;
  385. c += len;
  386. }
  387. ucs2be = l_malloc((n_ucs2be + 1) * 2);
  388. c = utf8;
  389. n_ucs2be = 0;
  390. while (*c) {
  391. len = l_utf8_get_codepoint(c, 4, &wc);
  392. ucs2be[n_ucs2be++] = L_CPU_TO_BE16(wc);
  393. c += len;
  394. }
  395. ucs2be[n_ucs2be] = 0;
  396. if (out_size)
  397. *out_size = (n_ucs2be + 1) * 2;
  398. return ucs2be;
  399. }