strfunc.c 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. /* ----------------------------------------------------------------------- *
  2. *
  3. * Copyright 1996-2009 The NASM Authors - All Rights Reserved
  4. * See the file AUTHORS included with the NASM distribution for
  5. * the specific copyright holders.
  6. *
  7. * Redistribution and use in source and binary forms, with or without
  8. * modification, are permitted provided that the following
  9. * conditions are met:
  10. *
  11. * * Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * * Redistributions in binary form must reproduce the above
  14. * copyright notice, this list of conditions and the following
  15. * disclaimer in the documentation and/or other materials provided
  16. * with the distribution.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  19. * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  20. * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  21. * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  23. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  25. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  26. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  29. * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  30. * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. *
  32. * ----------------------------------------------------------------------- */
  33. /*
  34. * strfunc.c
  35. *
  36. * String transformation functions
  37. */
  38. #include "nasmlib.h"
  39. #include "nasm.h"
  40. /*
  41. * Convert a string in UTF-8 format to UTF-16LE
  42. */
  43. static size_t utf8_to_16le(uint8_t *str, size_t len, char *op)
  44. {
  45. #define EMIT(x) do { if (op) { WRITESHORT(op,x); } outlen++; } while(0)
  46. size_t outlen = 0;
  47. int expect = 0;
  48. uint8_t c;
  49. uint32_t v = 0, vmin = 0;
  50. while (len--) {
  51. c = *str++;
  52. if (expect) {
  53. if ((c & 0xc0) != 0x80) {
  54. expect = 0;
  55. return -1;
  56. } else {
  57. v = (v << 6) | (c & 0x3f);
  58. if (!--expect) {
  59. if (v < vmin || v > 0x10ffff ||
  60. (v >= 0xd800 && v <= 0xdfff)) {
  61. return -1;
  62. } else if (v > 0xffff) {
  63. v -= 0x10000;
  64. EMIT(0xd800 | (v >> 10));
  65. EMIT(0xdc00 | (v & 0x3ff));
  66. } else {
  67. EMIT(v);
  68. }
  69. }
  70. continue;
  71. }
  72. }
  73. if (c < 0x80) {
  74. EMIT(c);
  75. } else if (c < 0xc0 || c >= 0xfe) {
  76. /* Invalid UTF-8 */
  77. return -1;
  78. } else if (c < 0xe0) {
  79. v = c & 0x1f;
  80. expect = 1;
  81. vmin = 0x80;
  82. } else if (c < 0xf0) {
  83. v = c & 0x0f;
  84. expect = 2;
  85. vmin = 0x800;
  86. } else if (c < 0xf8) {
  87. v = c & 0x07;
  88. expect = 3;
  89. vmin = 0x10000;
  90. } else if (c < 0xfc) {
  91. v = c & 0x03;
  92. expect = 4;
  93. vmin = 0x200000;
  94. } else {
  95. v = c & 0x01;
  96. expect = 5;
  97. vmin = 0x4000000;
  98. }
  99. }
  100. return expect ? (size_t)-1 : outlen << 1;
  101. #undef EMIT
  102. }
  103. /*
  104. * Convert a string in UTF-8 format to UTF-16BE
  105. */
  106. static size_t utf8_to_16be(uint8_t *str, size_t len, char *op)
  107. {
  108. #define EMIT(x) \
  109. do { \
  110. uint16_t _y = (x); \
  111. if (op) { \
  112. WRITECHAR(op, _y >> 8); \
  113. WRITECHAR(op, _y); \
  114. } \
  115. outlen++; \
  116. } while (0) \
  117. size_t outlen = 0;
  118. int expect = 0;
  119. uint8_t c;
  120. uint32_t v = 0, vmin = 0;
  121. while (len--) {
  122. c = *str++;
  123. if (expect) {
  124. if ((c & 0xc0) != 0x80) {
  125. expect = 0;
  126. return -1;
  127. } else {
  128. v = (v << 6) | (c & 0x3f);
  129. if (!--expect) {
  130. if (v < vmin || v > 0x10ffff ||
  131. (v >= 0xd800 && v <= 0xdfff)) {
  132. return -1;
  133. } else if (v > 0xffff) {
  134. v -= 0x10000;
  135. EMIT(0xdc00 | (v & 0x3ff));
  136. EMIT(0xd800 | (v >> 10));
  137. } else {
  138. EMIT(v);
  139. }
  140. }
  141. continue;
  142. }
  143. }
  144. if (c < 0x80) {
  145. EMIT(c);
  146. } else if (c < 0xc0 || c >= 0xfe) {
  147. /* Invalid UTF-8 */
  148. return -1;
  149. } else if (c < 0xe0) {
  150. v = c & 0x1f;
  151. expect = 1;
  152. vmin = 0x80;
  153. } else if (c < 0xf0) {
  154. v = c & 0x0f;
  155. expect = 2;
  156. vmin = 0x800;
  157. } else if (c < 0xf8) {
  158. v = c & 0x07;
  159. expect = 3;
  160. vmin = 0x10000;
  161. } else if (c < 0xfc) {
  162. v = c & 0x03;
  163. expect = 4;
  164. vmin = 0x200000;
  165. } else {
  166. v = c & 0x01;
  167. expect = 5;
  168. vmin = 0x4000000;
  169. }
  170. }
  171. return expect ? (size_t)-1 : outlen << 1;
  172. #undef EMIT
  173. }
  174. /*
  175. * Convert a string in UTF-8 format to UTF-32LE
  176. */
  177. static size_t utf8_to_32le(uint8_t *str, size_t len, char *op)
  178. {
  179. #define EMIT(x) do { if (op) { WRITELONG(op,x); } outlen++; } while(0)
  180. size_t outlen = 0;
  181. int expect = 0;
  182. uint8_t c;
  183. uint32_t v = 0, vmin = 0;
  184. while (len--) {
  185. c = *str++;
  186. if (expect) {
  187. if ((c & 0xc0) != 0x80) {
  188. return -1;
  189. } else {
  190. v = (v << 6) | (c & 0x3f);
  191. if (!--expect) {
  192. if (v < vmin || (v >= 0xd800 && v <= 0xdfff)) {
  193. return -1;
  194. } else {
  195. EMIT(v);
  196. }
  197. }
  198. continue;
  199. }
  200. }
  201. if (c < 0x80) {
  202. EMIT(c);
  203. } else if (c < 0xc0 || c >= 0xfe) {
  204. /* Invalid UTF-8 */
  205. return -1;
  206. } else if (c < 0xe0) {
  207. v = c & 0x1f;
  208. expect = 1;
  209. vmin = 0x80;
  210. } else if (c < 0xf0) {
  211. v = c & 0x0f;
  212. expect = 2;
  213. vmin = 0x800;
  214. } else if (c < 0xf8) {
  215. v = c & 0x07;
  216. expect = 3;
  217. vmin = 0x10000;
  218. } else if (c < 0xfc) {
  219. v = c & 0x03;
  220. expect = 4;
  221. vmin = 0x200000;
  222. } else {
  223. v = c & 0x01;
  224. expect = 5;
  225. vmin = 0x4000000;
  226. }
  227. }
  228. return expect ? (size_t)-1 : outlen << 2;
  229. #undef EMIT
  230. }
  231. /*
  232. * Convert a string in UTF-8 format to UTF-32BE
  233. */
  234. static size_t utf8_to_32be(uint8_t *str, size_t len, char *op)
  235. {
  236. #define EMIT(x) \
  237. do { \
  238. uint32_t _y = (x); \
  239. if (op) { \
  240. WRITECHAR(op,_y >> 24); \
  241. WRITECHAR(op,_y >> 16); \
  242. WRITECHAR(op,_y >> 8); \
  243. WRITECHAR(op,_y); \
  244. } \
  245. outlen++; \
  246. } while (0)
  247. size_t outlen = 0;
  248. int expect = 0;
  249. uint8_t c;
  250. uint32_t v = 0, vmin = 0;
  251. while (len--) {
  252. c = *str++;
  253. if (expect) {
  254. if ((c & 0xc0) != 0x80) {
  255. return -1;
  256. } else {
  257. v = (v << 6) | (c & 0x3f);
  258. if (!--expect) {
  259. if (v < vmin || (v >= 0xd800 && v <= 0xdfff)) {
  260. return -1;
  261. } else {
  262. EMIT(v);
  263. }
  264. }
  265. continue;
  266. }
  267. }
  268. if (c < 0x80) {
  269. EMIT(c);
  270. } else if (c < 0xc0 || c >= 0xfe) {
  271. /* Invalid UTF-8 */
  272. return -1;
  273. } else if (c < 0xe0) {
  274. v = c & 0x1f;
  275. expect = 1;
  276. vmin = 0x80;
  277. } else if (c < 0xf0) {
  278. v = c & 0x0f;
  279. expect = 2;
  280. vmin = 0x800;
  281. } else if (c < 0xf8) {
  282. v = c & 0x07;
  283. expect = 3;
  284. vmin = 0x10000;
  285. } else if (c < 0xfc) {
  286. v = c & 0x03;
  287. expect = 4;
  288. vmin = 0x200000;
  289. } else {
  290. v = c & 0x01;
  291. expect = 5;
  292. vmin = 0x4000000;
  293. }
  294. }
  295. return expect ? (size_t)-1 : outlen << 2;
  296. #undef EMIT
  297. }
  298. typedef size_t (*transform_func)(uint8_t *, size_t, char *);
  299. /*
  300. * Apply a specific string transform and return it in a nasm_malloc'd
  301. * buffer, returning the length. On error, returns (size_t)-1 and no
  302. * buffer is allocated.
  303. */
  304. size_t string_transform(char *str, size_t len, char **out, enum strfunc func)
  305. {
  306. /* This should match enum strfunc in nasm.h */
  307. static const transform_func str_transforms[] = {
  308. utf8_to_16le,
  309. utf8_to_16le,
  310. utf8_to_16be,
  311. utf8_to_32le,
  312. utf8_to_32le,
  313. utf8_to_32be,
  314. };
  315. transform_func transform = str_transforms[func];
  316. size_t outlen;
  317. uint8_t *s = (uint8_t *)str;
  318. char *buf;
  319. outlen = transform(s, len, NULL);
  320. if (outlen == (size_t)-1)
  321. return -1;
  322. *out = buf = nasm_malloc(outlen+1);
  323. buf[outlen] = '\0'; /* Forcibly null-terminate the buffer */
  324. return transform(s, len, buf);
  325. }