testchar.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615
  1. /**
  2. * Test the UTF-8 decoding routines
  3. *
  4. * author: Daniel Veillard
  5. * copy: see Copyright for the status of this software.
  6. */
  7. #include <stdio.h>
  8. #include <string.h>
  9. #include <libxml/parser.h>
  10. #include <libxml/parserInternals.h>
  11. int lastError;
  12. static void errorHandler(void *unused, xmlErrorPtr err) {
  13. if ((unused == NULL) && (err != NULL) && (lastError == 0)) {
  14. lastError = err->code;
  15. }
  16. }
  17. char document1[100] = "<doc>XXXX</doc>";
  18. char document2[100] = "<doc foo='XXXX'/>";
  19. static void testDocumentRangeByte1(xmlParserCtxtPtr ctxt, char *document,
  20. int len, char *data, int forbid1, int forbid2) {
  21. int i;
  22. xmlDocPtr res;
  23. for (i = 0;i <= 0xFF;i++) {
  24. lastError = 0;
  25. xmlCtxtReset(ctxt);
  26. data[0] = i;
  27. res = xmlReadMemory(document, len, "test", NULL, 0);
  28. if ((i == forbid1) || (i == forbid2)) {
  29. if ((lastError == 0) || (res != NULL))
  30. fprintf(stderr,
  31. "Failed to detect invalid char for Byte 0x%02X: %c\n",
  32. i, i);
  33. }
  34. else if ((i == '<') || (i == '&')) {
  35. if ((lastError == 0) || (res != NULL))
  36. fprintf(stderr,
  37. "Failed to detect illegal char %c for Byte 0x%02X\n", i, i);
  38. }
  39. else if (((i < 0x20) || (i >= 0x80)) &&
  40. (i != 0x9) && (i != 0xA) && (i != 0xD)) {
  41. if ((lastError != XML_ERR_INVALID_CHAR) && (res != NULL))
  42. fprintf(stderr,
  43. "Failed to detect invalid char for Byte 0x%02X\n", i);
  44. }
  45. else if (res == NULL) {
  46. fprintf(stderr,
  47. "Failed to parse valid char for Byte 0x%02X : %c\n", i, i);
  48. }
  49. if (res != NULL)
  50. xmlFreeDoc(res);
  51. }
  52. }
  53. static void testDocumentRangeByte2(xmlParserCtxtPtr ctxt, char *document,
  54. int len, char *data) {
  55. int i, j;
  56. xmlDocPtr res;
  57. for (i = 0x80;i <= 0xFF;i++) {
  58. for (j = 0;j <= 0xFF;j++) {
  59. lastError = 0;
  60. xmlCtxtReset(ctxt);
  61. data[0] = i;
  62. data[1] = j;
  63. res = xmlReadMemory(document, len, "test", NULL, 0);
  64. /* if first bit of first char is set, then second bit must too */
  65. if ((i & 0x80) && ((i & 0x40) == 0)) {
  66. if ((lastError == 0) || (res != NULL))
  67. fprintf(stderr,
  68. "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
  69. i, j);
  70. }
  71. /*
  72. * if first bit of first char is set, then second char first
  73. * bits must be 10
  74. */
  75. else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
  76. if ((lastError == 0) || (res != NULL))
  77. fprintf(stderr,
  78. "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
  79. i, j);
  80. }
  81. /*
  82. * if using a 2 byte encoding then the value must be greater
  83. * than 0x80, i.e. one of bits 5 to 1 of i must be set
  84. */
  85. else if ((i & 0x80) && ((i & 0x1E) == 0)) {
  86. if ((lastError == 0) || (res != NULL))
  87. fprintf(stderr,
  88. "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
  89. i, j);
  90. }
  91. /*
  92. * if third bit of first char is set, then the sequence would need
  93. * at least 3 bytes, but we give only 2 !
  94. */
  95. else if ((i & 0xE0) == 0xE0) {
  96. if ((lastError == 0) || (res != NULL))
  97. fprintf(stderr,
  98. "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
  99. i, j);
  100. }
  101. /*
  102. * We should see no error in remaning cases
  103. */
  104. else if ((lastError != 0) || (res == NULL)) {
  105. fprintf(stderr,
  106. "Failed to parse document for Bytes 0x%02X 0x%02X\n", i, j);
  107. }
  108. if (res != NULL)
  109. xmlFreeDoc(res);
  110. }
  111. }
  112. }
  113. /**
  114. * testDocumentRanges:
  115. *
  116. * Test the correct UTF8 character parsing in context of XML documents
  117. * Those are in-context injection tests checking the parser behaviour on
  118. * edge case values at different point in content, beginning and end of
  119. * CDATA in text or in attribute values.
  120. */
  121. static void testDocumentRanges(void) {
  122. xmlParserCtxtPtr ctxt;
  123. char *data;
  124. /*
  125. * Set up a parsing context using the first document as
  126. * the current input source.
  127. */
  128. ctxt = xmlNewParserCtxt();
  129. if (ctxt == NULL) {
  130. fprintf(stderr, "Failed to allocate parser context\n");
  131. return;
  132. }
  133. printf("testing 1 byte char in document: 1");
  134. fflush(stdout);
  135. data = &document1[5];
  136. data[0] = ' ';
  137. data[1] = ' ';
  138. data[2] = ' ';
  139. data[3] = ' ';
  140. /* test 1 byte injection at beginning of area */
  141. testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
  142. data, -1, -1);
  143. printf(" 2");
  144. fflush(stdout);
  145. data[0] = ' ';
  146. data[1] = ' ';
  147. data[2] = ' ';
  148. data[3] = ' ';
  149. /* test 1 byte injection at end of area */
  150. testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
  151. data + 3, -1, -1);
  152. printf(" 3");
  153. fflush(stdout);
  154. data = &document2[10];
  155. data[0] = ' ';
  156. data[1] = ' ';
  157. data[2] = ' ';
  158. data[3] = ' ';
  159. /* test 1 byte injection at beginning of area */
  160. testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
  161. data, '\'', -1);
  162. printf(" 4");
  163. fflush(stdout);
  164. data[0] = ' ';
  165. data[1] = ' ';
  166. data[2] = ' ';
  167. data[3] = ' ';
  168. /* test 1 byte injection at end of area */
  169. testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
  170. data + 3, '\'', -1);
  171. printf(" done\n");
  172. printf("testing 2 byte char in document: 1");
  173. fflush(stdout);
  174. data = &document1[5];
  175. data[0] = ' ';
  176. data[1] = ' ';
  177. data[2] = ' ';
  178. data[3] = ' ';
  179. /* test 2 byte injection at beginning of area */
  180. testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
  181. data);
  182. printf(" 2");
  183. fflush(stdout);
  184. data[0] = ' ';
  185. data[1] = ' ';
  186. data[2] = ' ';
  187. data[3] = ' ';
  188. /* test 2 byte injection at end of area */
  189. testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
  190. data + 2);
  191. printf(" 3");
  192. fflush(stdout);
  193. data = &document2[10];
  194. data[0] = ' ';
  195. data[1] = ' ';
  196. data[2] = ' ';
  197. data[3] = ' ';
  198. /* test 2 byte injection at beginning of area */
  199. testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
  200. data);
  201. printf(" 4");
  202. fflush(stdout);
  203. data[0] = ' ';
  204. data[1] = ' ';
  205. data[2] = ' ';
  206. data[3] = ' ';
  207. /* test 2 byte injection at end of area */
  208. testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
  209. data + 2);
  210. printf(" done\n");
  211. xmlFreeParserCtxt(ctxt);
  212. }
  213. static void testCharRangeByte1(xmlParserCtxtPtr ctxt, char *data) {
  214. int i = 0;
  215. int len, c;
  216. data[1] = 0;
  217. data[2] = 0;
  218. data[3] = 0;
  219. for (i = 0;i <= 0xFF;i++) {
  220. data[0] = i;
  221. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  222. lastError = 0;
  223. c = xmlCurrentChar(ctxt, &len);
  224. if ((i == 0) || (i >= 0x80)) {
  225. /* we must see an error there */
  226. if (lastError != XML_ERR_INVALID_CHAR)
  227. fprintf(stderr,
  228. "Failed to detect invalid char for Byte 0x%02X\n", i);
  229. } else if (i == 0xD) {
  230. if ((c != 0xA) || (len != 1))
  231. fprintf(stderr, "Failed to convert char for Byte 0x%02X\n", i);
  232. } else if ((c != i) || (len != 1)) {
  233. fprintf(stderr, "Failed to parse char for Byte 0x%02X\n", i);
  234. }
  235. }
  236. }
  237. static void testCharRangeByte2(xmlParserCtxtPtr ctxt, char *data) {
  238. int i, j;
  239. int len, c;
  240. data[2] = 0;
  241. data[3] = 0;
  242. for (i = 0x80;i <= 0xFF;i++) {
  243. for (j = 0;j <= 0xFF;j++) {
  244. data[0] = i;
  245. data[1] = j;
  246. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  247. lastError = 0;
  248. c = xmlCurrentChar(ctxt, &len);
  249. /* if first bit of first char is set, then second bit must too */
  250. if ((i & 0x80) && ((i & 0x40) == 0)) {
  251. if (lastError != XML_ERR_INVALID_CHAR)
  252. fprintf(stderr,
  253. "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
  254. i, j);
  255. }
  256. /*
  257. * if first bit of first char is set, then second char first
  258. * bits must be 10
  259. */
  260. else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
  261. if (lastError != XML_ERR_INVALID_CHAR)
  262. fprintf(stderr,
  263. "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
  264. i, j, c);
  265. }
  266. /*
  267. * if using a 2 byte encoding then the value must be greater
  268. * than 0x80, i.e. one of bits 5 to 1 of i must be set
  269. */
  270. else if ((i & 0x80) && ((i & 0x1E) == 0)) {
  271. if (lastError != XML_ERR_INVALID_CHAR)
  272. fprintf(stderr,
  273. "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
  274. i, j, c);
  275. }
  276. /*
  277. * if third bit of first char is set, then the sequence would need
  278. * at least 3 bytes, but we give only 2 !
  279. */
  280. else if ((i & 0xE0) == 0xE0) {
  281. if (lastError != XML_ERR_INVALID_CHAR)
  282. fprintf(stderr,
  283. "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
  284. i, j);
  285. }
  286. /*
  287. * We should see no error in remaning cases
  288. */
  289. else if ((lastError != 0) || (len != 2)) {
  290. fprintf(stderr,
  291. "Failed to parse char for Bytes 0x%02X 0x%02X\n", i, j);
  292. }
  293. /*
  294. * Finally check the value is right
  295. */
  296. else if (c != (j & 0x3F) + ((i & 0x1F) << 6)) {
  297. fprintf(stderr,
  298. "Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n",
  299. i, j, ((j & 0x3F) + ((i & 0x1F) << 6)), c);
  300. }
  301. }
  302. }
  303. }
  304. static void testCharRangeByte3(xmlParserCtxtPtr ctxt, char *data) {
  305. int i, j, k, K;
  306. int len, c;
  307. unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
  308. int value;
  309. data[3] = 0;
  310. for (i = 0xE0;i <= 0xFF;i++) {
  311. for (j = 0;j <= 0xFF;j++) {
  312. for (k = 0;k < 6;k++) {
  313. data[0] = i;
  314. data[1] = j;
  315. K = lows[k];
  316. data[2] = (char) K;
  317. value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12);
  318. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  319. lastError = 0;
  320. c = xmlCurrentChar(ctxt, &len);
  321. /*
  322. * if fourth bit of first char is set, then the sequence would need
  323. * at least 4 bytes, but we give only 3 !
  324. */
  325. if ((i & 0xF0) == 0xF0) {
  326. if (lastError != XML_ERR_INVALID_CHAR)
  327. fprintf(stderr,
  328. "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
  329. i, j, K, data[3]);
  330. }
  331. /*
  332. * The second and the third bytes must start with 10
  333. */
  334. else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80)) {
  335. if (lastError != XML_ERR_INVALID_CHAR)
  336. fprintf(stderr,
  337. "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
  338. i, j, K);
  339. }
  340. /*
  341. * if using a 3 byte encoding then the value must be greater
  342. * than 0x800, i.e. one of bits 4 to 0 of i must be set or
  343. * the 6th byte of data[1] must be set
  344. */
  345. else if (((i & 0xF) == 0) && ((j & 0x20) == 0)) {
  346. if (lastError != XML_ERR_INVALID_CHAR)
  347. fprintf(stderr,
  348. "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
  349. i, j, K);
  350. }
  351. /*
  352. * There are values in that range that are not allowed in XML-1.0
  353. */
  354. else if (((value > 0xD7FF) && (value <0xE000)) ||
  355. ((value > 0xFFFD) && (value <0x10000))) {
  356. if (lastError != XML_ERR_INVALID_CHAR)
  357. fprintf(stderr,
  358. "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n",
  359. value, i, j, K);
  360. }
  361. /*
  362. * We should see no error in remaining cases
  363. */
  364. else if ((lastError != 0) || (len != 3)) {
  365. fprintf(stderr,
  366. "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
  367. i, j, K);
  368. }
  369. /*
  370. * Finally check the value is right
  371. */
  372. else if (c != value) {
  373. fprintf(stderr,
  374. "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
  375. i, j, data[2], value, c);
  376. }
  377. }
  378. }
  379. }
  380. }
  381. static void testCharRangeByte4(xmlParserCtxtPtr ctxt, char *data) {
  382. int i, j, k, K, l, L;
  383. int len, c;
  384. unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
  385. int value;
  386. data[4] = 0;
  387. for (i = 0xF0;i <= 0xFF;i++) {
  388. for (j = 0;j <= 0xFF;j++) {
  389. for (k = 0;k < 6;k++) {
  390. for (l = 0;l < 6;l++) {
  391. data[0] = i;
  392. data[1] = j;
  393. K = lows[k];
  394. data[2] = (char) K;
  395. L = lows[l];
  396. data[3] = (char) L;
  397. value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) +
  398. ((i & 0x7) << 18);
  399. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  400. lastError = 0;
  401. c = xmlCurrentChar(ctxt, &len);
  402. /*
  403. * if fifth bit of first char is set, then the sequence would need
  404. * at least 5 bytes, but we give only 4 !
  405. */
  406. if ((i & 0xF8) == 0xF8) {
  407. if (lastError != XML_ERR_INVALID_CHAR)
  408. fprintf(stderr,
  409. "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
  410. i, j, K, data[3]);
  411. }
  412. /*
  413. * The second, third and fourth bytes must start with 10
  414. */
  415. else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80) ||
  416. ((L & 0xC0) != 0x80)) {
  417. if (lastError != XML_ERR_INVALID_CHAR)
  418. fprintf(stderr,
  419. "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
  420. i, j, K, L);
  421. }
  422. /*
  423. * if using a 3 byte encoding then the value must be greater
  424. * than 0x10000, i.e. one of bits 3 to 0 of i must be set or
  425. * the 6 or 5th byte of j must be set
  426. */
  427. else if (((i & 0x7) == 0) && ((j & 0x30) == 0)) {
  428. if (lastError != XML_ERR_INVALID_CHAR)
  429. fprintf(stderr,
  430. "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
  431. i, j, K, L);
  432. }
  433. /*
  434. * There are values in that range that are not allowed in XML-1.0
  435. */
  436. else if (((value > 0xD7FF) && (value <0xE000)) ||
  437. ((value > 0xFFFD) && (value <0x10000)) ||
  438. (value > 0x10FFFF)) {
  439. if (lastError != XML_ERR_INVALID_CHAR)
  440. fprintf(stderr,
  441. "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
  442. value, i, j, K, L);
  443. }
  444. /*
  445. * We should see no error in remaining cases
  446. */
  447. else if ((lastError != 0) || (len != 4)) {
  448. fprintf(stderr,
  449. "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
  450. i, j, K);
  451. }
  452. /*
  453. * Finally check the value is right
  454. */
  455. else if (c != value) {
  456. fprintf(stderr,
  457. "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
  458. i, j, data[2], value, c);
  459. }
  460. }
  461. }
  462. }
  463. }
  464. }
  465. /**
  466. * testCharRanges:
  467. *
  468. * Test the correct UTF8 character parsing in isolation i.e.
  469. * not when parsing a full document, this is less expensive and we can
  470. * cover the full range of UTF-8 chars accepted by XML-1.0
  471. */
  472. static void testCharRanges(void) {
  473. char data[5];
  474. xmlParserCtxtPtr ctxt;
  475. xmlParserInputBufferPtr buf;
  476. xmlParserInputPtr input;
  477. memset(data, 0, 5);
  478. /*
  479. * Set up a parsing context using the above data buffer as
  480. * the current input source.
  481. */
  482. ctxt = xmlNewParserCtxt();
  483. if (ctxt == NULL) {
  484. fprintf(stderr, "Failed to allocate parser context\n");
  485. return;
  486. }
  487. buf = xmlParserInputBufferCreateStatic(data, sizeof(data),
  488. XML_CHAR_ENCODING_NONE);
  489. if (buf == NULL) {
  490. fprintf(stderr, "Failed to allocate input buffer\n");
  491. goto error;
  492. }
  493. input = xmlNewInputStream(ctxt);
  494. if (input == NULL) {
  495. xmlFreeParserInputBuffer(buf);
  496. goto error;
  497. }
  498. input->filename = NULL;
  499. input->buf = buf;
  500. input->base = input->buf->buffer->content;
  501. input->cur = input->buf->buffer->content;
  502. input->end = &input->buf->buffer->content[4];
  503. inputPush(ctxt, input);
  504. printf("testing char range: 1");
  505. fflush(stdout);
  506. testCharRangeByte1(ctxt, data);
  507. printf(" 2");
  508. fflush(stdout);
  509. testCharRangeByte2(ctxt, data);
  510. printf(" 3");
  511. fflush(stdout);
  512. testCharRangeByte3(ctxt, data);
  513. printf(" 4");
  514. fflush(stdout);
  515. testCharRangeByte4(ctxt, data);
  516. printf(" done\n");
  517. fflush(stdout);
  518. error:
  519. xmlFreeParserCtxt(ctxt);
  520. }
  521. int main(void) {
  522. /*
  523. * this initialize the library and check potential ABI mismatches
  524. * between the version it was compiled for and the actual shared
  525. * library used.
  526. */
  527. LIBXML_TEST_VERSION
  528. /*
  529. * Catch errors separately
  530. */
  531. xmlSetStructuredErrorFunc(NULL, errorHandler);
  532. /*
  533. * Run the tests
  534. */
  535. testCharRanges();
  536. testDocumentRanges();
  537. /*
  538. * Cleanup function for the XML library.
  539. */
  540. xmlCleanupParser();
  541. /*
  542. * this is to debug memory for regression tests
  543. */
  544. xmlMemoryDump();
  545. return(0);
  546. }