123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615 |
- /**
- * Test the UTF-8 decoding routines
- *
- * author: Daniel Veillard
- * copy: see Copyright for the status of this software.
- */
- #include <stdio.h>
- #include <string.h>
- #include <libxml/parser.h>
- #include <libxml/parserInternals.h>
- int lastError;
- static void errorHandler(void *unused, xmlErrorPtr err) {
- if ((unused == NULL) && (err != NULL) && (lastError == 0)) {
- lastError = err->code;
- }
- }
- char document1[100] = "<doc>XXXX</doc>";
- char document2[100] = "<doc foo='XXXX'/>";
- static void testDocumentRangeByte1(xmlParserCtxtPtr ctxt, char *document,
- int len, char *data, int forbid1, int forbid2) {
- int i;
- xmlDocPtr res;
- for (i = 0;i <= 0xFF;i++) {
- lastError = 0;
- xmlCtxtReset(ctxt);
- data[0] = i;
- res = xmlReadMemory(document, len, "test", NULL, 0);
- if ((i == forbid1) || (i == forbid2)) {
- if ((lastError == 0) || (res != NULL))
- fprintf(stderr,
- "Failed to detect invalid char for Byte 0x%02X: %c\n",
- i, i);
- }
- else if ((i == '<') || (i == '&')) {
- if ((lastError == 0) || (res != NULL))
- fprintf(stderr,
- "Failed to detect illegal char %c for Byte 0x%02X\n", i, i);
- }
- else if (((i < 0x20) || (i >= 0x80)) &&
- (i != 0x9) && (i != 0xA) && (i != 0xD)) {
- if ((lastError != XML_ERR_INVALID_CHAR) && (res != NULL))
- fprintf(stderr,
- "Failed to detect invalid char for Byte 0x%02X\n", i);
- }
- else if (res == NULL) {
- fprintf(stderr,
- "Failed to parse valid char for Byte 0x%02X : %c\n", i, i);
- }
- if (res != NULL)
- xmlFreeDoc(res);
- }
- }
- static void testDocumentRangeByte2(xmlParserCtxtPtr ctxt, char *document,
- int len, char *data) {
- int i, j;
- xmlDocPtr res;
- for (i = 0x80;i <= 0xFF;i++) {
- for (j = 0;j <= 0xFF;j++) {
- lastError = 0;
- xmlCtxtReset(ctxt);
- data[0] = i;
- data[1] = j;
- res = xmlReadMemory(document, len, "test", NULL, 0);
- /* if first bit of first char is set, then second bit must too */
- if ((i & 0x80) && ((i & 0x40) == 0)) {
- if ((lastError == 0) || (res != NULL))
- fprintf(stderr,
- "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
- i, j);
- }
- /*
- * if first bit of first char is set, then second char first
- * bits must be 10
- */
- else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
- if ((lastError == 0) || (res != NULL))
- fprintf(stderr,
- "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
- i, j);
- }
- /*
- * if using a 2 byte encoding then the value must be greater
- * than 0x80, i.e. one of bits 5 to 1 of i must be set
- */
- else if ((i & 0x80) && ((i & 0x1E) == 0)) {
- if ((lastError == 0) || (res != NULL))
- fprintf(stderr,
- "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
- i, j);
- }
- /*
- * if third bit of first char is set, then the sequence would need
- * at least 3 bytes, but we give only 2 !
- */
- else if ((i & 0xE0) == 0xE0) {
- if ((lastError == 0) || (res != NULL))
- fprintf(stderr,
- "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
- i, j);
- }
- /*
- * We should see no error in remaning cases
- */
- else if ((lastError != 0) || (res == NULL)) {
- fprintf(stderr,
- "Failed to parse document for Bytes 0x%02X 0x%02X\n", i, j);
- }
- if (res != NULL)
- xmlFreeDoc(res);
- }
- }
- }
- /**
- * testDocumentRanges:
- *
- * Test the correct UTF8 character parsing in context of XML documents
- * Those are in-context injection tests checking the parser behaviour on
- * edge case values at different point in content, beginning and end of
- * CDATA in text or in attribute values.
- */
- static void testDocumentRanges(void) {
- xmlParserCtxtPtr ctxt;
- char *data;
- /*
- * Set up a parsing context using the first document as
- * the current input source.
- */
- ctxt = xmlNewParserCtxt();
- if (ctxt == NULL) {
- fprintf(stderr, "Failed to allocate parser context\n");
- return;
- }
- printf("testing 1 byte char in document: 1");
- fflush(stdout);
- data = &document1[5];
- data[0] = ' ';
- data[1] = ' ';
- data[2] = ' ';
- data[3] = ' ';
- /* test 1 byte injection at beginning of area */
- testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
- data, -1, -1);
- printf(" 2");
- fflush(stdout);
- data[0] = ' ';
- data[1] = ' ';
- data[2] = ' ';
- data[3] = ' ';
- /* test 1 byte injection at end of area */
- testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
- data + 3, -1, -1);
- printf(" 3");
- fflush(stdout);
- data = &document2[10];
- data[0] = ' ';
- data[1] = ' ';
- data[2] = ' ';
- data[3] = ' ';
- /* test 1 byte injection at beginning of area */
- testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
- data, '\'', -1);
- printf(" 4");
- fflush(stdout);
- data[0] = ' ';
- data[1] = ' ';
- data[2] = ' ';
- data[3] = ' ';
- /* test 1 byte injection at end of area */
- testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
- data + 3, '\'', -1);
- printf(" done\n");
- printf("testing 2 byte char in document: 1");
- fflush(stdout);
- data = &document1[5];
- data[0] = ' ';
- data[1] = ' ';
- data[2] = ' ';
- data[3] = ' ';
- /* test 2 byte injection at beginning of area */
- testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
- data);
- printf(" 2");
- fflush(stdout);
- data[0] = ' ';
- data[1] = ' ';
- data[2] = ' ';
- data[3] = ' ';
- /* test 2 byte injection at end of area */
- testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
- data + 2);
- printf(" 3");
- fflush(stdout);
- data = &document2[10];
- data[0] = ' ';
- data[1] = ' ';
- data[2] = ' ';
- data[3] = ' ';
- /* test 2 byte injection at beginning of area */
- testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
- data);
- printf(" 4");
- fflush(stdout);
- data[0] = ' ';
- data[1] = ' ';
- data[2] = ' ';
- data[3] = ' ';
- /* test 2 byte injection at end of area */
- testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
- data + 2);
- printf(" done\n");
- xmlFreeParserCtxt(ctxt);
- }
- static void testCharRangeByte1(xmlParserCtxtPtr ctxt, char *data) {
- int i = 0;
- int len, c;
- data[1] = 0;
- data[2] = 0;
- data[3] = 0;
- for (i = 0;i <= 0xFF;i++) {
- data[0] = i;
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
- lastError = 0;
- c = xmlCurrentChar(ctxt, &len);
- if ((i == 0) || (i >= 0x80)) {
- /* we must see an error there */
- if (lastError != XML_ERR_INVALID_CHAR)
- fprintf(stderr,
- "Failed to detect invalid char for Byte 0x%02X\n", i);
- } else if (i == 0xD) {
- if ((c != 0xA) || (len != 1))
- fprintf(stderr, "Failed to convert char for Byte 0x%02X\n", i);
- } else if ((c != i) || (len != 1)) {
- fprintf(stderr, "Failed to parse char for Byte 0x%02X\n", i);
- }
- }
- }
- static void testCharRangeByte2(xmlParserCtxtPtr ctxt, char *data) {
- int i, j;
- int len, c;
- data[2] = 0;
- data[3] = 0;
- for (i = 0x80;i <= 0xFF;i++) {
- for (j = 0;j <= 0xFF;j++) {
- data[0] = i;
- data[1] = j;
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
- lastError = 0;
- c = xmlCurrentChar(ctxt, &len);
- /* if first bit of first char is set, then second bit must too */
- if ((i & 0x80) && ((i & 0x40) == 0)) {
- if (lastError != XML_ERR_INVALID_CHAR)
- fprintf(stderr,
- "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
- i, j);
- }
- /*
- * if first bit of first char is set, then second char first
- * bits must be 10
- */
- else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
- if (lastError != XML_ERR_INVALID_CHAR)
- fprintf(stderr,
- "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
- i, j, c);
- }
- /*
- * if using a 2 byte encoding then the value must be greater
- * than 0x80, i.e. one of bits 5 to 1 of i must be set
- */
- else if ((i & 0x80) && ((i & 0x1E) == 0)) {
- if (lastError != XML_ERR_INVALID_CHAR)
- fprintf(stderr,
- "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
- i, j, c);
- }
- /*
- * if third bit of first char is set, then the sequence would need
- * at least 3 bytes, but we give only 2 !
- */
- else if ((i & 0xE0) == 0xE0) {
- if (lastError != XML_ERR_INVALID_CHAR)
- fprintf(stderr,
- "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
- i, j);
- }
- /*
- * We should see no error in remaning cases
- */
- else if ((lastError != 0) || (len != 2)) {
- fprintf(stderr,
- "Failed to parse char for Bytes 0x%02X 0x%02X\n", i, j);
- }
- /*
- * Finally check the value is right
- */
- else if (c != (j & 0x3F) + ((i & 0x1F) << 6)) {
- fprintf(stderr,
- "Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n",
- i, j, ((j & 0x3F) + ((i & 0x1F) << 6)), c);
- }
- }
- }
- }
- static void testCharRangeByte3(xmlParserCtxtPtr ctxt, char *data) {
- int i, j, k, K;
- int len, c;
- unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
- int value;
- data[3] = 0;
- for (i = 0xE0;i <= 0xFF;i++) {
- for (j = 0;j <= 0xFF;j++) {
- for (k = 0;k < 6;k++) {
- data[0] = i;
- data[1] = j;
- K = lows[k];
- data[2] = (char) K;
- value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12);
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
- lastError = 0;
- c = xmlCurrentChar(ctxt, &len);
- /*
- * if fourth bit of first char is set, then the sequence would need
- * at least 4 bytes, but we give only 3 !
- */
- if ((i & 0xF0) == 0xF0) {
- if (lastError != XML_ERR_INVALID_CHAR)
- fprintf(stderr,
- "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
- i, j, K, data[3]);
- }
- /*
- * The second and the third bytes must start with 10
- */
- else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80)) {
- if (lastError != XML_ERR_INVALID_CHAR)
- fprintf(stderr,
- "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
- i, j, K);
- }
- /*
- * if using a 3 byte encoding then the value must be greater
- * than 0x800, i.e. one of bits 4 to 0 of i must be set or
- * the 6th byte of data[1] must be set
- */
- else if (((i & 0xF) == 0) && ((j & 0x20) == 0)) {
- if (lastError != XML_ERR_INVALID_CHAR)
- fprintf(stderr,
- "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
- i, j, K);
- }
- /*
- * There are values in that range that are not allowed in XML-1.0
- */
- else if (((value > 0xD7FF) && (value <0xE000)) ||
- ((value > 0xFFFD) && (value <0x10000))) {
- if (lastError != XML_ERR_INVALID_CHAR)
- fprintf(stderr,
- "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n",
- value, i, j, K);
- }
- /*
- * We should see no error in remaining cases
- */
- else if ((lastError != 0) || (len != 3)) {
- fprintf(stderr,
- "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
- i, j, K);
- }
- /*
- * Finally check the value is right
- */
- else if (c != value) {
- fprintf(stderr,
- "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
- i, j, data[2], value, c);
- }
- }
- }
- }
- }
- static void testCharRangeByte4(xmlParserCtxtPtr ctxt, char *data) {
- int i, j, k, K, l, L;
- int len, c;
- unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
- int value;
- data[4] = 0;
- for (i = 0xF0;i <= 0xFF;i++) {
- for (j = 0;j <= 0xFF;j++) {
- for (k = 0;k < 6;k++) {
- for (l = 0;l < 6;l++) {
- data[0] = i;
- data[1] = j;
- K = lows[k];
- data[2] = (char) K;
- L = lows[l];
- data[3] = (char) L;
- value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) +
- ((i & 0x7) << 18);
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
- lastError = 0;
- c = xmlCurrentChar(ctxt, &len);
- /*
- * if fifth bit of first char is set, then the sequence would need
- * at least 5 bytes, but we give only 4 !
- */
- if ((i & 0xF8) == 0xF8) {
- if (lastError != XML_ERR_INVALID_CHAR)
- fprintf(stderr,
- "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
- i, j, K, data[3]);
- }
- /*
- * The second, third and fourth bytes must start with 10
- */
- else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80) ||
- ((L & 0xC0) != 0x80)) {
- if (lastError != XML_ERR_INVALID_CHAR)
- fprintf(stderr,
- "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
- i, j, K, L);
- }
- /*
- * if using a 3 byte encoding then the value must be greater
- * than 0x10000, i.e. one of bits 3 to 0 of i must be set or
- * the 6 or 5th byte of j must be set
- */
- else if (((i & 0x7) == 0) && ((j & 0x30) == 0)) {
- if (lastError != XML_ERR_INVALID_CHAR)
- fprintf(stderr,
- "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
- i, j, K, L);
- }
- /*
- * There are values in that range that are not allowed in XML-1.0
- */
- else if (((value > 0xD7FF) && (value <0xE000)) ||
- ((value > 0xFFFD) && (value <0x10000)) ||
- (value > 0x10FFFF)) {
- if (lastError != XML_ERR_INVALID_CHAR)
- fprintf(stderr,
- "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
- value, i, j, K, L);
- }
- /*
- * We should see no error in remaining cases
- */
- else if ((lastError != 0) || (len != 4)) {
- fprintf(stderr,
- "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
- i, j, K);
- }
- /*
- * Finally check the value is right
- */
- else if (c != value) {
- fprintf(stderr,
- "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
- i, j, data[2], value, c);
- }
- }
- }
- }
- }
- }
- /**
- * testCharRanges:
- *
- * Test the correct UTF8 character parsing in isolation i.e.
- * not when parsing a full document, this is less expensive and we can
- * cover the full range of UTF-8 chars accepted by XML-1.0
- */
- static void testCharRanges(void) {
- char data[5];
- xmlParserCtxtPtr ctxt;
- xmlParserInputBufferPtr buf;
- xmlParserInputPtr input;
- memset(data, 0, 5);
- /*
- * Set up a parsing context using the above data buffer as
- * the current input source.
- */
- ctxt = xmlNewParserCtxt();
- if (ctxt == NULL) {
- fprintf(stderr, "Failed to allocate parser context\n");
- return;
- }
- buf = xmlParserInputBufferCreateStatic(data, sizeof(data),
- XML_CHAR_ENCODING_NONE);
- if (buf == NULL) {
- fprintf(stderr, "Failed to allocate input buffer\n");
- goto error;
- }
- input = xmlNewInputStream(ctxt);
- if (input == NULL) {
- xmlFreeParserInputBuffer(buf);
- goto error;
- }
- input->filename = NULL;
- input->buf = buf;
- input->base = input->buf->buffer->content;
- input->cur = input->buf->buffer->content;
- input->end = &input->buf->buffer->content[4];
- inputPush(ctxt, input);
- printf("testing char range: 1");
- fflush(stdout);
- testCharRangeByte1(ctxt, data);
- printf(" 2");
- fflush(stdout);
- testCharRangeByte2(ctxt, data);
- printf(" 3");
- fflush(stdout);
- testCharRangeByte3(ctxt, data);
- printf(" 4");
- fflush(stdout);
- testCharRangeByte4(ctxt, data);
- printf(" done\n");
- fflush(stdout);
- error:
- xmlFreeParserCtxt(ctxt);
- }
- int main(void) {
- /*
- * this initialize the library and check potential ABI mismatches
- * between the version it was compiled for and the actual shared
- * library used.
- */
- LIBXML_TEST_VERSION
- /*
- * Catch errors separately
- */
- xmlSetStructuredErrorFunc(NULL, errorHandler);
- /*
- * Run the tests
- */
- testCharRanges();
- testDocumentRanges();
- /*
- * Cleanup function for the XML library.
- */
- xmlCleanupParser();
- /*
- * this is to debug memory for regression tests
- */
- xmlMemoryDump();
- return(0);
- }
|