testHTML.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880
  1. /*
  2. * testHTML.c : a small tester program for HTML input.
  3. *
  4. * See Copyright for the status of this software.
  5. *
  6. * daniel@veillard.com
  7. */
  8. #include "libxml.h"
  9. #ifdef LIBXML_HTML_ENABLED
  10. #include <string.h>
  11. #include <stdarg.h>
  12. #ifdef HAVE_SYS_TYPES_H
  13. #include <sys/types.h>
  14. #endif
  15. #ifdef HAVE_SYS_STAT_H
  16. #include <sys/stat.h>
  17. #endif
  18. #ifdef HAVE_FCNTL_H
  19. #include <fcntl.h>
  20. #endif
  21. #ifdef HAVE_UNISTD_H
  22. #include <unistd.h>
  23. #endif
  24. #ifdef HAVE_STDLIB_H
  25. #include <stdlib.h>
  26. #endif
  27. #include <libxml/xmlmemory.h>
  28. #include <libxml/HTMLparser.h>
  29. #include <libxml/HTMLtree.h>
  30. #include <libxml/debugXML.h>
  31. #include <libxml/xmlerror.h>
  32. #include <libxml/globals.h>
  33. #ifdef LIBXML_DEBUG_ENABLED
  34. static int debug = 0;
  35. #endif
  36. static int copy = 0;
  37. static int sax = 0;
  38. static int repeat = 0;
  39. static int noout = 0;
  40. #ifdef LIBXML_PUSH_ENABLED
  41. static int push = 0;
  42. #endif /* LIBXML_PUSH_ENABLED */
  43. static char *encoding = NULL;
  44. static int options = 0;
  45. static xmlSAXHandler emptySAXHandlerStruct = {
  46. NULL, /* internalSubset */
  47. NULL, /* isStandalone */
  48. NULL, /* hasInternalSubset */
  49. NULL, /* hasExternalSubset */
  50. NULL, /* resolveEntity */
  51. NULL, /* getEntity */
  52. NULL, /* entityDecl */
  53. NULL, /* notationDecl */
  54. NULL, /* attributeDecl */
  55. NULL, /* elementDecl */
  56. NULL, /* unparsedEntityDecl */
  57. NULL, /* setDocumentLocator */
  58. NULL, /* startDocument */
  59. NULL, /* endDocument */
  60. NULL, /* startElement */
  61. NULL, /* endElement */
  62. NULL, /* reference */
  63. NULL, /* characters */
  64. NULL, /* ignorableWhitespace */
  65. NULL, /* processingInstruction */
  66. NULL, /* comment */
  67. NULL, /* xmlParserWarning */
  68. NULL, /* xmlParserError */
  69. NULL, /* xmlParserError */
  70. NULL, /* getParameterEntity */
  71. NULL, /* cdataBlock */
  72. NULL, /* externalSubset */
  73. 1, /* initialized */
  74. NULL, /* private */
  75. NULL, /* startElementNsSAX2Func */
  76. NULL, /* endElementNsSAX2Func */
  77. NULL /* xmlStructuredErrorFunc */
  78. };
  79. static xmlSAXHandlerPtr emptySAXHandler = &emptySAXHandlerStruct;
  80. extern xmlSAXHandlerPtr debugSAXHandler;
  81. /************************************************************************
  82. * *
  83. * Debug Handlers *
  84. * *
  85. ************************************************************************/
  86. /**
  87. * isStandaloneDebug:
  88. * @ctxt: An XML parser context
  89. *
  90. * Is this document tagged standalone ?
  91. *
  92. * Returns 1 if true
  93. */
  94. static int
  95. isStandaloneDebug(void *ctx ATTRIBUTE_UNUSED)
  96. {
  97. fprintf(stdout, "SAX.isStandalone()\n");
  98. return(0);
  99. }
  100. /**
  101. * hasInternalSubsetDebug:
  102. * @ctxt: An XML parser context
  103. *
  104. * Does this document has an internal subset
  105. *
  106. * Returns 1 if true
  107. */
  108. static int
  109. hasInternalSubsetDebug(void *ctx ATTRIBUTE_UNUSED)
  110. {
  111. fprintf(stdout, "SAX.hasInternalSubset()\n");
  112. return(0);
  113. }
  114. /**
  115. * hasExternalSubsetDebug:
  116. * @ctxt: An XML parser context
  117. *
  118. * Does this document has an external subset
  119. *
  120. * Returns 1 if true
  121. */
  122. static int
  123. hasExternalSubsetDebug(void *ctx ATTRIBUTE_UNUSED)
  124. {
  125. fprintf(stdout, "SAX.hasExternalSubset()\n");
  126. return(0);
  127. }
  128. /**
  129. * hasInternalSubsetDebug:
  130. * @ctxt: An XML parser context
  131. *
  132. * Does this document has an internal subset
  133. */
  134. static void
  135. internalSubsetDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name,
  136. const xmlChar *ExternalID, const xmlChar *SystemID)
  137. {
  138. fprintf(stdout, "SAX.internalSubset(%s,", name);
  139. if (ExternalID == NULL)
  140. fprintf(stdout, " ,");
  141. else
  142. fprintf(stdout, " %s,", ExternalID);
  143. if (SystemID == NULL)
  144. fprintf(stdout, " )\n");
  145. else
  146. fprintf(stdout, " %s)\n", SystemID);
  147. }
  148. /**
  149. * resolveEntityDebug:
  150. * @ctxt: An XML parser context
  151. * @publicId: The public ID of the entity
  152. * @systemId: The system ID of the entity
  153. *
  154. * Special entity resolver, better left to the parser, it has
  155. * more context than the application layer.
  156. * The default behaviour is to NOT resolve the entities, in that case
  157. * the ENTITY_REF nodes are built in the structure (and the parameter
  158. * values).
  159. *
  160. * Returns the xmlParserInputPtr if inlined or NULL for DOM behaviour.
  161. */
  162. static xmlParserInputPtr
  163. resolveEntityDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *publicId, const xmlChar *systemId)
  164. {
  165. /* xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr) ctx; */
  166. fprintf(stdout, "SAX.resolveEntity(");
  167. if (publicId != NULL)
  168. fprintf(stdout, "%s", (char *)publicId);
  169. else
  170. fprintf(stdout, " ");
  171. if (systemId != NULL)
  172. fprintf(stdout, ", %s)\n", (char *)systemId);
  173. else
  174. fprintf(stdout, ", )\n");
  175. /*********
  176. if (systemId != NULL) {
  177. return(xmlNewInputFromFile(ctxt, (char *) systemId));
  178. }
  179. *********/
  180. return(NULL);
  181. }
  182. /**
  183. * getEntityDebug:
  184. * @ctxt: An XML parser context
  185. * @name: The entity name
  186. *
  187. * Get an entity by name
  188. *
  189. * Returns the xmlParserInputPtr if inlined or NULL for DOM behaviour.
  190. */
  191. static xmlEntityPtr
  192. getEntityDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
  193. {
  194. fprintf(stdout, "SAX.getEntity(%s)\n", name);
  195. return(NULL);
  196. }
  197. /**
  198. * getParameterEntityDebug:
  199. * @ctxt: An XML parser context
  200. * @name: The entity name
  201. *
  202. * Get a parameter entity by name
  203. *
  204. * Returns the xmlParserInputPtr
  205. */
  206. static xmlEntityPtr
  207. getParameterEntityDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
  208. {
  209. fprintf(stdout, "SAX.getParameterEntity(%s)\n", name);
  210. return(NULL);
  211. }
  212. /**
  213. * entityDeclDebug:
  214. * @ctxt: An XML parser context
  215. * @name: the entity name
  216. * @type: the entity type
  217. * @publicId: The public ID of the entity
  218. * @systemId: The system ID of the entity
  219. * @content: the entity value (without processing).
  220. *
  221. * An entity definition has been parsed
  222. */
  223. static void
  224. entityDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, int type,
  225. const xmlChar *publicId, const xmlChar *systemId, xmlChar *content)
  226. {
  227. fprintf(stdout, "SAX.entityDecl(%s, %d, %s, %s, %s)\n",
  228. name, type, publicId, systemId, content);
  229. }
  230. /**
  231. * attributeDeclDebug:
  232. * @ctxt: An XML parser context
  233. * @name: the attribute name
  234. * @type: the attribute type
  235. *
  236. * An attribute definition has been parsed
  237. */
  238. static void
  239. attributeDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *elem, const xmlChar *name,
  240. int type, int def, const xmlChar *defaultValue,
  241. xmlEnumerationPtr tree ATTRIBUTE_UNUSED)
  242. {
  243. fprintf(stdout, "SAX.attributeDecl(%s, %s, %d, %d, %s, ...)\n",
  244. elem, name, type, def, defaultValue);
  245. }
  246. /**
  247. * elementDeclDebug:
  248. * @ctxt: An XML parser context
  249. * @name: the element name
  250. * @type: the element type
  251. * @content: the element value (without processing).
  252. *
  253. * An element definition has been parsed
  254. */
  255. static void
  256. elementDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, int type,
  257. xmlElementContentPtr content ATTRIBUTE_UNUSED)
  258. {
  259. fprintf(stdout, "SAX.elementDecl(%s, %d, ...)\n",
  260. name, type);
  261. }
  262. /**
  263. * notationDeclDebug:
  264. * @ctxt: An XML parser context
  265. * @name: The name of the notation
  266. * @publicId: The public ID of the entity
  267. * @systemId: The system ID of the entity
  268. *
  269. * What to do when a notation declaration has been parsed.
  270. */
  271. static void
  272. notationDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name,
  273. const xmlChar *publicId, const xmlChar *systemId)
  274. {
  275. fprintf(stdout, "SAX.notationDecl(%s, %s, %s)\n",
  276. (char *) name, (char *) publicId, (char *) systemId);
  277. }
  278. /**
  279. * unparsedEntityDeclDebug:
  280. * @ctxt: An XML parser context
  281. * @name: The name of the entity
  282. * @publicId: The public ID of the entity
  283. * @systemId: The system ID of the entity
  284. * @notationName: the name of the notation
  285. *
  286. * What to do when an unparsed entity declaration is parsed
  287. */
  288. static void
  289. unparsedEntityDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name,
  290. const xmlChar *publicId, const xmlChar *systemId,
  291. const xmlChar *notationName)
  292. {
  293. fprintf(stdout, "SAX.unparsedEntityDecl(%s, %s, %s, %s)\n",
  294. (char *) name, (char *) publicId, (char *) systemId,
  295. (char *) notationName);
  296. }
  297. /**
  298. * setDocumentLocatorDebug:
  299. * @ctxt: An XML parser context
  300. * @loc: A SAX Locator
  301. *
  302. * Receive the document locator at startup, actually xmlDefaultSAXLocator
  303. * Everything is available on the context, so this is useless in our case.
  304. */
  305. static void
  306. setDocumentLocatorDebug(void *ctx ATTRIBUTE_UNUSED, xmlSAXLocatorPtr loc ATTRIBUTE_UNUSED)
  307. {
  308. fprintf(stdout, "SAX.setDocumentLocator()\n");
  309. }
  310. /**
  311. * startDocumentDebug:
  312. * @ctxt: An XML parser context
  313. *
  314. * called when the document start being processed.
  315. */
  316. static void
  317. startDocumentDebug(void *ctx ATTRIBUTE_UNUSED)
  318. {
  319. fprintf(stdout, "SAX.startDocument()\n");
  320. }
  321. /**
  322. * endDocumentDebug:
  323. * @ctxt: An XML parser context
  324. *
  325. * called when the document end has been detected.
  326. */
  327. static void
  328. endDocumentDebug(void *ctx ATTRIBUTE_UNUSED)
  329. {
  330. fprintf(stdout, "SAX.endDocument()\n");
  331. }
  332. /**
  333. * startElementDebug:
  334. * @ctxt: An XML parser context
  335. * @name: The element name
  336. *
  337. * called when an opening tag has been processed.
  338. */
  339. static void
  340. startElementDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, const xmlChar **atts)
  341. {
  342. int i;
  343. fprintf(stdout, "SAX.startElement(%s", (char *) name);
  344. if (atts != NULL) {
  345. for (i = 0;(atts[i] != NULL);i++) {
  346. fprintf(stdout, ", %s", atts[i++]);
  347. if (atts[i] != NULL) {
  348. unsigned char output[40];
  349. const unsigned char *att = atts[i];
  350. int outlen, attlen;
  351. fprintf(stdout, "='");
  352. while ((attlen = strlen((char*)att)) > 0) {
  353. outlen = sizeof output - 1;
  354. htmlEncodeEntities(output, &outlen, att, &attlen, '\'');
  355. output[outlen] = 0;
  356. fprintf(stdout, "%s", (char *) output);
  357. att += attlen;
  358. }
  359. fprintf(stdout, "'");
  360. }
  361. }
  362. }
  363. fprintf(stdout, ")\n");
  364. }
  365. /**
  366. * endElementDebug:
  367. * @ctxt: An XML parser context
  368. * @name: The element name
  369. *
  370. * called when the end of an element has been detected.
  371. */
  372. static void
  373. endElementDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
  374. {
  375. fprintf(stdout, "SAX.endElement(%s)\n", (char *) name);
  376. }
  377. /**
  378. * charactersDebug:
  379. * @ctxt: An XML parser context
  380. * @ch: a xmlChar string
  381. * @len: the number of xmlChar
  382. *
  383. * receiving some chars from the parser.
  384. * Question: how much at a time ???
  385. */
  386. static void
  387. charactersDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len)
  388. {
  389. unsigned char output[40];
  390. int inlen = len, outlen = 30;
  391. htmlEncodeEntities(output, &outlen, ch, &inlen, 0);
  392. output[outlen] = 0;
  393. fprintf(stdout, "SAX.characters(%s, %d)\n", output, len);
  394. }
  395. /**
  396. * cdataDebug:
  397. * @ctxt: An XML parser context
  398. * @ch: a xmlChar string
  399. * @len: the number of xmlChar
  400. *
  401. * receiving some cdata chars from the parser.
  402. * Question: how much at a time ???
  403. */
  404. static void
  405. cdataDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len)
  406. {
  407. unsigned char output[40];
  408. int inlen = len, outlen = 30;
  409. htmlEncodeEntities(output, &outlen, ch, &inlen, 0);
  410. output[outlen] = 0;
  411. fprintf(stdout, "SAX.cdata(%s, %d)\n", output, len);
  412. }
  413. /**
  414. * referenceDebug:
  415. * @ctxt: An XML parser context
  416. * @name: The entity name
  417. *
  418. * called when an entity reference is detected.
  419. */
  420. static void
  421. referenceDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
  422. {
  423. fprintf(stdout, "SAX.reference(%s)\n", name);
  424. }
  425. /**
  426. * ignorableWhitespaceDebug:
  427. * @ctxt: An XML parser context
  428. * @ch: a xmlChar string
  429. * @start: the first char in the string
  430. * @len: the number of xmlChar
  431. *
  432. * receiving some ignorable whitespaces from the parser.
  433. * Question: how much at a time ???
  434. */
  435. static void
  436. ignorableWhitespaceDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len)
  437. {
  438. char output[40];
  439. int i;
  440. for (i = 0;(i<len) && (i < 30);i++)
  441. output[i] = ch[i];
  442. output[i] = 0;
  443. fprintf(stdout, "SAX.ignorableWhitespace(%s, %d)\n", output, len);
  444. }
  445. /**
  446. * processingInstructionDebug:
  447. * @ctxt: An XML parser context
  448. * @target: the target name
  449. * @data: the PI data's
  450. * @len: the number of xmlChar
  451. *
  452. * A processing instruction has been parsed.
  453. */
  454. static void
  455. processingInstructionDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *target,
  456. const xmlChar *data)
  457. {
  458. fprintf(stdout, "SAX.processingInstruction(%s, %s)\n",
  459. (char *) target, (char *) data);
  460. }
  461. /**
  462. * commentDebug:
  463. * @ctxt: An XML parser context
  464. * @value: the comment content
  465. *
  466. * A comment has been parsed.
  467. */
  468. static void
  469. commentDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *value)
  470. {
  471. fprintf(stdout, "SAX.comment(%s)\n", value);
  472. }
  473. /**
  474. * warningDebug:
  475. * @ctxt: An XML parser context
  476. * @msg: the message to display/transmit
  477. * @...: extra parameters for the message display
  478. *
  479. * Display and format a warning messages, gives file, line, position and
  480. * extra parameters.
  481. */
  482. static void XMLCDECL
  483. warningDebug(void *ctx ATTRIBUTE_UNUSED, const char *msg, ...)
  484. {
  485. va_list args;
  486. va_start(args, msg);
  487. fprintf(stdout, "SAX.warning: ");
  488. vfprintf(stdout, msg, args);
  489. va_end(args);
  490. }
  491. /**
  492. * errorDebug:
  493. * @ctxt: An XML parser context
  494. * @msg: the message to display/transmit
  495. * @...: extra parameters for the message display
  496. *
  497. * Display and format a error messages, gives file, line, position and
  498. * extra parameters.
  499. */
  500. static void XMLCDECL
  501. errorDebug(void *ctx ATTRIBUTE_UNUSED, const char *msg, ...)
  502. {
  503. va_list args;
  504. va_start(args, msg);
  505. fprintf(stdout, "SAX.error: ");
  506. vfprintf(stdout, msg, args);
  507. va_end(args);
  508. }
  509. /**
  510. * fatalErrorDebug:
  511. * @ctxt: An XML parser context
  512. * @msg: the message to display/transmit
  513. * @...: extra parameters for the message display
  514. *
  515. * Display and format a fatalError messages, gives file, line, position and
  516. * extra parameters.
  517. */
  518. static void XMLCDECL
  519. fatalErrorDebug(void *ctx ATTRIBUTE_UNUSED, const char *msg, ...)
  520. {
  521. va_list args;
  522. va_start(args, msg);
  523. fprintf(stdout, "SAX.fatalError: ");
  524. vfprintf(stdout, msg, args);
  525. va_end(args);
  526. }
  527. static xmlSAXHandler debugSAXHandlerStruct = {
  528. internalSubsetDebug,
  529. isStandaloneDebug,
  530. hasInternalSubsetDebug,
  531. hasExternalSubsetDebug,
  532. resolveEntityDebug,
  533. getEntityDebug,
  534. entityDeclDebug,
  535. notationDeclDebug,
  536. attributeDeclDebug,
  537. elementDeclDebug,
  538. unparsedEntityDeclDebug,
  539. setDocumentLocatorDebug,
  540. startDocumentDebug,
  541. endDocumentDebug,
  542. startElementDebug,
  543. endElementDebug,
  544. referenceDebug,
  545. charactersDebug,
  546. ignorableWhitespaceDebug,
  547. processingInstructionDebug,
  548. commentDebug,
  549. warningDebug,
  550. errorDebug,
  551. fatalErrorDebug,
  552. getParameterEntityDebug,
  553. cdataDebug,
  554. NULL,
  555. 1,
  556. NULL,
  557. NULL,
  558. NULL,
  559. NULL
  560. };
  561. xmlSAXHandlerPtr debugSAXHandler = &debugSAXHandlerStruct;
  562. /************************************************************************
  563. * *
  564. * Debug *
  565. * *
  566. ************************************************************************/
  567. static void
  568. parseSAXFile(char *filename) {
  569. htmlDocPtr doc = NULL;
  570. /*
  571. * Empty callbacks for checking
  572. */
  573. #ifdef LIBXML_PUSH_ENABLED
  574. if (push) {
  575. FILE *f;
  576. #if defined(_WIN32) || defined (__DJGPP__) && !defined (__CYGWIN__)
  577. f = fopen(filename, "rb");
  578. #else
  579. f = fopen(filename, "r");
  580. #endif
  581. if (f != NULL) {
  582. int res, size = 3;
  583. char chars[4096];
  584. htmlParserCtxtPtr ctxt;
  585. /* if (repeat) */
  586. size = 4096;
  587. res = fread(chars, 1, 4, f);
  588. if (res > 0) {
  589. ctxt = htmlCreatePushParserCtxt(emptySAXHandler, NULL,
  590. chars, res, filename, XML_CHAR_ENCODING_NONE);
  591. while ((res = fread(chars, 1, size, f)) > 0) {
  592. htmlParseChunk(ctxt, chars, res, 0);
  593. }
  594. htmlParseChunk(ctxt, chars, 0, 1);
  595. doc = ctxt->myDoc;
  596. htmlFreeParserCtxt(ctxt);
  597. }
  598. if (doc != NULL) {
  599. fprintf(stdout, "htmlSAXParseFile returned non-NULL\n");
  600. xmlFreeDoc(doc);
  601. }
  602. fclose(f);
  603. }
  604. if (!noout) {
  605. #if defined(_WIN32) || defined (__DJGPP__) && !defined (__CYGWIN__)
  606. f = fopen(filename, "rb");
  607. #else
  608. f = fopen(filename, "r");
  609. #endif
  610. if (f != NULL) {
  611. int res, size = 3;
  612. char chars[4096];
  613. htmlParserCtxtPtr ctxt;
  614. /* if (repeat) */
  615. size = 4096;
  616. res = fread(chars, 1, 4, f);
  617. if (res > 0) {
  618. ctxt = htmlCreatePushParserCtxt(debugSAXHandler, NULL,
  619. chars, res, filename, XML_CHAR_ENCODING_NONE);
  620. while ((res = fread(chars, 1, size, f)) > 0) {
  621. htmlParseChunk(ctxt, chars, res, 0);
  622. }
  623. htmlParseChunk(ctxt, chars, 0, 1);
  624. doc = ctxt->myDoc;
  625. htmlFreeParserCtxt(ctxt);
  626. }
  627. if (doc != NULL) {
  628. fprintf(stdout, "htmlSAXParseFile returned non-NULL\n");
  629. xmlFreeDoc(doc);
  630. }
  631. fclose(f);
  632. }
  633. }
  634. } else {
  635. #endif /* LIBXML_PUSH_ENABLED */
  636. doc = htmlSAXParseFile(filename, NULL, emptySAXHandler, NULL);
  637. if (doc != NULL) {
  638. fprintf(stdout, "htmlSAXParseFile returned non-NULL\n");
  639. xmlFreeDoc(doc);
  640. }
  641. if (!noout) {
  642. /*
  643. * Debug callback
  644. */
  645. doc = htmlSAXParseFile(filename, NULL, debugSAXHandler, NULL);
  646. if (doc != NULL) {
  647. fprintf(stdout, "htmlSAXParseFile returned non-NULL\n");
  648. xmlFreeDoc(doc);
  649. }
  650. }
  651. #ifdef LIBXML_PUSH_ENABLED
  652. }
  653. #endif /* LIBXML_PUSH_ENABLED */
  654. }
  655. static void
  656. parseAndPrintFile(char *filename) {
  657. htmlDocPtr doc = NULL;
  658. /*
  659. * build an HTML tree from a string;
  660. */
  661. #ifdef LIBXML_PUSH_ENABLED
  662. if (push) {
  663. FILE *f;
  664. #if defined(_WIN32) || defined (__DJGPP__) && !defined (__CYGWIN__)
  665. f = fopen(filename, "rb");
  666. #else
  667. f = fopen(filename, "r");
  668. #endif
  669. if (f != NULL) {
  670. int res, size = 3;
  671. char chars[4096];
  672. htmlParserCtxtPtr ctxt;
  673. /* if (repeat) */
  674. size = 4096;
  675. res = fread(chars, 1, 4, f);
  676. if (res > 0) {
  677. ctxt = htmlCreatePushParserCtxt(NULL, NULL,
  678. chars, res, filename, XML_CHAR_ENCODING_NONE);
  679. while ((res = fread(chars, 1, size, f)) > 0) {
  680. htmlParseChunk(ctxt, chars, res, 0);
  681. }
  682. htmlParseChunk(ctxt, chars, 0, 1);
  683. doc = ctxt->myDoc;
  684. htmlFreeParserCtxt(ctxt);
  685. }
  686. fclose(f);
  687. }
  688. } else {
  689. doc = htmlReadFile(filename, NULL, options);
  690. }
  691. #else
  692. doc = htmlReadFile(filename,NULL,options);
  693. #endif
  694. if (doc == NULL) {
  695. xmlGenericError(xmlGenericErrorContext,
  696. "Could not parse %s\n", filename);
  697. }
  698. #ifdef LIBXML_TREE_ENABLED
  699. /*
  700. * test intermediate copy if needed.
  701. */
  702. if (copy) {
  703. htmlDocPtr tmp;
  704. tmp = doc;
  705. doc = xmlCopyDoc(doc, 1);
  706. xmlFreeDoc(tmp);
  707. }
  708. #endif
  709. #ifdef LIBXML_OUTPUT_ENABLED
  710. /*
  711. * print it.
  712. */
  713. if (!noout) {
  714. #ifdef LIBXML_DEBUG_ENABLED
  715. if (!debug) {
  716. if (encoding)
  717. htmlSaveFileEnc("-", doc, encoding);
  718. else
  719. htmlDocDump(stdout, doc);
  720. } else
  721. xmlDebugDumpDocument(stdout, doc);
  722. #else
  723. if (encoding)
  724. htmlSaveFileEnc("-", doc, encoding);
  725. else
  726. htmlDocDump(stdout, doc);
  727. #endif
  728. }
  729. #endif /* LIBXML_OUTPUT_ENABLED */
  730. /*
  731. * free it.
  732. */
  733. xmlFreeDoc(doc);
  734. }
  735. int main(int argc, char **argv) {
  736. int i, count;
  737. int files = 0;
  738. for (i = 1; i < argc ; i++) {
  739. #ifdef LIBXML_DEBUG_ENABLED
  740. if ((!strcmp(argv[i], "-debug")) || (!strcmp(argv[i], "--debug")))
  741. debug++;
  742. else
  743. #endif
  744. if ((!strcmp(argv[i], "-copy")) || (!strcmp(argv[i], "--copy")))
  745. copy++;
  746. #ifdef LIBXML_PUSH_ENABLED
  747. else if ((!strcmp(argv[i], "-push")) || (!strcmp(argv[i], "--push")))
  748. push++;
  749. #endif /* LIBXML_PUSH_ENABLED */
  750. else if ((!strcmp(argv[i], "-sax")) || (!strcmp(argv[i], "--sax")))
  751. sax++;
  752. else if ((!strcmp(argv[i], "-noout")) || (!strcmp(argv[i], "--noout")))
  753. noout++;
  754. else if ((!strcmp(argv[i], "-repeat")) ||
  755. (!strcmp(argv[i], "--repeat")))
  756. repeat++;
  757. else if ((!strcmp(argv[i], "-encode")) ||
  758. (!strcmp(argv[i], "--encode"))) {
  759. i++;
  760. encoding = argv[i];
  761. }
  762. }
  763. for (i = 1; i < argc ; i++) {
  764. if ((!strcmp(argv[i], "-encode")) ||
  765. (!strcmp(argv[i], "--encode"))) {
  766. i++;
  767. continue;
  768. }
  769. if (argv[i][0] != '-') {
  770. if (repeat) {
  771. for (count = 0;count < 100 * repeat;count++) {
  772. if (sax)
  773. parseSAXFile(argv[i]);
  774. else
  775. parseAndPrintFile(argv[i]);
  776. }
  777. } else {
  778. if (sax)
  779. parseSAXFile(argv[i]);
  780. else
  781. parseAndPrintFile(argv[i]);
  782. }
  783. files ++;
  784. }
  785. }
  786. if (files == 0) {
  787. printf("Usage : %s [--debug] [--copy] [--copy] HTMLfiles ...\n",
  788. argv[0]);
  789. printf("\tParse the HTML files and output the result of the parsing\n");
  790. #ifdef LIBXML_DEBUG_ENABLED
  791. printf("\t--debug : dump a debug tree of the in-memory document\n");
  792. #endif
  793. printf("\t--copy : used to test the internal copy implementation\n");
  794. printf("\t--sax : debug the sequence of SAX callbacks\n");
  795. printf("\t--repeat : parse the file 100 times, for timing\n");
  796. printf("\t--noout : do not print the result\n");
  797. #ifdef LIBXML_PUSH_ENABLED
  798. printf("\t--push : use the push mode parser\n");
  799. #endif /* LIBXML_PUSH_ENABLED */
  800. printf("\t--encode encoding : output in the given encoding\n");
  801. }
  802. xmlCleanupParser();
  803. xmlMemoryDump();
  804. return(0);
  805. }
  806. #else /* !LIBXML_HTML_ENABLED */
  807. #include <stdio.h>
  808. int main(int argc ATTRIBUTE_UNUSED, char **argv ATTRIBUTE_UNUSED) {
  809. printf("%s : HTML support not compiled in\n", argv[0]);
  810. return(0);
  811. }
  812. #endif