HTMLparser.c 200 KB


  1. /*
  2. * HTMLparser.c : an HTML 4.0 non-verifying parser
  3. *
  4. * See Copyright for the status of this software.
  5. *
  6. * daniel@veillard.com
  7. */
  8. #define IN_LIBXML
  9. #include "libxml.h"
  10. #ifdef LIBXML_HTML_ENABLED
  11. #include <string.h>
  12. #ifdef HAVE_CTYPE_H
  13. #include <ctype.h>
  14. #endif
  15. #ifdef HAVE_STDLIB_H
  16. #include <stdlib.h>
  17. #endif
  18. #ifdef HAVE_SYS_STAT_H
  19. #include <sys/stat.h>
  20. #endif
  21. #ifdef HAVE_FCNTL_H
  22. #include <fcntl.h>
  23. #endif
  24. #ifdef HAVE_UNISTD_H
  25. #include <unistd.h>
  26. #endif
  27. #ifdef HAVE_ZLIB_H
  28. #include <zlib.h>
  29. #endif
  30. #include <libxml/xmlmemory.h>
  31. #include <libxml/tree.h>
  32. #include <libxml/parser.h>
  33. #include <libxml/parserInternals.h>
  34. #include <libxml/xmlerror.h>
  35. #include <libxml/HTMLparser.h>
  36. #include <libxml/HTMLtree.h>
  37. #include <libxml/entities.h>
  38. #include <libxml/encoding.h>
  39. #include <libxml/valid.h>
  40. #include <libxml/xmlIO.h>
  41. #include <libxml/globals.h>
  42. #include <libxml/uri.h>
  43. #define HTML_MAX_NAMELEN 1000
  44. #define HTML_PARSER_BIG_BUFFER_SIZE 1000
  45. #define HTML_PARSER_BUFFER_SIZE 100
  46. /* #define DEBUG */
  47. /* #define DEBUG_PUSH */
  48. static int htmlOmittedDefaultValue = 1;
  49. xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
  50. xmlChar end, xmlChar end2, xmlChar end3);
  51. static void htmlParseComment(htmlParserCtxtPtr ctxt);
  52. /************************************************************************
  53. * *
  54. * Some factorized error routines *
  55. * *
  56. ************************************************************************/
  57. /**
  58. * htmlErrMemory:
  59. * @ctxt: an HTML parser context
  60. * @extra: extra informations
  61. *
  62. * Handle a redefinition of attribute error
  63. */
  64. static void
  65. htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
  66. {
  67. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  68. (ctxt->instate == XML_PARSER_EOF))
  69. return;
  70. if (ctxt != NULL) {
  71. ctxt->errNo = XML_ERR_NO_MEMORY;
  72. ctxt->instate = XML_PARSER_EOF;
  73. ctxt->disableSAX = 1;
  74. }
  75. if (extra)
  76. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  77. XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
  78. NULL, NULL, 0, 0,
  79. "Memory allocation failed : %s\n", extra);
  80. else
  81. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  82. XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
  83. NULL, NULL, 0, 0, "Memory allocation failed\n");
  84. }
  85. /**
  86. * htmlParseErr:
  87. * @ctxt: an HTML parser context
  88. * @error: the error number
  89. * @msg: the error message
  90. * @str1: string infor
  91. * @str2: string infor
  92. *
  93. * Handle a fatal parser error, i.e. violating Well-Formedness constraints
  94. */
  95. static void
  96. htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
  97. const char *msg, const xmlChar *str1, const xmlChar *str2)
  98. {
  99. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  100. (ctxt->instate == XML_PARSER_EOF))
  101. return;
  102. if (ctxt != NULL)
  103. ctxt->errNo = error;
  104. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
  105. XML_ERR_ERROR, NULL, 0,
  106. (const char *) str1, (const char *) str2,
  107. NULL, 0, 0,
  108. msg, str1, str2);
  109. if (ctxt != NULL)
  110. ctxt->wellFormed = 0;
  111. }
  112. /**
  113. * htmlParseErrInt:
  114. * @ctxt: an HTML parser context
  115. * @error: the error number
  116. * @msg: the error message
  117. * @val: integer info
  118. *
  119. * Handle a fatal parser error, i.e. violating Well-Formedness constraints
  120. */
  121. static void
  122. htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
  123. const char *msg, int val)
  124. {
  125. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  126. (ctxt->instate == XML_PARSER_EOF))
  127. return;
  128. if (ctxt != NULL)
  129. ctxt->errNo = error;
  130. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
  131. XML_ERR_ERROR, NULL, 0, NULL, NULL,
  132. NULL, val, 0, msg, val);
  133. if (ctxt != NULL)
  134. ctxt->wellFormed = 0;
  135. }
  136. /************************************************************************
  137. * *
  138. * Parser stacks related functions and macros *
  139. * *
  140. ************************************************************************/
  141. /**
  142. * htmlnamePush:
  143. * @ctxt: an HTML parser context
  144. * @value: the element name
  145. *
  146. * Pushes a new element name on top of the name stack
  147. *
  148. * Returns 0 in case of error, the index in the stack otherwise
  149. */
  150. static int
  151. htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
  152. {
  153. if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
  154. ctxt->html = 3;
  155. if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
  156. ctxt->html = 10;
  157. if (ctxt->nameNr >= ctxt->nameMax) {
  158. ctxt->nameMax *= 2;
  159. ctxt->nameTab = (const xmlChar * *)
  160. xmlRealloc((xmlChar * *)ctxt->nameTab,
  161. ctxt->nameMax *
  162. sizeof(ctxt->nameTab[0]));
  163. if (ctxt->nameTab == NULL) {
  164. htmlErrMemory(ctxt, NULL);
  165. return (0);
  166. }
  167. }
  168. ctxt->nameTab[ctxt->nameNr] = value;
  169. ctxt->name = value;
  170. return (ctxt->nameNr++);
  171. }
  172. /**
  173. * htmlnamePop:
  174. * @ctxt: an HTML parser context
  175. *
  176. * Pops the top element name from the name stack
  177. *
  178. * Returns the name just removed
  179. */
  180. static const xmlChar *
  181. htmlnamePop(htmlParserCtxtPtr ctxt)
  182. {
  183. const xmlChar *ret;
  184. if (ctxt->nameNr <= 0)
  185. return (NULL);
  186. ctxt->nameNr--;
  187. if (ctxt->nameNr < 0)
  188. return (NULL);
  189. if (ctxt->nameNr > 0)
  190. ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
  191. else
  192. ctxt->name = NULL;
  193. ret = ctxt->nameTab[ctxt->nameNr];
  194. ctxt->nameTab[ctxt->nameNr] = NULL;
  195. return (ret);
  196. }
  197. /**
  198. * htmlNodeInfoPush:
  199. * @ctxt: an HTML parser context
  200. * @value: the node info
  201. *
  202. * Pushes a new element name on top of the node info stack
  203. *
  204. * Returns 0 in case of error, the index in the stack otherwise
  205. */
  206. static int
  207. htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
  208. {
  209. if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
  210. if (ctxt->nodeInfoMax == 0)
  211. ctxt->nodeInfoMax = 5;
  212. ctxt->nodeInfoMax *= 2;
  213. ctxt->nodeInfoTab = (htmlParserNodeInfo *)
  214. xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
  215. ctxt->nodeInfoMax *
  216. sizeof(ctxt->nodeInfoTab[0]));
  217. if (ctxt->nodeInfoTab == NULL) {
  218. htmlErrMemory(ctxt, NULL);
  219. return (0);
  220. }
  221. }
  222. ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
  223. ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
  224. return (ctxt->nodeInfoNr++);
  225. }
  226. /**
  227. * htmlNodeInfoPop:
  228. * @ctxt: an HTML parser context
  229. *
  230. * Pops the top element name from the node info stack
  231. *
  232. * Returns 0 in case of error, the pointer to NodeInfo otherwise
  233. */
  234. static htmlParserNodeInfo *
  235. htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
  236. {
  237. if (ctxt->nodeInfoNr <= 0)
  238. return (NULL);
  239. ctxt->nodeInfoNr--;
  240. if (ctxt->nodeInfoNr < 0)
  241. return (NULL);
  242. if (ctxt->nodeInfoNr > 0)
  243. ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
  244. else
  245. ctxt->nodeInfo = NULL;
  246. return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
  247. }
  248. /*
  249. * Macros for accessing the content. Those should be used only by the parser,
  250. * and not exported.
  251. *
  252. * Dirty macros, i.e. one need to make assumption on the context to use them
  253. *
  254. * CUR_PTR return the current pointer to the xmlChar to be parsed.
  255. * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
  256. * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
  257. * in UNICODE mode. This should be used internally by the parser
  258. * only to compare to ASCII values otherwise it would break when
  259. * running with UTF-8 encoding.
  260. * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
  261. * to compare on ASCII based substring.
  262. * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
  263. * it should be used only to compare on ASCII based substring.
  264. * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
  265. * strings without newlines within the parser.
  266. *
  267. * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
  268. *
  269. * CURRENT Returns the current char value, with the full decoding of
  270. * UTF-8 if we are using this mode. It returns an int.
  271. * NEXT Skip to the next character, this does the proper decoding
  272. * in UTF-8 mode. It also pop-up unfinished entities on the fly.
  273. * NEXTL(l) Skip the current unicode character of l xmlChars long.
  274. * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
  275. */
  276. #define UPPER (toupper(*ctxt->input->cur))
  277. #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
  278. #define NXT(val) ctxt->input->cur[(val)]
  279. #define UPP(val) (toupper(ctxt->input->cur[(val)]))
  280. #define CUR_PTR ctxt->input->cur
  281. #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
  282. (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
  283. xmlParserInputShrink(ctxt->input)
  284. #define GROW if ((ctxt->progressive == 0) && \
  285. (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
  286. xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
  287. #define CURRENT ((int) (*ctxt->input->cur))
  288. #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
  289. /* Inported from XML */
  290. /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
  291. #define CUR ((int) (*ctxt->input->cur))
  292. #define NEXT xmlNextChar(ctxt)
  293. #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
  294. #define NEXTL(l) do { \
  295. if (*(ctxt->input->cur) == '\n') { \
  296. ctxt->input->line++; ctxt->input->col = 1; \
  297. } else ctxt->input->col++; \
  298. ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
  299. } while (0)
  300. /************
  301. \
  302. if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
  303. if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
  304. ************/
  305. #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
  306. #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
  307. #define COPY_BUF(l,b,i,v) \
  308. if (l == 1) b[i++] = (xmlChar) v; \
  309. else i += xmlCopyChar(l,&b[i],v)
  310. /**
  311. * htmlFindEncoding:
  312. * @the HTML parser context
  313. *
  314. * Ty to find and encoding in the current data available in the input
  315. * buffer this is needed to try to switch to the proper encoding when
  316. * one face a character error.
  317. * That's an heuristic, since it's operating outside of parsing it could
  318. * try to use a meta which had been commented out, that's the reason it
  319. * should only be used in case of error, not as a default.
  320. *
  321. * Returns an encoding string or NULL if not found, the string need to
  322. * be freed
  323. */
  324. static xmlChar *
  325. htmlFindEncoding(xmlParserCtxtPtr ctxt) {
  326. const xmlChar *start, *cur, *end;
  327. if ((ctxt == NULL) || (ctxt->input == NULL) ||
  328. (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
  329. (ctxt->input->buf->encoder != NULL))
  330. return(NULL);
  331. if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
  332. return(NULL);
  333. start = ctxt->input->cur;
  334. end = ctxt->input->end;
  335. /* we also expect the input buffer to be zero terminated */
  336. if (*end != 0)
  337. return(NULL);
  338. cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
  339. if (cur == NULL)
  340. return(NULL);
  341. cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
  342. if (cur == NULL)
  343. return(NULL);
  344. cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
  345. if (cur == NULL)
  346. return(NULL);
  347. cur += 8;
  348. start = cur;
  349. while (((*cur >= 'A') && (*cur <= 'Z')) ||
  350. ((*cur >= 'a') && (*cur <= 'z')) ||
  351. ((*cur >= '0') && (*cur <= '9')) ||
  352. (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
  353. cur++;
  354. if (cur == start)
  355. return(NULL);
  356. return(xmlStrndup(start, cur - start));
  357. }
  358. /**
  359. * htmlCurrentChar:
  360. * @ctxt: the HTML parser context
  361. * @len: pointer to the length of the char read
  362. *
  363. * The current char value, if using UTF-8 this may actually span multiple
  364. * bytes in the input buffer. Implement the end of line normalization:
  365. * 2.11 End-of-Line Handling
  366. * If the encoding is unspecified, in the case we find an ISO-Latin-1
  367. * char, then the encoding converter is plugged in automatically.
  368. *
  369. * Returns the current char value and its length
  370. */
  371. static int
  372. htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
  373. if (ctxt->instate == XML_PARSER_EOF)
  374. return(0);
  375. if (ctxt->token != 0) {
  376. *len = 0;
  377. return(ctxt->token);
  378. }
  379. if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
  380. /*
  381. * We are supposed to handle UTF8, check it's valid
  382. * From rfc2044: encoding of the Unicode values on UTF-8:
  383. *
  384. * UCS-4 range (hex.) UTF-8 octet sequence (binary)
  385. * 0000 0000-0000 007F 0xxxxxxx
  386. * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
  387. * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
  388. *
  389. * Check for the 0x110000 limit too
  390. */
  391. const unsigned char *cur = ctxt->input->cur;
  392. unsigned char c;
  393. unsigned int val;
  394. c = *cur;
  395. if (c & 0x80) {
  396. if (cur[1] == 0) {
  397. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  398. cur = ctxt->input->cur;
  399. }
  400. if ((cur[1] & 0xc0) != 0x80)
  401. goto encoding_error;
  402. if ((c & 0xe0) == 0xe0) {
  403. if (cur[2] == 0) {
  404. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  405. cur = ctxt->input->cur;
  406. }
  407. if ((cur[2] & 0xc0) != 0x80)
  408. goto encoding_error;
  409. if ((c & 0xf0) == 0xf0) {
  410. if (cur[3] == 0) {
  411. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  412. cur = ctxt->input->cur;
  413. }
  414. if (((c & 0xf8) != 0xf0) ||
  415. ((cur[3] & 0xc0) != 0x80))
  416. goto encoding_error;
  417. /* 4-byte code */
  418. *len = 4;
  419. val = (cur[0] & 0x7) << 18;
  420. val |= (cur[1] & 0x3f) << 12;
  421. val |= (cur[2] & 0x3f) << 6;
  422. val |= cur[3] & 0x3f;
  423. } else {
  424. /* 3-byte code */
  425. *len = 3;
  426. val = (cur[0] & 0xf) << 12;
  427. val |= (cur[1] & 0x3f) << 6;
  428. val |= cur[2] & 0x3f;
  429. }
  430. } else {
  431. /* 2-byte code */
  432. *len = 2;
  433. val = (cur[0] & 0x1f) << 6;
  434. val |= cur[1] & 0x3f;
  435. }
  436. if (!IS_CHAR(val)) {
  437. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  438. "Char 0x%X out of allowed range\n", val);
  439. }
  440. return(val);
  441. } else {
  442. if ((*ctxt->input->cur == 0) &&
  443. (ctxt->input->cur < ctxt->input->end)) {
  444. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  445. "Char 0x%X out of allowed range\n", 0);
  446. *len = 1;
  447. return(' ');
  448. }
  449. /* 1-byte code */
  450. *len = 1;
  451. return((int) *ctxt->input->cur);
  452. }
  453. }
  454. /*
  455. * Assume it's a fixed length encoding (1) with
  456. * a compatible encoding for the ASCII set, since
  457. * XML constructs only use < 128 chars
  458. */
  459. *len = 1;
  460. if ((int) *ctxt->input->cur < 0x80)
  461. return((int) *ctxt->input->cur);
  462. /*
  463. * Humm this is bad, do an automatic flow conversion
  464. */
  465. {
  466. xmlChar * guess;
  467. xmlCharEncodingHandlerPtr handler;
  468. guess = htmlFindEncoding(ctxt);
  469. if (guess == NULL) {
  470. xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
  471. } else {
  472. if (ctxt->input->encoding != NULL)
  473. xmlFree((xmlChar *) ctxt->input->encoding);
  474. ctxt->input->encoding = guess;
  475. handler = xmlFindCharEncodingHandler((const char *) guess);
  476. if (handler != NULL) {
  477. xmlSwitchToEncoding(ctxt, handler);
  478. } else {
  479. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  480. "Unsupported encoding %s", guess, NULL);
  481. }
  482. }
  483. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  484. }
  485. return(xmlCurrentChar(ctxt, len));
  486. encoding_error:
  487. /*
  488. * If we detect an UTF8 error that probably mean that the
  489. * input encoding didn't get properly advertized in the
  490. * declaration header. Report the error and switch the encoding
  491. * to ISO-Latin-1 (if you don't like this policy, just declare the
  492. * encoding !)
  493. */
  494. {
  495. char buffer[150];
  496. if (ctxt->input->end - ctxt->input->cur >= 4) {
  497. snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
  498. ctxt->input->cur[0], ctxt->input->cur[1],
  499. ctxt->input->cur[2], ctxt->input->cur[3]);
  500. } else {
  501. snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
  502. }
  503. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  504. "Input is not proper UTF-8, indicate encoding !\n",
  505. BAD_CAST buffer, NULL);
  506. }
  507. ctxt->charset = XML_CHAR_ENCODING_8859_1;
  508. *len = 1;
  509. return((int) *ctxt->input->cur);
  510. }
  511. /**
  512. * htmlSkipBlankChars:
  513. * @ctxt: the HTML parser context
  514. *
  515. * skip all blanks character found at that point in the input streams.
  516. *
  517. * Returns the number of space chars skipped
  518. */
  519. static int
  520. htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
  521. int res = 0;
  522. while (IS_BLANK_CH(*(ctxt->input->cur))) {
  523. if ((*ctxt->input->cur == 0) &&
  524. (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
  525. xmlPopInput(ctxt);
  526. } else {
  527. if (*(ctxt->input->cur) == '\n') {
  528. ctxt->input->line++; ctxt->input->col = 1;
  529. } else ctxt->input->col++;
  530. ctxt->input->cur++;
  531. ctxt->nbChars++;
  532. if (*ctxt->input->cur == 0)
  533. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  534. }
  535. res++;
  536. }
  537. return(res);
  538. }
  539. /************************************************************************
  540. * *
  541. * The list of HTML elements and their properties *
  542. * *
  543. ************************************************************************/
  544. /*
  545. * Start Tag: 1 means the start tag can be ommited
  546. * End Tag: 1 means the end tag can be ommited
  547. * 2 means it's forbidden (empty elements)
  548. * 3 means the tag is stylistic and should be closed easily
  549. * Depr: this element is deprecated
  550. * DTD: 1 means that this element is valid only in the Loose DTD
  551. * 2 means that this element is valid only in the Frameset DTD
  552. *
  553. * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
  554. , subElements , impliedsubelt , Attributes, userdata
  555. */
  556. /* Definitions and a couple of vars for HTML Elements */
  557. #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
  558. #define NB_FONTSTYLE 8
  559. #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
  560. #define NB_PHRASE 10
  561. #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
  562. #define NB_SPECIAL 16
  563. #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
  564. #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
  565. #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
  566. #define NB_BLOCK NB_HEADING + NB_LIST + 14
  567. #define FORMCTRL "input", "select", "textarea", "label", "button"
  568. #define NB_FORMCTRL 5
  569. #define PCDATA
  570. #define NB_PCDATA 0
  571. #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
  572. #define NB_HEADING 6
  573. #define LIST "ul", "ol", "dir", "menu"
  574. #define NB_LIST 4
  575. #define MODIFIER
  576. #define NB_MODIFIER 0
  577. #define FLOW BLOCK,INLINE
  578. #define NB_FLOW NB_BLOCK + NB_INLINE
  579. #define EMPTY NULL
  580. static const char* const html_flow[] = { FLOW, NULL } ;
  581. static const char* const html_inline[] = { INLINE, NULL } ;
  582. /* placeholders: elts with content but no subelements */
  583. static const char* const html_pcdata[] = { NULL } ;
  584. #define html_cdata html_pcdata
  585. /* ... and for HTML Attributes */
  586. #define COREATTRS "id", "class", "style", "title"
  587. #define NB_COREATTRS 4
  588. #define I18N "lang", "dir"
  589. #define NB_I18N 2
  590. #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
  591. #define NB_EVENTS 9
  592. #define ATTRS COREATTRS,I18N,EVENTS
  593. #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
  594. #define CELLHALIGN "align", "char", "charoff"
  595. #define NB_CELLHALIGN 3
  596. #define CELLVALIGN "valign"
  597. #define NB_CELLVALIGN 1
  598. static const char* const html_attrs[] = { ATTRS, NULL } ;
  599. static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
  600. static const char* const core_attrs[] = { COREATTRS, NULL } ;
  601. static const char* const i18n_attrs[] = { I18N, NULL } ;
  602. /* Other declarations that should go inline ... */
  603. static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
  604. "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
  605. "tabindex", "onfocus", "onblur", NULL } ;
  606. static const char* const target_attr[] = { "target", NULL } ;
  607. static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
  608. static const char* const alt_attr[] = { "alt", NULL } ;
  609. static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
  610. static const char* const href_attrs[] = { "href", NULL } ;
  611. static const char* const clear_attrs[] = { "clear", NULL } ;
  612. static const char* const inline_p[] = { INLINE, "p", NULL } ;
  613. static const char* const flow_param[] = { FLOW, "param", NULL } ;
  614. static const char* const applet_attrs[] = { COREATTRS , "codebase",
  615. "archive", "alt", "name", "height", "width", "align",
  616. "hspace", "vspace", NULL } ;
  617. static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
  618. "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
  619. static const char* const basefont_attrs[] =
  620. { "id", "size", "color", "face", NULL } ;
  621. static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
  622. static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
  623. static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
  624. static const char* const body_depr[] = { "background", "bgcolor", "text",
  625. "link", "vlink", "alink", NULL } ;
  626. static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
  627. "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
  628. static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
  629. static const char* const col_elt[] = { "col", NULL } ;
  630. static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
  631. static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
  632. static const char* const dl_contents[] = { "dt", "dd", NULL } ;
  633. static const char* const compact_attr[] = { "compact", NULL } ;
  634. static const char* const label_attr[] = { "label", NULL } ;
  635. static const char* const fieldset_contents[] = { FLOW, "legend" } ;
  636. static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
  637. static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
  638. static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
  639. static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
  640. static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
  641. static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
  642. static const char* const head_attrs[] = { I18N, "profile", NULL } ;
  643. static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
  644. static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
  645. static const char* const version_attr[] = { "version", NULL } ;
  646. static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
  647. static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
  648. static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
  649. static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
  650. static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
  651. static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
  652. static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
  653. static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
  654. static const char* const align_attr[] = { "align", NULL } ;
  655. static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
  656. static const char* const map_contents[] = { BLOCK, "area", NULL } ;
  657. static const char* const name_attr[] = { "name", NULL } ;
  658. static const char* const action_attr[] = { "action", NULL } ;
  659. static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
  660. static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
  661. static const char* const content_attr[] = { "content", NULL } ;
  662. static const char* const type_attr[] = { "type", NULL } ;
  663. static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
  664. static const char* const object_contents[] = { FLOW, "param", NULL } ;
  665. static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
  666. static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
  667. static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
  668. static const char* const option_elt[] = { "option", NULL } ;
  669. static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
  670. static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
  671. static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
  672. static const char* const width_attr[] = { "width", NULL } ;
  673. static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
  674. static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
  675. static const char* const language_attr[] = { "language", NULL } ;
  676. static const char* const select_content[] = { "optgroup", "option", NULL } ;
  677. static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
  678. static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
  679. static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
  680. static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
  681. static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
  682. static const char* const tr_elt[] = { "tr", NULL } ;
  683. static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
  684. static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
  685. static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
  686. static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
  687. static const char* const tr_contents[] = { "th", "td", NULL } ;
  688. static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
  689. static const char* const li_elt[] = { "li", NULL } ;
  690. static const char* const ul_depr[] = { "type", "compact", NULL} ;
  691. static const char* const dir_attr[] = { "dir", NULL} ;
  692. #define DECL (const char**)
  693. static const htmlElemDesc
  694. html40ElementTable[] = {
  695. { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
  696. DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
  697. },
  698. { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
  699. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  700. },
  701. { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
  702. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  703. },
  704. { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
  705. DECL inline_p , NULL , DECL html_attrs, NULL, NULL
  706. },
  707. { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
  708. DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
  709. },
  710. { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
  711. EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
  712. },
  713. { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
  714. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  715. },
  716. { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
  717. EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
  718. },
  719. { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
  720. EMPTY , NULL , NULL, DECL basefont_attrs, NULL
  721. },
  722. { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
  723. DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
  724. },
  725. { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
  726. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  727. },
  728. { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
  729. DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
  730. },
  731. { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
  732. DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
  733. },
  734. { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
  735. EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
  736. },
  737. { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
  738. DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
  739. },
  740. { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
  741. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  742. },
  743. { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
  744. DECL html_flow , NULL , NULL, DECL html_attrs, NULL
  745. },
  746. { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
  747. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  748. },
  749. { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
  750. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  751. },
  752. { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
  753. EMPTY , NULL , DECL col_attrs , NULL, NULL
  754. },
  755. { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
  756. DECL col_elt , "col" , DECL col_attrs , NULL, NULL
  757. },
  758. { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
  759. DECL html_flow , NULL , DECL html_attrs, NULL, NULL
  760. },
  761. { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
  762. DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
  763. },
  764. { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
  765. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  766. },
  767. { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
  768. DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
  769. },
  770. { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
  771. DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
  772. },
  773. { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
  774. DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
  775. },
  776. { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
  777. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  778. },
  779. { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
  780. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  781. },
  782. { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
  783. EMPTY, NULL, DECL embed_attrs, NULL, NULL
  784. },
  785. { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
  786. DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
  787. },
  788. { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
  789. DECL html_inline, NULL, NULL, DECL font_attrs, NULL
  790. },
  791. { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
  792. DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
  793. },
  794. { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
  795. EMPTY, NULL, NULL, DECL frame_attrs, NULL
  796. },
  797. { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
  798. DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
  799. },
  800. { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
  801. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  802. },
  803. { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
  804. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  805. },
  806. { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
  807. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  808. },
  809. { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
  810. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  811. },
  812. { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
  813. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  814. },
  815. { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
  816. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  817. },
  818. { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
  819. DECL head_contents, NULL, DECL head_attrs, NULL, NULL
  820. },
  821. { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
  822. EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
  823. },
  824. { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
  825. DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
  826. },
  827. { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
  828. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  829. },
  830. { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
  831. DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
  832. },
  833. { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
  834. EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
  835. },
  836. { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
  837. EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
  838. },
  839. { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
  840. DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
  841. },
  842. { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
  843. EMPTY, NULL, NULL, DECL prompt_attrs, NULL
  844. },
  845. { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
  846. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  847. },
  848. { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
  849. DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
  850. },
  851. { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
  852. DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
  853. },
  854. { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
  855. DECL html_flow, NULL, DECL html_attrs, NULL, NULL
  856. },
  857. { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
  858. EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
  859. },
  860. { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
  861. DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
  862. },
  863. { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
  864. DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
  865. },
  866. { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
  867. EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
  868. },
  869. { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
  870. DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
  871. },
  872. { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
  873. DECL html_flow, "div", DECL html_attrs, NULL, NULL
  874. },
  875. { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
  876. DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
  877. },
  878. { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
  879. DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
  880. },
  881. { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
  882. DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
  883. },
  884. { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
  885. DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
  886. },
  887. { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
  888. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  889. },
  890. { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
  891. EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
  892. },
  893. { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
  894. DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
  895. },
  896. { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
  897. DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
  898. },
  899. { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
  900. DECL html_inline, NULL, NULL, DECL html_attrs, NULL
  901. },
  902. { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
  903. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  904. },
  905. { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
  906. DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
  907. },
  908. { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
  909. DECL select_content, NULL, DECL select_attrs, NULL, NULL
  910. },
  911. { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
  912. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  913. },
  914. { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
  915. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  916. },
  917. { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
  918. DECL html_inline, NULL, NULL, DECL html_attrs, NULL
  919. },
  920. { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
  921. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  922. },
  923. { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
  924. DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
  925. },
  926. { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
  927. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  928. },
  929. { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
  930. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  931. },
  932. { "table", 0, 0, 0, 0, 0, 0, 0, "",
  933. DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
  934. },
  935. { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
  936. DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
  937. },
  938. { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
  939. DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
  940. },
  941. { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
  942. DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
  943. },
  944. { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
  945. DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
  946. },
  947. { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
  948. DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
  949. },
  950. { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
  951. DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
  952. },
  953. { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
  954. DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
  955. },
  956. { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
  957. DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
  958. },
  959. { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
  960. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  961. },
  962. { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
  963. DECL html_inline, NULL, NULL, DECL html_attrs, NULL
  964. },
  965. { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
  966. DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
  967. },
  968. { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
  969. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  970. }
  971. };
  972. /*
  973. * start tags that imply the end of current element
  974. */
  975. static const char * const htmlStartClose[] = {
  976. "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
  977. "dl", "ul", "ol", "menu", "dir", "address", "pre",
  978. "listing", "xmp", "head", NULL,
  979. "head", "p", NULL,
  980. "title", "p", NULL,
  981. "body", "head", "style", "link", "title", "p", NULL,
  982. "frameset", "head", "style", "link", "title", "p", NULL,
  983. "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
  984. "pre", "listing", "xmp", "head", "li", NULL,
  985. "hr", "p", "head", NULL,
  986. "h1", "p", "head", NULL,
  987. "h2", "p", "head", NULL,
  988. "h3", "p", "head", NULL,
  989. "h4", "p", "head", NULL,
  990. "h5", "p", "head", NULL,
  991. "h6", "p", "head", NULL,
  992. "dir", "p", "head", NULL,
  993. "address", "p", "head", "ul", NULL,
  994. "pre", "p", "head", "ul", NULL,
  995. "listing", "p", "head", NULL,
  996. "xmp", "p", "head", NULL,
  997. "blockquote", "p", "head", NULL,
  998. "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
  999. "xmp", "head", NULL,
  1000. "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
  1001. "head", "dd", NULL,
  1002. "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
  1003. "head", "dt", NULL,
  1004. "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
  1005. "listing", "xmp", NULL,
  1006. "ol", "p", "head", "ul", NULL,
  1007. "menu", "p", "head", "ul", NULL,
  1008. "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
  1009. "div", "p", "head", NULL,
  1010. "noscript", "p", "head", NULL,
  1011. "center", "font", "b", "i", "p", "head", NULL,
  1012. "a", "a", NULL,
  1013. "caption", "p", NULL,
  1014. "colgroup", "caption", "colgroup", "col", "p", NULL,
  1015. "col", "caption", "col", "p", NULL,
  1016. "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
  1017. "listing", "xmp", "a", NULL,
  1018. "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
  1019. "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
  1020. "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
  1021. "thead", "caption", "col", "colgroup", NULL,
  1022. "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
  1023. "tbody", "p", NULL,
  1024. "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
  1025. "tfoot", "tbody", "p", NULL,
  1026. "optgroup", "option", NULL,
  1027. "option", "option", NULL,
  1028. "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
  1029. "pre", "listing", "xmp", "a", NULL,
  1030. NULL
  1031. };
  1032. /*
  1033. * The list of HTML elements which are supposed not to have
  1034. * CDATA content and where a p element will be implied
  1035. *
  1036. * TODO: extend that list by reading the HTML SGML DTD on
  1037. * implied paragraph
  1038. */
  1039. static const char *const htmlNoContentElements[] = {
  1040. "html",
  1041. "head",
  1042. NULL
  1043. };
  1044. /*
  1045. * The list of HTML attributes which are of content %Script;
  1046. * NOTE: when adding ones, check htmlIsScriptAttribute() since
  1047. * it assumes the name starts with 'on'
  1048. */
  1049. static const char *const htmlScriptAttributes[] = {
  1050. "onclick",
  1051. "ondblclick",
  1052. "onmousedown",
  1053. "onmouseup",
  1054. "onmouseover",
  1055. "onmousemove",
  1056. "onmouseout",
  1057. "onkeypress",
  1058. "onkeydown",
  1059. "onkeyup",
  1060. "onload",
  1061. "onunload",
  1062. "onfocus",
  1063. "onblur",
  1064. "onsubmit",
  1065. "onrest",
  1066. "onchange",
  1067. "onselect"
  1068. };
  1069. /*
  1070. * This table is used by the htmlparser to know what to do with
  1071. * broken html pages. By assigning different priorities to different
  1072. * elements the parser can decide how to handle extra endtags.
  1073. * Endtags are only allowed to close elements with lower or equal
  1074. * priority.
  1075. */
  1076. typedef struct {
  1077. const char *name;
  1078. int priority;
  1079. } elementPriority;
  1080. static const elementPriority htmlEndPriority[] = {
  1081. {"div", 150},
  1082. {"td", 160},
  1083. {"th", 160},
  1084. {"tr", 170},
  1085. {"thead", 180},
  1086. {"tbody", 180},
  1087. {"tfoot", 180},
  1088. {"table", 190},
  1089. {"head", 200},
  1090. {"body", 200},
  1091. {"html", 220},
  1092. {NULL, 100} /* Default priority */
  1093. };
  1094. static const char** htmlStartCloseIndex[100];
  1095. static int htmlStartCloseIndexinitialized = 0;
  1096. /************************************************************************
  1097. * *
  1098. * functions to handle HTML specific data *
  1099. * *
  1100. ************************************************************************/
  1101. /**
  1102. * htmlInitAutoClose:
  1103. *
  1104. * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
  1105. * This is not reentrant. Call xmlInitParser() once before processing in
  1106. * case of use in multithreaded programs.
  1107. */
  1108. void
  1109. htmlInitAutoClose(void) {
  1110. int indx, i = 0;
  1111. if (htmlStartCloseIndexinitialized) return;
  1112. for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
  1113. indx = 0;
  1114. while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
  1115. htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
  1116. while (htmlStartClose[i] != NULL) i++;
  1117. i++;
  1118. }
  1119. htmlStartCloseIndexinitialized = 1;
  1120. }
  1121. /**
  1122. * htmlTagLookup:
  1123. * @tag: The tag name in lowercase
  1124. *
  1125. * Lookup the HTML tag in the ElementTable
  1126. *
  1127. * Returns the related htmlElemDescPtr or NULL if not found.
  1128. */
  1129. const htmlElemDesc *
  1130. htmlTagLookup(const xmlChar *tag) {
  1131. unsigned int i;
  1132. for (i = 0; i < (sizeof(html40ElementTable) /
  1133. sizeof(html40ElementTable[0]));i++) {
  1134. if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
  1135. return((htmlElemDescPtr) &html40ElementTable[i]);
  1136. }
  1137. return(NULL);
  1138. }
  1139. /**
  1140. * htmlGetEndPriority:
  1141. * @name: The name of the element to look up the priority for.
  1142. *
  1143. * Return value: The "endtag" priority.
  1144. **/
  1145. static int
  1146. htmlGetEndPriority (const xmlChar *name) {
  1147. int i = 0;
  1148. while ((htmlEndPriority[i].name != NULL) &&
  1149. (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
  1150. i++;
  1151. return(htmlEndPriority[i].priority);
  1152. }
  1153. /**
  1154. * htmlCheckAutoClose:
  1155. * @newtag: The new tag name
  1156. * @oldtag: The old tag name
  1157. *
  1158. * Checks whether the new tag is one of the registered valid tags for
  1159. * closing old.
  1160. * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
  1161. *
  1162. * Returns 0 if no, 1 if yes.
  1163. */
  1164. static int
  1165. htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
  1166. {
  1167. int i, indx;
  1168. const char **closed = NULL;
  1169. if (htmlStartCloseIndexinitialized == 0)
  1170. htmlInitAutoClose();
  1171. /* inefficient, but not a big deal */
  1172. for (indx = 0; indx < 100; indx++) {
  1173. closed = htmlStartCloseIndex[indx];
  1174. if (closed == NULL)
  1175. return (0);
  1176. if (xmlStrEqual(BAD_CAST * closed, newtag))
  1177. break;
  1178. }
  1179. i = closed - htmlStartClose;
  1180. i++;
  1181. while (htmlStartClose[i] != NULL) {
  1182. if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
  1183. return (1);
  1184. }
  1185. i++;
  1186. }
  1187. return (0);
  1188. }
  1189. /**
  1190. * htmlAutoCloseOnClose:
  1191. * @ctxt: an HTML parser context
  1192. * @newtag: The new tag name
  1193. * @force: force the tag closure
  1194. *
  1195. * The HTML DTD allows an ending tag to implicitly close other tags.
  1196. */
  1197. static void
  1198. htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
  1199. {
  1200. const htmlElemDesc *info;
  1201. int i, priority;
  1202. priority = htmlGetEndPriority(newtag);
  1203. for (i = (ctxt->nameNr - 1); i >= 0; i--) {
  1204. if (xmlStrEqual(newtag, ctxt->nameTab[i]))
  1205. break;
  1206. /*
  1207. * A missplaced endtag can only close elements with lower
  1208. * or equal priority, so if we find an element with higher
  1209. * priority before we find an element with
  1210. * matching name, we just ignore this endtag
  1211. */
  1212. if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
  1213. return;
  1214. }
  1215. if (i < 0)
  1216. return;
  1217. while (!xmlStrEqual(newtag, ctxt->name)) {
  1218. info = htmlTagLookup(ctxt->name);
  1219. if ((info != NULL) && (info->endTag == 3)) {
  1220. htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
  1221. "Opening and ending tag mismatch: %s and %s\n",
  1222. newtag, ctxt->name);
  1223. }
  1224. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  1225. ctxt->sax->endElement(ctxt->userData, ctxt->name);
  1226. htmlnamePop(ctxt);
  1227. }
  1228. }
  1229. /**
  1230. * htmlAutoCloseOnEnd:
  1231. * @ctxt: an HTML parser context
  1232. *
  1233. * Close all remaining tags at the end of the stream
  1234. */
  1235. static void
  1236. htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
  1237. {
  1238. int i;
  1239. if (ctxt->nameNr == 0)
  1240. return;
  1241. for (i = (ctxt->nameNr - 1); i >= 0; i--) {
  1242. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  1243. ctxt->sax->endElement(ctxt->userData, ctxt->name);
  1244. htmlnamePop(ctxt);
  1245. }
  1246. }
  1247. /**
  1248. * htmlAutoClose:
  1249. * @ctxt: an HTML parser context
  1250. * @newtag: The new tag name or NULL
  1251. *
  1252. * The HTML DTD allows a tag to implicitly close other tags.
  1253. * The list is kept in htmlStartClose array. This function is
  1254. * called when a new tag has been detected and generates the
  1255. * appropriates closes if possible/needed.
  1256. * If newtag is NULL this mean we are at the end of the resource
  1257. * and we should check
  1258. */
  1259. static void
  1260. htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
  1261. {
  1262. while ((newtag != NULL) && (ctxt->name != NULL) &&
  1263. (htmlCheckAutoClose(newtag, ctxt->name))) {
  1264. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  1265. ctxt->sax->endElement(ctxt->userData, ctxt->name);
  1266. htmlnamePop(ctxt);
  1267. }
  1268. if (newtag == NULL) {
  1269. htmlAutoCloseOnEnd(ctxt);
  1270. return;
  1271. }
  1272. while ((newtag == NULL) && (ctxt->name != NULL) &&
  1273. ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
  1274. (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
  1275. (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
  1276. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  1277. ctxt->sax->endElement(ctxt->userData, ctxt->name);
  1278. htmlnamePop(ctxt);
  1279. }
  1280. }
  1281. /**
  1282. * htmlAutoCloseTag:
  1283. * @doc: the HTML document
  1284. * @name: The tag name
  1285. * @elem: the HTML element
  1286. *
  1287. * The HTML DTD allows a tag to implicitly close other tags.
  1288. * The list is kept in htmlStartClose array. This function checks
  1289. * if the element or one of it's children would autoclose the
  1290. * given tag.
  1291. *
  1292. * Returns 1 if autoclose, 0 otherwise
  1293. */
  1294. int
  1295. htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
  1296. htmlNodePtr child;
  1297. if (elem == NULL) return(1);
  1298. if (xmlStrEqual(name, elem->name)) return(0);
  1299. if (htmlCheckAutoClose(elem->name, name)) return(1);
  1300. child = elem->children;
  1301. while (child != NULL) {
  1302. if (htmlAutoCloseTag(doc, name, child)) return(1);
  1303. child = child->next;
  1304. }
  1305. return(0);
  1306. }
  1307. /**
  1308. * htmlIsAutoClosed:
  1309. * @doc: the HTML document
  1310. * @elem: the HTML element
  1311. *
  1312. * The HTML DTD allows a tag to implicitly close other tags.
  1313. * The list is kept in htmlStartClose array. This function checks
  1314. * if a tag is autoclosed by one of it's child
  1315. *
  1316. * Returns 1 if autoclosed, 0 otherwise
  1317. */
  1318. int
  1319. htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
  1320. htmlNodePtr child;
  1321. if (elem == NULL) return(1);
  1322. child = elem->children;
  1323. while (child != NULL) {
  1324. if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
  1325. child = child->next;
  1326. }
  1327. return(0);
  1328. }
  1329. /**
  1330. * htmlCheckImplied:
  1331. * @ctxt: an HTML parser context
  1332. * @newtag: The new tag name
  1333. *
  1334. * The HTML DTD allows a tag to exists only implicitly
  1335. * called when a new tag has been detected and generates the
  1336. * appropriates implicit tags if missing
  1337. */
  1338. static void
  1339. htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
  1340. int i;
  1341. if (ctxt->options & HTML_PARSE_NOIMPLIED)
  1342. return;
  1343. if (!htmlOmittedDefaultValue)
  1344. return;
  1345. if (xmlStrEqual(newtag, BAD_CAST"html"))
  1346. return;
  1347. if (ctxt->nameNr <= 0) {
  1348. htmlnamePush(ctxt, BAD_CAST"html");
  1349. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1350. ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
  1351. }
  1352. if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
  1353. return;
  1354. if ((ctxt->nameNr <= 1) &&
  1355. ((xmlStrEqual(newtag, BAD_CAST"script")) ||
  1356. (xmlStrEqual(newtag, BAD_CAST"style")) ||
  1357. (xmlStrEqual(newtag, BAD_CAST"meta")) ||
  1358. (xmlStrEqual(newtag, BAD_CAST"link")) ||
  1359. (xmlStrEqual(newtag, BAD_CAST"title")) ||
  1360. (xmlStrEqual(newtag, BAD_CAST"base")))) {
  1361. if (ctxt->html >= 3) {
  1362. /* we already saw or generated an <head> before */
  1363. return;
  1364. }
  1365. /*
  1366. * dropped OBJECT ... i you put it first BODY will be
  1367. * assumed !
  1368. */
  1369. htmlnamePush(ctxt, BAD_CAST"head");
  1370. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1371. ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
  1372. } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
  1373. (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
  1374. (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
  1375. if (ctxt->html >= 10) {
  1376. /* we already saw or generated a <body> before */
  1377. return;
  1378. }
  1379. for (i = 0;i < ctxt->nameNr;i++) {
  1380. if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
  1381. return;
  1382. }
  1383. if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
  1384. return;
  1385. }
  1386. }
  1387. htmlnamePush(ctxt, BAD_CAST"body");
  1388. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1389. ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
  1390. }
  1391. }
  1392. /**
  1393. * htmlCheckParagraph
  1394. * @ctxt: an HTML parser context
  1395. *
  1396. * Check whether a p element need to be implied before inserting
  1397. * characters in the current element.
  1398. *
  1399. * Returns 1 if a paragraph has been inserted, 0 if not and -1
  1400. * in case of error.
  1401. */
  1402. static int
  1403. htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
  1404. const xmlChar *tag;
  1405. int i;
  1406. if (ctxt == NULL)
  1407. return(-1);
  1408. tag = ctxt->name;
  1409. if (tag == NULL) {
  1410. htmlAutoClose(ctxt, BAD_CAST"p");
  1411. htmlCheckImplied(ctxt, BAD_CAST"p");
  1412. htmlnamePush(ctxt, BAD_CAST"p");
  1413. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1414. ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
  1415. return(1);
  1416. }
  1417. if (!htmlOmittedDefaultValue)
  1418. return(0);
  1419. for (i = 0; htmlNoContentElements[i] != NULL; i++) {
  1420. if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
  1421. htmlAutoClose(ctxt, BAD_CAST"p");
  1422. htmlCheckImplied(ctxt, BAD_CAST"p");
  1423. htmlnamePush(ctxt, BAD_CAST"p");
  1424. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1425. ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
  1426. return(1);
  1427. }
  1428. }
  1429. return(0);
  1430. }
  1431. /**
  1432. * htmlIsScriptAttribute:
  1433. * @name: an attribute name
  1434. *
  1435. * Check if an attribute is of content type Script
  1436. *
  1437. * Returns 1 is the attribute is a script 0 otherwise
  1438. */
  1439. int
  1440. htmlIsScriptAttribute(const xmlChar *name) {
  1441. unsigned int i;
  1442. if (name == NULL)
  1443. return(0);
  1444. /*
  1445. * all script attributes start with 'on'
  1446. */
  1447. if ((name[0] != 'o') || (name[1] != 'n'))
  1448. return(0);
  1449. for (i = 0;
  1450. i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
  1451. i++) {
  1452. if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
  1453. return(1);
  1454. }
  1455. return(0);
  1456. }
  1457. /************************************************************************
  1458. * *
  1459. * The list of HTML predefined entities *
  1460. * *
  1461. ************************************************************************/
  1462. static const htmlEntityDesc html40EntitiesTable[] = {
  1463. /*
  1464. * the 4 absolute ones, plus apostrophe.
  1465. */
  1466. { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
  1467. { 38, "amp", "ampersand, U+0026 ISOnum" },
  1468. { 39, "apos", "single quote" },
  1469. { 60, "lt", "less-than sign, U+003C ISOnum" },
  1470. { 62, "gt", "greater-than sign, U+003E ISOnum" },
  1471. /*
  1472. * A bunch still in the 128-255 range
  1473. * Replacing them depend really on the charset used.
  1474. */
  1475. { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
  1476. { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
  1477. { 162, "cent", "cent sign, U+00A2 ISOnum" },
  1478. { 163, "pound","pound sign, U+00A3 ISOnum" },
  1479. { 164, "curren","currency sign, U+00A4 ISOnum" },
  1480. { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
  1481. { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
  1482. { 167, "sect", "section sign, U+00A7 ISOnum" },
  1483. { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
  1484. { 169, "copy", "copyright sign, U+00A9 ISOnum" },
  1485. { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
  1486. { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
  1487. { 172, "not", "not sign, U+00AC ISOnum" },
  1488. { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
  1489. { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
  1490. { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
  1491. { 176, "deg", "degree sign, U+00B0 ISOnum" },
  1492. { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
  1493. { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
  1494. { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
  1495. { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
  1496. { 181, "micro","micro sign, U+00B5 ISOnum" },
  1497. { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
  1498. { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
  1499. { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
  1500. { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
  1501. { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
  1502. { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
  1503. { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
  1504. { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
  1505. { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
  1506. { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
  1507. { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
  1508. { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
  1509. { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
  1510. { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
  1511. { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
  1512. { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
  1513. { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
  1514. { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
  1515. { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
  1516. { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
  1517. { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
  1518. { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
  1519. { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
  1520. { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
  1521. { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
  1522. { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
  1523. { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
  1524. { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
  1525. { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
  1526. { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
  1527. { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
  1528. { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
  1529. { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
  1530. { 215, "times","multiplication sign, U+00D7 ISOnum" },
  1531. { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
  1532. { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
  1533. { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
  1534. { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
  1535. { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
  1536. { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
  1537. { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
  1538. { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
  1539. { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
  1540. { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
  1541. { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
  1542. { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
  1543. { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
  1544. { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
  1545. { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
  1546. { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
  1547. { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
  1548. { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
  1549. { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
  1550. { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
  1551. { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
  1552. { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
  1553. { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
  1554. { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
  1555. { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
  1556. { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
  1557. { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
  1558. { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
  1559. { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
  1560. { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
  1561. { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
  1562. { 247, "divide","division sign, U+00F7 ISOnum" },
  1563. { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
  1564. { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
  1565. { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
  1566. { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
  1567. { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
  1568. { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
  1569. { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
  1570. { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
  1571. { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
  1572. { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
  1573. { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
  1574. { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
  1575. { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
  1576. /*
  1577. * Anything below should really be kept as entities references
  1578. */
  1579. { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
  1580. { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
  1581. { 732, "tilde","small tilde, U+02DC ISOdia" },
  1582. { 913, "Alpha","greek capital letter alpha, U+0391" },
  1583. { 914, "Beta", "greek capital letter beta, U+0392" },
  1584. { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
  1585. { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
  1586. { 917, "Epsilon","greek capital letter epsilon, U+0395" },
  1587. { 918, "Zeta", "greek capital letter zeta, U+0396" },
  1588. { 919, "Eta", "greek capital letter eta, U+0397" },
  1589. { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
  1590. { 921, "Iota", "greek capital letter iota, U+0399" },
  1591. { 922, "Kappa","greek capital letter kappa, U+039A" },
  1592. { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
  1593. { 924, "Mu", "greek capital letter mu, U+039C" },
  1594. { 925, "Nu", "greek capital letter nu, U+039D" },
  1595. { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
  1596. { 927, "Omicron","greek capital letter omicron, U+039F" },
  1597. { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
  1598. { 929, "Rho", "greek capital letter rho, U+03A1" },
  1599. { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
  1600. { 932, "Tau", "greek capital letter tau, U+03A4" },
  1601. { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
  1602. { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
  1603. { 935, "Chi", "greek capital letter chi, U+03A7" },
  1604. { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
  1605. { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
  1606. { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
  1607. { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
  1608. { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
  1609. { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
  1610. { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
  1611. { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
  1612. { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
  1613. { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
  1614. { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
  1615. { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
  1616. { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
  1617. { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
  1618. { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
  1619. { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
  1620. { 959, "omicron","greek small letter omicron, U+03BF NEW" },
  1621. { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
  1622. { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
  1623. { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
  1624. { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
  1625. { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
  1626. { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
  1627. { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
  1628. { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
  1629. { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
  1630. { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
  1631. { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
  1632. { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
  1633. { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
  1634. { 8194, "ensp", "en space, U+2002 ISOpub" },
  1635. { 8195, "emsp", "em space, U+2003 ISOpub" },
  1636. { 8201, "thinsp","thin space, U+2009 ISOpub" },
  1637. { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
  1638. { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
  1639. { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
  1640. { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
  1641. { 8211, "ndash","en dash, U+2013 ISOpub" },
  1642. { 8212, "mdash","em dash, U+2014 ISOpub" },
  1643. { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
  1644. { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
  1645. { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
  1646. { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
  1647. { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
  1648. { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
  1649. { 8224, "dagger","dagger, U+2020 ISOpub" },
  1650. { 8225, "Dagger","double dagger, U+2021 ISOpub" },
  1651. { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
  1652. { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
  1653. { 8240, "permil","per mille sign, U+2030 ISOtech" },
  1654. { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
  1655. { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
  1656. { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
  1657. { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
  1658. { 8254, "oline","overline = spacing overscore, U+203E NEW" },
  1659. { 8260, "frasl","fraction slash, U+2044 NEW" },
  1660. { 8364, "euro", "euro sign, U+20AC NEW" },
  1661. { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
  1662. { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
  1663. { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
  1664. { 8482, "trade","trade mark sign, U+2122 ISOnum" },
  1665. { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
  1666. { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
  1667. { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
  1668. { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
  1669. { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
  1670. { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
  1671. { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
  1672. { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
  1673. { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
  1674. { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
  1675. { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
  1676. { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
  1677. { 8704, "forall","for all, U+2200 ISOtech" },
  1678. { 8706, "part", "partial differential, U+2202 ISOtech" },
  1679. { 8707, "exist","there exists, U+2203 ISOtech" },
  1680. { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
  1681. { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
  1682. { 8712, "isin", "element of, U+2208 ISOtech" },
  1683. { 8713, "notin","not an element of, U+2209 ISOtech" },
  1684. { 8715, "ni", "contains as member, U+220B ISOtech" },
  1685. { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
  1686. { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
  1687. { 8722, "minus","minus sign, U+2212 ISOtech" },
  1688. { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
  1689. { 8730, "radic","square root = radical sign, U+221A ISOtech" },
  1690. { 8733, "prop", "proportional to, U+221D ISOtech" },
  1691. { 8734, "infin","infinity, U+221E ISOtech" },
  1692. { 8736, "ang", "angle, U+2220 ISOamso" },
  1693. { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
  1694. { 8744, "or", "logical or = vee, U+2228 ISOtech" },
  1695. { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
  1696. { 8746, "cup", "union = cup, U+222A ISOtech" },
  1697. { 8747, "int", "integral, U+222B ISOtech" },
  1698. { 8756, "there4","therefore, U+2234 ISOtech" },
  1699. { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
  1700. { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
  1701. { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
  1702. { 8800, "ne", "not equal to, U+2260 ISOtech" },
  1703. { 8801, "equiv","identical to, U+2261 ISOtech" },
  1704. { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
  1705. { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
  1706. { 8834, "sub", "subset of, U+2282 ISOtech" },
  1707. { 8835, "sup", "superset of, U+2283 ISOtech" },
  1708. { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
  1709. { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
  1710. { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
  1711. { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
  1712. { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
  1713. { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
  1714. { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
  1715. { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
  1716. { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
  1717. { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
  1718. { 8971, "rfloor","right floor, U+230B ISOamsc" },
  1719. { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
  1720. { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
  1721. { 9674, "loz", "lozenge, U+25CA ISOpub" },
  1722. { 9824, "spades","black spade suit, U+2660 ISOpub" },
  1723. { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
  1724. { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
  1725. { 9830, "diams","black diamond suit, U+2666 ISOpub" },
  1726. };
  1727. /************************************************************************
  1728. * *
  1729. * Commodity functions to handle entities *
  1730. * *
  1731. ************************************************************************/
  1732. /*
  1733. * Macro used to grow the current buffer.
  1734. */
  1735. #define growBuffer(buffer) { \
  1736. xmlChar *tmp; \
  1737. buffer##_size *= 2; \
  1738. tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
  1739. if (tmp == NULL) { \
  1740. htmlErrMemory(ctxt, "growing buffer\n"); \
  1741. xmlFree(buffer); \
  1742. return(NULL); \
  1743. } \
  1744. buffer = tmp; \
  1745. }
  1746. /**
  1747. * htmlEntityLookup:
  1748. * @name: the entity name
  1749. *
  1750. * Lookup the given entity in EntitiesTable
  1751. *
  1752. * TODO: the linear scan is really ugly, an hash table is really needed.
  1753. *
  1754. * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
  1755. */
  1756. const htmlEntityDesc *
  1757. htmlEntityLookup(const xmlChar *name) {
  1758. unsigned int i;
  1759. for (i = 0;i < (sizeof(html40EntitiesTable)/
  1760. sizeof(html40EntitiesTable[0]));i++) {
  1761. if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
  1762. return((htmlEntityDescPtr) &html40EntitiesTable[i]);
  1763. }
  1764. }
  1765. return(NULL);
  1766. }
  1767. /**
  1768. * htmlEntityValueLookup:
  1769. * @value: the entity's unicode value
  1770. *
  1771. * Lookup the given entity in EntitiesTable
  1772. *
  1773. * TODO: the linear scan is really ugly, an hash table is really needed.
  1774. *
  1775. * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
  1776. */
  1777. const htmlEntityDesc *
  1778. htmlEntityValueLookup(unsigned int value) {
  1779. unsigned int i;
  1780. for (i = 0;i < (sizeof(html40EntitiesTable)/
  1781. sizeof(html40EntitiesTable[0]));i++) {
  1782. if (html40EntitiesTable[i].value >= value) {
  1783. if (html40EntitiesTable[i].value > value)
  1784. break;
  1785. return((htmlEntityDescPtr) &html40EntitiesTable[i]);
  1786. }
  1787. }
  1788. return(NULL);
  1789. }
  1790. /**
  1791. * UTF8ToHtml:
  1792. * @out: a pointer to an array of bytes to store the result
  1793. * @outlen: the length of @out
  1794. * @in: a pointer to an array of UTF-8 chars
  1795. * @inlen: the length of @in
  1796. *
  1797. * Take a block of UTF-8 chars in and try to convert it to an ASCII
  1798. * plus HTML entities block of chars out.
  1799. *
  1800. * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
  1801. * The value of @inlen after return is the number of octets consumed
  1802. * as the return value is positive, else unpredictable.
  1803. * The value of @outlen after return is the number of octets consumed.
  1804. */
  1805. int
  1806. UTF8ToHtml(unsigned char* out, int *outlen,
  1807. const unsigned char* in, int *inlen) {
  1808. const unsigned char* processed = in;
  1809. const unsigned char* outend;
  1810. const unsigned char* outstart = out;
  1811. const unsigned char* instart = in;
  1812. const unsigned char* inend;
  1813. unsigned int c, d;
  1814. int trailing;
  1815. if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
  1816. if (in == NULL) {
  1817. /*
  1818. * initialization nothing to do
  1819. */
  1820. *outlen = 0;
  1821. *inlen = 0;
  1822. return(0);
  1823. }
  1824. inend = in + (*inlen);
  1825. outend = out + (*outlen);
  1826. while (in < inend) {
  1827. d = *in++;
  1828. if (d < 0x80) { c= d; trailing= 0; }
  1829. else if (d < 0xC0) {
  1830. /* trailing byte in leading position */
  1831. *outlen = out - outstart;
  1832. *inlen = processed - instart;
  1833. return(-2);
  1834. } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
  1835. else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
  1836. else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
  1837. else {
  1838. /* no chance for this in Ascii */
  1839. *outlen = out - outstart;
  1840. *inlen = processed - instart;
  1841. return(-2);
  1842. }
  1843. if (inend - in < trailing) {
  1844. break;
  1845. }
  1846. for ( ; trailing; trailing--) {
  1847. if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
  1848. break;
  1849. c <<= 6;
  1850. c |= d & 0x3F;
  1851. }
  1852. /* assertion: c is a single UTF-4 value */
  1853. if (c < 0x80) {
  1854. if (out + 1 >= outend)
  1855. break;
  1856. *out++ = c;
  1857. } else {
  1858. int len;
  1859. const htmlEntityDesc * ent;
  1860. const char *cp;
  1861. char nbuf[16];
  1862. /*
  1863. * Try to lookup a predefined HTML entity for it
  1864. */
  1865. ent = htmlEntityValueLookup(c);
  1866. if (ent == NULL) {
  1867. snprintf(nbuf, sizeof(nbuf), "#%u", c);
  1868. cp = nbuf;
  1869. }
  1870. else
  1871. cp = ent->name;
  1872. len = strlen(cp);
  1873. if (out + 2 + len >= outend)
  1874. break;
  1875. *out++ = '&';
  1876. memcpy(out, cp, len);
  1877. out += len;
  1878. *out++ = ';';
  1879. }
  1880. processed = in;
  1881. }
  1882. *outlen = out - outstart;
  1883. *inlen = processed - instart;
  1884. return(0);
  1885. }
  1886. /**
  1887. * htmlEncodeEntities:
  1888. * @out: a pointer to an array of bytes to store the result
  1889. * @outlen: the length of @out
  1890. * @in: a pointer to an array of UTF-8 chars
  1891. * @inlen: the length of @in
  1892. * @quoteChar: the quote character to escape (' or ") or zero.
  1893. *
  1894. * Take a block of UTF-8 chars in and try to convert it to an ASCII
  1895. * plus HTML entities block of chars out.
  1896. *
  1897. * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
  1898. * The value of @inlen after return is the number of octets consumed
  1899. * as the return value is positive, else unpredictable.
  1900. * The value of @outlen after return is the number of octets consumed.
  1901. */
  1902. int
  1903. htmlEncodeEntities(unsigned char* out, int *outlen,
  1904. const unsigned char* in, int *inlen, int quoteChar) {
  1905. const unsigned char* processed = in;
  1906. const unsigned char* outend;
  1907. const unsigned char* outstart = out;
  1908. const unsigned char* instart = in;
  1909. const unsigned char* inend;
  1910. unsigned int c, d;
  1911. int trailing;
  1912. if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
  1913. return(-1);
  1914. outend = out + (*outlen);
  1915. inend = in + (*inlen);
  1916. while (in < inend) {
  1917. d = *in++;
  1918. if (d < 0x80) { c= d; trailing= 0; }
  1919. else if (d < 0xC0) {
  1920. /* trailing byte in leading position */
  1921. *outlen = out - outstart;
  1922. *inlen = processed - instart;
  1923. return(-2);
  1924. } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
  1925. else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
  1926. else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
  1927. else {
  1928. /* no chance for this in Ascii */
  1929. *outlen = out - outstart;
  1930. *inlen = processed - instart;
  1931. return(-2);
  1932. }
  1933. if (inend - in < trailing)
  1934. break;
  1935. while (trailing--) {
  1936. if (((d= *in++) & 0xC0) != 0x80) {
  1937. *outlen = out - outstart;
  1938. *inlen = processed - instart;
  1939. return(-2);
  1940. }
  1941. c <<= 6;
  1942. c |= d & 0x3F;
  1943. }
  1944. /* assertion: c is a single UTF-4 value */
  1945. if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
  1946. (c != '&') && (c != '<') && (c != '>')) {
  1947. if (out >= outend)
  1948. break;
  1949. *out++ = c;
  1950. } else {
  1951. const htmlEntityDesc * ent;
  1952. const char *cp;
  1953. char nbuf[16];
  1954. int len;
  1955. /*
  1956. * Try to lookup a predefined HTML entity for it
  1957. */
  1958. ent = htmlEntityValueLookup(c);
  1959. if (ent == NULL) {
  1960. snprintf(nbuf, sizeof(nbuf), "#%u", c);
  1961. cp = nbuf;
  1962. }
  1963. else
  1964. cp = ent->name;
  1965. len = strlen(cp);
  1966. if (out + 2 + len > outend)
  1967. break;
  1968. *out++ = '&';
  1969. memcpy(out, cp, len);
  1970. out += len;
  1971. *out++ = ';';
  1972. }
  1973. processed = in;
  1974. }
  1975. *outlen = out - outstart;
  1976. *inlen = processed - instart;
  1977. return(0);
  1978. }
  1979. /************************************************************************
  1980. * *
  1981. * Commodity functions to handle streams *
  1982. * *
  1983. ************************************************************************/
  1984. /**
  1985. * htmlNewInputStream:
  1986. * @ctxt: an HTML parser context
  1987. *
  1988. * Create a new input stream structure
  1989. * Returns the new input stream or NULL
  1990. */
  1991. static htmlParserInputPtr
  1992. htmlNewInputStream(htmlParserCtxtPtr ctxt) {
  1993. htmlParserInputPtr input;
  1994. input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
  1995. if (input == NULL) {
  1996. htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
  1997. return(NULL);
  1998. }
  1999. memset(input, 0, sizeof(htmlParserInput));
  2000. input->filename = NULL;
  2001. input->directory = NULL;
  2002. input->base = NULL;
  2003. input->cur = NULL;
  2004. input->buf = NULL;
  2005. input->line = 1;
  2006. input->col = 1;
  2007. input->buf = NULL;
  2008. input->free = NULL;
  2009. input->version = NULL;
  2010. input->consumed = 0;
  2011. input->length = 0;
  2012. return(input);
  2013. }
  2014. /************************************************************************
  2015. * *
  2016. * Commodity functions, cleanup needed ? *
  2017. * *
  2018. ************************************************************************/
  2019. /*
  2020. * all tags allowing pc data from the html 4.01 loose dtd
  2021. * NOTE: it might be more apropriate to integrate this information
  2022. * into the html40ElementTable array but I don't want to risk any
  2023. * binary incomptibility
  2024. */
  2025. static const char *allowPCData[] = {
  2026. "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
  2027. "blockquote", "body", "button", "caption", "center", "cite", "code",
  2028. "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
  2029. "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
  2030. "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
  2031. "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
  2032. };
  2033. /**
  2034. * areBlanks:
  2035. * @ctxt: an HTML parser context
  2036. * @str: a xmlChar *
  2037. * @len: the size of @str
  2038. *
  2039. * Is this a sequence of blank chars that one can ignore ?
  2040. *
  2041. * Returns 1 if ignorable 0 otherwise.
  2042. */
  2043. static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
  2044. unsigned int i;
  2045. int j;
  2046. xmlNodePtr lastChild;
  2047. xmlDtdPtr dtd;
  2048. for (j = 0;j < len;j++)
  2049. if (!(IS_BLANK_CH(str[j]))) return(0);
  2050. if (CUR == 0) return(1);
  2051. if (CUR != '<') return(0);
  2052. if (ctxt->name == NULL)
  2053. return(1);
  2054. if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
  2055. return(1);
  2056. if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
  2057. return(1);
  2058. /* Only strip CDATA children of the body tag for strict HTML DTDs */
  2059. if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
  2060. dtd = xmlGetIntSubset(ctxt->myDoc);
  2061. if (dtd != NULL && dtd->ExternalID != NULL) {
  2062. if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
  2063. !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
  2064. return(1);
  2065. }
  2066. }
  2067. if (ctxt->node == NULL) return(0);
  2068. lastChild = xmlGetLastChild(ctxt->node);
  2069. while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
  2070. lastChild = lastChild->prev;
  2071. if (lastChild == NULL) {
  2072. if ((ctxt->node->type != XML_ELEMENT_NODE) &&
  2073. (ctxt->node->content != NULL)) return(0);
  2074. /* keep ws in constructs like ...<b> </b>...
  2075. for all tags "b" allowing PCDATA */
  2076. for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
  2077. if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
  2078. return(0);
  2079. }
  2080. }
  2081. } else if (xmlNodeIsText(lastChild)) {
  2082. return(0);
  2083. } else {
  2084. /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
  2085. for all tags "p" allowing PCDATA */
  2086. for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
  2087. if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
  2088. return(0);
  2089. }
  2090. }
  2091. }
  2092. return(1);
  2093. }
  2094. /**
  2095. * htmlNewDocNoDtD:
  2096. * @URI: URI for the dtd, or NULL
  2097. * @ExternalID: the external ID of the DTD, or NULL
  2098. *
  2099. * Creates a new HTML document without a DTD node if @URI and @ExternalID
  2100. * are NULL
  2101. *
  2102. * Returns a new document, do not initialize the DTD if not provided
  2103. */
  2104. htmlDocPtr
  2105. htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
  2106. xmlDocPtr cur;
  2107. /*
  2108. * Allocate a new document and fill the fields.
  2109. */
  2110. cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
  2111. if (cur == NULL) {
  2112. htmlErrMemory(NULL, "HTML document creation failed\n");
  2113. return(NULL);
  2114. }
  2115. memset(cur, 0, sizeof(xmlDoc));
  2116. cur->type = XML_HTML_DOCUMENT_NODE;
  2117. cur->version = NULL;
  2118. cur->intSubset = NULL;
  2119. cur->doc = cur;
  2120. cur->name = NULL;
  2121. cur->children = NULL;
  2122. cur->extSubset = NULL;
  2123. cur->oldNs = NULL;
  2124. cur->encoding = NULL;
  2125. cur->standalone = 1;
  2126. cur->compression = 0;
  2127. cur->ids = NULL;
  2128. cur->refs = NULL;
  2129. cur->_private = NULL;
  2130. cur->charset = XML_CHAR_ENCODING_UTF8;
  2131. cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
  2132. if ((ExternalID != NULL) ||
  2133. (URI != NULL))
  2134. xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
  2135. return(cur);
  2136. }
  2137. /**
  2138. * htmlNewDoc:
  2139. * @URI: URI for the dtd, or NULL
  2140. * @ExternalID: the external ID of the DTD, or NULL
  2141. *
  2142. * Creates a new HTML document
  2143. *
  2144. * Returns a new document
  2145. */
  2146. htmlDocPtr
  2147. htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
  2148. if ((URI == NULL) && (ExternalID == NULL))
  2149. return(htmlNewDocNoDtD(
  2150. BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
  2151. BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
  2152. return(htmlNewDocNoDtD(URI, ExternalID));
  2153. }
  2154. /************************************************************************
  2155. * *
  2156. * The parser itself *
  2157. * Relates to http://www.w3.org/TR/html40 *
  2158. * *
  2159. ************************************************************************/
  2160. /************************************************************************
  2161. * *
  2162. * The parser itself *
  2163. * *
  2164. ************************************************************************/
  2165. static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
  2166. /**
  2167. * htmlParseHTMLName:
  2168. * @ctxt: an HTML parser context
  2169. *
  2170. * parse an HTML tag or attribute name, note that we convert it to lowercase
  2171. * since HTML names are not case-sensitive.
  2172. *
  2173. * Returns the Tag Name parsed or NULL
  2174. */
  2175. static const xmlChar *
  2176. htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
  2177. int i = 0;
  2178. xmlChar loc[HTML_PARSER_BUFFER_SIZE];
  2179. if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
  2180. (CUR != ':') && (CUR != '.')) return(NULL);
  2181. while ((i < HTML_PARSER_BUFFER_SIZE) &&
  2182. ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
  2183. (CUR == ':') || (CUR == '-') || (CUR == '_') ||
  2184. (CUR == '.'))) {
  2185. if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
  2186. else loc[i] = CUR;
  2187. i++;
  2188. NEXT;
  2189. }
  2190. return(xmlDictLookup(ctxt->dict, loc, i));
  2191. }
  2192. /**
  2193. * htmlParseHTMLName_nonInvasive:
  2194. * @ctxt: an HTML parser context
  2195. *
  2196. * parse an HTML tag or attribute name, note that we convert it to lowercase
  2197. * since HTML names are not case-sensitive, this doesn't consume the data
  2198. * from the stream, it's a look-ahead
  2199. *
  2200. * Returns the Tag Name parsed or NULL
  2201. */
  2202. static const xmlChar *
  2203. htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
  2204. int i = 0;
  2205. xmlChar loc[HTML_PARSER_BUFFER_SIZE];
  2206. if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
  2207. (NXT(1) != ':')) return(NULL);
  2208. while ((i < HTML_PARSER_BUFFER_SIZE) &&
  2209. ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
  2210. (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
  2211. if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
  2212. else loc[i] = NXT(1+i);
  2213. i++;
  2214. }
  2215. return(xmlDictLookup(ctxt->dict, loc, i));
  2216. }
  2217. /**
  2218. * htmlParseName:
  2219. * @ctxt: an HTML parser context
  2220. *
  2221. * parse an HTML name, this routine is case sensitive.
  2222. *
  2223. * Returns the Name parsed or NULL
  2224. */
  2225. static const xmlChar *
  2226. htmlParseName(htmlParserCtxtPtr ctxt) {
  2227. const xmlChar *in;
  2228. const xmlChar *ret;
  2229. int count = 0;
  2230. GROW;
  2231. /*
  2232. * Accelerator for simple ASCII names
  2233. */
  2234. in = ctxt->input->cur;
  2235. if (((*in >= 0x61) && (*in <= 0x7A)) ||
  2236. ((*in >= 0x41) && (*in <= 0x5A)) ||
  2237. (*in == '_') || (*in == ':')) {
  2238. in++;
  2239. while (((*in >= 0x61) && (*in <= 0x7A)) ||
  2240. ((*in >= 0x41) && (*in <= 0x5A)) ||
  2241. ((*in >= 0x30) && (*in <= 0x39)) ||
  2242. (*in == '_') || (*in == '-') ||
  2243. (*in == ':') || (*in == '.'))
  2244. in++;
  2245. if ((*in > 0) && (*in < 0x80)) {
  2246. count = in - ctxt->input->cur;
  2247. ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
  2248. ctxt->input->cur = in;
  2249. ctxt->nbChars += count;
  2250. ctxt->input->col += count;
  2251. return(ret);
  2252. }
  2253. }
  2254. return(htmlParseNameComplex(ctxt));
  2255. }
  2256. static const xmlChar *
  2257. htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
  2258. int len = 0, l;
  2259. int c;
  2260. int count = 0;
  2261. /*
  2262. * Handler for more complex cases
  2263. */
  2264. GROW;
  2265. c = CUR_CHAR(l);
  2266. if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
  2267. (!IS_LETTER(c) && (c != '_') &&
  2268. (c != ':'))) {
  2269. return(NULL);
  2270. }
  2271. while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
  2272. ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
  2273. (c == '.') || (c == '-') ||
  2274. (c == '_') || (c == ':') ||
  2275. (IS_COMBINING(c)) ||
  2276. (IS_EXTENDER(c)))) {
  2277. if (count++ > 100) {
  2278. count = 0;
  2279. GROW;
  2280. }
  2281. len += l;
  2282. NEXTL(l);
  2283. c = CUR_CHAR(l);
  2284. }
  2285. return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
  2286. }
  2287. /**
  2288. * htmlParseHTMLAttribute:
  2289. * @ctxt: an HTML parser context
  2290. * @stop: a char stop value
  2291. *
  2292. * parse an HTML attribute value till the stop (quote), if
  2293. * stop is 0 then it stops at the first space
  2294. *
  2295. * Returns the attribute parsed or NULL
  2296. */
  2297. static xmlChar *
  2298. htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
  2299. xmlChar *buffer = NULL;
  2300. int buffer_size = 0;
  2301. xmlChar *out = NULL;
  2302. const xmlChar *name = NULL;
  2303. const xmlChar *cur = NULL;
  2304. const htmlEntityDesc * ent;
  2305. /*
  2306. * allocate a translation buffer.
  2307. */
  2308. buffer_size = HTML_PARSER_BUFFER_SIZE;
  2309. buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
  2310. if (buffer == NULL) {
  2311. htmlErrMemory(ctxt, "buffer allocation failed\n");
  2312. return(NULL);
  2313. }
  2314. out = buffer;
  2315. /*
  2316. * Ok loop until we reach one of the ending chars
  2317. */
  2318. while ((CUR != 0) && (CUR != stop)) {
  2319. if ((stop == 0) && (CUR == '>')) break;
  2320. if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
  2321. if (CUR == '&') {
  2322. if (NXT(1) == '#') {
  2323. unsigned int c;
  2324. int bits;
  2325. c = htmlParseCharRef(ctxt);
  2326. if (c < 0x80)
  2327. { *out++ = c; bits= -6; }
  2328. else if (c < 0x800)
  2329. { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  2330. else if (c < 0x10000)
  2331. { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  2332. else
  2333. { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
  2334. for ( ; bits >= 0; bits-= 6) {
  2335. *out++ = ((c >> bits) & 0x3F) | 0x80;
  2336. }
  2337. if (out - buffer > buffer_size - 100) {
  2338. int indx = out - buffer;
  2339. growBuffer(buffer);
  2340. out = &buffer[indx];
  2341. }
  2342. } else {
  2343. ent = htmlParseEntityRef(ctxt, &name);
  2344. if (name == NULL) {
  2345. *out++ = '&';
  2346. if (out - buffer > buffer_size - 100) {
  2347. int indx = out - buffer;
  2348. growBuffer(buffer);
  2349. out = &buffer[indx];
  2350. }
  2351. } else if (ent == NULL) {
  2352. *out++ = '&';
  2353. cur = name;
  2354. while (*cur != 0) {
  2355. if (out - buffer > buffer_size - 100) {
  2356. int indx = out - buffer;
  2357. growBuffer(buffer);
  2358. out = &buffer[indx];
  2359. }
  2360. *out++ = *cur++;
  2361. }
  2362. } else {
  2363. unsigned int c;
  2364. int bits;
  2365. if (out - buffer > buffer_size - 100) {
  2366. int indx = out - buffer;
  2367. growBuffer(buffer);
  2368. out = &buffer[indx];
  2369. }
  2370. c = ent->value;
  2371. if (c < 0x80)
  2372. { *out++ = c; bits= -6; }
  2373. else if (c < 0x800)
  2374. { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  2375. else if (c < 0x10000)
  2376. { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  2377. else
  2378. { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
  2379. for ( ; bits >= 0; bits-= 6) {
  2380. *out++ = ((c >> bits) & 0x3F) | 0x80;
  2381. }
  2382. }
  2383. }
  2384. } else {
  2385. unsigned int c;
  2386. int bits, l;
  2387. if (out - buffer > buffer_size - 100) {
  2388. int indx = out - buffer;
  2389. growBuffer(buffer);
  2390. out = &buffer[indx];
  2391. }
  2392. c = CUR_CHAR(l);
  2393. if (c < 0x80)
  2394. { *out++ = c; bits= -6; }
  2395. else if (c < 0x800)
  2396. { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  2397. else if (c < 0x10000)
  2398. { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  2399. else
  2400. { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
  2401. for ( ; bits >= 0; bits-= 6) {
  2402. *out++ = ((c >> bits) & 0x3F) | 0x80;
  2403. }
  2404. NEXT;
  2405. }
  2406. }
  2407. *out = 0;
  2408. return(buffer);
  2409. }
  2410. /**
  2411. * htmlParseEntityRef:
  2412. * @ctxt: an HTML parser context
  2413. * @str: location to store the entity name
  2414. *
  2415. * parse an HTML ENTITY references
  2416. *
  2417. * [68] EntityRef ::= '&' Name ';'
  2418. *
  2419. * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
  2420. * if non-NULL *str will have to be freed by the caller.
  2421. */
  2422. const htmlEntityDesc *
  2423. htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
  2424. const xmlChar *name;
  2425. const htmlEntityDesc * ent = NULL;
  2426. if (str != NULL) *str = NULL;
  2427. if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
  2428. if (CUR == '&') {
  2429. NEXT;
  2430. name = htmlParseName(ctxt);
  2431. if (name == NULL) {
  2432. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  2433. "htmlParseEntityRef: no name\n", NULL, NULL);
  2434. } else {
  2435. GROW;
  2436. if (CUR == ';') {
  2437. if (str != NULL)
  2438. *str = name;
  2439. /*
  2440. * Lookup the entity in the table.
  2441. */
  2442. ent = htmlEntityLookup(name);
  2443. if (ent != NULL) /* OK that's ugly !!! */
  2444. NEXT;
  2445. } else {
  2446. htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
  2447. "htmlParseEntityRef: expecting ';'\n",
  2448. NULL, NULL);
  2449. if (str != NULL)
  2450. *str = name;
  2451. }
  2452. }
  2453. }
  2454. return(ent);
  2455. }
  2456. /**
  2457. * htmlParseAttValue:
  2458. * @ctxt: an HTML parser context
  2459. *
  2460. * parse a value for an attribute
  2461. * Note: the parser won't do substitution of entities here, this
  2462. * will be handled later in xmlStringGetNodeList, unless it was
  2463. * asked for ctxt->replaceEntities != 0
  2464. *
  2465. * Returns the AttValue parsed or NULL.
  2466. */
  2467. static xmlChar *
  2468. htmlParseAttValue(htmlParserCtxtPtr ctxt) {
  2469. xmlChar *ret = NULL;
  2470. if (CUR == '"') {
  2471. NEXT;
  2472. ret = htmlParseHTMLAttribute(ctxt, '"');
  2473. if (CUR != '"') {
  2474. htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
  2475. "AttValue: \" expected\n", NULL, NULL);
  2476. } else
  2477. NEXT;
  2478. } else if (CUR == '\'') {
  2479. NEXT;
  2480. ret = htmlParseHTMLAttribute(ctxt, '\'');
  2481. if (CUR != '\'') {
  2482. htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
  2483. "AttValue: ' expected\n", NULL, NULL);
  2484. } else
  2485. NEXT;
  2486. } else {
  2487. /*
  2488. * That's an HTMLism, the attribute value may not be quoted
  2489. */
  2490. ret = htmlParseHTMLAttribute(ctxt, 0);
  2491. if (ret == NULL) {
  2492. htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
  2493. "AttValue: no value found\n", NULL, NULL);
  2494. }
  2495. }
  2496. return(ret);
  2497. }
  2498. /**
  2499. * htmlParseSystemLiteral:
  2500. * @ctxt: an HTML parser context
  2501. *
  2502. * parse an HTML Literal
  2503. *
  2504. * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
  2505. *
  2506. * Returns the SystemLiteral parsed or NULL
  2507. */
  2508. static xmlChar *
  2509. htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
  2510. const xmlChar *q;
  2511. xmlChar *ret = NULL;
  2512. if (CUR == '"') {
  2513. NEXT;
  2514. q = CUR_PTR;
  2515. while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
  2516. NEXT;
  2517. if (!IS_CHAR_CH(CUR)) {
  2518. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
  2519. "Unfinished SystemLiteral\n", NULL, NULL);
  2520. } else {
  2521. ret = xmlStrndup(q, CUR_PTR - q);
  2522. NEXT;
  2523. }
  2524. } else if (CUR == '\'') {
  2525. NEXT;
  2526. q = CUR_PTR;
  2527. while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
  2528. NEXT;
  2529. if (!IS_CHAR_CH(CUR)) {
  2530. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
  2531. "Unfinished SystemLiteral\n", NULL, NULL);
  2532. } else {
  2533. ret = xmlStrndup(q, CUR_PTR - q);
  2534. NEXT;
  2535. }
  2536. } else {
  2537. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
  2538. " or ' expected\n", NULL, NULL);
  2539. }
  2540. return(ret);
  2541. }
  2542. /**
  2543. * htmlParsePubidLiteral:
  2544. * @ctxt: an HTML parser context
  2545. *
  2546. * parse an HTML public literal
  2547. *
  2548. * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
  2549. *
  2550. * Returns the PubidLiteral parsed or NULL.
  2551. */
  2552. static xmlChar *
  2553. htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
  2554. const xmlChar *q;
  2555. xmlChar *ret = NULL;
  2556. /*
  2557. * Name ::= (Letter | '_') (NameChar)*
  2558. */
  2559. if (CUR == '"') {
  2560. NEXT;
  2561. q = CUR_PTR;
  2562. while (IS_PUBIDCHAR_CH(CUR)) NEXT;
  2563. if (CUR != '"') {
  2564. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
  2565. "Unfinished PubidLiteral\n", NULL, NULL);
  2566. } else {
  2567. ret = xmlStrndup(q, CUR_PTR - q);
  2568. NEXT;
  2569. }
  2570. } else if (CUR == '\'') {
  2571. NEXT;
  2572. q = CUR_PTR;
  2573. while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
  2574. NEXT;
  2575. if (CUR != '\'') {
  2576. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
  2577. "Unfinished PubidLiteral\n", NULL, NULL);
  2578. } else {
  2579. ret = xmlStrndup(q, CUR_PTR - q);
  2580. NEXT;
  2581. }
  2582. } else {
  2583. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
  2584. "PubidLiteral \" or ' expected\n", NULL, NULL);
  2585. }
  2586. return(ret);
  2587. }
  2588. /**
  2589. * htmlParseScript:
  2590. * @ctxt: an HTML parser context
  2591. *
  2592. * parse the content of an HTML SCRIPT or STYLE element
  2593. * http://www.w3.org/TR/html4/sgml/dtd.html#Script
  2594. * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
  2595. * http://www.w3.org/TR/html4/types.html#type-script
  2596. * http://www.w3.org/TR/html4/types.html#h-6.15
  2597. * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
  2598. *
  2599. * Script data ( %Script; in the DTD) can be the content of the SCRIPT
  2600. * element and the value of intrinsic event attributes. User agents must
  2601. * not evaluate script data as HTML markup but instead must pass it on as
  2602. * data to a script engine.
  2603. * NOTES:
  2604. * - The content is passed like CDATA
  2605. * - the attributes for style and scripting "onXXX" are also described
  2606. * as CDATA but SGML allows entities references in attributes so their
  2607. * processing is identical as other attributes
  2608. */
  2609. static void
  2610. htmlParseScript(htmlParserCtxtPtr ctxt) {
  2611. xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
  2612. int nbchar = 0;
  2613. int cur,l;
  2614. SHRINK;
  2615. cur = CUR_CHAR(l);
  2616. while (IS_CHAR_CH(cur)) {
  2617. if ((cur == '<') && (NXT(1) == '/')) {
  2618. /*
  2619. * One should break here, the specification is clear:
  2620. * Authors should therefore escape "</" within the content.
  2621. * Escape mechanisms are specific to each scripting or
  2622. * style sheet language.
  2623. *
  2624. * In recovery mode, only break if end tag match the
  2625. * current tag, effectively ignoring all tags inside the
  2626. * script/style block and treating the entire block as
  2627. * CDATA.
  2628. */
  2629. if (ctxt->recovery) {
  2630. if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
  2631. xmlStrlen(ctxt->name)) == 0)
  2632. {
  2633. break; /* while */
  2634. } else {
  2635. htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
  2636. "Element %s embeds close tag\n",
  2637. ctxt->name, NULL);
  2638. }
  2639. } else {
  2640. if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
  2641. ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
  2642. {
  2643. break; /* while */
  2644. }
  2645. }
  2646. }
  2647. COPY_BUF(l,buf,nbchar,cur);
  2648. if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
  2649. if (ctxt->sax->cdataBlock!= NULL) {
  2650. /*
  2651. * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
  2652. */
  2653. ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
  2654. } else if (ctxt->sax->characters != NULL) {
  2655. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2656. }
  2657. nbchar = 0;
  2658. }
  2659. GROW;
  2660. NEXTL(l);
  2661. cur = CUR_CHAR(l);
  2662. }
  2663. if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
  2664. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  2665. "Invalid char in CDATA 0x%X\n", cur);
  2666. NEXT;
  2667. }
  2668. if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
  2669. if (ctxt->sax->cdataBlock!= NULL) {
  2670. /*
  2671. * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
  2672. */
  2673. ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
  2674. } else if (ctxt->sax->characters != NULL) {
  2675. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2676. }
  2677. }
  2678. }
  2679. /**
  2680. * htmlParseCharData:
  2681. * @ctxt: an HTML parser context
  2682. *
  2683. * parse a CharData section.
  2684. * if we are within a CDATA section ']]>' marks an end of section.
  2685. *
  2686. * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
  2687. */
  2688. static void
  2689. htmlParseCharData(htmlParserCtxtPtr ctxt) {
  2690. xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
  2691. int nbchar = 0;
  2692. int cur, l;
  2693. int chunk = 0;
  2694. SHRINK;
  2695. cur = CUR_CHAR(l);
  2696. while (((cur != '<') || (ctxt->token == '<')) &&
  2697. ((cur != '&') || (ctxt->token == '&')) &&
  2698. (cur != 0)) {
  2699. if (!(IS_CHAR(cur))) {
  2700. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  2701. "Invalid char in CDATA 0x%X\n", cur);
  2702. } else {
  2703. COPY_BUF(l,buf,nbchar,cur);
  2704. }
  2705. if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
  2706. /*
  2707. * Ok the segment is to be consumed as chars.
  2708. */
  2709. if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
  2710. if (areBlanks(ctxt, buf, nbchar)) {
  2711. if (ctxt->sax->ignorableWhitespace != NULL)
  2712. ctxt->sax->ignorableWhitespace(ctxt->userData,
  2713. buf, nbchar);
  2714. } else {
  2715. htmlCheckParagraph(ctxt);
  2716. if (ctxt->sax->characters != NULL)
  2717. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2718. }
  2719. }
  2720. nbchar = 0;
  2721. }
  2722. NEXTL(l);
  2723. chunk++;
  2724. if (chunk > HTML_PARSER_BUFFER_SIZE) {
  2725. chunk = 0;
  2726. SHRINK;
  2727. GROW;
  2728. }
  2729. cur = CUR_CHAR(l);
  2730. if (cur == 0) {
  2731. SHRINK;
  2732. GROW;
  2733. cur = CUR_CHAR(l);
  2734. }
  2735. }
  2736. if (nbchar != 0) {
  2737. buf[nbchar] = 0;
  2738. /*
  2739. * Ok the segment is to be consumed as chars.
  2740. */
  2741. if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
  2742. if (areBlanks(ctxt, buf, nbchar)) {
  2743. if (ctxt->sax->ignorableWhitespace != NULL)
  2744. ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
  2745. } else {
  2746. htmlCheckParagraph(ctxt);
  2747. if (ctxt->sax->characters != NULL)
  2748. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2749. }
  2750. }
  2751. } else {
  2752. /*
  2753. * Loop detection
  2754. */
  2755. if (cur == 0)
  2756. ctxt->instate = XML_PARSER_EOF;
  2757. }
  2758. }
  2759. /**
  2760. * htmlParseExternalID:
  2761. * @ctxt: an HTML parser context
  2762. * @publicID: a xmlChar** receiving PubidLiteral
  2763. *
  2764. * Parse an External ID or a Public ID
  2765. *
  2766. * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
  2767. * | 'PUBLIC' S PubidLiteral S SystemLiteral
  2768. *
  2769. * [83] PublicID ::= 'PUBLIC' S PubidLiteral
  2770. *
  2771. * Returns the function returns SystemLiteral and in the second
  2772. * case publicID receives PubidLiteral, is strict is off
  2773. * it is possible to return NULL and have publicID set.
  2774. */
  2775. static xmlChar *
  2776. htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
  2777. xmlChar *URI = NULL;
  2778. if ((UPPER == 'S') && (UPP(1) == 'Y') &&
  2779. (UPP(2) == 'S') && (UPP(3) == 'T') &&
  2780. (UPP(4) == 'E') && (UPP(5) == 'M')) {
  2781. SKIP(6);
  2782. if (!IS_BLANK_CH(CUR)) {
  2783. htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
  2784. "Space required after 'SYSTEM'\n", NULL, NULL);
  2785. }
  2786. SKIP_BLANKS;
  2787. URI = htmlParseSystemLiteral(ctxt);
  2788. if (URI == NULL) {
  2789. htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
  2790. "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
  2791. }
  2792. } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
  2793. (UPP(2) == 'B') && (UPP(3) == 'L') &&
  2794. (UPP(4) == 'I') && (UPP(5) == 'C')) {
  2795. SKIP(6);
  2796. if (!IS_BLANK_CH(CUR)) {
  2797. htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
  2798. "Space required after 'PUBLIC'\n", NULL, NULL);
  2799. }
  2800. SKIP_BLANKS;
  2801. *publicID = htmlParsePubidLiteral(ctxt);
  2802. if (*publicID == NULL) {
  2803. htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
  2804. "htmlParseExternalID: PUBLIC, no Public Identifier\n",
  2805. NULL, NULL);
  2806. }
  2807. SKIP_BLANKS;
  2808. if ((CUR == '"') || (CUR == '\'')) {
  2809. URI = htmlParseSystemLiteral(ctxt);
  2810. }
  2811. }
  2812. return(URI);
  2813. }
  2814. /**
  2815. * xmlParsePI:
  2816. * @ctxt: an XML parser context
  2817. *
  2818. * parse an XML Processing Instruction.
  2819. *
  2820. * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
  2821. */
  2822. static void
  2823. htmlParsePI(htmlParserCtxtPtr ctxt) {
  2824. xmlChar *buf = NULL;
  2825. int len = 0;
  2826. int size = HTML_PARSER_BUFFER_SIZE;
  2827. int cur, l;
  2828. const xmlChar *target;
  2829. xmlParserInputState state;
  2830. int count = 0;
  2831. if ((RAW == '<') && (NXT(1) == '?')) {
  2832. state = ctxt->instate;
  2833. ctxt->instate = XML_PARSER_PI;
  2834. /*
  2835. * this is a Processing Instruction.
  2836. */
  2837. SKIP(2);
  2838. SHRINK;
  2839. /*
  2840. * Parse the target name and check for special support like
  2841. * namespace.
  2842. */
  2843. target = htmlParseName(ctxt);
  2844. if (target != NULL) {
  2845. if (RAW == '>') {
  2846. SKIP(1);
  2847. /*
  2848. * SAX: PI detected.
  2849. */
  2850. if ((ctxt->sax) && (!ctxt->disableSAX) &&
  2851. (ctxt->sax->processingInstruction != NULL))
  2852. ctxt->sax->processingInstruction(ctxt->userData,
  2853. target, NULL);
  2854. ctxt->instate = state;
  2855. return;
  2856. }
  2857. buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
  2858. if (buf == NULL) {
  2859. htmlErrMemory(ctxt, NULL);
  2860. ctxt->instate = state;
  2861. return;
  2862. }
  2863. cur = CUR;
  2864. if (!IS_BLANK(cur)) {
  2865. htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
  2866. "ParsePI: PI %s space expected\n", target, NULL);
  2867. }
  2868. SKIP_BLANKS;
  2869. cur = CUR_CHAR(l);
  2870. while (IS_CHAR(cur) && (cur != '>')) {
  2871. if (len + 5 >= size) {
  2872. xmlChar *tmp;
  2873. size *= 2;
  2874. tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
  2875. if (tmp == NULL) {
  2876. htmlErrMemory(ctxt, NULL);
  2877. xmlFree(buf);
  2878. ctxt->instate = state;
  2879. return;
  2880. }
  2881. buf = tmp;
  2882. }
  2883. count++;
  2884. if (count > 50) {
  2885. GROW;
  2886. count = 0;
  2887. }
  2888. COPY_BUF(l,buf,len,cur);
  2889. NEXTL(l);
  2890. cur = CUR_CHAR(l);
  2891. if (cur == 0) {
  2892. SHRINK;
  2893. GROW;
  2894. cur = CUR_CHAR(l);
  2895. }
  2896. }
  2897. buf[len] = 0;
  2898. if (cur != '>') {
  2899. htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
  2900. "ParsePI: PI %s never end ...\n", target, NULL);
  2901. } else {
  2902. SKIP(1);
  2903. /*
  2904. * SAX: PI detected.
  2905. */
  2906. if ((ctxt->sax) && (!ctxt->disableSAX) &&
  2907. (ctxt->sax->processingInstruction != NULL))
  2908. ctxt->sax->processingInstruction(ctxt->userData,
  2909. target, buf);
  2910. }
  2911. xmlFree(buf);
  2912. } else {
  2913. htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
  2914. "PI is not started correctly", NULL, NULL);
  2915. }
  2916. ctxt->instate = state;
  2917. }
  2918. }
  2919. /**
  2920. * htmlParseComment:
  2921. * @ctxt: an HTML parser context
  2922. *
  2923. * Parse an XML (SGML) comment <!-- .... -->
  2924. *
  2925. * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
  2926. */
  2927. static void
  2928. htmlParseComment(htmlParserCtxtPtr ctxt) {
  2929. xmlChar *buf = NULL;
  2930. int len;
  2931. int size = HTML_PARSER_BUFFER_SIZE;
  2932. int q, ql;
  2933. int r, rl;
  2934. int cur, l;
  2935. xmlParserInputState state;
  2936. /*
  2937. * Check that there is a comment right here.
  2938. */
  2939. if ((RAW != '<') || (NXT(1) != '!') ||
  2940. (NXT(2) != '-') || (NXT(3) != '-')) return;
  2941. state = ctxt->instate;
  2942. ctxt->instate = XML_PARSER_COMMENT;
  2943. SHRINK;
  2944. SKIP(4);
  2945. buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
  2946. if (buf == NULL) {
  2947. htmlErrMemory(ctxt, "buffer allocation failed\n");
  2948. ctxt->instate = state;
  2949. return;
  2950. }
  2951. q = CUR_CHAR(ql);
  2952. NEXTL(ql);
  2953. r = CUR_CHAR(rl);
  2954. NEXTL(rl);
  2955. cur = CUR_CHAR(l);
  2956. len = 0;
  2957. while (IS_CHAR(cur) &&
  2958. ((cur != '>') ||
  2959. (r != '-') || (q != '-'))) {
  2960. if (len + 5 >= size) {
  2961. xmlChar *tmp;
  2962. size *= 2;
  2963. tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
  2964. if (tmp == NULL) {
  2965. xmlFree(buf);
  2966. htmlErrMemory(ctxt, "growing buffer failed\n");
  2967. ctxt->instate = state;
  2968. return;
  2969. }
  2970. buf = tmp;
  2971. }
  2972. COPY_BUF(ql,buf,len,q);
  2973. q = r;
  2974. ql = rl;
  2975. r = cur;
  2976. rl = l;
  2977. NEXTL(l);
  2978. cur = CUR_CHAR(l);
  2979. if (cur == 0) {
  2980. SHRINK;
  2981. GROW;
  2982. cur = CUR_CHAR(l);
  2983. }
  2984. }
  2985. buf[len] = 0;
  2986. if (!IS_CHAR(cur)) {
  2987. htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
  2988. "Comment not terminated \n<!--%.50s\n", buf, NULL);
  2989. xmlFree(buf);
  2990. } else {
  2991. NEXT;
  2992. if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
  2993. (!ctxt->disableSAX))
  2994. ctxt->sax->comment(ctxt->userData, buf);
  2995. xmlFree(buf);
  2996. }
  2997. ctxt->instate = state;
  2998. }
  2999. /**
  3000. * htmlParseCharRef:
  3001. * @ctxt: an HTML parser context
  3002. *
  3003. * parse Reference declarations
  3004. *
  3005. * [66] CharRef ::= '&#' [0-9]+ ';' |
  3006. * '&#x' [0-9a-fA-F]+ ';'
  3007. *
  3008. * Returns the value parsed (as an int)
  3009. */
  3010. int
  3011. htmlParseCharRef(htmlParserCtxtPtr ctxt) {
  3012. int val = 0;
  3013. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  3014. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  3015. "htmlParseCharRef: context error\n",
  3016. NULL, NULL);
  3017. return(0);
  3018. }
  3019. if ((CUR == '&') && (NXT(1) == '#') &&
  3020. ((NXT(2) == 'x') || NXT(2) == 'X')) {
  3021. SKIP(3);
  3022. while (CUR != ';') {
  3023. if ((CUR >= '0') && (CUR <= '9'))
  3024. val = val * 16 + (CUR - '0');
  3025. else if ((CUR >= 'a') && (CUR <= 'f'))
  3026. val = val * 16 + (CUR - 'a') + 10;
  3027. else if ((CUR >= 'A') && (CUR <= 'F'))
  3028. val = val * 16 + (CUR - 'A') + 10;
  3029. else {
  3030. htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
  3031. "htmlParseCharRef: missing semicolumn\n",
  3032. NULL, NULL);
  3033. break;
  3034. }
  3035. NEXT;
  3036. }
  3037. if (CUR == ';')
  3038. NEXT;
  3039. } else if ((CUR == '&') && (NXT(1) == '#')) {
  3040. SKIP(2);
  3041. while (CUR != ';') {
  3042. if ((CUR >= '0') && (CUR <= '9'))
  3043. val = val * 10 + (CUR - '0');
  3044. else {
  3045. htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
  3046. "htmlParseCharRef: missing semicolumn\n",
  3047. NULL, NULL);
  3048. break;
  3049. }
  3050. NEXT;
  3051. }
  3052. if (CUR == ';')
  3053. NEXT;
  3054. } else {
  3055. htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
  3056. "htmlParseCharRef: invalid value\n", NULL, NULL);
  3057. }
  3058. /*
  3059. * Check the value IS_CHAR ...
  3060. */
  3061. if (IS_CHAR(val)) {
  3062. return(val);
  3063. } else {
  3064. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  3065. "htmlParseCharRef: invalid xmlChar value %d\n",
  3066. val);
  3067. }
  3068. return(0);
  3069. }
  3070. /**
  3071. * htmlParseDocTypeDecl:
  3072. * @ctxt: an HTML parser context
  3073. *
  3074. * parse a DOCTYPE declaration
  3075. *
  3076. * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
  3077. * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
  3078. */
  3079. static void
  3080. htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
  3081. const xmlChar *name;
  3082. xmlChar *ExternalID = NULL;
  3083. xmlChar *URI = NULL;
  3084. /*
  3085. * We know that '<!DOCTYPE' has been detected.
  3086. */
  3087. SKIP(9);
  3088. SKIP_BLANKS;
  3089. /*
  3090. * Parse the DOCTYPE name.
  3091. */
  3092. name = htmlParseName(ctxt);
  3093. if (name == NULL) {
  3094. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  3095. "htmlParseDocTypeDecl : no DOCTYPE name !\n",
  3096. NULL, NULL);
  3097. }
  3098. /*
  3099. * Check that upper(name) == "HTML" !!!!!!!!!!!!!
  3100. */
  3101. SKIP_BLANKS;
  3102. /*
  3103. * Check for SystemID and ExternalID
  3104. */
  3105. URI = htmlParseExternalID(ctxt, &ExternalID);
  3106. SKIP_BLANKS;
  3107. /*
  3108. * We should be at the end of the DOCTYPE declaration.
  3109. */
  3110. if (CUR != '>') {
  3111. htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
  3112. "DOCTYPE improperly terminated\n", NULL, NULL);
  3113. /* We shouldn't try to resynchronize ... */
  3114. }
  3115. NEXT;
  3116. /*
  3117. * Create or update the document accordingly to the DOCTYPE
  3118. */
  3119. if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
  3120. (!ctxt->disableSAX))
  3121. ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
  3122. /*
  3123. * Cleanup, since we don't use all those identifiers
  3124. */
  3125. if (URI != NULL) xmlFree(URI);
  3126. if (ExternalID != NULL) xmlFree(ExternalID);
  3127. }
  3128. /**
  3129. * htmlParseAttribute:
  3130. * @ctxt: an HTML parser context
  3131. * @value: a xmlChar ** used to store the value of the attribute
  3132. *
  3133. * parse an attribute
  3134. *
  3135. * [41] Attribute ::= Name Eq AttValue
  3136. *
  3137. * [25] Eq ::= S? '=' S?
  3138. *
  3139. * With namespace:
  3140. *
  3141. * [NS 11] Attribute ::= QName Eq AttValue
  3142. *
  3143. * Also the case QName == xmlns:??? is handled independently as a namespace
  3144. * definition.
  3145. *
  3146. * Returns the attribute name, and the value in *value.
  3147. */
  3148. static const xmlChar *
  3149. htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
  3150. const xmlChar *name;
  3151. xmlChar *val = NULL;
  3152. *value = NULL;
  3153. name = htmlParseHTMLName(ctxt);
  3154. if (name == NULL) {
  3155. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  3156. "error parsing attribute name\n", NULL, NULL);
  3157. return(NULL);
  3158. }
  3159. /*
  3160. * read the value
  3161. */
  3162. SKIP_BLANKS;
  3163. if (CUR == '=') {
  3164. NEXT;
  3165. SKIP_BLANKS;
  3166. val = htmlParseAttValue(ctxt);
  3167. }
  3168. *value = val;
  3169. return(name);
  3170. }
  3171. /**
  3172. * htmlCheckEncoding:
  3173. * @ctxt: an HTML parser context
  3174. * @attvalue: the attribute value
  3175. *
  3176. * Checks an http-equiv attribute from a Meta tag to detect
  3177. * the encoding
  3178. * If a new encoding is detected the parser is switched to decode
  3179. * it and pass UTF8
  3180. */
  3181. static void
  3182. htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
  3183. const xmlChar *encoding;
  3184. if ((ctxt == NULL) || (attvalue == NULL))
  3185. return;
  3186. /* do not change encoding */
  3187. if (ctxt->input->encoding != NULL)
  3188. return;
  3189. encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
  3190. if (encoding != NULL) {
  3191. encoding += 8;
  3192. } else {
  3193. encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
  3194. if (encoding != NULL)
  3195. encoding += 9;
  3196. }
  3197. if (encoding != NULL) {
  3198. xmlCharEncoding enc;
  3199. xmlCharEncodingHandlerPtr handler;
  3200. while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
  3201. if (ctxt->input->encoding != NULL)
  3202. xmlFree((xmlChar *) ctxt->input->encoding);
  3203. ctxt->input->encoding = xmlStrdup(encoding);
  3204. enc = xmlParseCharEncoding((const char *) encoding);
  3205. /*
  3206. * registered set of known encodings
  3207. */
  3208. if (enc != XML_CHAR_ENCODING_ERROR) {
  3209. if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
  3210. (enc == XML_CHAR_ENCODING_UTF16BE) ||
  3211. (enc == XML_CHAR_ENCODING_UCS4LE) ||
  3212. (enc == XML_CHAR_ENCODING_UCS4BE)) &&
  3213. (ctxt->input->buf != NULL) &&
  3214. (ctxt->input->buf->encoder == NULL)) {
  3215. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  3216. "htmlCheckEncoding: wrong encoding meta\n",
  3217. NULL, NULL);
  3218. } else {
  3219. xmlSwitchEncoding(ctxt, enc);
  3220. }
  3221. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  3222. } else {
  3223. /*
  3224. * fallback for unknown encodings
  3225. */
  3226. handler = xmlFindCharEncodingHandler((const char *) encoding);
  3227. if (handler != NULL) {
  3228. xmlSwitchToEncoding(ctxt, handler);
  3229. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  3230. } else {
  3231. ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
  3232. }
  3233. }
  3234. if ((ctxt->input->buf != NULL) &&
  3235. (ctxt->input->buf->encoder != NULL) &&
  3236. (ctxt->input->buf->raw != NULL) &&
  3237. (ctxt->input->buf->buffer != NULL)) {
  3238. int nbchars;
  3239. int processed;
  3240. /*
  3241. * convert as much as possible to the parser reading buffer.
  3242. */
  3243. processed = ctxt->input->cur - ctxt->input->base;
  3244. xmlBufferShrink(ctxt->input->buf->buffer, processed);
  3245. nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
  3246. ctxt->input->buf->buffer,
  3247. ctxt->input->buf->raw);
  3248. if (nbchars < 0) {
  3249. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  3250. "htmlCheckEncoding: encoder error\n",
  3251. NULL, NULL);
  3252. }
  3253. ctxt->input->base =
  3254. ctxt->input->cur = ctxt->input->buf->buffer->content;
  3255. ctxt->input->end =
  3256. &ctxt->input->base[ctxt->input->buf->buffer->use];
  3257. }
  3258. }
  3259. }
  3260. /**
  3261. * htmlCheckMeta:
  3262. * @ctxt: an HTML parser context
  3263. * @atts: the attributes values
  3264. *
  3265. * Checks an attributes from a Meta tag
  3266. */
  3267. static void
  3268. htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
  3269. int i;
  3270. const xmlChar *att, *value;
  3271. int http = 0;
  3272. const xmlChar *content = NULL;
  3273. if ((ctxt == NULL) || (atts == NULL))
  3274. return;
  3275. i = 0;
  3276. att = atts[i++];
  3277. while (att != NULL) {
  3278. value = atts[i++];
  3279. if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
  3280. && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
  3281. http = 1;
  3282. else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
  3283. content = value;
  3284. att = atts[i++];
  3285. }
  3286. if ((http) && (content != NULL))
  3287. htmlCheckEncoding(ctxt, content);
  3288. }
  3289. /**
  3290. * htmlParseStartTag:
  3291. * @ctxt: an HTML parser context
  3292. *
  3293. * parse a start of tag either for rule element or
  3294. * EmptyElement. In both case we don't parse the tag closing chars.
  3295. *
  3296. * [40] STag ::= '<' Name (S Attribute)* S? '>'
  3297. *
  3298. * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
  3299. *
  3300. * With namespace:
  3301. *
  3302. * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
  3303. *
  3304. * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
  3305. *
  3306. * Returns 0 in case of success, -1 in case of error and 1 if discarded
  3307. */
  3308. static int
  3309. htmlParseStartTag(htmlParserCtxtPtr ctxt) {
  3310. const xmlChar *name;
  3311. const xmlChar *attname;
  3312. xmlChar *attvalue;
  3313. const xmlChar **atts;
  3314. int nbatts = 0;
  3315. int maxatts;
  3316. int meta = 0;
  3317. int i;
  3318. int discardtag = 0;
  3319. if (ctxt->instate == XML_PARSER_EOF)
  3320. return(-1);
  3321. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  3322. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  3323. "htmlParseStartTag: context error\n", NULL, NULL);
  3324. return -1;
  3325. }
  3326. if (CUR != '<') return -1;
  3327. NEXT;
  3328. atts = ctxt->atts;
  3329. maxatts = ctxt->maxatts;
  3330. GROW;
  3331. name = htmlParseHTMLName(ctxt);
  3332. if (name == NULL) {
  3333. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  3334. "htmlParseStartTag: invalid element name\n",
  3335. NULL, NULL);
  3336. /* Dump the bogus tag like browsers do */
  3337. while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
  3338. (ctxt->instate != XML_PARSER_EOF))
  3339. NEXT;
  3340. return -1;
  3341. }
  3342. if (xmlStrEqual(name, BAD_CAST"meta"))
  3343. meta = 1;
  3344. /*
  3345. * Check for auto-closure of HTML elements.
  3346. */
  3347. htmlAutoClose(ctxt, name);
  3348. /*
  3349. * Check for implied HTML elements.
  3350. */
  3351. htmlCheckImplied(ctxt, name);
  3352. /*
  3353. * Avoid html at any level > 0, head at any level != 1
  3354. * or any attempt to recurse body
  3355. */
  3356. if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
  3357. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  3358. "htmlParseStartTag: misplaced <html> tag\n",
  3359. name, NULL);
  3360. discardtag = 1;
  3361. ctxt->depth++;
  3362. }
  3363. if ((ctxt->nameNr != 1) &&
  3364. (xmlStrEqual(name, BAD_CAST"head"))) {
  3365. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  3366. "htmlParseStartTag: misplaced <head> tag\n",
  3367. name, NULL);
  3368. discardtag = 1;
  3369. ctxt->depth++;
  3370. }
  3371. if (xmlStrEqual(name, BAD_CAST"body")) {
  3372. int indx;
  3373. for (indx = 0;indx < ctxt->nameNr;indx++) {
  3374. if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
  3375. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  3376. "htmlParseStartTag: misplaced <body> tag\n",
  3377. name, NULL);
  3378. discardtag = 1;
  3379. ctxt->depth++;
  3380. }
  3381. }
  3382. }
  3383. /*
  3384. * Now parse the attributes, it ends up with the ending
  3385. *
  3386. * (S Attribute)* S?
  3387. */
  3388. SKIP_BLANKS;
  3389. while ((IS_CHAR_CH(CUR)) &&
  3390. (CUR != '>') &&
  3391. ((CUR != '/') || (NXT(1) != '>'))) {
  3392. long cons = ctxt->nbChars;
  3393. GROW;
  3394. attname = htmlParseAttribute(ctxt, &attvalue);
  3395. if (attname != NULL) {
  3396. /*
  3397. * Well formedness requires at most one declaration of an attribute
  3398. */
  3399. for (i = 0; i < nbatts;i += 2) {
  3400. if (xmlStrEqual(atts[i], attname)) {
  3401. htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
  3402. "Attribute %s redefined\n", attname, NULL);
  3403. if (attvalue != NULL)
  3404. xmlFree(attvalue);
  3405. goto failed;
  3406. }
  3407. }
  3408. /*
  3409. * Add the pair to atts
  3410. */
  3411. if (atts == NULL) {
  3412. maxatts = 22; /* allow for 10 attrs by default */
  3413. atts = (const xmlChar **)
  3414. xmlMalloc(maxatts * sizeof(xmlChar *));
  3415. if (atts == NULL) {
  3416. htmlErrMemory(ctxt, NULL);
  3417. if (attvalue != NULL)
  3418. xmlFree(attvalue);
  3419. goto failed;
  3420. }
  3421. ctxt->atts = atts;
  3422. ctxt->maxatts = maxatts;
  3423. } else if (nbatts + 4 > maxatts) {
  3424. const xmlChar **n;
  3425. maxatts *= 2;
  3426. n = (const xmlChar **) xmlRealloc((void *) atts,
  3427. maxatts * sizeof(const xmlChar *));
  3428. if (n == NULL) {
  3429. htmlErrMemory(ctxt, NULL);
  3430. if (attvalue != NULL)
  3431. xmlFree(attvalue);
  3432. goto failed;
  3433. }
  3434. atts = n;
  3435. ctxt->atts = atts;
  3436. ctxt->maxatts = maxatts;
  3437. }
  3438. atts[nbatts++] = attname;
  3439. atts[nbatts++] = attvalue;
  3440. atts[nbatts] = NULL;
  3441. atts[nbatts + 1] = NULL;
  3442. }
  3443. else {
  3444. if (attvalue != NULL)
  3445. xmlFree(attvalue);
  3446. /* Dump the bogus attribute string up to the next blank or
  3447. * the end of the tag. */
  3448. while ((IS_CHAR_CH(CUR)) &&
  3449. !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
  3450. ((CUR != '/') || (NXT(1) != '>')))
  3451. NEXT;
  3452. }
  3453. failed:
  3454. SKIP_BLANKS;
  3455. if (cons == ctxt->nbChars) {
  3456. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  3457. "htmlParseStartTag: problem parsing attributes\n",
  3458. NULL, NULL);
  3459. break;
  3460. }
  3461. }
  3462. /*
  3463. * Handle specific association to the META tag
  3464. */
  3465. if (meta && (nbatts != 0))
  3466. htmlCheckMeta(ctxt, atts);
  3467. /*
  3468. * SAX: Start of Element !
  3469. */
  3470. if (!discardtag) {
  3471. htmlnamePush(ctxt, name);
  3472. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
  3473. if (nbatts != 0)
  3474. ctxt->sax->startElement(ctxt->userData, name, atts);
  3475. else
  3476. ctxt->sax->startElement(ctxt->userData, name, NULL);
  3477. }
  3478. }
  3479. if (atts != NULL) {
  3480. for (i = 1;i < nbatts;i += 2) {
  3481. if (atts[i] != NULL)
  3482. xmlFree((xmlChar *) atts[i]);
  3483. }
  3484. }
  3485. return(discardtag);
  3486. }
  3487. /**
  3488. * htmlParseEndTag:
  3489. * @ctxt: an HTML parser context
  3490. *
  3491. * parse an end of tag
  3492. *
  3493. * [42] ETag ::= '</' Name S? '>'
  3494. *
  3495. * With namespace
  3496. *
  3497. * [NS 9] ETag ::= '</' QName S? '>'
  3498. *
  3499. * Returns 1 if the current level should be closed.
  3500. */
  3501. static int
  3502. htmlParseEndTag(htmlParserCtxtPtr ctxt)
  3503. {
  3504. const xmlChar *name;
  3505. const xmlChar *oldname;
  3506. int i, ret;
  3507. if ((CUR != '<') || (NXT(1) != '/')) {
  3508. htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
  3509. "htmlParseEndTag: '</' not found\n", NULL, NULL);
  3510. return (0);
  3511. }
  3512. SKIP(2);
  3513. name = htmlParseHTMLName(ctxt);
  3514. if (name == NULL)
  3515. return (0);
  3516. /*
  3517. * We should definitely be at the ending "S? '>'" part
  3518. */
  3519. SKIP_BLANKS;
  3520. if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
  3521. htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
  3522. "End tag : expected '>'\n", NULL, NULL);
  3523. if (ctxt->recovery) {
  3524. /*
  3525. * We're not at the ending > !!
  3526. * Error, unless in recover mode where we search forwards
  3527. * until we find a >
  3528. */
  3529. while (CUR != '\0' && CUR != '>') NEXT;
  3530. NEXT;
  3531. }
  3532. } else
  3533. NEXT;
  3534. /*
  3535. * if we ignored misplaced tags in htmlParseStartTag don't pop them
  3536. * out now.
  3537. */
  3538. if ((ctxt->depth > 0) &&
  3539. (xmlStrEqual(name, BAD_CAST "html") ||
  3540. xmlStrEqual(name, BAD_CAST "body") ||
  3541. xmlStrEqual(name, BAD_CAST "head"))) {
  3542. ctxt->depth--;
  3543. return (0);
  3544. }
  3545. /*
  3546. * If the name read is not one of the element in the parsing stack
  3547. * then return, it's just an error.
  3548. */
  3549. for (i = (ctxt->nameNr - 1); i >= 0; i--) {
  3550. if (xmlStrEqual(name, ctxt->nameTab[i]))
  3551. break;
  3552. }
  3553. if (i < 0) {
  3554. htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
  3555. "Unexpected end tag : %s\n", name, NULL);
  3556. return (0);
  3557. }
  3558. /*
  3559. * Check for auto-closure of HTML elements.
  3560. */
  3561. htmlAutoCloseOnClose(ctxt, name);
  3562. /*
  3563. * Well formedness constraints, opening and closing must match.
  3564. * With the exception that the autoclose may have popped stuff out
  3565. * of the stack.
  3566. */
  3567. if (!xmlStrEqual(name, ctxt->name)) {
  3568. if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
  3569. htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
  3570. "Opening and ending tag mismatch: %s and %s\n",
  3571. name, ctxt->name);
  3572. }
  3573. }
  3574. /*
  3575. * SAX: End of Tag
  3576. */
  3577. oldname = ctxt->name;
  3578. if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
  3579. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  3580. ctxt->sax->endElement(ctxt->userData, name);
  3581. htmlnamePop(ctxt);
  3582. ret = 1;
  3583. } else {
  3584. ret = 0;
  3585. }
  3586. return (ret);
  3587. }
  3588. /**
  3589. * htmlParseReference:
  3590. * @ctxt: an HTML parser context
  3591. *
  3592. * parse and handle entity references in content,
  3593. * this will end-up in a call to character() since this is either a
  3594. * CharRef, or a predefined entity.
  3595. */
  3596. static void
  3597. htmlParseReference(htmlParserCtxtPtr ctxt) {
  3598. const htmlEntityDesc * ent;
  3599. xmlChar out[6];
  3600. const xmlChar *name;
  3601. if (CUR != '&') return;
  3602. if (NXT(1) == '#') {
  3603. unsigned int c;
  3604. int bits, i = 0;
  3605. c = htmlParseCharRef(ctxt);
  3606. if (c == 0)
  3607. return;
  3608. if (c < 0x80) { out[i++]= c; bits= -6; }
  3609. else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  3610. else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  3611. else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
  3612. for ( ; bits >= 0; bits-= 6) {
  3613. out[i++]= ((c >> bits) & 0x3F) | 0x80;
  3614. }
  3615. out[i] = 0;
  3616. htmlCheckParagraph(ctxt);
  3617. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
  3618. ctxt->sax->characters(ctxt->userData, out, i);
  3619. } else {
  3620. ent = htmlParseEntityRef(ctxt, &name);
  3621. if (name == NULL) {
  3622. htmlCheckParagraph(ctxt);
  3623. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
  3624. ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
  3625. return;
  3626. }
  3627. if ((ent == NULL) || !(ent->value > 0)) {
  3628. htmlCheckParagraph(ctxt);
  3629. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
  3630. ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
  3631. ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
  3632. /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
  3633. }
  3634. } else {
  3635. unsigned int c;
  3636. int bits, i = 0;
  3637. c = ent->value;
  3638. if (c < 0x80)
  3639. { out[i++]= c; bits= -6; }
  3640. else if (c < 0x800)
  3641. { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  3642. else if (c < 0x10000)
  3643. { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  3644. else
  3645. { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
  3646. for ( ; bits >= 0; bits-= 6) {
  3647. out[i++]= ((c >> bits) & 0x3F) | 0x80;
  3648. }
  3649. out[i] = 0;
  3650. htmlCheckParagraph(ctxt);
  3651. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
  3652. ctxt->sax->characters(ctxt->userData, out, i);
  3653. }
  3654. }
  3655. }
  3656. /**
  3657. * htmlParseContent:
  3658. * @ctxt: an HTML parser context
  3659. *
  3660. * Parse a content: comment, sub-element, reference or text.
  3661. * Kept for compatibility with old code
  3662. */
  3663. static void
  3664. htmlParseContent(htmlParserCtxtPtr ctxt) {
  3665. xmlChar *currentNode;
  3666. int depth;
  3667. const xmlChar *name;
  3668. currentNode = xmlStrdup(ctxt->name);
  3669. depth = ctxt->nameNr;
  3670. while (1) {
  3671. long cons = ctxt->nbChars;
  3672. GROW;
  3673. if (ctxt->instate == XML_PARSER_EOF)
  3674. break;
  3675. /*
  3676. * Our tag or one of it's parent or children is ending.
  3677. */
  3678. if ((CUR == '<') && (NXT(1) == '/')) {
  3679. if (htmlParseEndTag(ctxt) &&
  3680. ((currentNode != NULL) || (ctxt->nameNr == 0))) {
  3681. if (currentNode != NULL)
  3682. xmlFree(currentNode);
  3683. return;
  3684. }
  3685. continue; /* while */
  3686. }
  3687. else if ((CUR == '<') &&
  3688. ((IS_ASCII_LETTER(NXT(1))) ||
  3689. (NXT(1) == '_') || (NXT(1) == ':'))) {
  3690. name = htmlParseHTMLName_nonInvasive(ctxt);
  3691. if (name == NULL) {
  3692. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  3693. "htmlParseStartTag: invalid element name\n",
  3694. NULL, NULL);
  3695. /* Dump the bogus tag like browsers do */
  3696. while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
  3697. NEXT;
  3698. if (currentNode != NULL)
  3699. xmlFree(currentNode);
  3700. return;
  3701. }
  3702. if (ctxt->name != NULL) {
  3703. if (htmlCheckAutoClose(name, ctxt->name) == 1) {
  3704. htmlAutoClose(ctxt, name);
  3705. continue;
  3706. }
  3707. }
  3708. }
  3709. /*
  3710. * Has this node been popped out during parsing of
  3711. * the next element
  3712. */
  3713. if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
  3714. (!xmlStrEqual(currentNode, ctxt->name)))
  3715. {
  3716. if (currentNode != NULL) xmlFree(currentNode);
  3717. return;
  3718. }
  3719. if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
  3720. (xmlStrEqual(currentNode, BAD_CAST"style")))) {
  3721. /*
  3722. * Handle SCRIPT/STYLE separately
  3723. */
  3724. htmlParseScript(ctxt);
  3725. } else {
  3726. /*
  3727. * Sometimes DOCTYPE arrives in the middle of the document
  3728. */
  3729. if ((CUR == '<') && (NXT(1) == '!') &&
  3730. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  3731. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  3732. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  3733. (UPP(8) == 'E')) {
  3734. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  3735. "Misplaced DOCTYPE declaration\n",
  3736. BAD_CAST "DOCTYPE" , NULL);
  3737. htmlParseDocTypeDecl(ctxt);
  3738. }
  3739. /*
  3740. * First case : a comment
  3741. */
  3742. if ((CUR == '<') && (NXT(1) == '!') &&
  3743. (NXT(2) == '-') && (NXT(3) == '-')) {
  3744. htmlParseComment(ctxt);
  3745. }
  3746. /*
  3747. * Second case : a Processing Instruction.
  3748. */
  3749. else if ((CUR == '<') && (NXT(1) == '?')) {
  3750. htmlParsePI(ctxt);
  3751. }
  3752. /*
  3753. * Third case : a sub-element.
  3754. */
  3755. else if (CUR == '<') {
  3756. htmlParseElement(ctxt);
  3757. }
  3758. /*
  3759. * Fourth case : a reference. If if has not been resolved,
  3760. * parsing returns it's Name, create the node
  3761. */
  3762. else if (CUR == '&') {
  3763. htmlParseReference(ctxt);
  3764. }
  3765. /*
  3766. * Fifth case : end of the resource
  3767. */
  3768. else if (CUR == 0) {
  3769. htmlAutoCloseOnEnd(ctxt);
  3770. break;
  3771. }
  3772. /*
  3773. * Last case, text. Note that References are handled directly.
  3774. */
  3775. else {
  3776. htmlParseCharData(ctxt);
  3777. }
  3778. if (cons == ctxt->nbChars) {
  3779. if (ctxt->node != NULL) {
  3780. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  3781. "detected an error in element content\n",
  3782. NULL, NULL);
  3783. }
  3784. break;
  3785. }
  3786. }
  3787. GROW;
  3788. }
  3789. if (currentNode != NULL) xmlFree(currentNode);
  3790. }
  3791. /**
  3792. * htmlParseElement:
  3793. * @ctxt: an HTML parser context
  3794. *
  3795. * parse an HTML element, this is highly recursive
  3796. * this is kept for compatibility with previous code versions
  3797. *
  3798. * [39] element ::= EmptyElemTag | STag content ETag
  3799. *
  3800. * [41] Attribute ::= Name Eq AttValue
  3801. */
  3802. void
  3803. htmlParseElement(htmlParserCtxtPtr ctxt) {
  3804. const xmlChar *name;
  3805. xmlChar *currentNode = NULL;
  3806. const htmlElemDesc * info;
  3807. htmlParserNodeInfo node_info;
  3808. int failed;
  3809. int depth;
  3810. const xmlChar *oldptr;
  3811. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  3812. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  3813. "htmlParseElement: context error\n", NULL, NULL);
  3814. return;
  3815. }
  3816. if (ctxt->instate == XML_PARSER_EOF)
  3817. return;
  3818. /* Capture start position */
  3819. if (ctxt->record_info) {
  3820. node_info.begin_pos = ctxt->input->consumed +
  3821. (CUR_PTR - ctxt->input->base);
  3822. node_info.begin_line = ctxt->input->line;
  3823. }
  3824. failed = htmlParseStartTag(ctxt);
  3825. name = ctxt->name;
  3826. if ((failed == -1) || (name == NULL)) {
  3827. if (CUR == '>')
  3828. NEXT;
  3829. return;
  3830. }
  3831. /*
  3832. * Lookup the info for that element.
  3833. */
  3834. info = htmlTagLookup(name);
  3835. if (info == NULL) {
  3836. htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
  3837. "Tag %s invalid\n", name, NULL);
  3838. }
  3839. /*
  3840. * Check for an Empty Element labeled the XML/SGML way
  3841. */
  3842. if ((CUR == '/') && (NXT(1) == '>')) {
  3843. SKIP(2);
  3844. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  3845. ctxt->sax->endElement(ctxt->userData, name);
  3846. htmlnamePop(ctxt);
  3847. return;
  3848. }
  3849. if (CUR == '>') {
  3850. NEXT;
  3851. } else {
  3852. htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
  3853. "Couldn't find end of Start Tag %s\n", name, NULL);
  3854. /*
  3855. * end of parsing of this node.
  3856. */
  3857. if (xmlStrEqual(name, ctxt->name)) {
  3858. nodePop(ctxt);
  3859. htmlnamePop(ctxt);
  3860. }
  3861. /*
  3862. * Capture end position and add node
  3863. */
  3864. if (ctxt->record_info) {
  3865. node_info.end_pos = ctxt->input->consumed +
  3866. (CUR_PTR - ctxt->input->base);
  3867. node_info.end_line = ctxt->input->line;
  3868. node_info.node = ctxt->node;
  3869. xmlParserAddNodeInfo(ctxt, &node_info);
  3870. }
  3871. return;
  3872. }
  3873. /*
  3874. * Check for an Empty Element from DTD definition
  3875. */
  3876. if ((info != NULL) && (info->empty)) {
  3877. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  3878. ctxt->sax->endElement(ctxt->userData, name);
  3879. htmlnamePop(ctxt);
  3880. return;
  3881. }
  3882. /*
  3883. * Parse the content of the element:
  3884. */
  3885. currentNode = xmlStrdup(ctxt->name);
  3886. depth = ctxt->nameNr;
  3887. while (IS_CHAR_CH(CUR)) {
  3888. oldptr = ctxt->input->cur;
  3889. htmlParseContent(ctxt);
  3890. if (oldptr==ctxt->input->cur) break;
  3891. if (ctxt->nameNr < depth) break;
  3892. }
  3893. /*
  3894. * Capture end position and add node
  3895. */
  3896. if ( currentNode != NULL && ctxt->record_info ) {
  3897. node_info.end_pos = ctxt->input->consumed +
  3898. (CUR_PTR - ctxt->input->base);
  3899. node_info.end_line = ctxt->input->line;
  3900. node_info.node = ctxt->node;
  3901. xmlParserAddNodeInfo(ctxt, &node_info);
  3902. }
  3903. if (!IS_CHAR_CH(CUR)) {
  3904. htmlAutoCloseOnEnd(ctxt);
  3905. }
  3906. if (currentNode != NULL)
  3907. xmlFree(currentNode);
  3908. }
  3909. static void
  3910. htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
  3911. /*
  3912. * Capture end position and add node
  3913. */
  3914. if ( ctxt->node != NULL && ctxt->record_info ) {
  3915. ctxt->nodeInfo->end_pos = ctxt->input->consumed +
  3916. (CUR_PTR - ctxt->input->base);
  3917. ctxt->nodeInfo->end_line = ctxt->input->line;
  3918. ctxt->nodeInfo->node = ctxt->node;
  3919. xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
  3920. htmlNodeInfoPop(ctxt);
  3921. }
  3922. if (!IS_CHAR_CH(CUR)) {
  3923. htmlAutoCloseOnEnd(ctxt);
  3924. }
  3925. }
  3926. /**
  3927. * htmlParseElementInternal:
  3928. * @ctxt: an HTML parser context
  3929. *
  3930. * parse an HTML element, new version, non recursive
  3931. *
  3932. * [39] element ::= EmptyElemTag | STag content ETag
  3933. *
  3934. * [41] Attribute ::= Name Eq AttValue
  3935. */
  3936. static void
  3937. htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
  3938. const xmlChar *name;
  3939. const htmlElemDesc * info;
  3940. htmlParserNodeInfo node_info;
  3941. int failed;
  3942. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  3943. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  3944. "htmlParseElementInternal: context error\n", NULL, NULL);
  3945. return;
  3946. }
  3947. if (ctxt->instate == XML_PARSER_EOF)
  3948. return;
  3949. /* Capture start position */
  3950. if (ctxt->record_info) {
  3951. node_info.begin_pos = ctxt->input->consumed +
  3952. (CUR_PTR - ctxt->input->base);
  3953. node_info.begin_line = ctxt->input->line;
  3954. }
  3955. failed = htmlParseStartTag(ctxt);
  3956. name = ctxt->name;
  3957. if ((failed == -1) || (name == NULL)) {
  3958. if (CUR == '>')
  3959. NEXT;
  3960. return;
  3961. }
  3962. /*
  3963. * Lookup the info for that element.
  3964. */
  3965. info = htmlTagLookup(name);
  3966. if (info == NULL) {
  3967. htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
  3968. "Tag %s invalid\n", name, NULL);
  3969. }
  3970. /*
  3971. * Check for an Empty Element labeled the XML/SGML way
  3972. */
  3973. if ((CUR == '/') && (NXT(1) == '>')) {
  3974. SKIP(2);
  3975. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  3976. ctxt->sax->endElement(ctxt->userData, name);
  3977. htmlnamePop(ctxt);
  3978. return;
  3979. }
  3980. if (CUR == '>') {
  3981. NEXT;
  3982. } else {
  3983. htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
  3984. "Couldn't find end of Start Tag %s\n", name, NULL);
  3985. /*
  3986. * end of parsing of this node.
  3987. */
  3988. if (xmlStrEqual(name, ctxt->name)) {
  3989. nodePop(ctxt);
  3990. htmlnamePop(ctxt);
  3991. }
  3992. if (ctxt->record_info)
  3993. htmlNodeInfoPush(ctxt, &node_info);
  3994. htmlParserFinishElementParsing(ctxt);
  3995. return;
  3996. }
  3997. /*
  3998. * Check for an Empty Element from DTD definition
  3999. */
  4000. if ((info != NULL) && (info->empty)) {
  4001. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  4002. ctxt->sax->endElement(ctxt->userData, name);
  4003. htmlnamePop(ctxt);
  4004. return;
  4005. }
  4006. if (ctxt->record_info)
  4007. htmlNodeInfoPush(ctxt, &node_info);
  4008. }
  4009. /**
  4010. * htmlParseContentInternal:
  4011. * @ctxt: an HTML parser context
  4012. *
  4013. * Parse a content: comment, sub-element, reference or text.
  4014. * New version for non recursive htmlParseElementInternal
  4015. */
  4016. static void
  4017. htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
  4018. xmlChar *currentNode;
  4019. int depth;
  4020. const xmlChar *name;
  4021. currentNode = xmlStrdup(ctxt->name);
  4022. depth = ctxt->nameNr;
  4023. while (1) {
  4024. long cons = ctxt->nbChars;
  4025. GROW;
  4026. if (ctxt->instate == XML_PARSER_EOF)
  4027. break;
  4028. /*
  4029. * Our tag or one of it's parent or children is ending.
  4030. */
  4031. if ((CUR == '<') && (NXT(1) == '/')) {
  4032. if (htmlParseEndTag(ctxt) &&
  4033. ((currentNode != NULL) || (ctxt->nameNr == 0))) {
  4034. if (currentNode != NULL)
  4035. xmlFree(currentNode);
  4036. currentNode = xmlStrdup(ctxt->name);
  4037. depth = ctxt->nameNr;
  4038. }
  4039. continue; /* while */
  4040. }
  4041. else if ((CUR == '<') &&
  4042. ((IS_ASCII_LETTER(NXT(1))) ||
  4043. (NXT(1) == '_') || (NXT(1) == ':'))) {
  4044. name = htmlParseHTMLName_nonInvasive(ctxt);
  4045. if (name == NULL) {
  4046. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  4047. "htmlParseStartTag: invalid element name\n",
  4048. NULL, NULL);
  4049. /* Dump the bogus tag like browsers do */
  4050. while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
  4051. NEXT;
  4052. htmlParserFinishElementParsing(ctxt);
  4053. if (currentNode != NULL)
  4054. xmlFree(currentNode);
  4055. currentNode = xmlStrdup(ctxt->name);
  4056. depth = ctxt->nameNr;
  4057. continue;
  4058. }
  4059. if (ctxt->name != NULL) {
  4060. if (htmlCheckAutoClose(name, ctxt->name) == 1) {
  4061. htmlAutoClose(ctxt, name);
  4062. continue;
  4063. }
  4064. }
  4065. }
  4066. /*
  4067. * Has this node been popped out during parsing of
  4068. * the next element
  4069. */
  4070. if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
  4071. (!xmlStrEqual(currentNode, ctxt->name)))
  4072. {
  4073. htmlParserFinishElementParsing(ctxt);
  4074. if (currentNode != NULL) xmlFree(currentNode);
  4075. currentNode = xmlStrdup(ctxt->name);
  4076. depth = ctxt->nameNr;
  4077. continue;
  4078. }
  4079. if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
  4080. (xmlStrEqual(currentNode, BAD_CAST"style")))) {
  4081. /*
  4082. * Handle SCRIPT/STYLE separately
  4083. */
  4084. htmlParseScript(ctxt);
  4085. } else {
  4086. /*
  4087. * Sometimes DOCTYPE arrives in the middle of the document
  4088. */
  4089. if ((CUR == '<') && (NXT(1) == '!') &&
  4090. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  4091. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  4092. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  4093. (UPP(8) == 'E')) {
  4094. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  4095. "Misplaced DOCTYPE declaration\n",
  4096. BAD_CAST "DOCTYPE" , NULL);
  4097. htmlParseDocTypeDecl(ctxt);
  4098. }
  4099. /*
  4100. * First case : a comment
  4101. */
  4102. if ((CUR == '<') && (NXT(1) == '!') &&
  4103. (NXT(2) == '-') && (NXT(3) == '-')) {
  4104. htmlParseComment(ctxt);
  4105. }
  4106. /*
  4107. * Second case : a Processing Instruction.
  4108. */
  4109. else if ((CUR == '<') && (NXT(1) == '?')) {
  4110. htmlParsePI(ctxt);
  4111. }
  4112. /*
  4113. * Third case : a sub-element.
  4114. */
  4115. else if (CUR == '<') {
  4116. htmlParseElementInternal(ctxt);
  4117. if (currentNode != NULL) xmlFree(currentNode);
  4118. currentNode = xmlStrdup(ctxt->name);
  4119. depth = ctxt->nameNr;
  4120. }
  4121. /*
  4122. * Fourth case : a reference. If if has not been resolved,
  4123. * parsing returns it's Name, create the node
  4124. */
  4125. else if (CUR == '&') {
  4126. htmlParseReference(ctxt);
  4127. }
  4128. /*
  4129. * Fifth case : end of the resource
  4130. */
  4131. else if (CUR == 0) {
  4132. htmlAutoCloseOnEnd(ctxt);
  4133. break;
  4134. }
  4135. /*
  4136. * Last case, text. Note that References are handled directly.
  4137. */
  4138. else {
  4139. htmlParseCharData(ctxt);
  4140. }
  4141. if (cons == ctxt->nbChars) {
  4142. if (ctxt->node != NULL) {
  4143. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  4144. "detected an error in element content\n",
  4145. NULL, NULL);
  4146. }
  4147. break;
  4148. }
  4149. }
  4150. GROW;
  4151. }
  4152. if (currentNode != NULL) xmlFree(currentNode);
  4153. }
  4154. /**
  4155. * htmlParseContent:
  4156. * @ctxt: an HTML parser context
  4157. *
  4158. * Parse a content: comment, sub-element, reference or text.
  4159. * This is the entry point when called from parser.c
  4160. */
  4161. void
  4162. __htmlParseContent(void *ctxt) {
  4163. if (ctxt != NULL)
  4164. htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
  4165. }
  4166. /**
  4167. * htmlParseDocument:
  4168. * @ctxt: an HTML parser context
  4169. *
  4170. * parse an HTML document (and build a tree if using the standard SAX
  4171. * interface).
  4172. *
  4173. * Returns 0, -1 in case of error. the parser context is augmented
  4174. * as a result of the parsing.
  4175. */
  4176. int
  4177. htmlParseDocument(htmlParserCtxtPtr ctxt) {
  4178. xmlChar start[4];
  4179. xmlCharEncoding enc;
  4180. xmlDtdPtr dtd;
  4181. xmlInitParser();
  4182. htmlDefaultSAXHandlerInit();
  4183. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  4184. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  4185. "htmlParseDocument: context error\n", NULL, NULL);
  4186. return(XML_ERR_INTERNAL_ERROR);
  4187. }
  4188. ctxt->html = 1;
  4189. ctxt->linenumbers = 1;
  4190. GROW;
  4191. /*
  4192. * SAX: beginning of the document processing.
  4193. */
  4194. if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
  4195. ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
  4196. if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
  4197. ((ctxt->input->end - ctxt->input->cur) >= 4)) {
  4198. /*
  4199. * Get the 4 first bytes and decode the charset
  4200. * if enc != XML_CHAR_ENCODING_NONE
  4201. * plug some encoding conversion routines.
  4202. */
  4203. start[0] = RAW;
  4204. start[1] = NXT(1);
  4205. start[2] = NXT(2);
  4206. start[3] = NXT(3);
  4207. enc = xmlDetectCharEncoding(&start[0], 4);
  4208. if (enc != XML_CHAR_ENCODING_NONE) {
  4209. xmlSwitchEncoding(ctxt, enc);
  4210. }
  4211. }
  4212. /*
  4213. * Wipe out everything which is before the first '<'
  4214. */
  4215. SKIP_BLANKS;
  4216. if (CUR == 0) {
  4217. htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
  4218. "Document is empty\n", NULL, NULL);
  4219. }
  4220. if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
  4221. ctxt->sax->startDocument(ctxt->userData);
  4222. /*
  4223. * Parse possible comments and PIs before any content
  4224. */
  4225. while (((CUR == '<') && (NXT(1) == '!') &&
  4226. (NXT(2) == '-') && (NXT(3) == '-')) ||
  4227. ((CUR == '<') && (NXT(1) == '?'))) {
  4228. htmlParseComment(ctxt);
  4229. htmlParsePI(ctxt);
  4230. SKIP_BLANKS;
  4231. }
  4232. /*
  4233. * Then possibly doc type declaration(s) and more Misc
  4234. * (doctypedecl Misc*)?
  4235. */
  4236. if ((CUR == '<') && (NXT(1) == '!') &&
  4237. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  4238. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  4239. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  4240. (UPP(8) == 'E')) {
  4241. htmlParseDocTypeDecl(ctxt);
  4242. }
  4243. SKIP_BLANKS;
  4244. /*
  4245. * Parse possible comments and PIs before any content
  4246. */
  4247. while (((CUR == '<') && (NXT(1) == '!') &&
  4248. (NXT(2) == '-') && (NXT(3) == '-')) ||
  4249. ((CUR == '<') && (NXT(1) == '?'))) {
  4250. htmlParseComment(ctxt);
  4251. htmlParsePI(ctxt);
  4252. SKIP_BLANKS;
  4253. }
  4254. /*
  4255. * Time to start parsing the tree itself
  4256. */
  4257. htmlParseContentInternal(ctxt);
  4258. /*
  4259. * autoclose
  4260. */
  4261. if (CUR == 0)
  4262. htmlAutoCloseOnEnd(ctxt);
  4263. /*
  4264. * SAX: end of the document processing.
  4265. */
  4266. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  4267. ctxt->sax->endDocument(ctxt->userData);
  4268. if (ctxt->myDoc != NULL) {
  4269. dtd = xmlGetIntSubset(ctxt->myDoc);
  4270. if (dtd == NULL)
  4271. ctxt->myDoc->intSubset =
  4272. xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
  4273. BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
  4274. BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
  4275. }
  4276. if (! ctxt->wellFormed) return(-1);
  4277. return(0);
  4278. }
  4279. /************************************************************************
  4280. * *
  4281. * Parser contexts handling *
  4282. * *
  4283. ************************************************************************/
  4284. /**
  4285. * htmlInitParserCtxt:
  4286. * @ctxt: an HTML parser context
  4287. *
  4288. * Initialize a parser context
  4289. *
  4290. * Returns 0 in case of success and -1 in case of error
  4291. */
  4292. static int
  4293. htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
  4294. {
  4295. htmlSAXHandler *sax;
  4296. if (ctxt == NULL) return(-1);
  4297. memset(ctxt, 0, sizeof(htmlParserCtxt));
  4298. ctxt->dict = xmlDictCreate();
  4299. if (ctxt->dict == NULL) {
  4300. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4301. return(-1);
  4302. }
  4303. sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
  4304. if (sax == NULL) {
  4305. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4306. return(-1);
  4307. }
  4308. else
  4309. memset(sax, 0, sizeof(htmlSAXHandler));
  4310. /* Allocate the Input stack */
  4311. ctxt->inputTab = (htmlParserInputPtr *)
  4312. xmlMalloc(5 * sizeof(htmlParserInputPtr));
  4313. if (ctxt->inputTab == NULL) {
  4314. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4315. ctxt->inputNr = 0;
  4316. ctxt->inputMax = 0;
  4317. ctxt->input = NULL;
  4318. return(-1);
  4319. }
  4320. ctxt->inputNr = 0;
  4321. ctxt->inputMax = 5;
  4322. ctxt->input = NULL;
  4323. ctxt->version = NULL;
  4324. ctxt->encoding = NULL;
  4325. ctxt->standalone = -1;
  4326. ctxt->instate = XML_PARSER_START;
  4327. /* Allocate the Node stack */
  4328. ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
  4329. if (ctxt->nodeTab == NULL) {
  4330. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4331. ctxt->nodeNr = 0;
  4332. ctxt->nodeMax = 0;
  4333. ctxt->node = NULL;
  4334. ctxt->inputNr = 0;
  4335. ctxt->inputMax = 0;
  4336. ctxt->input = NULL;
  4337. return(-1);
  4338. }
  4339. ctxt->nodeNr = 0;
  4340. ctxt->nodeMax = 10;
  4341. ctxt->node = NULL;
  4342. /* Allocate the Name stack */
  4343. ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
  4344. if (ctxt->nameTab == NULL) {
  4345. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4346. ctxt->nameNr = 0;
  4347. ctxt->nameMax = 0;
  4348. ctxt->name = NULL;
  4349. ctxt->nodeNr = 0;
  4350. ctxt->nodeMax = 0;
  4351. ctxt->node = NULL;
  4352. ctxt->inputNr = 0;
  4353. ctxt->inputMax = 0;
  4354. ctxt->input = NULL;
  4355. return(-1);
  4356. }
  4357. ctxt->nameNr = 0;
  4358. ctxt->nameMax = 10;
  4359. ctxt->name = NULL;
  4360. ctxt->nodeInfoTab = NULL;
  4361. ctxt->nodeInfoNr = 0;
  4362. ctxt->nodeInfoMax = 0;
  4363. if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
  4364. else {
  4365. ctxt->sax = sax;
  4366. memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
  4367. }
  4368. ctxt->userData = ctxt;
  4369. ctxt->myDoc = NULL;
  4370. ctxt->wellFormed = 1;
  4371. ctxt->replaceEntities = 0;
  4372. ctxt->linenumbers = xmlLineNumbersDefaultValue;
  4373. ctxt->html = 1;
  4374. ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
  4375. ctxt->vctxt.userData = ctxt;
  4376. ctxt->vctxt.error = xmlParserValidityError;
  4377. ctxt->vctxt.warning = xmlParserValidityWarning;
  4378. ctxt->record_info = 0;
  4379. ctxt->validate = 0;
  4380. ctxt->nbChars = 0;
  4381. ctxt->checkIndex = 0;
  4382. ctxt->catalogs = NULL;
  4383. xmlInitNodeInfoSeq(&ctxt->node_seq);
  4384. return(0);
  4385. }
  4386. /**
  4387. * htmlFreeParserCtxt:
  4388. * @ctxt: an HTML parser context
  4389. *
  4390. * Free all the memory used by a parser context. However the parsed
  4391. * document in ctxt->myDoc is not freed.
  4392. */
  4393. void
  4394. htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
  4395. {
  4396. xmlFreeParserCtxt(ctxt);
  4397. }
  4398. /**
  4399. * htmlNewParserCtxt:
  4400. *
  4401. * Allocate and initialize a new parser context.
  4402. *
  4403. * Returns the htmlParserCtxtPtr or NULL in case of allocation error
  4404. */
  4405. htmlParserCtxtPtr
  4406. htmlNewParserCtxt(void)
  4407. {
  4408. xmlParserCtxtPtr ctxt;
  4409. ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
  4410. if (ctxt == NULL) {
  4411. htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
  4412. return(NULL);
  4413. }
  4414. memset(ctxt, 0, sizeof(xmlParserCtxt));
  4415. if (htmlInitParserCtxt(ctxt) < 0) {
  4416. htmlFreeParserCtxt(ctxt);
  4417. return(NULL);
  4418. }
  4419. return(ctxt);
  4420. }
  4421. /**
  4422. * htmlCreateMemoryParserCtxt:
  4423. * @buffer: a pointer to a char array
  4424. * @size: the size of the array
  4425. *
  4426. * Create a parser context for an HTML in-memory document.
  4427. *
  4428. * Returns the new parser context or NULL
  4429. */
  4430. htmlParserCtxtPtr
  4431. htmlCreateMemoryParserCtxt(const char *buffer, int size) {
  4432. xmlParserCtxtPtr ctxt;
  4433. xmlParserInputPtr input;
  4434. xmlParserInputBufferPtr buf;
  4435. if (buffer == NULL)
  4436. return(NULL);
  4437. if (size <= 0)
  4438. return(NULL);
  4439. ctxt = htmlNewParserCtxt();
  4440. if (ctxt == NULL)
  4441. return(NULL);
  4442. buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
  4443. if (buf == NULL) return(NULL);
  4444. input = xmlNewInputStream(ctxt);
  4445. if (input == NULL) {
  4446. xmlFreeParserCtxt(ctxt);
  4447. return(NULL);
  4448. }
  4449. input->filename = NULL;
  4450. input->buf = buf;
  4451. input->base = input->buf->buffer->content;
  4452. input->cur = input->buf->buffer->content;
  4453. input->end = &input->buf->buffer->content[input->buf->buffer->use];
  4454. inputPush(ctxt, input);
  4455. return(ctxt);
  4456. }
  4457. /**
  4458. * htmlCreateDocParserCtxt:
  4459. * @cur: a pointer to an array of xmlChar
  4460. * @encoding: a free form C string describing the HTML document encoding, or NULL
  4461. *
  4462. * Create a parser context for an HTML document.
  4463. *
  4464. * TODO: check the need to add encoding handling there
  4465. *
  4466. * Returns the new parser context or NULL
  4467. */
  4468. static htmlParserCtxtPtr
  4469. htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
  4470. int len;
  4471. htmlParserCtxtPtr ctxt;
  4472. if (cur == NULL)
  4473. return(NULL);
  4474. len = xmlStrlen(cur);
  4475. ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
  4476. if (ctxt == NULL)
  4477. return(NULL);
  4478. if (encoding != NULL) {
  4479. xmlCharEncoding enc;
  4480. xmlCharEncodingHandlerPtr handler;
  4481. if (ctxt->input->encoding != NULL)
  4482. xmlFree((xmlChar *) ctxt->input->encoding);
  4483. ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
  4484. enc = xmlParseCharEncoding(encoding);
  4485. /*
  4486. * registered set of known encodings
  4487. */
  4488. if (enc != XML_CHAR_ENCODING_ERROR) {
  4489. xmlSwitchEncoding(ctxt, enc);
  4490. if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
  4491. htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  4492. "Unsupported encoding %s\n",
  4493. (const xmlChar *) encoding, NULL);
  4494. }
  4495. } else {
  4496. /*
  4497. * fallback for unknown encodings
  4498. */
  4499. handler = xmlFindCharEncodingHandler((const char *) encoding);
  4500. if (handler != NULL) {
  4501. xmlSwitchToEncoding(ctxt, handler);
  4502. } else {
  4503. htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  4504. "Unsupported encoding %s\n",
  4505. (const xmlChar *) encoding, NULL);
  4506. }
  4507. }
  4508. }
  4509. return(ctxt);
  4510. }
  4511. #ifdef LIBXML_PUSH_ENABLED
  4512. /************************************************************************
  4513. * *
  4514. * Progressive parsing interfaces *
  4515. * *
  4516. ************************************************************************/
  4517. /**
  4518. * htmlParseLookupSequence:
  4519. * @ctxt: an HTML parser context
  4520. * @first: the first char to lookup
  4521. * @next: the next char to lookup or zero
  4522. * @third: the next char to lookup or zero
  4523. * @comment: flag to force checking inside comments
  4524. *
  4525. * Try to find if a sequence (first, next, third) or just (first next) or
  4526. * (first) is available in the input stream.
  4527. * This function has a side effect of (possibly) incrementing ctxt->checkIndex
  4528. * to avoid rescanning sequences of bytes, it DOES change the state of the
  4529. * parser, do not use liberally.
  4530. * This is basically similar to xmlParseLookupSequence()
  4531. *
  4532. * Returns the index to the current parsing point if the full sequence
  4533. * is available, -1 otherwise.
  4534. */
  4535. static int
  4536. htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
  4537. xmlChar next, xmlChar third, int iscomment,
  4538. int ignoreattrval)
  4539. {
  4540. int base, len;
  4541. htmlParserInputPtr in;
  4542. const xmlChar *buf;
  4543. int incomment = 0;
  4544. int invalue = 0;
  4545. char valdellim = 0x0;
  4546. in = ctxt->input;
  4547. if (in == NULL)
  4548. return (-1);
  4549. base = in->cur - in->base;
  4550. if (base < 0)
  4551. return (-1);
  4552. if (ctxt->checkIndex > base)
  4553. base = ctxt->checkIndex;
  4554. if (in->buf == NULL) {
  4555. buf = in->base;
  4556. len = in->length;
  4557. } else {
  4558. buf = in->buf->buffer->content;
  4559. len = in->buf->buffer->use;
  4560. }
  4561. /* take into account the sequence length */
  4562. if (third)
  4563. len -= 2;
  4564. else if (next)
  4565. len--;
  4566. for (; base < len; base++) {
  4567. if ((!incomment) && (base + 4 < len) && (!iscomment)) {
  4568. if ((buf[base] == '<') && (buf[base + 1] == '!') &&
  4569. (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
  4570. incomment = 1;
  4571. /* do not increment past <! - some people use <!--> */
  4572. base += 2;
  4573. }
  4574. }
  4575. if (ignoreattrval) {
  4576. if (buf[base] == '"' || buf[base] == '\'') {
  4577. if (invalue) {
  4578. if (buf[base] == valdellim) {
  4579. invalue = 0;
  4580. continue;
  4581. }
  4582. } else {
  4583. valdellim = buf[base];
  4584. invalue = 1;
  4585. continue;
  4586. }
  4587. } else if (invalue) {
  4588. continue;
  4589. }
  4590. }
  4591. if (incomment) {
  4592. if (base + 3 > len)
  4593. return (-1);
  4594. if ((buf[base] == '-') && (buf[base + 1] == '-') &&
  4595. (buf[base + 2] == '>')) {
  4596. incomment = 0;
  4597. base += 2;
  4598. }
  4599. continue;
  4600. }
  4601. if (buf[base] == first) {
  4602. if (third != 0) {
  4603. if ((buf[base + 1] != next) || (buf[base + 2] != third))
  4604. continue;
  4605. } else if (next != 0) {
  4606. if (buf[base + 1] != next)
  4607. continue;
  4608. }
  4609. ctxt->checkIndex = 0;
  4610. #ifdef DEBUG_PUSH
  4611. if (next == 0)
  4612. xmlGenericError(xmlGenericErrorContext,
  4613. "HPP: lookup '%c' found at %d\n",
  4614. first, base);
  4615. else if (third == 0)
  4616. xmlGenericError(xmlGenericErrorContext,
  4617. "HPP: lookup '%c%c' found at %d\n",
  4618. first, next, base);
  4619. else
  4620. xmlGenericError(xmlGenericErrorContext,
  4621. "HPP: lookup '%c%c%c' found at %d\n",
  4622. first, next, third, base);
  4623. #endif
  4624. return (base - (in->cur - in->base));
  4625. }
  4626. }
  4627. if ((!incomment) && (!invalue))
  4628. ctxt->checkIndex = base;
  4629. #ifdef DEBUG_PUSH
  4630. if (next == 0)
  4631. xmlGenericError(xmlGenericErrorContext,
  4632. "HPP: lookup '%c' failed\n", first);
  4633. else if (third == 0)
  4634. xmlGenericError(xmlGenericErrorContext,
  4635. "HPP: lookup '%c%c' failed\n", first, next);
  4636. else
  4637. xmlGenericError(xmlGenericErrorContext,
  4638. "HPP: lookup '%c%c%c' failed\n", first, next,
  4639. third);
  4640. #endif
  4641. return (-1);
  4642. }
  4643. /**
  4644. * htmlParseLookupChars:
  4645. * @ctxt: an HTML parser context
  4646. * @stop: Array of chars, which stop the lookup.
  4647. * @stopLen: Length of stop-Array
  4648. *
  4649. * Try to find if any char of the stop-Array is available in the input
  4650. * stream.
  4651. * This function has a side effect of (possibly) incrementing ctxt->checkIndex
  4652. * to avoid rescanning sequences of bytes, it DOES change the state of the
  4653. * parser, do not use liberally.
  4654. *
  4655. * Returns the index to the current parsing point if a stopChar
  4656. * is available, -1 otherwise.
  4657. */
  4658. static int
  4659. htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
  4660. int stopLen)
  4661. {
  4662. int base, len;
  4663. htmlParserInputPtr in;
  4664. const xmlChar *buf;
  4665. int incomment = 0;
  4666. int i;
  4667. in = ctxt->input;
  4668. if (in == NULL)
  4669. return (-1);
  4670. base = in->cur - in->base;
  4671. if (base < 0)
  4672. return (-1);
  4673. if (ctxt->checkIndex > base)
  4674. base = ctxt->checkIndex;
  4675. if (in->buf == NULL) {
  4676. buf = in->base;
  4677. len = in->length;
  4678. } else {
  4679. buf = in->buf->buffer->content;
  4680. len = in->buf->buffer->use;
  4681. }
  4682. for (; base < len; base++) {
  4683. if (!incomment && (base + 4 < len)) {
  4684. if ((buf[base] == '<') && (buf[base + 1] == '!') &&
  4685. (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
  4686. incomment = 1;
  4687. /* do not increment past <! - some people use <!--> */
  4688. base += 2;
  4689. }
  4690. }
  4691. if (incomment) {
  4692. if (base + 3 > len)
  4693. return (-1);
  4694. if ((buf[base] == '-') && (buf[base + 1] == '-') &&
  4695. (buf[base + 2] == '>')) {
  4696. incomment = 0;
  4697. base += 2;
  4698. }
  4699. continue;
  4700. }
  4701. for (i = 0; i < stopLen; ++i) {
  4702. if (buf[base] == stop[i]) {
  4703. ctxt->checkIndex = 0;
  4704. return (base - (in->cur - in->base));
  4705. }
  4706. }
  4707. }
  4708. ctxt->checkIndex = base;
  4709. return (-1);
  4710. }
  4711. /**
  4712. * htmlParseTryOrFinish:
  4713. * @ctxt: an HTML parser context
  4714. * @terminate: last chunk indicator
  4715. *
  4716. * Try to progress on parsing
  4717. *
  4718. * Returns zero if no parsing was possible
  4719. */
  4720. static int
  4721. htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
  4722. int ret = 0;
  4723. htmlParserInputPtr in;
  4724. int avail = 0;
  4725. xmlChar cur, next;
  4726. #ifdef DEBUG_PUSH
  4727. switch (ctxt->instate) {
  4728. case XML_PARSER_EOF:
  4729. xmlGenericError(xmlGenericErrorContext,
  4730. "HPP: try EOF\n"); break;
  4731. case XML_PARSER_START:
  4732. xmlGenericError(xmlGenericErrorContext,
  4733. "HPP: try START\n"); break;
  4734. case XML_PARSER_MISC:
  4735. xmlGenericError(xmlGenericErrorContext,
  4736. "HPP: try MISC\n");break;
  4737. case XML_PARSER_COMMENT:
  4738. xmlGenericError(xmlGenericErrorContext,
  4739. "HPP: try COMMENT\n");break;
  4740. case XML_PARSER_PROLOG:
  4741. xmlGenericError(xmlGenericErrorContext,
  4742. "HPP: try PROLOG\n");break;
  4743. case XML_PARSER_START_TAG:
  4744. xmlGenericError(xmlGenericErrorContext,
  4745. "HPP: try START_TAG\n");break;
  4746. case XML_PARSER_CONTENT:
  4747. xmlGenericError(xmlGenericErrorContext,
  4748. "HPP: try CONTENT\n");break;
  4749. case XML_PARSER_CDATA_SECTION:
  4750. xmlGenericError(xmlGenericErrorContext,
  4751. "HPP: try CDATA_SECTION\n");break;
  4752. case XML_PARSER_END_TAG:
  4753. xmlGenericError(xmlGenericErrorContext,
  4754. "HPP: try END_TAG\n");break;
  4755. case XML_PARSER_ENTITY_DECL:
  4756. xmlGenericError(xmlGenericErrorContext,
  4757. "HPP: try ENTITY_DECL\n");break;
  4758. case XML_PARSER_ENTITY_VALUE:
  4759. xmlGenericError(xmlGenericErrorContext,
  4760. "HPP: try ENTITY_VALUE\n");break;
  4761. case XML_PARSER_ATTRIBUTE_VALUE:
  4762. xmlGenericError(xmlGenericErrorContext,
  4763. "HPP: try ATTRIBUTE_VALUE\n");break;
  4764. case XML_PARSER_DTD:
  4765. xmlGenericError(xmlGenericErrorContext,
  4766. "HPP: try DTD\n");break;
  4767. case XML_PARSER_EPILOG:
  4768. xmlGenericError(xmlGenericErrorContext,
  4769. "HPP: try EPILOG\n");break;
  4770. case XML_PARSER_PI:
  4771. xmlGenericError(xmlGenericErrorContext,
  4772. "HPP: try PI\n");break;
  4773. case XML_PARSER_SYSTEM_LITERAL:
  4774. xmlGenericError(xmlGenericErrorContext,
  4775. "HPP: try SYSTEM_LITERAL\n");break;
  4776. }
  4777. #endif
  4778. while (1) {
  4779. in = ctxt->input;
  4780. if (in == NULL) break;
  4781. if (in->buf == NULL)
  4782. avail = in->length - (in->cur - in->base);
  4783. else
  4784. avail = in->buf->buffer->use - (in->cur - in->base);
  4785. if ((avail == 0) && (terminate)) {
  4786. htmlAutoCloseOnEnd(ctxt);
  4787. if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
  4788. /*
  4789. * SAX: end of the document processing.
  4790. */
  4791. ctxt->instate = XML_PARSER_EOF;
  4792. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  4793. ctxt->sax->endDocument(ctxt->userData);
  4794. }
  4795. }
  4796. if (avail < 1)
  4797. goto done;
  4798. cur = in->cur[0];
  4799. if (cur == 0) {
  4800. SKIP(1);
  4801. continue;
  4802. }
  4803. switch (ctxt->instate) {
  4804. case XML_PARSER_EOF:
  4805. /*
  4806. * Document parsing is done !
  4807. */
  4808. goto done;
  4809. case XML_PARSER_START:
  4810. /*
  4811. * Very first chars read from the document flow.
  4812. */
  4813. cur = in->cur[0];
  4814. if (IS_BLANK_CH(cur)) {
  4815. SKIP_BLANKS;
  4816. if (in->buf == NULL)
  4817. avail = in->length - (in->cur - in->base);
  4818. else
  4819. avail = in->buf->buffer->use - (in->cur - in->base);
  4820. }
  4821. if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
  4822. ctxt->sax->setDocumentLocator(ctxt->userData,
  4823. &xmlDefaultSAXLocator);
  4824. if ((ctxt->sax) && (ctxt->sax->startDocument) &&
  4825. (!ctxt->disableSAX))
  4826. ctxt->sax->startDocument(ctxt->userData);
  4827. cur = in->cur[0];
  4828. next = in->cur[1];
  4829. if ((cur == '<') && (next == '!') &&
  4830. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  4831. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  4832. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  4833. (UPP(8) == 'E')) {
  4834. if ((!terminate) &&
  4835. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  4836. goto done;
  4837. #ifdef DEBUG_PUSH
  4838. xmlGenericError(xmlGenericErrorContext,
  4839. "HPP: Parsing internal subset\n");
  4840. #endif
  4841. htmlParseDocTypeDecl(ctxt);
  4842. ctxt->instate = XML_PARSER_PROLOG;
  4843. #ifdef DEBUG_PUSH
  4844. xmlGenericError(xmlGenericErrorContext,
  4845. "HPP: entering PROLOG\n");
  4846. #endif
  4847. } else {
  4848. ctxt->instate = XML_PARSER_MISC;
  4849. #ifdef DEBUG_PUSH
  4850. xmlGenericError(xmlGenericErrorContext,
  4851. "HPP: entering MISC\n");
  4852. #endif
  4853. }
  4854. break;
  4855. case XML_PARSER_MISC:
  4856. SKIP_BLANKS;
  4857. if (in->buf == NULL)
  4858. avail = in->length - (in->cur - in->base);
  4859. else
  4860. avail = in->buf->buffer->use - (in->cur - in->base);
  4861. if (avail < 2)
  4862. goto done;
  4863. cur = in->cur[0];
  4864. next = in->cur[1];
  4865. if ((cur == '<') && (next == '!') &&
  4866. (in->cur[2] == '-') && (in->cur[3] == '-')) {
  4867. if ((!terminate) &&
  4868. (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
  4869. goto done;
  4870. #ifdef DEBUG_PUSH
  4871. xmlGenericError(xmlGenericErrorContext,
  4872. "HPP: Parsing Comment\n");
  4873. #endif
  4874. htmlParseComment(ctxt);
  4875. ctxt->instate = XML_PARSER_MISC;
  4876. } else if ((cur == '<') && (next == '?')) {
  4877. if ((!terminate) &&
  4878. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  4879. goto done;
  4880. #ifdef DEBUG_PUSH
  4881. xmlGenericError(xmlGenericErrorContext,
  4882. "HPP: Parsing PI\n");
  4883. #endif
  4884. htmlParsePI(ctxt);
  4885. ctxt->instate = XML_PARSER_MISC;
  4886. } else if ((cur == '<') && (next == '!') &&
  4887. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  4888. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  4889. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  4890. (UPP(8) == 'E')) {
  4891. if ((!terminate) &&
  4892. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  4893. goto done;
  4894. #ifdef DEBUG_PUSH
  4895. xmlGenericError(xmlGenericErrorContext,
  4896. "HPP: Parsing internal subset\n");
  4897. #endif
  4898. htmlParseDocTypeDecl(ctxt);
  4899. ctxt->instate = XML_PARSER_PROLOG;
  4900. #ifdef DEBUG_PUSH
  4901. xmlGenericError(xmlGenericErrorContext,
  4902. "HPP: entering PROLOG\n");
  4903. #endif
  4904. } else if ((cur == '<') && (next == '!') &&
  4905. (avail < 9)) {
  4906. goto done;
  4907. } else {
  4908. ctxt->instate = XML_PARSER_START_TAG;
  4909. #ifdef DEBUG_PUSH
  4910. xmlGenericError(xmlGenericErrorContext,
  4911. "HPP: entering START_TAG\n");
  4912. #endif
  4913. }
  4914. break;
  4915. case XML_PARSER_PROLOG:
  4916. SKIP_BLANKS;
  4917. if (in->buf == NULL)
  4918. avail = in->length - (in->cur - in->base);
  4919. else
  4920. avail = in->buf->buffer->use - (in->cur - in->base);
  4921. if (avail < 2)
  4922. goto done;
  4923. cur = in->cur[0];
  4924. next = in->cur[1];
  4925. if ((cur == '<') && (next == '!') &&
  4926. (in->cur[2] == '-') && (in->cur[3] == '-')) {
  4927. if ((!terminate) &&
  4928. (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
  4929. goto done;
  4930. #ifdef DEBUG_PUSH
  4931. xmlGenericError(xmlGenericErrorContext,
  4932. "HPP: Parsing Comment\n");
  4933. #endif
  4934. htmlParseComment(ctxt);
  4935. ctxt->instate = XML_PARSER_PROLOG;
  4936. } else if ((cur == '<') && (next == '?')) {
  4937. if ((!terminate) &&
  4938. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  4939. goto done;
  4940. #ifdef DEBUG_PUSH
  4941. xmlGenericError(xmlGenericErrorContext,
  4942. "HPP: Parsing PI\n");
  4943. #endif
  4944. htmlParsePI(ctxt);
  4945. ctxt->instate = XML_PARSER_PROLOG;
  4946. } else if ((cur == '<') && (next == '!') &&
  4947. (avail < 4)) {
  4948. goto done;
  4949. } else {
  4950. ctxt->instate = XML_PARSER_START_TAG;
  4951. #ifdef DEBUG_PUSH
  4952. xmlGenericError(xmlGenericErrorContext,
  4953. "HPP: entering START_TAG\n");
  4954. #endif
  4955. }
  4956. break;
  4957. case XML_PARSER_EPILOG:
  4958. if (in->buf == NULL)
  4959. avail = in->length - (in->cur - in->base);
  4960. else
  4961. avail = in->buf->buffer->use - (in->cur - in->base);
  4962. if (avail < 1)
  4963. goto done;
  4964. cur = in->cur[0];
  4965. if (IS_BLANK_CH(cur)) {
  4966. htmlParseCharData(ctxt);
  4967. goto done;
  4968. }
  4969. if (avail < 2)
  4970. goto done;
  4971. next = in->cur[1];
  4972. if ((cur == '<') && (next == '!') &&
  4973. (in->cur[2] == '-') && (in->cur[3] == '-')) {
  4974. if ((!terminate) &&
  4975. (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
  4976. goto done;
  4977. #ifdef DEBUG_PUSH
  4978. xmlGenericError(xmlGenericErrorContext,
  4979. "HPP: Parsing Comment\n");
  4980. #endif
  4981. htmlParseComment(ctxt);
  4982. ctxt->instate = XML_PARSER_EPILOG;
  4983. } else if ((cur == '<') && (next == '?')) {
  4984. if ((!terminate) &&
  4985. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  4986. goto done;
  4987. #ifdef DEBUG_PUSH
  4988. xmlGenericError(xmlGenericErrorContext,
  4989. "HPP: Parsing PI\n");
  4990. #endif
  4991. htmlParsePI(ctxt);
  4992. ctxt->instate = XML_PARSER_EPILOG;
  4993. } else if ((cur == '<') && (next == '!') &&
  4994. (avail < 4)) {
  4995. goto done;
  4996. } else {
  4997. ctxt->errNo = XML_ERR_DOCUMENT_END;
  4998. ctxt->wellFormed = 0;
  4999. ctxt->instate = XML_PARSER_EOF;
  5000. #ifdef DEBUG_PUSH
  5001. xmlGenericError(xmlGenericErrorContext,
  5002. "HPP: entering EOF\n");
  5003. #endif
  5004. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  5005. ctxt->sax->endDocument(ctxt->userData);
  5006. goto done;
  5007. }
  5008. break;
  5009. case XML_PARSER_START_TAG: {
  5010. const xmlChar *name;
  5011. int failed;
  5012. const htmlElemDesc * info;
  5013. if (avail < 2)
  5014. goto done;
  5015. cur = in->cur[0];
  5016. if (cur != '<') {
  5017. ctxt->instate = XML_PARSER_CONTENT;
  5018. #ifdef DEBUG_PUSH
  5019. xmlGenericError(xmlGenericErrorContext,
  5020. "HPP: entering CONTENT\n");
  5021. #endif
  5022. break;
  5023. }
  5024. if (in->cur[1] == '/') {
  5025. ctxt->instate = XML_PARSER_END_TAG;
  5026. ctxt->checkIndex = 0;
  5027. #ifdef DEBUG_PUSH
  5028. xmlGenericError(xmlGenericErrorContext,
  5029. "HPP: entering END_TAG\n");
  5030. #endif
  5031. break;
  5032. }
  5033. if ((!terminate) &&
  5034. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  5035. goto done;
  5036. failed = htmlParseStartTag(ctxt);
  5037. name = ctxt->name;
  5038. if ((failed == -1) ||
  5039. (name == NULL)) {
  5040. if (CUR == '>')
  5041. NEXT;
  5042. break;
  5043. }
  5044. /*
  5045. * Lookup the info for that element.
  5046. */
  5047. info = htmlTagLookup(name);
  5048. if (info == NULL) {
  5049. htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
  5050. "Tag %s invalid\n", name, NULL);
  5051. }
  5052. /*
  5053. * Check for an Empty Element labeled the XML/SGML way
  5054. */
  5055. if ((CUR == '/') && (NXT(1) == '>')) {
  5056. SKIP(2);
  5057. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  5058. ctxt->sax->endElement(ctxt->userData, name);
  5059. htmlnamePop(ctxt);
  5060. ctxt->instate = XML_PARSER_CONTENT;
  5061. #ifdef DEBUG_PUSH
  5062. xmlGenericError(xmlGenericErrorContext,
  5063. "HPP: entering CONTENT\n");
  5064. #endif
  5065. break;
  5066. }
  5067. if (CUR == '>') {
  5068. NEXT;
  5069. } else {
  5070. htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
  5071. "Couldn't find end of Start Tag %s\n",
  5072. name, NULL);
  5073. /*
  5074. * end of parsing of this node.
  5075. */
  5076. if (xmlStrEqual(name, ctxt->name)) {
  5077. nodePop(ctxt);
  5078. htmlnamePop(ctxt);
  5079. }
  5080. ctxt->instate = XML_PARSER_CONTENT;
  5081. #ifdef DEBUG_PUSH
  5082. xmlGenericError(xmlGenericErrorContext,
  5083. "HPP: entering CONTENT\n");
  5084. #endif
  5085. break;
  5086. }
  5087. /*
  5088. * Check for an Empty Element from DTD definition
  5089. */
  5090. if ((info != NULL) && (info->empty)) {
  5091. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  5092. ctxt->sax->endElement(ctxt->userData, name);
  5093. htmlnamePop(ctxt);
  5094. }
  5095. ctxt->instate = XML_PARSER_CONTENT;
  5096. #ifdef DEBUG_PUSH
  5097. xmlGenericError(xmlGenericErrorContext,
  5098. "HPP: entering CONTENT\n");
  5099. #endif
  5100. break;
  5101. }
  5102. case XML_PARSER_CONTENT: {
  5103. long cons;
  5104. /*
  5105. * Handle preparsed entities and charRef
  5106. */
  5107. if (ctxt->token != 0) {
  5108. xmlChar chr[2] = { 0 , 0 } ;
  5109. chr[0] = (xmlChar) ctxt->token;
  5110. htmlCheckParagraph(ctxt);
  5111. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
  5112. ctxt->sax->characters(ctxt->userData, chr, 1);
  5113. ctxt->token = 0;
  5114. ctxt->checkIndex = 0;
  5115. }
  5116. if ((avail == 1) && (terminate)) {
  5117. cur = in->cur[0];
  5118. if ((cur != '<') && (cur != '&')) {
  5119. if (ctxt->sax != NULL) {
  5120. if (IS_BLANK_CH(cur)) {
  5121. if (ctxt->sax->ignorableWhitespace != NULL)
  5122. ctxt->sax->ignorableWhitespace(
  5123. ctxt->userData, &cur, 1);
  5124. } else {
  5125. htmlCheckParagraph(ctxt);
  5126. if (ctxt->sax->characters != NULL)
  5127. ctxt->sax->characters(
  5128. ctxt->userData, &cur, 1);
  5129. }
  5130. }
  5131. ctxt->token = 0;
  5132. ctxt->checkIndex = 0;
  5133. in->cur++;
  5134. break;
  5135. }
  5136. }
  5137. if (avail < 2)
  5138. goto done;
  5139. cur = in->cur[0];
  5140. next = in->cur[1];
  5141. cons = ctxt->nbChars;
  5142. if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
  5143. (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
  5144. /*
  5145. * Handle SCRIPT/STYLE separately
  5146. */
  5147. if (!terminate) {
  5148. int idx;
  5149. xmlChar val;
  5150. idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
  5151. if (idx < 0)
  5152. goto done;
  5153. val = in->cur[idx + 2];
  5154. if (val == 0) /* bad cut of input */
  5155. goto done;
  5156. }
  5157. htmlParseScript(ctxt);
  5158. if ((cur == '<') && (next == '/')) {
  5159. ctxt->instate = XML_PARSER_END_TAG;
  5160. ctxt->checkIndex = 0;
  5161. #ifdef DEBUG_PUSH
  5162. xmlGenericError(xmlGenericErrorContext,
  5163. "HPP: entering END_TAG\n");
  5164. #endif
  5165. break;
  5166. }
  5167. } else {
  5168. /*
  5169. * Sometimes DOCTYPE arrives in the middle of the document
  5170. */
  5171. if ((cur == '<') && (next == '!') &&
  5172. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  5173. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  5174. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  5175. (UPP(8) == 'E')) {
  5176. if ((!terminate) &&
  5177. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  5178. goto done;
  5179. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  5180. "Misplaced DOCTYPE declaration\n",
  5181. BAD_CAST "DOCTYPE" , NULL);
  5182. htmlParseDocTypeDecl(ctxt);
  5183. } else if ((cur == '<') && (next == '!') &&
  5184. (in->cur[2] == '-') && (in->cur[3] == '-')) {
  5185. if ((!terminate) &&
  5186. (htmlParseLookupSequence(
  5187. ctxt, '-', '-', '>', 1, 1) < 0))
  5188. goto done;
  5189. #ifdef DEBUG_PUSH
  5190. xmlGenericError(xmlGenericErrorContext,
  5191. "HPP: Parsing Comment\n");
  5192. #endif
  5193. htmlParseComment(ctxt);
  5194. ctxt->instate = XML_PARSER_CONTENT;
  5195. } else if ((cur == '<') && (next == '?')) {
  5196. if ((!terminate) &&
  5197. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  5198. goto done;
  5199. #ifdef DEBUG_PUSH
  5200. xmlGenericError(xmlGenericErrorContext,
  5201. "HPP: Parsing PI\n");
  5202. #endif
  5203. htmlParsePI(ctxt);
  5204. ctxt->instate = XML_PARSER_CONTENT;
  5205. } else if ((cur == '<') && (next == '!') && (avail < 4)) {
  5206. goto done;
  5207. } else if ((cur == '<') && (next == '/')) {
  5208. ctxt->instate = XML_PARSER_END_TAG;
  5209. ctxt->checkIndex = 0;
  5210. #ifdef DEBUG_PUSH
  5211. xmlGenericError(xmlGenericErrorContext,
  5212. "HPP: entering END_TAG\n");
  5213. #endif
  5214. break;
  5215. } else if (cur == '<') {
  5216. ctxt->instate = XML_PARSER_START_TAG;
  5217. ctxt->checkIndex = 0;
  5218. #ifdef DEBUG_PUSH
  5219. xmlGenericError(xmlGenericErrorContext,
  5220. "HPP: entering START_TAG\n");
  5221. #endif
  5222. break;
  5223. } else if (cur == '&') {
  5224. if ((!terminate) &&
  5225. (htmlParseLookupChars(ctxt,
  5226. BAD_CAST "; >/", 4) < 0))
  5227. goto done;
  5228. #ifdef DEBUG_PUSH
  5229. xmlGenericError(xmlGenericErrorContext,
  5230. "HPP: Parsing Reference\n");
  5231. #endif
  5232. /* TODO: check generation of subtrees if noent !!! */
  5233. htmlParseReference(ctxt);
  5234. } else {
  5235. /*
  5236. * check that the text sequence is complete
  5237. * before handing out the data to the parser
  5238. * to avoid problems with erroneous end of
  5239. * data detection.
  5240. */
  5241. if ((!terminate) &&
  5242. (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
  5243. goto done;
  5244. ctxt->checkIndex = 0;
  5245. #ifdef DEBUG_PUSH
  5246. xmlGenericError(xmlGenericErrorContext,
  5247. "HPP: Parsing char data\n");
  5248. #endif
  5249. htmlParseCharData(ctxt);
  5250. }
  5251. }
  5252. if (cons == ctxt->nbChars) {
  5253. if (ctxt->node != NULL) {
  5254. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5255. "detected an error in element content\n",
  5256. NULL, NULL);
  5257. }
  5258. NEXT;
  5259. break;
  5260. }
  5261. break;
  5262. }
  5263. case XML_PARSER_END_TAG:
  5264. if (avail < 2)
  5265. goto done;
  5266. if ((!terminate) &&
  5267. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  5268. goto done;
  5269. htmlParseEndTag(ctxt);
  5270. if (ctxt->nameNr == 0) {
  5271. ctxt->instate = XML_PARSER_EPILOG;
  5272. } else {
  5273. ctxt->instate = XML_PARSER_CONTENT;
  5274. }
  5275. ctxt->checkIndex = 0;
  5276. #ifdef DEBUG_PUSH
  5277. xmlGenericError(xmlGenericErrorContext,
  5278. "HPP: entering CONTENT\n");
  5279. #endif
  5280. break;
  5281. case XML_PARSER_CDATA_SECTION:
  5282. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5283. "HPP: internal error, state == CDATA\n",
  5284. NULL, NULL);
  5285. ctxt->instate = XML_PARSER_CONTENT;
  5286. ctxt->checkIndex = 0;
  5287. #ifdef DEBUG_PUSH
  5288. xmlGenericError(xmlGenericErrorContext,
  5289. "HPP: entering CONTENT\n");
  5290. #endif
  5291. break;
  5292. case XML_PARSER_DTD:
  5293. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5294. "HPP: internal error, state == DTD\n",
  5295. NULL, NULL);
  5296. ctxt->instate = XML_PARSER_CONTENT;
  5297. ctxt->checkIndex = 0;
  5298. #ifdef DEBUG_PUSH
  5299. xmlGenericError(xmlGenericErrorContext,
  5300. "HPP: entering CONTENT\n");
  5301. #endif
  5302. break;
  5303. case XML_PARSER_COMMENT:
  5304. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5305. "HPP: internal error, state == COMMENT\n",
  5306. NULL, NULL);
  5307. ctxt->instate = XML_PARSER_CONTENT;
  5308. ctxt->checkIndex = 0;
  5309. #ifdef DEBUG_PUSH
  5310. xmlGenericError(xmlGenericErrorContext,
  5311. "HPP: entering CONTENT\n");
  5312. #endif
  5313. break;
  5314. case XML_PARSER_PI:
  5315. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5316. "HPP: internal error, state == PI\n",
  5317. NULL, NULL);
  5318. ctxt->instate = XML_PARSER_CONTENT;
  5319. ctxt->checkIndex = 0;
  5320. #ifdef DEBUG_PUSH
  5321. xmlGenericError(xmlGenericErrorContext,
  5322. "HPP: entering CONTENT\n");
  5323. #endif
  5324. break;
  5325. case XML_PARSER_ENTITY_DECL:
  5326. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5327. "HPP: internal error, state == ENTITY_DECL\n",
  5328. NULL, NULL);
  5329. ctxt->instate = XML_PARSER_CONTENT;
  5330. ctxt->checkIndex = 0;
  5331. #ifdef DEBUG_PUSH
  5332. xmlGenericError(xmlGenericErrorContext,
  5333. "HPP: entering CONTENT\n");
  5334. #endif
  5335. break;
  5336. case XML_PARSER_ENTITY_VALUE:
  5337. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5338. "HPP: internal error, state == ENTITY_VALUE\n",
  5339. NULL, NULL);
  5340. ctxt->instate = XML_PARSER_CONTENT;
  5341. ctxt->checkIndex = 0;
  5342. #ifdef DEBUG_PUSH
  5343. xmlGenericError(xmlGenericErrorContext,
  5344. "HPP: entering DTD\n");
  5345. #endif
  5346. break;
  5347. case XML_PARSER_ATTRIBUTE_VALUE:
  5348. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5349. "HPP: internal error, state == ATTRIBUTE_VALUE\n",
  5350. NULL, NULL);
  5351. ctxt->instate = XML_PARSER_START_TAG;
  5352. ctxt->checkIndex = 0;
  5353. #ifdef DEBUG_PUSH
  5354. xmlGenericError(xmlGenericErrorContext,
  5355. "HPP: entering START_TAG\n");
  5356. #endif
  5357. break;
  5358. case XML_PARSER_SYSTEM_LITERAL:
  5359. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5360. "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
  5361. NULL, NULL);
  5362. ctxt->instate = XML_PARSER_CONTENT;
  5363. ctxt->checkIndex = 0;
  5364. #ifdef DEBUG_PUSH
  5365. xmlGenericError(xmlGenericErrorContext,
  5366. "HPP: entering CONTENT\n");
  5367. #endif
  5368. break;
  5369. case XML_PARSER_IGNORE:
  5370. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5371. "HPP: internal error, state == XML_PARSER_IGNORE\n",
  5372. NULL, NULL);
  5373. ctxt->instate = XML_PARSER_CONTENT;
  5374. ctxt->checkIndex = 0;
  5375. #ifdef DEBUG_PUSH
  5376. xmlGenericError(xmlGenericErrorContext,
  5377. "HPP: entering CONTENT\n");
  5378. #endif
  5379. break;
  5380. case XML_PARSER_PUBLIC_LITERAL:
  5381. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5382. "HPP: internal error, state == XML_PARSER_LITERAL\n",
  5383. NULL, NULL);
  5384. ctxt->instate = XML_PARSER_CONTENT;
  5385. ctxt->checkIndex = 0;
  5386. #ifdef DEBUG_PUSH
  5387. xmlGenericError(xmlGenericErrorContext,
  5388. "HPP: entering CONTENT\n");
  5389. #endif
  5390. break;
  5391. }
  5392. }
  5393. done:
  5394. if ((avail == 0) && (terminate)) {
  5395. htmlAutoCloseOnEnd(ctxt);
  5396. if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
  5397. /*
  5398. * SAX: end of the document processing.
  5399. */
  5400. ctxt->instate = XML_PARSER_EOF;
  5401. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  5402. ctxt->sax->endDocument(ctxt->userData);
  5403. }
  5404. }
  5405. if ((ctxt->myDoc != NULL) &&
  5406. ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
  5407. (ctxt->instate == XML_PARSER_EPILOG))) {
  5408. xmlDtdPtr dtd;
  5409. dtd = xmlGetIntSubset(ctxt->myDoc);
  5410. if (dtd == NULL)
  5411. ctxt->myDoc->intSubset =
  5412. xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
  5413. BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
  5414. BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
  5415. }
  5416. #ifdef DEBUG_PUSH
  5417. xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
  5418. #endif
  5419. return(ret);
  5420. }
  5421. /**
  5422. * htmlParseChunk:
  5423. * @ctxt: an HTML parser context
  5424. * @chunk: an char array
  5425. * @size: the size in byte of the chunk
  5426. * @terminate: last chunk indicator
  5427. *
  5428. * Parse a Chunk of memory
  5429. *
  5430. * Returns zero if no error, the xmlParserErrors otherwise.
  5431. */
  5432. int
  5433. htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
  5434. int terminate) {
  5435. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  5436. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5437. "htmlParseChunk: context error\n", NULL, NULL);
  5438. return(XML_ERR_INTERNAL_ERROR);
  5439. }
  5440. if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
  5441. (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
  5442. int base = ctxt->input->base - ctxt->input->buf->buffer->content;
  5443. int cur = ctxt->input->cur - ctxt->input->base;
  5444. int res;
  5445. res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
  5446. if (res < 0) {
  5447. ctxt->errNo = XML_PARSER_EOF;
  5448. ctxt->disableSAX = 1;
  5449. return (XML_PARSER_EOF);
  5450. }
  5451. ctxt->input->base = ctxt->input->buf->buffer->content + base;
  5452. ctxt->input->cur = ctxt->input->base + cur;
  5453. ctxt->input->end =
  5454. &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
  5455. #ifdef DEBUG_PUSH
  5456. xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
  5457. #endif
  5458. #if 0
  5459. if ((terminate) || (ctxt->input->buf->buffer->use > 80))
  5460. htmlParseTryOrFinish(ctxt, terminate);
  5461. #endif
  5462. } else if (ctxt->instate != XML_PARSER_EOF) {
  5463. if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
  5464. xmlParserInputBufferPtr in = ctxt->input->buf;
  5465. if ((in->encoder != NULL) && (in->buffer != NULL) &&
  5466. (in->raw != NULL)) {
  5467. int nbchars;
  5468. nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
  5469. if (nbchars < 0) {
  5470. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  5471. "encoder error\n", NULL, NULL);
  5472. return(XML_ERR_INVALID_ENCODING);
  5473. }
  5474. }
  5475. }
  5476. }
  5477. htmlParseTryOrFinish(ctxt, terminate);
  5478. if (terminate) {
  5479. if ((ctxt->instate != XML_PARSER_EOF) &&
  5480. (ctxt->instate != XML_PARSER_EPILOG) &&
  5481. (ctxt->instate != XML_PARSER_MISC)) {
  5482. ctxt->errNo = XML_ERR_DOCUMENT_END;
  5483. ctxt->wellFormed = 0;
  5484. }
  5485. if (ctxt->instate != XML_PARSER_EOF) {
  5486. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  5487. ctxt->sax->endDocument(ctxt->userData);
  5488. }
  5489. ctxt->instate = XML_PARSER_EOF;
  5490. }
  5491. return((xmlParserErrors) ctxt->errNo);
  5492. }
  5493. /************************************************************************
  5494. * *
  5495. * User entry points *
  5496. * *
  5497. ************************************************************************/
  5498. /**
  5499. * htmlCreatePushParserCtxt:
  5500. * @sax: a SAX handler
  5501. * @user_data: The user data returned on SAX callbacks
  5502. * @chunk: a pointer to an array of chars
  5503. * @size: number of chars in the array
  5504. * @filename: an optional file name or URI
  5505. * @enc: an optional encoding
  5506. *
  5507. * Create a parser context for using the HTML parser in push mode
  5508. * The value of @filename is used for fetching external entities
  5509. * and error/warning reports.
  5510. *
  5511. * Returns the new parser context or NULL
  5512. */
  5513. htmlParserCtxtPtr
  5514. htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
  5515. const char *chunk, int size, const char *filename,
  5516. xmlCharEncoding enc) {
  5517. htmlParserCtxtPtr ctxt;
  5518. htmlParserInputPtr inputStream;
  5519. xmlParserInputBufferPtr buf;
  5520. xmlInitParser();
  5521. buf = xmlAllocParserInputBuffer(enc);
  5522. if (buf == NULL) return(NULL);
  5523. ctxt = htmlNewParserCtxt();
  5524. if (ctxt == NULL) {
  5525. xmlFreeParserInputBuffer(buf);
  5526. return(NULL);
  5527. }
  5528. if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
  5529. ctxt->charset=XML_CHAR_ENCODING_UTF8;
  5530. if (sax != NULL) {
  5531. if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
  5532. xmlFree(ctxt->sax);
  5533. ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
  5534. if (ctxt->sax == NULL) {
  5535. xmlFree(buf);
  5536. xmlFree(ctxt);
  5537. return(NULL);
  5538. }
  5539. memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
  5540. if (user_data != NULL)
  5541. ctxt->userData = user_data;
  5542. }
  5543. if (filename == NULL) {
  5544. ctxt->directory = NULL;
  5545. } else {
  5546. ctxt->directory = xmlParserGetDirectory(filename);
  5547. }
  5548. inputStream = htmlNewInputStream(ctxt);
  5549. if (inputStream == NULL) {
  5550. xmlFreeParserCtxt(ctxt);
  5551. xmlFree(buf);
  5552. return(NULL);
  5553. }
  5554. if (filename == NULL)
  5555. inputStream->filename = NULL;
  5556. else
  5557. inputStream->filename = (char *)
  5558. xmlCanonicPath((const xmlChar *) filename);
  5559. inputStream->buf = buf;
  5560. inputStream->base = inputStream->buf->buffer->content;
  5561. inputStream->cur = inputStream->buf->buffer->content;
  5562. inputStream->end =
  5563. &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
  5564. inputPush(ctxt, inputStream);
  5565. if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
  5566. (ctxt->input->buf != NULL)) {
  5567. int base = ctxt->input->base - ctxt->input->buf->buffer->content;
  5568. int cur = ctxt->input->cur - ctxt->input->base;
  5569. xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
  5570. ctxt->input->base = ctxt->input->buf->buffer->content + base;
  5571. ctxt->input->cur = ctxt->input->base + cur;
  5572. ctxt->input->end =
  5573. &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
  5574. #ifdef DEBUG_PUSH
  5575. xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
  5576. #endif
  5577. }
  5578. ctxt->progressive = 1;
  5579. return(ctxt);
  5580. }
  5581. #endif /* LIBXML_PUSH_ENABLED */
  5582. /**
  5583. * htmlSAXParseDoc:
  5584. * @cur: a pointer to an array of xmlChar
  5585. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5586. * @sax: the SAX handler block
  5587. * @userData: if using SAX, this pointer will be provided on callbacks.
  5588. *
  5589. * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
  5590. * to handle parse events. If sax is NULL, fallback to the default DOM
  5591. * behavior and return a tree.
  5592. *
  5593. * Returns the resulting document tree unless SAX is NULL or the document is
  5594. * not well formed.
  5595. */
  5596. htmlDocPtr
  5597. htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
  5598. htmlDocPtr ret;
  5599. htmlParserCtxtPtr ctxt;
  5600. xmlInitParser();
  5601. if (cur == NULL) return(NULL);
  5602. ctxt = htmlCreateDocParserCtxt(cur, encoding);
  5603. if (ctxt == NULL) return(NULL);
  5604. if (sax != NULL) {
  5605. if (ctxt->sax != NULL) xmlFree (ctxt->sax);
  5606. ctxt->sax = sax;
  5607. ctxt->userData = userData;
  5608. }
  5609. htmlParseDocument(ctxt);
  5610. ret = ctxt->myDoc;
  5611. if (sax != NULL) {
  5612. ctxt->sax = NULL;
  5613. ctxt->userData = NULL;
  5614. }
  5615. htmlFreeParserCtxt(ctxt);
  5616. return(ret);
  5617. }
  5618. /**
  5619. * htmlParseDoc:
  5620. * @cur: a pointer to an array of xmlChar
  5621. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5622. *
  5623. * parse an HTML in-memory document and build a tree.
  5624. *
  5625. * Returns the resulting document tree
  5626. */
  5627. htmlDocPtr
  5628. htmlParseDoc(xmlChar *cur, const char *encoding) {
  5629. return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
  5630. }
  5631. /**
  5632. * htmlCreateFileParserCtxt:
  5633. * @filename: the filename
  5634. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5635. *
  5636. * Create a parser context for a file content.
  5637. * Automatic support for ZLIB/Compress compressed document is provided
  5638. * by default if found at compile-time.
  5639. *
  5640. * Returns the new parser context or NULL
  5641. */
  5642. htmlParserCtxtPtr
  5643. htmlCreateFileParserCtxt(const char *filename, const char *encoding)
  5644. {
  5645. htmlParserCtxtPtr ctxt;
  5646. htmlParserInputPtr inputStream;
  5647. char *canonicFilename;
  5648. /* htmlCharEncoding enc; */
  5649. xmlChar *content, *content_line = (xmlChar *) "charset=";
  5650. if (filename == NULL)
  5651. return(NULL);
  5652. ctxt = htmlNewParserCtxt();
  5653. if (ctxt == NULL) {
  5654. return(NULL);
  5655. }
  5656. canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
  5657. if (canonicFilename == NULL) {
  5658. #ifdef LIBXML_SAX1_ENABLED
  5659. if (xmlDefaultSAXHandler.error != NULL) {
  5660. xmlDefaultSAXHandler.error(NULL, "out of memory\n");
  5661. }
  5662. #endif
  5663. xmlFreeParserCtxt(ctxt);
  5664. return(NULL);
  5665. }
  5666. inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
  5667. xmlFree(canonicFilename);
  5668. if (inputStream == NULL) {
  5669. xmlFreeParserCtxt(ctxt);
  5670. return(NULL);
  5671. }
  5672. inputPush(ctxt, inputStream);
  5673. /* set encoding */
  5674. if (encoding) {
  5675. content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
  5676. if (content) {
  5677. strcpy ((char *)content, (char *)content_line);
  5678. strcat ((char *)content, (char *)encoding);
  5679. htmlCheckEncoding (ctxt, content);
  5680. xmlFree (content);
  5681. }
  5682. }
  5683. return(ctxt);
  5684. }
  5685. /**
  5686. * htmlSAXParseFile:
  5687. * @filename: the filename
  5688. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5689. * @sax: the SAX handler block
  5690. * @userData: if using SAX, this pointer will be provided on callbacks.
  5691. *
  5692. * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
  5693. * compressed document is provided by default if found at compile-time.
  5694. * It use the given SAX function block to handle the parsing callback.
  5695. * If sax is NULL, fallback to the default DOM tree building routines.
  5696. *
  5697. * Returns the resulting document tree unless SAX is NULL or the document is
  5698. * not well formed.
  5699. */
  5700. htmlDocPtr
  5701. htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
  5702. void *userData) {
  5703. htmlDocPtr ret;
  5704. htmlParserCtxtPtr ctxt;
  5705. htmlSAXHandlerPtr oldsax = NULL;
  5706. xmlInitParser();
  5707. ctxt = htmlCreateFileParserCtxt(filename, encoding);
  5708. if (ctxt == NULL) return(NULL);
  5709. if (sax != NULL) {
  5710. oldsax = ctxt->sax;
  5711. ctxt->sax = sax;
  5712. ctxt->userData = userData;
  5713. }
  5714. htmlParseDocument(ctxt);
  5715. ret = ctxt->myDoc;
  5716. if (sax != NULL) {
  5717. ctxt->sax = oldsax;
  5718. ctxt->userData = NULL;
  5719. }
  5720. htmlFreeParserCtxt(ctxt);
  5721. return(ret);
  5722. }
  5723. /**
  5724. * htmlParseFile:
  5725. * @filename: the filename
  5726. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5727. *
  5728. * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
  5729. * compressed document is provided by default if found at compile-time.
  5730. *
  5731. * Returns the resulting document tree
  5732. */
  5733. htmlDocPtr
  5734. htmlParseFile(const char *filename, const char *encoding) {
  5735. return(htmlSAXParseFile(filename, encoding, NULL, NULL));
  5736. }
  5737. /**
  5738. * htmlHandleOmittedElem:
  5739. * @val: int 0 or 1
  5740. *
  5741. * Set and return the previous value for handling HTML omitted tags.
  5742. *
  5743. * Returns the last value for 0 for no handling, 1 for auto insertion.
  5744. */
  5745. int
  5746. htmlHandleOmittedElem(int val) {
  5747. int old = htmlOmittedDefaultValue;
  5748. htmlOmittedDefaultValue = val;
  5749. return(old);
  5750. }
  5751. /**
  5752. * htmlElementAllowedHere:
  5753. * @parent: HTML parent element
  5754. * @elt: HTML element
  5755. *
  5756. * Checks whether an HTML element may be a direct child of a parent element.
  5757. * Note - doesn't check for deprecated elements
  5758. *
  5759. * Returns 1 if allowed; 0 otherwise.
  5760. */
  5761. int
  5762. htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
  5763. const char** p ;
  5764. if ( ! elt || ! parent || ! parent->subelts )
  5765. return 0 ;
  5766. for ( p = parent->subelts; *p; ++p )
  5767. if ( !xmlStrcmp((const xmlChar *)*p, elt) )
  5768. return 1 ;
  5769. return 0 ;
  5770. }
  5771. /**
  5772. * htmlElementStatusHere:
  5773. * @parent: HTML parent element
  5774. * @elt: HTML element
  5775. *
  5776. * Checks whether an HTML element may be a direct child of a parent element.
  5777. * and if so whether it is valid or deprecated.
  5778. *
  5779. * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
  5780. */
  5781. htmlStatus
  5782. htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
  5783. if ( ! parent || ! elt )
  5784. return HTML_INVALID ;
  5785. if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
  5786. return HTML_INVALID ;
  5787. return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
  5788. }
  5789. /**
  5790. * htmlAttrAllowed:
  5791. * @elt: HTML element
  5792. * @attr: HTML attribute
  5793. * @legacy: whether to allow deprecated attributes
  5794. *
  5795. * Checks whether an attribute is valid for an element
  5796. * Has full knowledge of Required and Deprecated attributes
  5797. *
  5798. * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
  5799. */
  5800. htmlStatus
  5801. htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
  5802. const char** p ;
  5803. if ( !elt || ! attr )
  5804. return HTML_INVALID ;
  5805. if ( elt->attrs_req )
  5806. for ( p = elt->attrs_req; *p; ++p)
  5807. if ( !xmlStrcmp((const xmlChar*)*p, attr) )
  5808. return HTML_REQUIRED ;
  5809. if ( elt->attrs_opt )
  5810. for ( p = elt->attrs_opt; *p; ++p)
  5811. if ( !xmlStrcmp((const xmlChar*)*p, attr) )
  5812. return HTML_VALID ;
  5813. if ( legacy && elt->attrs_depr )
  5814. for ( p = elt->attrs_depr; *p; ++p)
  5815. if ( !xmlStrcmp((const xmlChar*)*p, attr) )
  5816. return HTML_DEPRECATED ;
  5817. return HTML_INVALID ;
  5818. }
  5819. /**
  5820. * htmlNodeStatus:
  5821. * @node: an htmlNodePtr in a tree
  5822. * @legacy: whether to allow deprecated elements (YES is faster here
  5823. * for Element nodes)
  5824. *
  5825. * Checks whether the tree node is valid. Experimental (the author
  5826. * only uses the HTML enhancements in a SAX parser)
  5827. *
  5828. * Return: for Element nodes, a return from htmlElementAllowedHere (if
  5829. * legacy allowed) or htmlElementStatusHere (otherwise).
  5830. * for Attribute nodes, a return from htmlAttrAllowed
  5831. * for other nodes, HTML_NA (no checks performed)
  5832. */
  5833. htmlStatus
  5834. htmlNodeStatus(const htmlNodePtr node, int legacy) {
  5835. if ( ! node )
  5836. return HTML_INVALID ;
  5837. switch ( node->type ) {
  5838. case XML_ELEMENT_NODE:
  5839. return legacy
  5840. ? ( htmlElementAllowedHere (
  5841. htmlTagLookup(node->parent->name) , node->name
  5842. ) ? HTML_VALID : HTML_INVALID )
  5843. : htmlElementStatusHere(
  5844. htmlTagLookup(node->parent->name) ,
  5845. htmlTagLookup(node->name) )
  5846. ;
  5847. case XML_ATTRIBUTE_NODE:
  5848. return htmlAttrAllowed(
  5849. htmlTagLookup(node->parent->name) , node->name, legacy) ;
  5850. default: return HTML_NA ;
  5851. }
  5852. }
  5853. /************************************************************************
  5854. * *
  5855. * New set (2.6.0) of simpler and more flexible APIs *
  5856. * *
  5857. ************************************************************************/
  5858. /**
  5859. * DICT_FREE:
  5860. * @str: a string
  5861. *
  5862. * Free a string if it is not owned by the "dict" dictionnary in the
  5863. * current scope
  5864. */
  5865. #define DICT_FREE(str) \
  5866. if ((str) && ((!dict) || \
  5867. (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
  5868. xmlFree((char *)(str));
  5869. /**
  5870. * htmlCtxtReset:
  5871. * @ctxt: an HTML parser context
  5872. *
  5873. * Reset a parser context
  5874. */
  5875. void
  5876. htmlCtxtReset(htmlParserCtxtPtr ctxt)
  5877. {
  5878. xmlParserInputPtr input;
  5879. xmlDictPtr dict;
  5880. if (ctxt == NULL)
  5881. return;
  5882. xmlInitParser();
  5883. dict = ctxt->dict;
  5884. while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
  5885. xmlFreeInputStream(input);
  5886. }
  5887. ctxt->inputNr = 0;
  5888. ctxt->input = NULL;
  5889. ctxt->spaceNr = 0;
  5890. if (ctxt->spaceTab != NULL) {
  5891. ctxt->spaceTab[0] = -1;
  5892. ctxt->space = &ctxt->spaceTab[0];
  5893. } else {
  5894. ctxt->space = NULL;
  5895. }
  5896. ctxt->nodeNr = 0;
  5897. ctxt->node = NULL;
  5898. ctxt->nameNr = 0;
  5899. ctxt->name = NULL;
  5900. DICT_FREE(ctxt->version);
  5901. ctxt->version = NULL;
  5902. DICT_FREE(ctxt->encoding);
  5903. ctxt->encoding = NULL;
  5904. DICT_FREE(ctxt->directory);
  5905. ctxt->directory = NULL;
  5906. DICT_FREE(ctxt->extSubURI);
  5907. ctxt->extSubURI = NULL;
  5908. DICT_FREE(ctxt->extSubSystem);
  5909. ctxt->extSubSystem = NULL;
  5910. if (ctxt->myDoc != NULL)
  5911. xmlFreeDoc(ctxt->myDoc);
  5912. ctxt->myDoc = NULL;
  5913. ctxt->standalone = -1;
  5914. ctxt->hasExternalSubset = 0;
  5915. ctxt->hasPErefs = 0;
  5916. ctxt->html = 1;
  5917. ctxt->external = 0;
  5918. ctxt->instate = XML_PARSER_START;
  5919. ctxt->token = 0;
  5920. ctxt->wellFormed = 1;
  5921. ctxt->nsWellFormed = 1;
  5922. ctxt->valid = 1;
  5923. ctxt->vctxt.userData = ctxt;
  5924. ctxt->vctxt.error = xmlParserValidityError;
  5925. ctxt->vctxt.warning = xmlParserValidityWarning;
  5926. ctxt->record_info = 0;
  5927. ctxt->nbChars = 0;
  5928. ctxt->checkIndex = 0;
  5929. ctxt->inSubset = 0;
  5930. ctxt->errNo = XML_ERR_OK;
  5931. ctxt->depth = 0;
  5932. ctxt->charset = XML_CHAR_ENCODING_NONE;
  5933. ctxt->catalogs = NULL;
  5934. xmlInitNodeInfoSeq(&ctxt->node_seq);
  5935. if (ctxt->attsDefault != NULL) {
  5936. xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
  5937. ctxt->attsDefault = NULL;
  5938. }
  5939. if (ctxt->attsSpecial != NULL) {
  5940. xmlHashFree(ctxt->attsSpecial, NULL);
  5941. ctxt->attsSpecial = NULL;
  5942. }
  5943. }
  5944. /**
  5945. * htmlCtxtUseOptions:
  5946. * @ctxt: an HTML parser context
  5947. * @options: a combination of htmlParserOption(s)
  5948. *
  5949. * Applies the options to the parser context
  5950. *
  5951. * Returns 0 in case of success, the set of unknown or unimplemented options
  5952. * in case of error.
  5953. */
  5954. int
  5955. htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
  5956. {
  5957. if (ctxt == NULL)
  5958. return(-1);
  5959. if (options & HTML_PARSE_NOWARNING) {
  5960. ctxt->sax->warning = NULL;
  5961. ctxt->vctxt.warning = NULL;
  5962. options -= XML_PARSE_NOWARNING;
  5963. ctxt->options |= XML_PARSE_NOWARNING;
  5964. }
  5965. if (options & HTML_PARSE_NOERROR) {
  5966. ctxt->sax->error = NULL;
  5967. ctxt->vctxt.error = NULL;
  5968. ctxt->sax->fatalError = NULL;
  5969. options -= XML_PARSE_NOERROR;
  5970. ctxt->options |= XML_PARSE_NOERROR;
  5971. }
  5972. if (options & HTML_PARSE_PEDANTIC) {
  5973. ctxt->pedantic = 1;
  5974. options -= XML_PARSE_PEDANTIC;
  5975. ctxt->options |= XML_PARSE_PEDANTIC;
  5976. } else
  5977. ctxt->pedantic = 0;
  5978. if (options & XML_PARSE_NOBLANKS) {
  5979. ctxt->keepBlanks = 0;
  5980. ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
  5981. options -= XML_PARSE_NOBLANKS;
  5982. ctxt->options |= XML_PARSE_NOBLANKS;
  5983. } else
  5984. ctxt->keepBlanks = 1;
  5985. if (options & HTML_PARSE_RECOVER) {
  5986. ctxt->recovery = 1;
  5987. options -= HTML_PARSE_RECOVER;
  5988. } else
  5989. ctxt->recovery = 0;
  5990. if (options & HTML_PARSE_COMPACT) {
  5991. ctxt->options |= HTML_PARSE_COMPACT;
  5992. options -= HTML_PARSE_COMPACT;
  5993. }
  5994. if (options & XML_PARSE_HUGE) {
  5995. ctxt->options |= XML_PARSE_HUGE;
  5996. options -= XML_PARSE_HUGE;
  5997. }
  5998. ctxt->dictNames = 0;
  5999. return (options);
  6000. }
  6001. /**
  6002. * htmlDoRead:
  6003. * @ctxt: an HTML parser context
  6004. * @URL: the base URL to use for the document
  6005. * @encoding: the document encoding, or NULL
  6006. * @options: a combination of htmlParserOption(s)
  6007. * @reuse: keep the context for reuse
  6008. *
  6009. * Common front-end for the htmlRead functions
  6010. *
  6011. * Returns the resulting document tree or NULL
  6012. */
  6013. static htmlDocPtr
  6014. htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
  6015. int options, int reuse)
  6016. {
  6017. htmlDocPtr ret;
  6018. htmlCtxtUseOptions(ctxt, options);
  6019. ctxt->html = 1;
  6020. if (encoding != NULL) {
  6021. xmlCharEncodingHandlerPtr hdlr;
  6022. hdlr = xmlFindCharEncodingHandler(encoding);
  6023. if (hdlr != NULL) {
  6024. xmlSwitchToEncoding(ctxt, hdlr);
  6025. if (ctxt->input->encoding != NULL)
  6026. xmlFree((xmlChar *) ctxt->input->encoding);
  6027. ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
  6028. }
  6029. }
  6030. if ((URL != NULL) && (ctxt->input != NULL) &&
  6031. (ctxt->input->filename == NULL))
  6032. ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
  6033. htmlParseDocument(ctxt);
  6034. ret = ctxt->myDoc;
  6035. ctxt->myDoc = NULL;
  6036. if (!reuse) {
  6037. if ((ctxt->dictNames) &&
  6038. (ret != NULL) &&
  6039. (ret->dict == ctxt->dict))
  6040. ctxt->dict = NULL;
  6041. xmlFreeParserCtxt(ctxt);
  6042. }
  6043. return (ret);
  6044. }
  6045. /**
  6046. * htmlReadDoc:
  6047. * @cur: a pointer to a zero terminated string
  6048. * @URL: the base URL to use for the document
  6049. * @encoding: the document encoding, or NULL
  6050. * @options: a combination of htmlParserOption(s)
  6051. *
  6052. * parse an XML in-memory document and build a tree.
  6053. *
  6054. * Returns the resulting document tree
  6055. */
  6056. htmlDocPtr
  6057. htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
  6058. {
  6059. htmlParserCtxtPtr ctxt;
  6060. if (cur == NULL)
  6061. return (NULL);
  6062. xmlInitParser();
  6063. ctxt = htmlCreateDocParserCtxt(cur, NULL);
  6064. if (ctxt == NULL)
  6065. return (NULL);
  6066. return (htmlDoRead(ctxt, URL, encoding, options, 0));
  6067. }
  6068. /**
  6069. * htmlReadFile:
  6070. * @filename: a file or URL
  6071. * @encoding: the document encoding, or NULL
  6072. * @options: a combination of htmlParserOption(s)
  6073. *
  6074. * parse an XML file from the filesystem or the network.
  6075. *
  6076. * Returns the resulting document tree
  6077. */
  6078. htmlDocPtr
  6079. htmlReadFile(const char *filename, const char *encoding, int options)
  6080. {
  6081. htmlParserCtxtPtr ctxt;
  6082. xmlInitParser();
  6083. ctxt = htmlCreateFileParserCtxt(filename, encoding);
  6084. if (ctxt == NULL)
  6085. return (NULL);
  6086. return (htmlDoRead(ctxt, NULL, NULL, options, 0));
  6087. }
  6088. /**
  6089. * htmlReadMemory:
  6090. * @buffer: a pointer to a char array
  6091. * @size: the size of the array
  6092. * @URL: the base URL to use for the document
  6093. * @encoding: the document encoding, or NULL
  6094. * @options: a combination of htmlParserOption(s)
  6095. *
  6096. * parse an XML in-memory document and build a tree.
  6097. *
  6098. * Returns the resulting document tree
  6099. */
  6100. htmlDocPtr
  6101. htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
  6102. {
  6103. htmlParserCtxtPtr ctxt;
  6104. xmlInitParser();
  6105. ctxt = xmlCreateMemoryParserCtxt(buffer, size);
  6106. if (ctxt == NULL)
  6107. return (NULL);
  6108. htmlDefaultSAXHandlerInit();
  6109. if (ctxt->sax != NULL)
  6110. memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
  6111. return (htmlDoRead(ctxt, URL, encoding, options, 0));
  6112. }
  6113. /**
  6114. * htmlReadFd:
  6115. * @fd: an open file descriptor
  6116. * @URL: the base URL to use for the document
  6117. * @encoding: the document encoding, or NULL
  6118. * @options: a combination of htmlParserOption(s)
  6119. *
  6120. * parse an XML from a file descriptor and build a tree.
  6121. *
  6122. * Returns the resulting document tree
  6123. */
  6124. htmlDocPtr
  6125. htmlReadFd(int fd, const char *URL, const char *encoding, int options)
  6126. {
  6127. htmlParserCtxtPtr ctxt;
  6128. xmlParserInputBufferPtr input;
  6129. xmlParserInputPtr stream;
  6130. if (fd < 0)
  6131. return (NULL);
  6132. xmlInitParser();
  6133. input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
  6134. if (input == NULL)
  6135. return (NULL);
  6136. ctxt = xmlNewParserCtxt();
  6137. if (ctxt == NULL) {
  6138. xmlFreeParserInputBuffer(input);
  6139. return (NULL);
  6140. }
  6141. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  6142. if (stream == NULL) {
  6143. xmlFreeParserInputBuffer(input);
  6144. xmlFreeParserCtxt(ctxt);
  6145. return (NULL);
  6146. }
  6147. inputPush(ctxt, stream);
  6148. return (htmlDoRead(ctxt, URL, encoding, options, 0));
  6149. }
  6150. /**
  6151. * htmlReadIO:
  6152. * @ioread: an I/O read function
  6153. * @ioclose: an I/O close function
  6154. * @ioctx: an I/O handler
  6155. * @URL: the base URL to use for the document
  6156. * @encoding: the document encoding, or NULL
  6157. * @options: a combination of htmlParserOption(s)
  6158. *
  6159. * parse an HTML document from I/O functions and source and build a tree.
  6160. *
  6161. * Returns the resulting document tree
  6162. */
  6163. htmlDocPtr
  6164. htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
  6165. void *ioctx, const char *URL, const char *encoding, int options)
  6166. {
  6167. htmlParserCtxtPtr ctxt;
  6168. xmlParserInputBufferPtr input;
  6169. xmlParserInputPtr stream;
  6170. if (ioread == NULL)
  6171. return (NULL);
  6172. xmlInitParser();
  6173. input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
  6174. XML_CHAR_ENCODING_NONE);
  6175. if (input == NULL)
  6176. return (NULL);
  6177. ctxt = htmlNewParserCtxt();
  6178. if (ctxt == NULL) {
  6179. xmlFreeParserInputBuffer(input);
  6180. return (NULL);
  6181. }
  6182. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  6183. if (stream == NULL) {
  6184. xmlFreeParserInputBuffer(input);
  6185. xmlFreeParserCtxt(ctxt);
  6186. return (NULL);
  6187. }
  6188. inputPush(ctxt, stream);
  6189. return (htmlDoRead(ctxt, URL, encoding, options, 0));
  6190. }
  6191. /**
  6192. * htmlCtxtReadDoc:
  6193. * @ctxt: an HTML parser context
  6194. * @cur: a pointer to a zero terminated string
  6195. * @URL: the base URL to use for the document
  6196. * @encoding: the document encoding, or NULL
  6197. * @options: a combination of htmlParserOption(s)
  6198. *
  6199. * parse an XML in-memory document and build a tree.
  6200. * This reuses the existing @ctxt parser context
  6201. *
  6202. * Returns the resulting document tree
  6203. */
  6204. htmlDocPtr
  6205. htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
  6206. const char *URL, const char *encoding, int options)
  6207. {
  6208. xmlParserInputPtr stream;
  6209. if (cur == NULL)
  6210. return (NULL);
  6211. if (ctxt == NULL)
  6212. return (NULL);
  6213. htmlCtxtReset(ctxt);
  6214. stream = xmlNewStringInputStream(ctxt, cur);
  6215. if (stream == NULL) {
  6216. return (NULL);
  6217. }
  6218. inputPush(ctxt, stream);
  6219. return (htmlDoRead(ctxt, URL, encoding, options, 1));
  6220. }
  6221. /**
  6222. * htmlCtxtReadFile:
  6223. * @ctxt: an HTML parser context
  6224. * @filename: a file or URL
  6225. * @encoding: the document encoding, or NULL
  6226. * @options: a combination of htmlParserOption(s)
  6227. *
  6228. * parse an XML file from the filesystem or the network.
  6229. * This reuses the existing @ctxt parser context
  6230. *
  6231. * Returns the resulting document tree
  6232. */
  6233. htmlDocPtr
  6234. htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
  6235. const char *encoding, int options)
  6236. {
  6237. xmlParserInputPtr stream;
  6238. if (filename == NULL)
  6239. return (NULL);
  6240. if (ctxt == NULL)
  6241. return (NULL);
  6242. htmlCtxtReset(ctxt);
  6243. stream = xmlLoadExternalEntity(filename, NULL, ctxt);
  6244. if (stream == NULL) {
  6245. return (NULL);
  6246. }
  6247. inputPush(ctxt, stream);
  6248. return (htmlDoRead(ctxt, NULL, encoding, options, 1));
  6249. }
  6250. /**
  6251. * htmlCtxtReadMemory:
  6252. * @ctxt: an HTML parser context
  6253. * @buffer: a pointer to a char array
  6254. * @size: the size of the array
  6255. * @URL: the base URL to use for the document
  6256. * @encoding: the document encoding, or NULL
  6257. * @options: a combination of htmlParserOption(s)
  6258. *
  6259. * parse an XML in-memory document and build a tree.
  6260. * This reuses the existing @ctxt parser context
  6261. *
  6262. * Returns the resulting document tree
  6263. */
  6264. htmlDocPtr
  6265. htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
  6266. const char *URL, const char *encoding, int options)
  6267. {
  6268. xmlParserInputBufferPtr input;
  6269. xmlParserInputPtr stream;
  6270. if (ctxt == NULL)
  6271. return (NULL);
  6272. if (buffer == NULL)
  6273. return (NULL);
  6274. htmlCtxtReset(ctxt);
  6275. input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
  6276. if (input == NULL) {
  6277. return(NULL);
  6278. }
  6279. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  6280. if (stream == NULL) {
  6281. xmlFreeParserInputBuffer(input);
  6282. return(NULL);
  6283. }
  6284. inputPush(ctxt, stream);
  6285. return (htmlDoRead(ctxt, URL, encoding, options, 1));
  6286. }
  6287. /**
  6288. * htmlCtxtReadFd:
  6289. * @ctxt: an HTML parser context
  6290. * @fd: an open file descriptor
  6291. * @URL: the base URL to use for the document
  6292. * @encoding: the document encoding, or NULL
  6293. * @options: a combination of htmlParserOption(s)
  6294. *
  6295. * parse an XML from a file descriptor and build a tree.
  6296. * This reuses the existing @ctxt parser context
  6297. *
  6298. * Returns the resulting document tree
  6299. */
  6300. htmlDocPtr
  6301. htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
  6302. const char *URL, const char *encoding, int options)
  6303. {
  6304. xmlParserInputBufferPtr input;
  6305. xmlParserInputPtr stream;
  6306. if (fd < 0)
  6307. return (NULL);
  6308. if (ctxt == NULL)
  6309. return (NULL);
  6310. htmlCtxtReset(ctxt);
  6311. input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
  6312. if (input == NULL)
  6313. return (NULL);
  6314. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  6315. if (stream == NULL) {
  6316. xmlFreeParserInputBuffer(input);
  6317. return (NULL);
  6318. }
  6319. inputPush(ctxt, stream);
  6320. return (htmlDoRead(ctxt, URL, encoding, options, 1));
  6321. }
  6322. /**
  6323. * htmlCtxtReadIO:
  6324. * @ctxt: an HTML parser context
  6325. * @ioread: an I/O read function
  6326. * @ioclose: an I/O close function
  6327. * @ioctx: an I/O handler
  6328. * @URL: the base URL to use for the document
  6329. * @encoding: the document encoding, or NULL
  6330. * @options: a combination of htmlParserOption(s)
  6331. *
  6332. * parse an HTML document from I/O functions and source and build a tree.
  6333. * This reuses the existing @ctxt parser context
  6334. *
  6335. * Returns the resulting document tree
  6336. */
  6337. htmlDocPtr
  6338. htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
  6339. xmlInputCloseCallback ioclose, void *ioctx,
  6340. const char *URL,
  6341. const char *encoding, int options)
  6342. {
  6343. xmlParserInputBufferPtr input;
  6344. xmlParserInputPtr stream;
  6345. if (ioread == NULL)
  6346. return (NULL);
  6347. if (ctxt == NULL)
  6348. return (NULL);
  6349. htmlCtxtReset(ctxt);
  6350. input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
  6351. XML_CHAR_ENCODING_NONE);
  6352. if (input == NULL)
  6353. return (NULL);
  6354. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  6355. if (stream == NULL) {
  6356. xmlFreeParserInputBuffer(input);
  6357. return (NULL);
  6358. }
  6359. inputPush(ctxt, stream);
  6360. return (htmlDoRead(ctxt, URL, encoding, options, 1));
  6361. }
  6362. #define bottom_HTMLparser
  6363. #include "elfgcchack.h"
  6364. #endif /* LIBXML_HTML_ENABLED */