genUnicode.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478
  1. #!/usr/bin/python -u
  2. #
  3. # Original script modified in November 2003 to take advantage of
  4. # the character-validation range routines, and updated to the
  5. # current Unicode information (Version 4.0.1)
  6. #
  7. # NOTE: there is an 'alias' facility for blocks which are not present in
  8. # the current release, but are needed for ABI compatibility. This
  9. # must be accomplished MANUALLY! Please see the comments below under
  10. # 'blockAliases'
  11. #
  12. import sys
  13. import string
  14. import time
  15. webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
  16. sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
  17. #
  18. # blockAliases is a small hack - it is used for mapping block names which
  19. # were were used in the 3.1 release, but are missing or changed in the current
  20. # release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
  21. blockAliases = []
  22. blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
  23. blockAliases.append("Greek:GreekandCoptic")
  24. blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
  25. "SupplementaryPrivateUseArea-B")
  26. # minTableSize gives the minimum number of ranges which must be present
  27. # before a range table is produced. If there are less than this
  28. # number, inline comparisons are generated
  29. minTableSize = 8
  30. (blockfile, catfile) = string.split(sources)
  31. #
  32. # Now process the "blocks" file, reducing it to a dictionary
  33. # indexed by blockname, containing a tuple with the applicable
  34. # block range
  35. #
  36. BlockNames = {}
  37. try:
  38. blocks = open(blockfile, "r")
  39. except:
  40. print "Missing %s, aborting ..." % blockfile
  41. sys.exit(1)
  42. for line in blocks.readlines():
  43. if line[0] == '#':
  44. continue
  45. line = string.strip(line)
  46. if line == '':
  47. continue
  48. try:
  49. fields = string.split(line, ';')
  50. range = string.strip(fields[0])
  51. (start, end) = string.split(range, "..")
  52. name = string.strip(fields[1])
  53. name = string.replace(name, ' ', '')
  54. except:
  55. print "Failed to process line: %s" % (line)
  56. continue
  57. start = "0x" + start
  58. end = "0x" + end
  59. try:
  60. BlockNames[name].append((start, end))
  61. except:
  62. BlockNames[name] = [(start, end)]
  63. blocks.close()
  64. print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
  65. for block in blockAliases:
  66. alias = string.split(block,':')
  67. alist = string.split(alias[1],',')
  68. for comp in alist:
  69. if BlockNames.has_key(comp):
  70. if alias[0] not in BlockNames:
  71. BlockNames[alias[0]] = []
  72. for r in BlockNames[comp]:
  73. BlockNames[alias[0]].append(r)
  74. else:
  75. print "Alias %s: %s not in Blocks" % (alias[0], comp)
  76. continue
  77. #
  78. # Next process the Categories file. This is more complex, since
  79. # the file is in code sequence, and we need to invert it. We use
  80. # a dictionary with index category-name, with each entry containing
  81. # all the ranges (codepoints) of that category. Note that category
  82. # names comprise two parts - the general category, and the "subclass"
  83. # within that category. Therefore, both "general category" (which is
  84. # the first character of the 2-character category-name) and the full
  85. # (2-character) name are entered into this dictionary.
  86. #
  87. try:
  88. data = open(catfile, "r")
  89. except:
  90. print "Missing %s, aborting ..." % catfile
  91. sys.exit(1)
  92. nbchar = 0;
  93. Categories = {}
  94. for line in data.readlines():
  95. if line[0] == '#':
  96. continue
  97. line = string.strip(line)
  98. if line == '':
  99. continue
  100. try:
  101. fields = string.split(line, ';')
  102. point = string.strip(fields[0])
  103. value = 0
  104. while point != '':
  105. value = value * 16
  106. if point[0] >= '0' and point[0] <= '9':
  107. value = value + ord(point[0]) - ord('0')
  108. elif point[0] >= 'A' and point[0] <= 'F':
  109. value = value + 10 + ord(point[0]) - ord('A')
  110. elif point[0] >= 'a' and point[0] <= 'f':
  111. value = value + 10 + ord(point[0]) - ord('a')
  112. point = point[1:]
  113. name = fields[2]
  114. except:
  115. print "Failed to process line: %s" % (line)
  116. continue
  117. nbchar = nbchar + 1
  118. # update entry for "full name"
  119. try:
  120. Categories[name].append(value)
  121. except:
  122. try:
  123. Categories[name] = [value]
  124. except:
  125. print "Failed to process line: %s" % (line)
  126. # update "general category" name
  127. try:
  128. Categories[name[0]].append(value)
  129. except:
  130. try:
  131. Categories[name[0]] = [value]
  132. except:
  133. print "Failed to process line: %s" % (line)
  134. blocks.close()
  135. print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
  136. #
  137. # The data is now all read. Time to process it into a more useful form.
  138. #
  139. # reduce the number list into ranges
  140. for cat in Categories.keys():
  141. list = Categories[cat]
  142. start = -1
  143. prev = -1
  144. end = -1
  145. ranges = []
  146. for val in list:
  147. if start == -1:
  148. start = val
  149. prev = val
  150. continue
  151. elif val == prev + 1:
  152. prev = val
  153. continue
  154. elif prev == start:
  155. ranges.append((prev, prev))
  156. start = val
  157. prev = val
  158. continue
  159. else:
  160. ranges.append((start, prev))
  161. start = val
  162. prev = val
  163. continue
  164. if prev == start:
  165. ranges.append((prev, prev))
  166. else:
  167. ranges.append((start, prev))
  168. Categories[cat] = ranges
  169. #
  170. # Assure all data is in alphabetic order, since we will be doing binary
  171. # searches on the tables.
  172. #
  173. bkeys = BlockNames.keys()
  174. bkeys.sort()
  175. ckeys = Categories.keys()
  176. ckeys.sort()
  177. #
  178. # Generate the resulting files
  179. #
  180. try:
  181. header = open("include/libxml/xmlunicode.h", "w")
  182. except:
  183. print "Failed to open include/libxml/xmlunicode.h"
  184. sys.exit(1)
  185. try:
  186. output = open("xmlunicode.c", "w")
  187. except:
  188. print "Failed to open xmlunicode.c"
  189. sys.exit(1)
  190. date = time.asctime(time.localtime(time.time()))
  191. header.write(
  192. """/*
  193. * Summary: Unicode character APIs
  194. * Description: API for the Unicode character APIs
  195. *
  196. * This file is automatically generated from the
  197. * UCS description files of the Unicode Character Database
  198. * %s
  199. * using the genUnicode.py Python script.
  200. *
  201. * Generation date: %s
  202. * Sources: %s
  203. * Author: Daniel Veillard
  204. */
  205. #ifndef __XML_UNICODE_H__
  206. #define __XML_UNICODE_H__
  207. #include <libxml/xmlversion.h>
  208. #ifdef LIBXML_UNICODE_ENABLED
  209. #ifdef __cplusplus
  210. extern "C" {
  211. #endif
  212. """ % (webpage, date, sources));
  213. output.write(
  214. """/*
  215. * xmlunicode.c: this module implements the Unicode character APIs
  216. *
  217. * This file is automatically generated from the
  218. * UCS description files of the Unicode Character Database
  219. * %s
  220. * using the genUnicode.py Python script.
  221. *
  222. * Generation date: %s
  223. * Sources: %s
  224. * Daniel Veillard <veillard@redhat.com>
  225. */
  226. #define IN_LIBXML
  227. #include "libxml.h"
  228. #ifdef LIBXML_UNICODE_ENABLED
  229. #include <string.h>
  230. #include <libxml/xmlversion.h>
  231. #include <libxml/xmlunicode.h>
  232. #include <libxml/chvalid.h>
  233. typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */
  234. typedef struct {
  235. const char *rangename;
  236. xmlIntFunc *func;
  237. } xmlUnicodeRange;
  238. typedef struct {
  239. xmlUnicodeRange *table;
  240. int numentries;
  241. } xmlUnicodeNameTable;
  242. static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);
  243. static xmlUnicodeRange xmlUnicodeBlocks[] = {
  244. """ % (webpage, date, sources));
  245. flag = 0
  246. for block in bkeys:
  247. name = string.replace(block, '-', '')
  248. if flag:
  249. output.write(',\n')
  250. else:
  251. flag = 1
  252. output.write(' {"%s", xmlUCSIs%s}' % (block, name))
  253. output.write('};\n\n')
  254. output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
  255. flag = 0;
  256. for name in ckeys:
  257. if flag:
  258. output.write(',\n')
  259. else:
  260. flag = 1
  261. output.write(' {"%s", xmlUCSIsCat%s}' % (name, name))
  262. output.write('};\n\n')
  263. #
  264. # For any categories with more than minTableSize ranges we generate
  265. # a range table suitable for xmlCharInRange
  266. #
  267. for name in ckeys:
  268. if len(Categories[name]) > minTableSize:
  269. numshort = 0
  270. numlong = 0
  271. ranges = Categories[name]
  272. sptr = "NULL"
  273. lptr = "NULL"
  274. for range in ranges:
  275. (low, high) = range
  276. if high < 0x10000:
  277. if numshort == 0:
  278. pline = "static const xmlChSRange xml%sS[] = {" % name
  279. sptr = "xml%sS" % name
  280. else:
  281. pline += ", "
  282. numshort += 1
  283. else:
  284. if numlong == 0:
  285. if numshort > 0:
  286. output.write(pline + " };\n")
  287. pline = "static const xmlChLRange xml%sL[] = {" % name
  288. lptr = "xml%sL" % name
  289. else:
  290. pline += ", "
  291. numlong += 1
  292. if len(pline) > 60:
  293. output.write(pline + "\n")
  294. pline = " "
  295. pline += "{%s, %s}" % (hex(low), hex(high))
  296. output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
  297. % (name, numshort, numlong, sptr, lptr))
  298. output.write(
  299. """static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
  300. static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
  301. /**
  302. * xmlUnicodeLookup:
  303. * @tptr: pointer to the name table
  304. * @name: name to be found
  305. *
  306. * binary table lookup for user-supplied name
  307. *
  308. * Returns pointer to range function if found, otherwise NULL
  309. */
  310. static xmlIntFunc
  311. *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
  312. int low, high, mid, cmp;
  313. xmlUnicodeRange *sptr;
  314. if ((tptr == NULL) || (tname == NULL)) return(NULL);
  315. low = 0;
  316. high = tptr->numentries - 1;
  317. sptr = tptr->table;
  318. while (low <= high) {
  319. mid = (low + high) / 2;
  320. if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
  321. return (sptr[mid].func);
  322. if (cmp < 0)
  323. high = mid - 1;
  324. else
  325. low = mid + 1;
  326. }
  327. return (NULL);
  328. }
  329. """ % (len(BlockNames), len(Categories)) )
  330. for block in bkeys:
  331. name = string.replace(block, '-', '')
  332. header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
  333. output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
  334. output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
  335. (block))
  336. output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
  337. output.write("int\nxmlUCSIs%s(int code) {\n return(" % name)
  338. flag = 0
  339. for (start, end) in BlockNames[block]:
  340. if flag:
  341. output.write(" ||\n ")
  342. else:
  343. flag = 1
  344. output.write("((code >= %s) && (code <= %s))" % (start, end))
  345. output.write(");\n}\n\n")
  346. header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
  347. output.write(
  348. """/**
  349. * xmlUCSIsBlock:
  350. * @code: UCS code point
  351. * @block: UCS block name
  352. *
  353. * Check whether the character is part of the UCS Block
  354. *
  355. * Returns 1 if true, 0 if false and -1 on unknown block
  356. */
  357. int
  358. xmlUCSIsBlock(int code, const char *block) {
  359. xmlIntFunc *func;
  360. func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
  361. if (func == NULL)
  362. return (-1);
  363. return (func(code));
  364. }
  365. """)
  366. for name in ckeys:
  367. ranges = Categories[name]
  368. header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
  369. output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
  370. output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
  371. (name))
  372. output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
  373. output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
  374. if len(Categories[name]) > minTableSize:
  375. output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)"
  376. % name)
  377. else:
  378. start = 1
  379. for range in ranges:
  380. (begin, end) = range;
  381. if start:
  382. output.write(" return(");
  383. start = 0
  384. else:
  385. output.write(" ||\n ");
  386. if (begin == end):
  387. output.write("(code == %s)" % (hex(begin)))
  388. else:
  389. output.write("((code >= %s) && (code <= %s))" % (
  390. hex(begin), hex(end)))
  391. output.write(");\n}\n\n")
  392. header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
  393. output.write(
  394. """/**
  395. * xmlUCSIsCat:
  396. * @code: UCS code point
  397. * @cat: UCS Category name
  398. *
  399. * Check whether the character is part of the UCS Category
  400. *
  401. * Returns 1 if true, 0 if false and -1 on unknown category
  402. */
  403. int
  404. xmlUCSIsCat(int code, const char *cat) {
  405. xmlIntFunc *func;
  406. func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
  407. if (func == NULL)
  408. return (-1);
  409. return (func(code));
  410. }
  411. #define bottom_xmlunicode
  412. #include "elfgcchack.h"
  413. #endif /* LIBXML_UNICODE_ENABLED */
  414. """)
  415. header.write("""
  416. #ifdef __cplusplus
  417. }
  418. #endif
  419. #endif /* LIBXML_UNICODE_ENABLED */
  420. #endif /* __XML_UNICODE_H__ */
  421. """);
  422. header.close()
  423. output.close()