123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478 |
- #!/usr/bin/python -u
- #
- # Original script modified in November 2003 to take advantage of
- # the character-validation range routines, and updated to the
- # current Unicode information (Version 4.0.1)
- #
- # NOTE: there is an 'alias' facility for blocks which are not present in
- # the current release, but are needed for ABI compatibility. This
- # must be accomplished MANUALLY! Please see the comments below under
- # 'blockAliases'
- #
- import sys
- import string
- import time
- webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
- sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
- #
- # blockAliases is a small hack - it is used for mapping block names which
- # were were used in the 3.1 release, but are missing or changed in the current
- # release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
- blockAliases = []
- blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
- blockAliases.append("Greek:GreekandCoptic")
- blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
- "SupplementaryPrivateUseArea-B")
- # minTableSize gives the minimum number of ranges which must be present
- # before a range table is produced. If there are less than this
- # number, inline comparisons are generated
- minTableSize = 8
- (blockfile, catfile) = string.split(sources)
- #
- # Now process the "blocks" file, reducing it to a dictionary
- # indexed by blockname, containing a tuple with the applicable
- # block range
- #
- BlockNames = {}
- try:
- blocks = open(blockfile, "r")
- except:
- print "Missing %s, aborting ..." % blockfile
- sys.exit(1)
- for line in blocks.readlines():
- if line[0] == '#':
- continue
- line = string.strip(line)
- if line == '':
- continue
- try:
- fields = string.split(line, ';')
- range = string.strip(fields[0])
- (start, end) = string.split(range, "..")
- name = string.strip(fields[1])
- name = string.replace(name, ' ', '')
- except:
- print "Failed to process line: %s" % (line)
- continue
- start = "0x" + start
- end = "0x" + end
- try:
- BlockNames[name].append((start, end))
- except:
- BlockNames[name] = [(start, end)]
- blocks.close()
- print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
- for block in blockAliases:
- alias = string.split(block,':')
- alist = string.split(alias[1],',')
- for comp in alist:
- if BlockNames.has_key(comp):
- if alias[0] not in BlockNames:
- BlockNames[alias[0]] = []
- for r in BlockNames[comp]:
- BlockNames[alias[0]].append(r)
- else:
- print "Alias %s: %s not in Blocks" % (alias[0], comp)
- continue
- #
- # Next process the Categories file. This is more complex, since
- # the file is in code sequence, and we need to invert it. We use
- # a dictionary with index category-name, with each entry containing
- # all the ranges (codepoints) of that category. Note that category
- # names comprise two parts - the general category, and the "subclass"
- # within that category. Therefore, both "general category" (which is
- # the first character of the 2-character category-name) and the full
- # (2-character) name are entered into this dictionary.
- #
- try:
- data = open(catfile, "r")
- except:
- print "Missing %s, aborting ..." % catfile
- sys.exit(1)
- nbchar = 0;
- Categories = {}
- for line in data.readlines():
- if line[0] == '#':
- continue
- line = string.strip(line)
- if line == '':
- continue
- try:
- fields = string.split(line, ';')
- point = string.strip(fields[0])
- value = 0
- while point != '':
- value = value * 16
- if point[0] >= '0' and point[0] <= '9':
- value = value + ord(point[0]) - ord('0')
- elif point[0] >= 'A' and point[0] <= 'F':
- value = value + 10 + ord(point[0]) - ord('A')
- elif point[0] >= 'a' and point[0] <= 'f':
- value = value + 10 + ord(point[0]) - ord('a')
- point = point[1:]
- name = fields[2]
- except:
- print "Failed to process line: %s" % (line)
- continue
-
- nbchar = nbchar + 1
- # update entry for "full name"
- try:
- Categories[name].append(value)
- except:
- try:
- Categories[name] = [value]
- except:
- print "Failed to process line: %s" % (line)
- # update "general category" name
- try:
- Categories[name[0]].append(value)
- except:
- try:
- Categories[name[0]] = [value]
- except:
- print "Failed to process line: %s" % (line)
- blocks.close()
- print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
- #
- # The data is now all read. Time to process it into a more useful form.
- #
- # reduce the number list into ranges
- for cat in Categories.keys():
- list = Categories[cat]
- start = -1
- prev = -1
- end = -1
- ranges = []
- for val in list:
- if start == -1:
- start = val
- prev = val
- continue
- elif val == prev + 1:
- prev = val
- continue
- elif prev == start:
- ranges.append((prev, prev))
- start = val
- prev = val
- continue
- else:
- ranges.append((start, prev))
- start = val
- prev = val
- continue
- if prev == start:
- ranges.append((prev, prev))
- else:
- ranges.append((start, prev))
- Categories[cat] = ranges
- #
- # Assure all data is in alphabetic order, since we will be doing binary
- # searches on the tables.
- #
- bkeys = BlockNames.keys()
- bkeys.sort()
- ckeys = Categories.keys()
- ckeys.sort()
- #
- # Generate the resulting files
- #
- try:
- header = open("include/libxml/xmlunicode.h", "w")
- except:
- print "Failed to open include/libxml/xmlunicode.h"
- sys.exit(1)
- try:
- output = open("xmlunicode.c", "w")
- except:
- print "Failed to open xmlunicode.c"
- sys.exit(1)
- date = time.asctime(time.localtime(time.time()))
- header.write(
- """/*
- * Summary: Unicode character APIs
- * Description: API for the Unicode character APIs
- *
- * This file is automatically generated from the
- * UCS description files of the Unicode Character Database
- * %s
- * using the genUnicode.py Python script.
- *
- * Generation date: %s
- * Sources: %s
- * Author: Daniel Veillard
- */
- #ifndef __XML_UNICODE_H__
- #define __XML_UNICODE_H__
- #include <libxml/xmlversion.h>
- #ifdef LIBXML_UNICODE_ENABLED
- #ifdef __cplusplus
- extern "C" {
- #endif
- """ % (webpage, date, sources));
- output.write(
- """/*
- * xmlunicode.c: this module implements the Unicode character APIs
- *
- * This file is automatically generated from the
- * UCS description files of the Unicode Character Database
- * %s
- * using the genUnicode.py Python script.
- *
- * Generation date: %s
- * Sources: %s
- * Daniel Veillard <veillard@redhat.com>
- */
- #define IN_LIBXML
- #include "libxml.h"
- #ifdef LIBXML_UNICODE_ENABLED
- #include <string.h>
- #include <libxml/xmlversion.h>
- #include <libxml/xmlunicode.h>
- #include <libxml/chvalid.h>
- typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */
- typedef struct {
- const char *rangename;
- xmlIntFunc *func;
- } xmlUnicodeRange;
- typedef struct {
- xmlUnicodeRange *table;
- int numentries;
- } xmlUnicodeNameTable;
- static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);
- static xmlUnicodeRange xmlUnicodeBlocks[] = {
- """ % (webpage, date, sources));
- flag = 0
- for block in bkeys:
- name = string.replace(block, '-', '')
- if flag:
- output.write(',\n')
- else:
- flag = 1
- output.write(' {"%s", xmlUCSIs%s}' % (block, name))
- output.write('};\n\n')
- output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
- flag = 0;
- for name in ckeys:
- if flag:
- output.write(',\n')
- else:
- flag = 1
- output.write(' {"%s", xmlUCSIsCat%s}' % (name, name))
- output.write('};\n\n')
- #
- # For any categories with more than minTableSize ranges we generate
- # a range table suitable for xmlCharInRange
- #
- for name in ckeys:
- if len(Categories[name]) > minTableSize:
- numshort = 0
- numlong = 0
- ranges = Categories[name]
- sptr = "NULL"
- lptr = "NULL"
- for range in ranges:
- (low, high) = range
- if high < 0x10000:
- if numshort == 0:
- pline = "static const xmlChSRange xml%sS[] = {" % name
- sptr = "xml%sS" % name
- else:
- pline += ", "
- numshort += 1
- else:
- if numlong == 0:
- if numshort > 0:
- output.write(pline + " };\n")
- pline = "static const xmlChLRange xml%sL[] = {" % name
- lptr = "xml%sL" % name
- else:
- pline += ", "
- numlong += 1
- if len(pline) > 60:
- output.write(pline + "\n")
- pline = " "
- pline += "{%s, %s}" % (hex(low), hex(high))
- output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
- % (name, numshort, numlong, sptr, lptr))
- output.write(
- """static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
- static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
- /**
- * xmlUnicodeLookup:
- * @tptr: pointer to the name table
- * @name: name to be found
- *
- * binary table lookup for user-supplied name
- *
- * Returns pointer to range function if found, otherwise NULL
- */
- static xmlIntFunc
- *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
- int low, high, mid, cmp;
- xmlUnicodeRange *sptr;
- if ((tptr == NULL) || (tname == NULL)) return(NULL);
- low = 0;
- high = tptr->numentries - 1;
- sptr = tptr->table;
- while (low <= high) {
- mid = (low + high) / 2;
- if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
- return (sptr[mid].func);
- if (cmp < 0)
- high = mid - 1;
- else
- low = mid + 1;
- }
- return (NULL);
- }
- """ % (len(BlockNames), len(Categories)) )
- for block in bkeys:
- name = string.replace(block, '-', '')
- header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
- output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
- output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
- (block))
- output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
- output.write("int\nxmlUCSIs%s(int code) {\n return(" % name)
- flag = 0
- for (start, end) in BlockNames[block]:
- if flag:
- output.write(" ||\n ")
- else:
- flag = 1
- output.write("((code >= %s) && (code <= %s))" % (start, end))
- output.write(");\n}\n\n")
- header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
- output.write(
- """/**
- * xmlUCSIsBlock:
- * @code: UCS code point
- * @block: UCS block name
- *
- * Check whether the character is part of the UCS Block
- *
- * Returns 1 if true, 0 if false and -1 on unknown block
- */
- int
- xmlUCSIsBlock(int code, const char *block) {
- xmlIntFunc *func;
- func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
- if (func == NULL)
- return (-1);
- return (func(code));
- }
- """)
- for name in ckeys:
- ranges = Categories[name]
- header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
- output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
- output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
- (name))
- output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
- output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
- if len(Categories[name]) > minTableSize:
- output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)"
- % name)
- else:
- start = 1
- for range in ranges:
- (begin, end) = range;
- if start:
- output.write(" return(");
- start = 0
- else:
- output.write(" ||\n ");
- if (begin == end):
- output.write("(code == %s)" % (hex(begin)))
- else:
- output.write("((code >= %s) && (code <= %s))" % (
- hex(begin), hex(end)))
- output.write(");\n}\n\n")
- header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
- output.write(
- """/**
- * xmlUCSIsCat:
- * @code: UCS code point
- * @cat: UCS Category name
- *
- * Check whether the character is part of the UCS Category
- *
- * Returns 1 if true, 0 if false and -1 on unknown category
- */
- int
- xmlUCSIsCat(int code, const char *cat) {
- xmlIntFunc *func;
- func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
- if (func == NULL)
- return (-1);
- return (func(code));
- }
- #define bottom_xmlunicode
- #include "elfgcchack.h"
- #endif /* LIBXML_UNICODE_ENABLED */
- """)
- header.write("""
- #ifdef __cplusplus
- }
- #endif
- #endif /* LIBXML_UNICODE_ENABLED */
- #endif /* __XML_UNICODE_H__ */
- """);
- header.close()
- output.close()
|