1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258 |
- #!/usr/bin/python -u
- #
- # imports the API description and fills up a database with
- # name relevance to modules, functions or web pages
- #
- # Operation needed:
- # =================
- #
- # install mysqld, the python wrappers for mysql and libxml2, start mysqld
- # Change the root passwd of mysql:
- # mysqladmin -u root password new_password
- # Create the new database xmlsoft
- # mysqladmin -p create xmlsoft
- # Create a database user 'veillard' and give him passord access
- # change veillard and abcde with the right user name and passwd
- # mysql -p
- # password:
- # mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
- # IDENTIFIED BY 'abcde' WITH GRANT OPTION;
- #
- # As the user check the access:
- # mysql -p xmlsoft
- # Enter password:
- # Welcome to the MySQL monitor....
- # mysql> use xmlsoft
- # Database changed
- # mysql> quit
- # Bye
- #
- # Then run the script in the doc subdir, it will create the symbols and
- # word tables and populate them with informations extracted from
- # the libxml2-api.xml API description, and make then accessible read-only
- # by nobody@loaclhost the user expected to be Apache's one
- #
- # On the Apache configuration, make sure you have php support enabled
- #
- import MySQLdb
- import libxml2
- import sys
- import string
- import os
- #
- # We are not interested in parsing errors here
- #
- def callback(ctx, str):
- return
- libxml2.registerErrorHandler(callback, None)
- #
- # The dictionnary of tables required and the SQL command needed
- # to create them
- #
- TABLES={
- "symbols" : """CREATE TABLE symbols (
- name varchar(255) BINARY NOT NULL,
- module varchar(255) BINARY NOT NULL,
- type varchar(25) NOT NULL,
- descr varchar(255),
- UNIQUE KEY name (name),
- KEY module (module))""",
- "words" : """CREATE TABLE words (
- name varchar(50) BINARY NOT NULL,
- symbol varchar(255) BINARY NOT NULL,
- relevance int,
- KEY name (name),
- KEY symbol (symbol),
- UNIQUE KEY ID (name, symbol))""",
- "wordsHTML" : """CREATE TABLE wordsHTML (
- name varchar(50) BINARY NOT NULL,
- resource varchar(255) BINARY NOT NULL,
- section varchar(255),
- id varchar(50),
- relevance int,
- KEY name (name),
- KEY resource (resource),
- UNIQUE KEY ref (name, resource))""",
- "wordsArchive" : """CREATE TABLE wordsArchive (
- name varchar(50) BINARY NOT NULL,
- ID int(11) NOT NULL,
- relevance int,
- KEY name (name),
- UNIQUE KEY ref (name, ID))""",
- "pages" : """CREATE TABLE pages (
- resource varchar(255) BINARY NOT NULL,
- title varchar(255) BINARY NOT NULL,
- UNIQUE KEY name (resource))""",
- "archives" : """CREATE TABLE archives (
- ID int(11) NOT NULL auto_increment,
- resource varchar(255) BINARY NOT NULL,
- title varchar(255) BINARY NOT NULL,
- UNIQUE KEY id (ID,resource(255)),
- INDEX (ID),
- INDEX (resource))""",
- "Queries" : """CREATE TABLE Queries (
- ID int(11) NOT NULL auto_increment,
- Value varchar(50) NOT NULL,
- Count int(11) NOT NULL,
- UNIQUE KEY id (ID,Value(35)),
- INDEX (ID))""",
- "AllQueries" : """CREATE TABLE AllQueries (
- ID int(11) NOT NULL auto_increment,
- Value varchar(50) NOT NULL,
- Count int(11) NOT NULL,
- UNIQUE KEY id (ID,Value(35)),
- INDEX (ID))""",
- }
- #
- # The XML API description file to parse
- #
- API="libxml2-api.xml"
- DB=None
- #########################################################################
- # #
- # MySQL database interfaces #
- # #
- #########################################################################
- def createTable(db, name):
- global TABLES
- if db == None:
- return -1
- if name == None:
- return -1
- c = db.cursor()
- ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
- if ret == 1:
- print "Removed table %s" % (name)
- print "Creating table %s" % (name)
- try:
- ret = c.execute(TABLES[name])
- except:
- print "Failed to create table %s" % (name)
- return -1
- return ret
- def checkTables(db, verbose = 1):
- global TABLES
- if db == None:
- return -1
- c = db.cursor()
- nbtables = c.execute("show tables")
- if verbose:
- print "Found %d tables" % (nbtables)
- tables = {}
- i = 0
- while i < nbtables:
- l = c.fetchone()
- name = l[0]
- tables[name] = {}
- i = i + 1
- for table in TABLES.keys():
- if not tables.has_key(table):
- print "table %s missing" % (table)
- createTable(db, table)
- try:
- ret = c.execute("SELECT count(*) from %s" % table);
- row = c.fetchone()
- if verbose:
- print "Table %s contains %d records" % (table, row[0])
- except:
- print "Troubles with table %s : repairing" % (table)
- ret = c.execute("repair table %s" % table);
- print "repairing returned %d" % (ret)
- ret = c.execute("SELECT count(*) from %s" % table);
- row = c.fetchone()
- print "Table %s contains %d records" % (table, row[0])
- if verbose:
- print "checkTables finished"
- # make sure apache can access the tables read-only
- try:
- ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
- ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost")
- except:
- pass
- return 0
-
- def openMySQL(db="xmlsoft", passwd=None, verbose = 1):
- global DB
- if passwd == None:
- try:
- passwd = os.environ["MySQL_PASS"]
- except:
- print "No password available, set environment MySQL_PASS"
- sys.exit(1)
- DB = MySQLdb.connect(passwd=passwd, db=db)
- if DB == None:
- return -1
- ret = checkTables(DB, verbose)
- return ret
- def updateWord(name, symbol, relevance):
- global DB
- if DB == None:
- openMySQL()
- if DB == None:
- return -1
- if name == None:
- return -1
- if symbol == None:
- return -1
- c = DB.cursor()
- try:
- ret = c.execute(
- """INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
- (name, symbol, relevance))
- except:
- try:
- ret = c.execute(
- """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
- (relevance, name, symbol))
- except:
- print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
- print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
- print sys.exc_type, sys.exc_value
- return -1
-
- return ret
- def updateSymbol(name, module, type, desc):
- global DB
- updateWord(name, name, 50)
- if DB == None:
- openMySQL()
- if DB == None:
- return -1
- if name == None:
- return -1
- if module == None:
- return -1
- if type == None:
- return -1
- try:
- desc = string.replace(desc, "'", " ")
- l = string.split(desc, ".")
- desc = l[0]
- desc = desc[0:99]
- except:
- desc = ""
- c = DB.cursor()
- try:
- ret = c.execute(
- """INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
- (name, module, type, desc))
- except:
- try:
- ret = c.execute(
- """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
- (module, type, desc, name))
- except:
- print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
- print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
- print sys.exc_type, sys.exc_value
- return -1
-
- return ret
-
- def addFunction(name, module, desc = ""):
- return updateSymbol(name, module, 'function', desc)
- def addMacro(name, module, desc = ""):
- return updateSymbol(name, module, 'macro', desc)
- def addEnum(name, module, desc = ""):
- return updateSymbol(name, module, 'enum', desc)
- def addStruct(name, module, desc = ""):
- return updateSymbol(name, module, 'struct', desc)
- def addConst(name, module, desc = ""):
- return updateSymbol(name, module, 'const', desc)
- def addType(name, module, desc = ""):
- return updateSymbol(name, module, 'type', desc)
- def addFunctype(name, module, desc = ""):
- return updateSymbol(name, module, 'functype', desc)
- def addPage(resource, title):
- global DB
- if DB == None:
- openMySQL()
- if DB == None:
- return -1
- if resource == None:
- return -1
- c = DB.cursor()
- try:
- ret = c.execute(
- """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
- (resource, title))
- except:
- try:
- ret = c.execute(
- """UPDATE pages SET title='%s' WHERE resource='%s'""" %
- (title, resource))
- except:
- print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
- print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
- print sys.exc_type, sys.exc_value
- return -1
-
- return ret
- def updateWordHTML(name, resource, desc, id, relevance):
- global DB
- if DB == None:
- openMySQL()
- if DB == None:
- return -1
- if name == None:
- return -1
- if resource == None:
- return -1
- if id == None:
- id = ""
- if desc == None:
- desc = ""
- else:
- try:
- desc = string.replace(desc, "'", " ")
- desc = desc[0:99]
- except:
- desc = ""
- c = DB.cursor()
- try:
- ret = c.execute(
- """INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
- (name, resource, desc, id, relevance))
- except:
- try:
- ret = c.execute(
- """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
- (desc, id, relevance, name, resource))
- except:
- print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
- print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
- print sys.exc_type, sys.exc_value
- return -1
-
- return ret
- def checkXMLMsgArchive(url):
- global DB
- if DB == None:
- openMySQL()
- if DB == None:
- return -1
- if url == None:
- return -1
- c = DB.cursor()
- try:
- ret = c.execute(
- """SELECT ID FROM archives WHERE resource='%s'""" % (url))
- row = c.fetchone()
- if row == None:
- return -1
- except:
- return -1
-
- return row[0]
-
- def addXMLMsgArchive(url, title):
- global DB
- if DB == None:
- openMySQL()
- if DB == None:
- return -1
- if url == None:
- return -1
- if title == None:
- title = ""
- else:
- title = string.replace(title, "'", " ")
- title = title[0:99]
- c = DB.cursor()
- try:
- cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
- ret = c.execute(cmd)
- cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
- ret = c.execute(cmd)
- row = c.fetchone()
- if row == None:
- print "addXMLMsgArchive failed to get the ID: %s" % (url)
- return -1
- except:
- print "addXMLMsgArchive failed command: %s" % (cmd)
- return -1
-
- return((int)(row[0]))
- def updateWordArchive(name, id, relevance):
- global DB
- if DB == None:
- openMySQL()
- if DB == None:
- return -1
- if name == None:
- return -1
- if id == None:
- return -1
- c = DB.cursor()
- try:
- ret = c.execute(
- """INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
- (name, id, relevance))
- except:
- try:
- ret = c.execute(
- """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
- (relevance, name, id))
- except:
- print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
- print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
- print sys.exc_type, sys.exc_value
- return -1
-
- return ret
- #########################################################################
- # #
- # Word dictionnary and analysis routines #
- # #
- #########################################################################
- #
- # top 100 english word without the one len < 3 + own set
- #
- dropWords = {
- 'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
- 'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
- 'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
- 'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
- 'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
- 'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
- 'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
- 'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
- 'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
- 'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
- 'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
- 'down':0,
- 'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
- }
- wordsDict = {}
- wordsDictHTML = {}
- wordsDictArchive = {}
- def cleanupWordsString(str):
- str = string.replace(str, ".", " ")
- str = string.replace(str, "!", " ")
- str = string.replace(str, "?", " ")
- str = string.replace(str, ",", " ")
- str = string.replace(str, "'", " ")
- str = string.replace(str, '"', " ")
- str = string.replace(str, ";", " ")
- str = string.replace(str, "(", " ")
- str = string.replace(str, ")", " ")
- str = string.replace(str, "{", " ")
- str = string.replace(str, "}", " ")
- str = string.replace(str, "<", " ")
- str = string.replace(str, ">", " ")
- str = string.replace(str, "=", " ")
- str = string.replace(str, "/", " ")
- str = string.replace(str, "*", " ")
- str = string.replace(str, ":", " ")
- str = string.replace(str, "#", " ")
- str = string.replace(str, "\\", " ")
- str = string.replace(str, "\n", " ")
- str = string.replace(str, "\r", " ")
- str = string.replace(str, "\xc2", " ")
- str = string.replace(str, "\xa0", " ")
- return str
-
- def cleanupDescrString(str):
- str = string.replace(str, "'", " ")
- str = string.replace(str, "\n", " ")
- str = string.replace(str, "\r", " ")
- str = string.replace(str, "\xc2", " ")
- str = string.replace(str, "\xa0", " ")
- l = string.split(str)
- str = string.join(str)
- return str
- def splitIdentifier(str):
- ret = []
- while str != "":
- cur = string.lower(str[0])
- str = str[1:]
- if ((cur < 'a') or (cur > 'z')):
- continue
- while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
- cur = cur + string.lower(str[0])
- str = str[1:]
- while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
- cur = cur + str[0]
- str = str[1:]
- while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
- str = str[1:]
- ret.append(cur)
- return ret
- def addWord(word, module, symbol, relevance):
- global wordsDict
- if word == None or len(word) < 3:
- return -1
- if module == None or symbol == None:
- return -1
- if dropWords.has_key(word):
- return 0
- if ord(word[0]) > 0x80:
- return 0
- if wordsDict.has_key(word):
- d = wordsDict[word]
- if d == None:
- return 0
- if len(d) > 500:
- wordsDict[word] = None
- return 0
- try:
- relevance = relevance + d[(module, symbol)]
- except:
- pass
- else:
- wordsDict[word] = {}
- wordsDict[word][(module, symbol)] = relevance
- return relevance
-
- def addString(str, module, symbol, relevance):
- if str == None or len(str) < 3:
- return -1
- ret = 0
- str = cleanupWordsString(str)
- l = string.split(str)
- for word in l:
- if len(word) > 2:
- ret = ret + addWord(word, module, symbol, 5)
- return ret
- def addWordHTML(word, resource, id, section, relevance):
- global wordsDictHTML
- if word == None or len(word) < 3:
- return -1
- if resource == None or section == None:
- return -1
- if dropWords.has_key(word):
- return 0
- if ord(word[0]) > 0x80:
- return 0
- section = cleanupDescrString(section)
- if wordsDictHTML.has_key(word):
- d = wordsDictHTML[word]
- if d == None:
- print "skipped %s" % (word)
- return 0
- try:
- (r,i,s) = d[resource]
- if i != None:
- id = i
- if s != None:
- section = s
- relevance = relevance + r
- except:
- pass
- else:
- wordsDictHTML[word] = {}
- d = wordsDictHTML[word];
- d[resource] = (relevance, id, section)
- return relevance
-
- def addStringHTML(str, resource, id, section, relevance):
- if str == None or len(str) < 3:
- return -1
- ret = 0
- str = cleanupWordsString(str)
- l = string.split(str)
- for word in l:
- if len(word) > 2:
- try:
- r = addWordHTML(word, resource, id, section, relevance)
- if r < 0:
- print "addWordHTML failed: %s %s" % (word, resource)
- ret = ret + r
- except:
- print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
- print sys.exc_type, sys.exc_value
- return ret
- def addWordArchive(word, id, relevance):
- global wordsDictArchive
- if word == None or len(word) < 3:
- return -1
- if id == None or id == -1:
- return -1
- if dropWords.has_key(word):
- return 0
- if ord(word[0]) > 0x80:
- return 0
- if wordsDictArchive.has_key(word):
- d = wordsDictArchive[word]
- if d == None:
- print "skipped %s" % (word)
- return 0
- try:
- r = d[id]
- relevance = relevance + r
- except:
- pass
- else:
- wordsDictArchive[word] = {}
- d = wordsDictArchive[word];
- d[id] = relevance
- return relevance
-
- def addStringArchive(str, id, relevance):
- if str == None or len(str) < 3:
- return -1
- ret = 0
- str = cleanupWordsString(str)
- l = string.split(str)
- for word in l:
- i = len(word)
- if i > 2:
- try:
- r = addWordArchive(word, id, relevance)
- if r < 0:
- print "addWordArchive failed: %s %s" % (word, id)
- else:
- ret = ret + r
- except:
- print "addWordArchive failed: %s %s %d" % (word, id, relevance)
- print sys.exc_type, sys.exc_value
- return ret
- #########################################################################
- # #
- # XML API description analysis #
- # #
- #########################################################################
- def loadAPI(filename):
- doc = libxml2.parseFile(filename)
- print "loaded %s" % (filename)
- return doc
- def foundExport(file, symbol):
- if file == None:
- return 0
- if symbol == None:
- return 0
- addFunction(symbol, file)
- l = splitIdentifier(symbol)
- for word in l:
- addWord(word, file, symbol, 10)
- return 1
-
- def analyzeAPIFile(top):
- count = 0
- name = top.prop("name")
- cur = top.children
- while cur != None:
- if cur.type == 'text':
- cur = cur.next
- continue
- if cur.name == "exports":
- count = count + foundExport(name, cur.prop("symbol"))
- else:
- print "unexpected element %s in API doc <file name='%s'>" % (name)
- cur = cur.next
- return count
- def analyzeAPIFiles(top):
- count = 0
- cur = top.children
-
- while cur != None:
- if cur.type == 'text':
- cur = cur.next
- continue
- if cur.name == "file":
- count = count + analyzeAPIFile(cur)
- else:
- print "unexpected element %s in API doc <files>" % (cur.name)
- cur = cur.next
- return count
- def analyzeAPIEnum(top):
- file = top.prop("file")
- if file == None:
- return 0
- symbol = top.prop("name")
- if symbol == None:
- return 0
- addEnum(symbol, file)
- l = splitIdentifier(symbol)
- for word in l:
- addWord(word, file, symbol, 10)
- return 1
- def analyzeAPIConst(top):
- file = top.prop("file")
- if file == None:
- return 0
- symbol = top.prop("name")
- if symbol == None:
- return 0
- addConst(symbol, file)
- l = splitIdentifier(symbol)
- for word in l:
- addWord(word, file, symbol, 10)
- return 1
- def analyzeAPIType(top):
- file = top.prop("file")
- if file == None:
- return 0
- symbol = top.prop("name")
- if symbol == None:
- return 0
- addType(symbol, file)
- l = splitIdentifier(symbol)
- for word in l:
- addWord(word, file, symbol, 10)
- return 1
- def analyzeAPIFunctype(top):
- file = top.prop("file")
- if file == None:
- return 0
- symbol = top.prop("name")
- if symbol == None:
- return 0
- addFunctype(symbol, file)
- l = splitIdentifier(symbol)
- for word in l:
- addWord(word, file, symbol, 10)
- return 1
- def analyzeAPIStruct(top):
- file = top.prop("file")
- if file == None:
- return 0
- symbol = top.prop("name")
- if symbol == None:
- return 0
- addStruct(symbol, file)
- l = splitIdentifier(symbol)
- for word in l:
- addWord(word, file, symbol, 10)
- info = top.prop("info")
- if info != None:
- info = string.replace(info, "'", " ")
- info = string.strip(info)
- l = string.split(info)
- for word in l:
- if len(word) > 2:
- addWord(word, file, symbol, 5)
- return 1
- def analyzeAPIMacro(top):
- file = top.prop("file")
- if file == None:
- return 0
- symbol = top.prop("name")
- if symbol == None:
- return 0
- symbol = string.replace(symbol, "'", " ")
- symbol = string.strip(symbol)
- info = None
- cur = top.children
- while cur != None:
- if cur.type == 'text':
- cur = cur.next
- continue
- if cur.name == "info":
- info = cur.content
- break
- cur = cur.next
- l = splitIdentifier(symbol)
- for word in l:
- addWord(word, file, symbol, 10)
- if info == None:
- addMacro(symbol, file)
- print "Macro %s description has no <info>" % (symbol)
- return 0
- info = string.replace(info, "'", " ")
- info = string.strip(info)
- addMacro(symbol, file, info)
- l = string.split(info)
- for word in l:
- if len(word) > 2:
- addWord(word, file, symbol, 5)
- return 1
- def analyzeAPIFunction(top):
- file = top.prop("file")
- if file == None:
- return 0
- symbol = top.prop("name")
- if symbol == None:
- return 0
- symbol = string.replace(symbol, "'", " ")
- symbol = string.strip(symbol)
- info = None
- cur = top.children
- while cur != None:
- if cur.type == 'text':
- cur = cur.next
- continue
- if cur.name == "info":
- info = cur.content
- elif cur.name == "return":
- rinfo = cur.prop("info")
- if rinfo != None:
- rinfo = string.replace(rinfo, "'", " ")
- rinfo = string.strip(rinfo)
- addString(rinfo, file, symbol, 7)
- elif cur.name == "arg":
- ainfo = cur.prop("info")
- if ainfo != None:
- ainfo = string.replace(ainfo, "'", " ")
- ainfo = string.strip(ainfo)
- addString(ainfo, file, symbol, 5)
- name = cur.prop("name")
- if name != None:
- name = string.replace(name, "'", " ")
- name = string.strip(name)
- addWord(name, file, symbol, 7)
- cur = cur.next
- if info == None:
- print "Function %s description has no <info>" % (symbol)
- addFunction(symbol, file, "")
- else:
- info = string.replace(info, "'", " ")
- info = string.strip(info)
- addFunction(symbol, file, info)
- addString(info, file, symbol, 5)
- l = splitIdentifier(symbol)
- for word in l:
- addWord(word, file, symbol, 10)
- return 1
- def analyzeAPISymbols(top):
- count = 0
- cur = top.children
-
- while cur != None:
- if cur.type == 'text':
- cur = cur.next
- continue
- if cur.name == "macro":
- count = count + analyzeAPIMacro(cur)
- elif cur.name == "function":
- count = count + analyzeAPIFunction(cur)
- elif cur.name == "const":
- count = count + analyzeAPIConst(cur)
- elif cur.name == "typedef":
- count = count + analyzeAPIType(cur)
- elif cur.name == "struct":
- count = count + analyzeAPIStruct(cur)
- elif cur.name == "enum":
- count = count + analyzeAPIEnum(cur)
- elif cur.name == "functype":
- count = count + analyzeAPIFunctype(cur)
- else:
- print "unexpected element %s in API doc <files>" % (cur.name)
- cur = cur.next
- return count
- def analyzeAPI(doc):
- count = 0
- if doc == None:
- return -1
- root = doc.getRootElement()
- if root.name != "api":
- print "Unexpected root name"
- return -1
- cur = root.children
- while cur != None:
- if cur.type == 'text':
- cur = cur.next
- continue
- if cur.name == "files":
- pass
- # count = count + analyzeAPIFiles(cur)
- elif cur.name == "symbols":
- count = count + analyzeAPISymbols(cur)
- else:
- print "unexpected element %s in API doc" % (cur.name)
- cur = cur.next
- return count
- #########################################################################
- # #
- # Web pages parsing and analysis #
- # #
- #########################################################################
- import glob
- def analyzeHTMLText(doc, resource, p, section, id):
- words = 0
- try:
- content = p.content
- words = words + addStringHTML(content, resource, id, section, 5)
- except:
- return -1
- return words
- def analyzeHTMLPara(doc, resource, p, section, id):
- words = 0
- try:
- content = p.content
- words = words + addStringHTML(content, resource, id, section, 5)
- except:
- return -1
- return words
- def analyzeHTMLPre(doc, resource, p, section, id):
- words = 0
- try:
- content = p.content
- words = words + addStringHTML(content, resource, id, section, 5)
- except:
- return -1
- return words
- def analyzeHTML(doc, resource, p, section, id):
- words = 0
- try:
- content = p.content
- words = words + addStringHTML(content, resource, id, section, 5)
- except:
- return -1
- return words
- def analyzeHTML(doc, resource):
- para = 0;
- ctxt = doc.xpathNewContext()
- try:
- res = ctxt.xpathEval("//head/title")
- title = res[0].content
- except:
- title = "Page %s" % (resource)
- addPage(resource, title)
- try:
- items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
- section = title
- id = ""
- for item in items:
- if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
- section = item.content
- if item.prop("id"):
- id = item.prop("id")
- elif item.prop("name"):
- id = item.prop("name")
- elif item.type == 'text':
- analyzeHTMLText(doc, resource, item, section, id)
- para = para + 1
- elif item.name == 'p':
- analyzeHTMLPara(doc, resource, item, section, id)
- para = para + 1
- elif item.name == 'pre':
- analyzeHTMLPre(doc, resource, item, section, id)
- para = para + 1
- else:
- print "Page %s, unexpected %s element" % (resource, item.name)
- except:
- print "Page %s: problem analyzing" % (resource)
- print sys.exc_type, sys.exc_value
- return para
- def analyzeHTMLPages():
- ret = 0
- HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
- for html in HTMLfiles:
- if html[0:3] == "API":
- continue
- if html == "xml.html":
- continue
- try:
- doc = libxml2.parseFile(html)
- except:
- doc = libxml2.htmlParseFile(html, None)
- try:
- res = analyzeHTML(doc, html)
- print "Parsed %s : %d paragraphs" % (html, res)
- ret = ret + 1
- except:
- print "could not parse %s" % (html)
- return ret
- #########################################################################
- # #
- # Mail archives parsing and analysis #
- # #
- #########################################################################
- import time
- def getXMLDateArchive(t = None):
- if t == None:
- t = time.time()
- T = time.gmtime(t)
- month = time.strftime("%B", T)
- year = T[0]
- url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
- return url
- def scanXMLMsgArchive(url, title, force = 0):
- if url == None or title == None:
- return 0
- ID = checkXMLMsgArchive(url)
- if force == 0 and ID != -1:
- return 0
- if ID == -1:
- ID = addXMLMsgArchive(url, title)
- if ID == -1:
- return 0
- try:
- print "Loading %s" % (url)
- doc = libxml2.htmlParseFile(url, None);
- except:
- doc = None
- if doc == None:
- print "Failed to parse %s" % (url)
- return 0
- addStringArchive(title, ID, 20)
- ctxt = doc.xpathNewContext()
- texts = ctxt.xpathEval("//pre//text()")
- for text in texts:
- addStringArchive(text.content, ID, 5)
- return 1
- def scanXMLDateArchive(t = None, force = 0):
- global wordsDictArchive
- wordsDictArchive = {}
- url = getXMLDateArchive(t)
- print "loading %s" % (url)
- try:
- doc = libxml2.htmlParseFile(url, None);
- except:
- doc = None
- if doc == None:
- print "Failed to parse %s" % (url)
- return -1
- ctxt = doc.xpathNewContext()
- anchors = ctxt.xpathEval("//a[@href]")
- links = 0
- newmsg = 0
- for anchor in anchors:
- href = anchor.prop("href")
- if href == None or href[0:3] != "msg":
- continue
- try:
- links = links + 1
- msg = libxml2.buildURI(href, url)
- title = anchor.content
- if title != None and title[0:4] == 'Re: ':
- title = title[4:]
- if title != None and title[0:6] == '[xml] ':
- title = title[6:]
- newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
- except:
- pass
- return newmsg
-
- #########################################################################
- # #
- # Main code: open the DB, the API XML and analyze it #
- # #
- #########################################################################
- def analyzeArchives(t = None, force = 0):
- global wordsDictArchive
- ret = scanXMLDateArchive(t, force)
- print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
- i = 0
- skipped = 0
- for word in wordsDictArchive.keys():
- refs = wordsDictArchive[word]
- if refs == None:
- skipped = skipped + 1
- continue;
- for id in refs.keys():
- relevance = refs[id]
- updateWordArchive(word, id, relevance)
- i = i + 1
- print "Found %d associations in HTML pages" % (i)
- def analyzeHTMLTop():
- global wordsDictHTML
- ret = analyzeHTMLPages()
- print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
- i = 0
- skipped = 0
- for word in wordsDictHTML.keys():
- refs = wordsDictHTML[word]
- if refs == None:
- skipped = skipped + 1
- continue;
- for resource in refs.keys():
- (relevance, id, section) = refs[resource]
- updateWordHTML(word, resource, section, id, relevance)
- i = i + 1
- print "Found %d associations in HTML pages" % (i)
- def analyzeAPITop():
- global wordsDict
- global API
- try:
- doc = loadAPI(API)
- ret = analyzeAPI(doc)
- print "Analyzed %d blocs" % (ret)
- doc.freeDoc()
- except:
- print "Failed to parse and analyze %s" % (API)
- print sys.exc_type, sys.exc_value
- sys.exit(1)
- print "Indexed %d words" % (len(wordsDict))
- i = 0
- skipped = 0
- for word in wordsDict.keys():
- refs = wordsDict[word]
- if refs == None:
- skipped = skipped + 1
- continue;
- for (module, symbol) in refs.keys():
- updateWord(word, symbol, refs[(module, symbol)])
- i = i + 1
- print "Found %d associations, skipped %d words" % (i, skipped)
- def usage():
- print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]"
- sys.exit(1)
- def main():
- try:
- openMySQL()
- except:
- print "Failed to open the database"
- print sys.exc_type, sys.exc_value
- sys.exit(1)
- args = sys.argv[1:]
- force = 0
- if args:
- i = 0
- while i < len(args):
- if args[i] == '--force':
- force = 1
- elif args[i] == '--archive':
- analyzeArchives(None, force)
- elif args[i] == '--archive-year':
- i = i + 1;
- year = args[i]
- months = ["January" , "February", "March", "April", "May",
- "June", "July", "August", "September", "October",
- "November", "December"];
- for month in months:
- try:
- str = "%s-%s" % (year, month)
- T = time.strptime(str, "%Y-%B")
- t = time.mktime(T) + 3600 * 24 * 10;
- analyzeArchives(t, force)
- except:
- print "Failed to index month archive:"
- print sys.exc_type, sys.exc_value
- elif args[i] == '--archive-month':
- i = i + 1;
- month = args[i]
- try:
- T = time.strptime(month, "%Y-%B")
- t = time.mktime(T) + 3600 * 24 * 10;
- analyzeArchives(t, force)
- except:
- print "Failed to index month archive:"
- print sys.exc_type, sys.exc_value
- elif args[i] == '--API':
- analyzeAPITop()
- elif args[i] == '--docs':
- analyzeHTMLTop()
- else:
- usage()
- i = i + 1
- else:
- usage()
- if __name__ == "__main__":
- main()
|