| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258 | 
							- #!/usr/bin/python -u
 
- #
 
- # imports the API description and fills up a database with
 
- # name relevance to modules, functions or web pages
 
- #
 
- # Operation needed:
 
- # =================
 
- #
 
- # install mysqld, the python wrappers for mysql and libxml2, start mysqld
 
- # Change the root passwd of mysql:
 
- #    mysqladmin -u root password new_password
 
- # Create the new database xmlsoft
 
- #    mysqladmin -p create xmlsoft
 
- # Create a database user 'veillard' and give him passord access
 
- # change veillard and abcde with the right user name and passwd
 
- #    mysql -p
 
- #    password:
 
- #    mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
 
- #           IDENTIFIED BY 'abcde' WITH GRANT OPTION;
 
- #
 
- # As the user check the access:
 
- #    mysql -p xmlsoft
 
- #    Enter password:
 
- #    Welcome to the MySQL monitor....
 
- #    mysql> use xmlsoft
 
- #    Database changed
 
- #    mysql> quit
 
- #    Bye
 
- #
 
- # Then run the script in the doc subdir, it will create the symbols and
 
- # word tables and populate them with informations extracted from 
 
- # the libxml2-api.xml API description, and make then accessible read-only
 
- # by nobody@loaclhost the user expected to be Apache's one
 
- #
 
- # On the Apache configuration, make sure you have php support enabled
 
- #
 
- import MySQLdb
 
- import libxml2
 
- import sys
 
- import string
 
- import os
 
- #
 
- # We are not interested in parsing errors here
 
- #
 
- def callback(ctx, str):
 
-     return
 
- libxml2.registerErrorHandler(callback, None)
 
- #
 
- # The dictionnary of tables required and the SQL command needed
 
- # to create them
 
- #
 
- TABLES={
 
-   "symbols" : """CREATE TABLE symbols (
 
-            name varchar(255) BINARY NOT NULL,
 
- 	   module varchar(255) BINARY NOT NULL,
 
-            type varchar(25) NOT NULL,
 
- 	   descr varchar(255),
 
- 	   UNIQUE KEY name (name),
 
- 	   KEY module (module))""",
 
-   "words" : """CREATE TABLE words (
 
-            name varchar(50) BINARY NOT NULL,
 
- 	   symbol varchar(255) BINARY NOT NULL,
 
-            relevance int,
 
- 	   KEY name (name),
 
- 	   KEY symbol (symbol),
 
- 	   UNIQUE KEY ID (name, symbol))""",
 
-   "wordsHTML" : """CREATE TABLE wordsHTML (
 
-            name varchar(50) BINARY NOT NULL,
 
- 	   resource varchar(255) BINARY NOT NULL,
 
- 	   section varchar(255),
 
- 	   id varchar(50),
 
-            relevance int,
 
- 	   KEY name (name),
 
- 	   KEY resource (resource),
 
- 	   UNIQUE KEY ref (name, resource))""",
 
-   "wordsArchive" : """CREATE TABLE wordsArchive (
 
-            name varchar(50) BINARY NOT NULL,
 
- 	   ID int(11) NOT NULL,
 
-            relevance int,
 
- 	   KEY name (name),
 
- 	   UNIQUE KEY ref (name, ID))""",
 
-   "pages" : """CREATE TABLE pages (
 
-            resource varchar(255) BINARY NOT NULL,
 
- 	   title varchar(255) BINARY NOT NULL,
 
- 	   UNIQUE KEY name (resource))""",
 
-   "archives" : """CREATE TABLE archives (
 
-            ID int(11) NOT NULL auto_increment,
 
-            resource varchar(255) BINARY NOT NULL,
 
- 	   title varchar(255) BINARY NOT NULL,
 
- 	   UNIQUE KEY id (ID,resource(255)),
 
- 	   INDEX (ID),
 
- 	   INDEX (resource))""",
 
-   "Queries" : """CREATE TABLE Queries (
 
-            ID int(11) NOT NULL auto_increment,
 
- 	   Value varchar(50) NOT NULL,
 
- 	   Count int(11) NOT NULL,
 
- 	   UNIQUE KEY id (ID,Value(35)),
 
- 	   INDEX (ID))""",
 
-   "AllQueries" : """CREATE TABLE AllQueries (
 
-            ID int(11) NOT NULL auto_increment,
 
- 	   Value varchar(50) NOT NULL,
 
- 	   Count int(11) NOT NULL,
 
- 	   UNIQUE KEY id (ID,Value(35)),
 
- 	   INDEX (ID))""",
 
- }
 
- #
 
- # The XML API description file to parse
 
- #
 
- API="libxml2-api.xml"
 
- DB=None
 
- #########################################################################
 
- #									#
 
- #                  MySQL database interfaces				#
 
- #									#
 
- #########################################################################
 
- def createTable(db, name):
 
-     global TABLES
 
-     if db == None:
 
-         return -1
 
-     if name == None:
 
-         return -1
 
-     c = db.cursor()
 
-     ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
 
-     if ret == 1:
 
-         print "Removed table %s" % (name)
 
-     print "Creating table %s" % (name)
 
-     try:
 
-         ret = c.execute(TABLES[name])
 
-     except:
 
-         print "Failed to create table %s" % (name)
 
- 	return -1
 
-     return ret
 
- def checkTables(db, verbose = 1):
 
-     global TABLES
 
-     if db == None:
 
-         return -1
 
-     c = db.cursor()
 
-     nbtables = c.execute("show tables")
 
-     if verbose:
 
- 	print "Found %d tables" % (nbtables)
 
-     tables = {}
 
-     i = 0
 
-     while i < nbtables:
 
-         l = c.fetchone()
 
- 	name = l[0]
 
- 	tables[name] = {}
 
-         i = i + 1
 
-     for table in TABLES.keys():
 
-         if not tables.has_key(table):
 
- 	    print "table %s missing" % (table)
 
- 	    createTable(db, table)
 
- 	try:
 
- 	    ret = c.execute("SELECT count(*) from %s" % table);
 
- 	    row = c.fetchone()
 
- 	    if verbose:
 
- 		print "Table %s contains %d records" % (table, row[0])
 
- 	except:
 
- 	    print "Troubles with table %s : repairing" % (table)
 
- 	    ret = c.execute("repair table %s" % table);
 
- 	    print "repairing returned %d" % (ret)
 
- 	    ret = c.execute("SELECT count(*) from %s" % table);
 
- 	    row = c.fetchone()
 
- 	    print "Table %s contains %d records" % (table, row[0])
 
-     if verbose:
 
- 	print "checkTables finished"
 
-     # make sure apache can access the tables read-only
 
-     try:
 
- 	ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
 
- 	ret = c.execute("GRANT INSERT,SELECT,UPDATE  ON xmlsoft.Queries TO nobody@localhost")
 
-     except:
 
-         pass
 
-     return 0
 
-     
 
- def openMySQL(db="xmlsoft", passwd=None, verbose = 1):
 
-     global DB
 
-     if passwd == None:
 
-         try:
 
- 	    passwd = os.environ["MySQL_PASS"]
 
- 	except:
 
- 	    print "No password available, set environment MySQL_PASS"
 
- 	    sys.exit(1)
 
-     DB = MySQLdb.connect(passwd=passwd, db=db)
 
-     if DB == None:
 
-         return -1
 
-     ret = checkTables(DB, verbose)
 
-     return ret
 
- def updateWord(name, symbol, relevance):
 
-     global DB
 
-     if DB == None:
 
-         openMySQL()
 
-     if DB == None:
 
-         return -1
 
-     if name == None:
 
-         return -1
 
-     if symbol == None:
 
-         return -1
 
-     c = DB.cursor()
 
-     try:
 
- 	ret = c.execute(
 
- """INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
 
- 		(name, symbol, relevance))
 
-     except:
 
-         try:
 
- 	    ret = c.execute(
 
-     """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
 
- 		    (relevance, name, symbol))
 
- 	except:
 
- 	    print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
 
- 	    print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
 
- 	    print sys.exc_type, sys.exc_value
 
- 	    return -1
 
- 	     
 
-     return ret
 
- def updateSymbol(name, module, type, desc):
 
-     global DB
 
-     updateWord(name, name, 50)
 
-     if DB == None:
 
-         openMySQL()
 
-     if DB == None:
 
-         return -1
 
-     if name == None:
 
-         return -1
 
-     if module == None:
 
-         return -1
 
-     if type == None:
 
-         return -1
 
-     try:
 
- 	desc = string.replace(desc, "'", " ")
 
- 	l = string.split(desc, ".")
 
- 	desc = l[0]
 
- 	desc = desc[0:99]
 
-     except:
 
-         desc = ""
 
-     c = DB.cursor()
 
-     try:
 
- 	ret = c.execute(
 
- """INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
 
-                     (name, module, type, desc))
 
-     except:
 
-         try:
 
- 	    ret = c.execute(
 
- """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
 
-                     (module, type, desc, name))
 
-         except:
 
- 	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
 
- 	    print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
 
- 	    print sys.exc_type, sys.exc_value
 
- 	    return -1
 
- 	     
 
-     return ret
 
-         
 
- def addFunction(name, module, desc = ""):
 
-     return updateSymbol(name, module, 'function', desc)
 
- def addMacro(name, module, desc = ""):
 
-     return updateSymbol(name, module, 'macro', desc)
 
- def addEnum(name, module, desc = ""):
 
-     return updateSymbol(name, module, 'enum', desc)
 
- def addStruct(name, module, desc = ""):
 
-     return updateSymbol(name, module, 'struct', desc)
 
- def addConst(name, module, desc = ""):
 
-     return updateSymbol(name, module, 'const', desc)
 
- def addType(name, module, desc = ""):
 
-     return updateSymbol(name, module, 'type', desc)
 
- def addFunctype(name, module, desc = ""):
 
-     return updateSymbol(name, module, 'functype', desc)
 
- def addPage(resource, title):
 
-     global DB
 
-     if DB == None:
 
-         openMySQL()
 
-     if DB == None:
 
-         return -1
 
-     if resource == None:
 
-         return -1
 
-     c = DB.cursor()
 
-     try:
 
- 	ret = c.execute(
 
- 	    """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
 
-                     (resource, title))
 
-     except:
 
-         try:
 
- 	    ret = c.execute(
 
- 		"""UPDATE pages SET title='%s' WHERE resource='%s'""" %
 
-                     (title, resource))
 
-         except:
 
- 	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
 
- 	    print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
 
- 	    print sys.exc_type, sys.exc_value
 
- 	    return -1
 
- 	     
 
-     return ret
 
- def updateWordHTML(name, resource, desc, id, relevance):
 
-     global DB
 
-     if DB == None:
 
-         openMySQL()
 
-     if DB == None:
 
-         return -1
 
-     if name == None:
 
-         return -1
 
-     if resource == None:
 
-         return -1
 
-     if id == None:
 
-         id = ""
 
-     if desc == None:
 
-         desc = ""
 
-     else:
 
- 	try:
 
- 	    desc = string.replace(desc, "'", " ")
 
- 	    desc = desc[0:99]
 
- 	except:
 
- 	    desc = ""
 
-     c = DB.cursor()
 
-     try:
 
- 	ret = c.execute(
 
- """INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
 
-                     (name, resource, desc, id, relevance))
 
-     except:
 
-         try:
 
- 	    ret = c.execute(
 
- """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
 
-                     (desc, id, relevance, name, resource))
 
-         except:
 
- 	    print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
 
- 	    print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
 
- 	    print sys.exc_type, sys.exc_value
 
- 	    return -1
 
- 	     
 
-     return ret
 
- def checkXMLMsgArchive(url):
 
-     global DB
 
-     if DB == None:
 
-         openMySQL()
 
-     if DB == None:
 
-         return -1
 
-     if url == None:
 
-         return -1
 
-     c = DB.cursor()
 
-     try:
 
- 	ret = c.execute(
 
- 	    """SELECT ID FROM archives WHERE resource='%s'""" % (url))
 
- 	row = c.fetchone()
 
- 	if row == None:
 
- 	    return -1
 
-     except:
 
- 	return -1
 
- 	     
 
-     return row[0]
 
-     
 
- def addXMLMsgArchive(url, title):
 
-     global DB
 
-     if DB == None:
 
-         openMySQL()
 
-     if DB == None:
 
-         return -1
 
-     if url == None:
 
-         return -1
 
-     if title == None:
 
-         title = ""
 
-     else:
 
- 	title = string.replace(title, "'", " ")
 
- 	title = title[0:99]
 
-     c = DB.cursor()
 
-     try:
 
-         cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
 
-         ret = c.execute(cmd)
 
- 	cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
 
-         ret = c.execute(cmd)
 
- 	row = c.fetchone()
 
- 	if row == None:
 
- 	    print "addXMLMsgArchive failed to get the ID: %s" % (url)
 
- 	    return -1
 
-     except:
 
-         print "addXMLMsgArchive failed command: %s" % (cmd)
 
- 	return -1
 
- 	     
 
-     return((int)(row[0]))
 
- def updateWordArchive(name, id, relevance):
 
-     global DB
 
-     if DB == None:
 
-         openMySQL()
 
-     if DB == None:
 
-         return -1
 
-     if name == None:
 
-         return -1
 
-     if id == None:
 
-         return -1
 
-     c = DB.cursor()
 
-     try:
 
- 	ret = c.execute(
 
- """INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
 
-                     (name, id, relevance))
 
-     except:
 
-         try:
 
- 	    ret = c.execute(
 
- """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
 
-                     (relevance, name, id))
 
-         except:
 
- 	    print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
 
- 	    print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
 
- 	    print sys.exc_type, sys.exc_value
 
- 	    return -1
 
- 	     
 
-     return ret
 
- #########################################################################
 
- #									#
 
- #                  Word dictionnary and analysis routines		#
 
- #									#
 
- #########################################################################
 
- #
 
- # top 100 english word without the one len < 3 + own set
 
- #
 
- dropWords = {
 
-     'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
 
-     'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
 
-     'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
 
-     'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
 
-     'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
 
-     'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
 
-     'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
 
-     'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
 
-     'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
 
-     'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
 
-     'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
 
-     'down':0,
 
-     'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
 
- }
 
- wordsDict = {}
 
- wordsDictHTML = {}
 
- wordsDictArchive = {}
 
- def cleanupWordsString(str):
 
-     str = string.replace(str, ".", " ")
 
-     str = string.replace(str, "!", " ")
 
-     str = string.replace(str, "?", " ")
 
-     str = string.replace(str, ",", " ")
 
-     str = string.replace(str, "'", " ")
 
-     str = string.replace(str, '"', " ")
 
-     str = string.replace(str, ";", " ")
 
-     str = string.replace(str, "(", " ")
 
-     str = string.replace(str, ")", " ")
 
-     str = string.replace(str, "{", " ")
 
-     str = string.replace(str, "}", " ")
 
-     str = string.replace(str, "<", " ")
 
-     str = string.replace(str, ">", " ")
 
-     str = string.replace(str, "=", " ")
 
-     str = string.replace(str, "/", " ")
 
-     str = string.replace(str, "*", " ")
 
-     str = string.replace(str, ":", " ")
 
-     str = string.replace(str, "#", " ")
 
-     str = string.replace(str, "\\", " ")
 
-     str = string.replace(str, "\n", " ")
 
-     str = string.replace(str, "\r", " ")
 
-     str = string.replace(str, "\xc2", " ")
 
-     str = string.replace(str, "\xa0", " ")
 
-     return str
 
-     
 
- def cleanupDescrString(str):
 
-     str = string.replace(str, "'", " ")
 
-     str = string.replace(str, "\n", " ")
 
-     str = string.replace(str, "\r", " ")
 
-     str = string.replace(str, "\xc2", " ")
 
-     str = string.replace(str, "\xa0", " ")
 
-     l = string.split(str)
 
-     str = string.join(str)
 
-     return str
 
- def splitIdentifier(str):
 
-     ret = []
 
-     while str != "":
 
-         cur = string.lower(str[0])
 
- 	str = str[1:]
 
- 	if ((cur < 'a') or (cur > 'z')):
 
- 	    continue
 
- 	while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
 
- 	    cur = cur + string.lower(str[0])
 
- 	    str = str[1:]
 
- 	while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
 
- 	    cur = cur + str[0]
 
- 	    str = str[1:]
 
- 	while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
 
- 	    str = str[1:]
 
- 	ret.append(cur)
 
-     return ret
 
- def addWord(word, module, symbol, relevance):
 
-     global wordsDict
 
-     if word == None or len(word) < 3:
 
-         return -1
 
-     if module == None or symbol == None:
 
-         return -1
 
-     if dropWords.has_key(word):
 
-         return 0
 
-     if ord(word[0]) > 0x80:
 
-         return 0
 
-     if wordsDict.has_key(word):
 
-         d = wordsDict[word]
 
- 	if d == None:
 
- 	    return 0
 
- 	if len(d) > 500:
 
- 	    wordsDict[word] = None
 
- 	    return 0
 
- 	try:
 
- 	    relevance = relevance + d[(module, symbol)]
 
- 	except:
 
- 	    pass
 
-     else:
 
-         wordsDict[word] = {}
 
-     wordsDict[word][(module, symbol)] = relevance
 
-     return relevance
 
-     
 
- def addString(str, module, symbol, relevance):
 
-     if str == None or len(str) < 3:
 
-         return -1
 
-     ret = 0
 
-     str = cleanupWordsString(str)
 
-     l = string.split(str)
 
-     for word in l:
 
- 	if len(word) > 2:
 
- 	    ret = ret + addWord(word, module, symbol, 5)
 
-     return ret
 
- def addWordHTML(word, resource, id, section, relevance):
 
-     global wordsDictHTML
 
-     if word == None or len(word) < 3:
 
-         return -1
 
-     if resource == None or section == None:
 
-         return -1
 
-     if dropWords.has_key(word):
 
-         return 0
 
-     if ord(word[0]) > 0x80:
 
-         return 0
 
-     section = cleanupDescrString(section)
 
-     if wordsDictHTML.has_key(word):
 
-         d = wordsDictHTML[word]
 
- 	if d == None:
 
- 	    print "skipped %s" % (word)
 
- 	    return 0
 
- 	try:
 
- 	    (r,i,s) = d[resource]
 
- 	    if i != None:
 
- 	        id = i
 
- 	    if s != None:
 
- 	        section = s
 
- 	    relevance = relevance + r
 
- 	except:
 
- 	    pass
 
-     else:
 
-         wordsDictHTML[word] = {}
 
-     d = wordsDictHTML[word];
 
-     d[resource] = (relevance, id, section)
 
-     return relevance
 
-     
 
- def addStringHTML(str, resource, id, section, relevance):
 
-     if str == None or len(str) < 3:
 
-         return -1
 
-     ret = 0
 
-     str = cleanupWordsString(str)
 
-     l = string.split(str)
 
-     for word in l:
 
- 	if len(word) > 2:
 
- 	    try:
 
- 		r = addWordHTML(word, resource, id, section, relevance)
 
- 		if r < 0:
 
- 		    print "addWordHTML failed: %s %s" % (word, resource)
 
- 		ret = ret + r
 
- 	    except:
 
- 		print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
 
- 		print sys.exc_type, sys.exc_value
 
-     return ret
 
- def addWordArchive(word, id, relevance):
 
-     global wordsDictArchive
 
-     if word == None or len(word) < 3:
 
-         return -1
 
-     if id == None or id == -1:
 
-         return -1
 
-     if dropWords.has_key(word):
 
-         return 0
 
-     if ord(word[0]) > 0x80:
 
-         return 0
 
-     if wordsDictArchive.has_key(word):
 
-         d = wordsDictArchive[word]
 
- 	if d == None:
 
- 	    print "skipped %s" % (word)
 
- 	    return 0
 
- 	try:
 
- 	    r = d[id]
 
- 	    relevance = relevance + r
 
- 	except:
 
- 	    pass
 
-     else:
 
-         wordsDictArchive[word] = {}
 
-     d = wordsDictArchive[word];
 
-     d[id] = relevance
 
-     return relevance
 
-     
 
- def addStringArchive(str, id, relevance):
 
-     if str == None or len(str) < 3:
 
-         return -1
 
-     ret = 0
 
-     str = cleanupWordsString(str)
 
-     l = string.split(str)
 
-     for word in l:
 
-         i = len(word)
 
- 	if i > 2:
 
- 	    try:
 
- 		r = addWordArchive(word, id, relevance)
 
- 		if r < 0:
 
- 		    print "addWordArchive failed: %s %s" % (word, id)
 
- 		else:
 
- 		    ret = ret + r
 
- 	    except:
 
- 		print "addWordArchive failed: %s %s %d" % (word, id, relevance)
 
- 		print sys.exc_type, sys.exc_value
 
-     return ret
 
- #########################################################################
 
- #									#
 
- #                  XML API description analysis				#
 
- #									#
 
- #########################################################################
 
- def loadAPI(filename):
 
-     doc = libxml2.parseFile(filename)
 
-     print "loaded %s" % (filename)
 
-     return doc
 
- def foundExport(file, symbol):
 
-     if file == None:
 
-         return 0
 
-     if symbol == None:
 
-         return 0
 
-     addFunction(symbol, file)
 
-     l = splitIdentifier(symbol)
 
-     for word in l:
 
- 	addWord(word, file, symbol, 10)
 
-     return 1
 
-      
 
- def analyzeAPIFile(top):
 
-     count = 0
 
-     name = top.prop("name")
 
-     cur = top.children
 
-     while cur != None:
 
-         if cur.type == 'text':
 
- 	    cur = cur.next
 
- 	    continue
 
- 	if cur.name == "exports":
 
- 	    count = count + foundExport(name, cur.prop("symbol"))
 
- 	else:
 
- 	    print "unexpected element %s in API doc <file name='%s'>" % (name)
 
-         cur = cur.next
 
-     return count
 
- def analyzeAPIFiles(top):
 
-     count = 0
 
-     cur = top.children
 
-         
 
-     while cur != None:
 
-         if cur.type == 'text':
 
- 	    cur = cur.next
 
- 	    continue
 
- 	if cur.name == "file":
 
- 	    count = count + analyzeAPIFile(cur)
 
- 	else:
 
- 	    print "unexpected element %s in API doc <files>" % (cur.name)
 
-         cur = cur.next
 
-     return count
 
- def analyzeAPIEnum(top):
 
-     file = top.prop("file")
 
-     if file == None:
 
-         return 0
 
-     symbol = top.prop("name")
 
-     if symbol == None:
 
-         return 0
 
-     addEnum(symbol, file)
 
-     l = splitIdentifier(symbol)
 
-     for word in l:
 
- 	addWord(word, file, symbol, 10)
 
-     return 1
 
- def analyzeAPIConst(top):
 
-     file = top.prop("file")
 
-     if file == None:
 
-         return 0
 
-     symbol = top.prop("name")
 
-     if symbol == None:
 
-         return 0
 
-     addConst(symbol, file)
 
-     l = splitIdentifier(symbol)
 
-     for word in l:
 
- 	addWord(word, file, symbol, 10)
 
-     return 1
 
- def analyzeAPIType(top):
 
-     file = top.prop("file")
 
-     if file == None:
 
-         return 0
 
-     symbol = top.prop("name")
 
-     if symbol == None:
 
-         return 0
 
-     addType(symbol, file)
 
-     l = splitIdentifier(symbol)
 
-     for word in l:
 
- 	addWord(word, file, symbol, 10)
 
-     return 1
 
- def analyzeAPIFunctype(top):
 
-     file = top.prop("file")
 
-     if file == None:
 
-         return 0
 
-     symbol = top.prop("name")
 
-     if symbol == None:
 
-         return 0
 
-     addFunctype(symbol, file)
 
-     l = splitIdentifier(symbol)
 
-     for word in l:
 
- 	addWord(word, file, symbol, 10)
 
-     return 1
 
- def analyzeAPIStruct(top):
 
-     file = top.prop("file")
 
-     if file == None:
 
-         return 0
 
-     symbol = top.prop("name")
 
-     if symbol == None:
 
-         return 0
 
-     addStruct(symbol, file)
 
-     l = splitIdentifier(symbol)
 
-     for word in l:
 
- 	addWord(word, file, symbol, 10)
 
-     info = top.prop("info")
 
-     if info != None:
 
- 	info = string.replace(info, "'", " ")
 
- 	info = string.strip(info)
 
- 	l = string.split(info)
 
- 	for word in l:
 
- 	    if len(word) > 2:
 
- 		addWord(word, file, symbol, 5)
 
-     return 1
 
- def analyzeAPIMacro(top):
 
-     file = top.prop("file")
 
-     if file == None:
 
-         return 0
 
-     symbol = top.prop("name")
 
-     if symbol == None:
 
-         return 0
 
-     symbol = string.replace(symbol, "'", " ")
 
-     symbol = string.strip(symbol)
 
-     info = None
 
-     cur = top.children
 
-     while cur != None:
 
-         if cur.type == 'text':
 
- 	    cur = cur.next
 
- 	    continue
 
- 	if cur.name == "info":
 
- 	    info = cur.content
 
- 	    break
 
-         cur = cur.next
 
-     l = splitIdentifier(symbol)
 
-     for word in l:
 
- 	addWord(word, file, symbol, 10)
 
-     if info == None:
 
- 	addMacro(symbol, file)
 
-         print "Macro %s description has no <info>" % (symbol)
 
-         return 0
 
-     info = string.replace(info, "'", " ")
 
-     info = string.strip(info)
 
-     addMacro(symbol, file, info)
 
-     l = string.split(info)
 
-     for word in l:
 
- 	if len(word) > 2:
 
- 	    addWord(word, file, symbol, 5)
 
-     return 1
 
- def analyzeAPIFunction(top):
 
-     file = top.prop("file")
 
-     if file == None:
 
-         return 0
 
-     symbol = top.prop("name")
 
-     if symbol == None:
 
-         return 0
 
-     symbol = string.replace(symbol, "'", " ")
 
-     symbol = string.strip(symbol)
 
-     info = None
 
-     cur = top.children
 
-     while cur != None:
 
-         if cur.type == 'text':
 
- 	    cur = cur.next
 
- 	    continue
 
- 	if cur.name == "info":
 
- 	    info = cur.content
 
- 	elif cur.name == "return":
 
- 	    rinfo = cur.prop("info")
 
- 	    if rinfo != None:
 
- 		rinfo = string.replace(rinfo, "'", " ")
 
- 		rinfo = string.strip(rinfo)
 
- 	        addString(rinfo, file, symbol, 7)
 
- 	elif cur.name == "arg":
 
- 	    ainfo = cur.prop("info")
 
- 	    if ainfo != None:
 
- 		ainfo = string.replace(ainfo, "'", " ")
 
- 		ainfo = string.strip(ainfo)
 
- 	        addString(ainfo, file, symbol, 5)
 
- 	    name = cur.prop("name")
 
- 	    if name != None:
 
- 		name = string.replace(name, "'", " ")
 
- 		name = string.strip(name)
 
- 	        addWord(name, file, symbol, 7)
 
-         cur = cur.next
 
-     if info == None:
 
-         print "Function %s description has no <info>" % (symbol)
 
- 	addFunction(symbol, file, "")
 
-     else:
 
-         info = string.replace(info, "'", " ")
 
- 	info = string.strip(info)
 
- 	addFunction(symbol, file, info)
 
-         addString(info, file, symbol, 5)
 
-     l = splitIdentifier(symbol)
 
-     for word in l:
 
- 	addWord(word, file, symbol, 10)
 
-     return 1
 
- def analyzeAPISymbols(top):
 
-     count = 0
 
-     cur = top.children
 
-         
 
-     while cur != None:
 
-         if cur.type == 'text':
 
- 	    cur = cur.next
 
- 	    continue
 
- 	if cur.name == "macro":
 
- 	    count = count + analyzeAPIMacro(cur)
 
- 	elif cur.name == "function":
 
- 	    count = count + analyzeAPIFunction(cur)
 
- 	elif cur.name == "const":
 
- 	    count = count + analyzeAPIConst(cur)
 
- 	elif cur.name == "typedef":
 
- 	    count = count + analyzeAPIType(cur)
 
- 	elif cur.name == "struct":
 
- 	    count = count + analyzeAPIStruct(cur)
 
- 	elif cur.name == "enum":
 
- 	    count = count + analyzeAPIEnum(cur)
 
- 	elif cur.name == "functype":
 
- 	    count = count + analyzeAPIFunctype(cur)
 
- 	else:
 
- 	    print "unexpected element %s in API doc <files>" % (cur.name)
 
-         cur = cur.next
 
-     return count
 
- def analyzeAPI(doc):
 
-     count = 0
 
-     if doc == None:
 
-         return -1
 
-     root = doc.getRootElement()
 
-     if root.name != "api":
 
-         print "Unexpected root name"
 
-         return -1
 
-     cur = root.children
 
-     while cur != None:
 
-         if cur.type == 'text':
 
- 	    cur = cur.next
 
- 	    continue
 
- 	if cur.name == "files":
 
- 	    pass
 
- #	    count = count + analyzeAPIFiles(cur)
 
- 	elif cur.name == "symbols":
 
- 	    count = count + analyzeAPISymbols(cur)
 
- 	else:
 
- 	    print "unexpected element %s in API doc" % (cur.name)
 
-         cur = cur.next
 
-     return count
 
- #########################################################################
 
- #									#
 
- #                  Web pages parsing and analysis			#
 
- #									#
 
- #########################################################################
 
- import glob
 
- def analyzeHTMLText(doc, resource, p, section, id):
 
-     words = 0
 
-     try:
 
- 	content = p.content
 
- 	words = words + addStringHTML(content, resource, id, section, 5)
 
-     except:
 
-         return -1
 
-     return words
 
- def analyzeHTMLPara(doc, resource, p, section, id):
 
-     words = 0
 
-     try:
 
- 	content = p.content
 
- 	words = words + addStringHTML(content, resource, id, section, 5)
 
-     except:
 
-         return -1
 
-     return words
 
- def analyzeHTMLPre(doc, resource, p, section, id):
 
-     words = 0
 
-     try:
 
- 	content = p.content
 
- 	words = words + addStringHTML(content, resource, id, section, 5)
 
-     except:
 
-         return -1
 
-     return words
 
- def analyzeHTML(doc, resource, p, section, id):
 
-     words = 0
 
-     try:
 
- 	content = p.content
 
- 	words = words + addStringHTML(content, resource, id, section, 5)
 
-     except:
 
-         return -1
 
-     return words
 
- def analyzeHTML(doc, resource):
 
-     para = 0;
 
-     ctxt = doc.xpathNewContext()
 
-     try:
 
- 	res = ctxt.xpathEval("//head/title")
 
- 	title = res[0].content
 
-     except:
 
-         title = "Page %s" % (resource)
 
-     addPage(resource, title)
 
-     try:
 
- 	items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
 
- 	section = title
 
- 	id = ""
 
- 	for item in items:
 
- 	    if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
 
- 	        section = item.content
 
- 		if item.prop("id"):
 
- 		    id = item.prop("id")
 
- 		elif item.prop("name"):
 
- 		    id = item.prop("name")
 
- 	    elif item.type == 'text':
 
- 	        analyzeHTMLText(doc, resource, item, section, id)
 
- 		para = para + 1
 
- 	    elif item.name == 'p':
 
- 	        analyzeHTMLPara(doc, resource, item, section, id)
 
- 		para = para + 1
 
- 	    elif item.name == 'pre':
 
- 	        analyzeHTMLPre(doc, resource, item, section, id)
 
- 		para = para + 1
 
- 	    else:
 
- 	        print "Page %s, unexpected %s element" % (resource, item.name)
 
-     except:
 
-         print "Page %s: problem analyzing" % (resource)
 
- 	print sys.exc_type, sys.exc_value
 
-     return para
 
- def analyzeHTMLPages():
 
-     ret = 0
 
-     HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
 
-     for html in HTMLfiles:
 
- 	if html[0:3] == "API":
 
- 	    continue
 
- 	if html == "xml.html":
 
- 	    continue
 
- 	try:
 
- 	    doc = libxml2.parseFile(html)
 
- 	except:
 
- 	    doc = libxml2.htmlParseFile(html, None)
 
- 	try:
 
- 	    res = analyzeHTML(doc, html)
 
- 	    print "Parsed %s : %d paragraphs" % (html, res)
 
- 	    ret = ret + 1
 
- 	except:
 
- 	    print "could not parse %s" % (html)
 
-     return ret
 
- #########################################################################
 
- #									#
 
- #                  Mail archives parsing and analysis			#
 
- #									#
 
- #########################################################################
 
- import time
 
- def getXMLDateArchive(t = None):
 
-     if t == None:
 
- 	t = time.time()
 
-     T = time.gmtime(t)
 
-     month = time.strftime("%B", T)
 
-     year = T[0]
 
-     url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
 
-     return url
 
- def scanXMLMsgArchive(url, title, force = 0):
 
-     if url == None or title == None:
 
-         return 0
 
-     ID = checkXMLMsgArchive(url)
 
-     if force == 0 and ID != -1:
 
-         return 0
 
-     if ID == -1:
 
- 	ID = addXMLMsgArchive(url, title)
 
- 	if ID == -1:
 
- 	    return 0
 
-     try:
 
-         print "Loading %s" % (url)
 
-         doc = libxml2.htmlParseFile(url, None);
 
-     except:
 
-         doc = None
 
-     if doc == None:
 
-         print "Failed to parse %s" % (url)
 
- 	return 0
 
-     addStringArchive(title, ID, 20)
 
-     ctxt = doc.xpathNewContext()
 
-     texts = ctxt.xpathEval("//pre//text()")
 
-     for text in texts:
 
-         addStringArchive(text.content, ID, 5)
 
-     return 1
 
- def scanXMLDateArchive(t = None, force = 0):
 
-     global wordsDictArchive
 
-     wordsDictArchive = {}
 
-     url = getXMLDateArchive(t)
 
-     print "loading %s" % (url)
 
-     try:
 
- 	doc = libxml2.htmlParseFile(url, None);
 
-     except:
 
-         doc = None
 
-     if doc == None:
 
-         print "Failed to parse %s" % (url)
 
- 	return -1
 
-     ctxt = doc.xpathNewContext()
 
-     anchors = ctxt.xpathEval("//a[@href]")
 
-     links = 0
 
-     newmsg = 0
 
-     for anchor in anchors:
 
- 	href = anchor.prop("href")
 
- 	if href == None or href[0:3] != "msg":
 
- 	    continue
 
-         try:
 
- 	    links = links + 1
 
- 	    msg = libxml2.buildURI(href, url)
 
- 	    title = anchor.content
 
- 	    if title != None and title[0:4] == 'Re: ':
 
- 	        title = title[4:]
 
- 	    if title != None and title[0:6] == '[xml] ':
 
- 	        title = title[6:]
 
- 	    newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
 
- 	except:
 
- 	    pass
 
-     return newmsg
 
-     
 
- #########################################################################
 
- #									#
 
- #          Main code: open the DB, the API XML and analyze it		#
 
- #									#
 
- #########################################################################
 
- def analyzeArchives(t = None, force = 0):
 
-     global wordsDictArchive
 
-     ret = scanXMLDateArchive(t, force)
 
-     print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
 
-     i = 0
 
-     skipped = 0
 
-     for word in wordsDictArchive.keys():
 
- 	refs = wordsDictArchive[word]
 
- 	if refs  == None:
 
- 	    skipped = skipped + 1
 
- 	    continue;
 
- 	for id in refs.keys():
 
- 	    relevance = refs[id]
 
- 	    updateWordArchive(word, id, relevance)
 
- 	    i = i + 1
 
-     print "Found %d associations in HTML pages" % (i)
 
- def analyzeHTMLTop():
 
-     global wordsDictHTML
 
-     ret = analyzeHTMLPages()
 
-     print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
 
-     i = 0
 
-     skipped = 0
 
-     for word in wordsDictHTML.keys():
 
- 	refs = wordsDictHTML[word]
 
- 	if refs  == None:
 
- 	    skipped = skipped + 1
 
- 	    continue;
 
- 	for resource in refs.keys():
 
- 	    (relevance, id, section) = refs[resource]
 
- 	    updateWordHTML(word, resource, section, id, relevance)
 
- 	    i = i + 1
 
-     print "Found %d associations in HTML pages" % (i)
 
- def analyzeAPITop():
 
-     global wordsDict
 
-     global API
 
-     try:
 
- 	doc = loadAPI(API)
 
- 	ret = analyzeAPI(doc)
 
- 	print "Analyzed %d blocs" % (ret)
 
- 	doc.freeDoc()
 
-     except:
 
- 	print "Failed to parse and analyze %s" % (API)
 
- 	print sys.exc_type, sys.exc_value
 
- 	sys.exit(1)
 
-     print "Indexed %d words" % (len(wordsDict))
 
-     i = 0
 
-     skipped = 0
 
-     for word in wordsDict.keys():
 
- 	refs = wordsDict[word]
 
- 	if refs  == None:
 
- 	    skipped = skipped + 1
 
- 	    continue;
 
- 	for (module, symbol) in refs.keys():
 
- 	    updateWord(word, symbol, refs[(module, symbol)])
 
- 	    i = i + 1
 
-     print "Found %d associations, skipped %d words" % (i, skipped)
 
- def usage():
 
-     print "Usage index.py [--force] [--archive]  [--archive-year year] [--archive-month month] [--API] [--docs]"
 
-     sys.exit(1)
 
- def main():
 
-     try:
 
- 	openMySQL()
 
-     except:
 
- 	print "Failed to open the database"
 
- 	print sys.exc_type, sys.exc_value
 
- 	sys.exit(1)
 
-     args = sys.argv[1:]
 
-     force = 0
 
-     if args:
 
-         i = 0
 
- 	while i < len(args):
 
- 	    if args[i] == '--force':
 
- 	        force = 1
 
- 	    elif args[i] == '--archive':
 
- 	        analyzeArchives(None, force)
 
- 	    elif args[i] == '--archive-year':
 
- 	        i = i + 1;
 
- 		year = args[i]
 
- 		months = ["January" , "February", "March", "April", "May",
 
- 			  "June", "July", "August", "September", "October",
 
- 			  "November", "December"];
 
- 	        for month in months:
 
- 		    try:
 
- 		        str = "%s-%s" % (year, month)
 
- 			T = time.strptime(str, "%Y-%B")
 
- 			t = time.mktime(T) + 3600 * 24 * 10;
 
- 			analyzeArchives(t, force)
 
- 		    except:
 
- 			print "Failed to index month archive:"
 
- 			print sys.exc_type, sys.exc_value
 
- 	    elif args[i] == '--archive-month':
 
- 	        i = i + 1;
 
- 		month = args[i]
 
- 		try:
 
- 		    T = time.strptime(month, "%Y-%B")
 
- 		    t = time.mktime(T) + 3600 * 24 * 10;
 
- 		    analyzeArchives(t, force)
 
- 		except:
 
- 		    print "Failed to index month archive:"
 
- 		    print sys.exc_type, sys.exc_value
 
- 	    elif args[i] == '--API':
 
- 	        analyzeAPITop()
 
- 	    elif args[i] == '--docs':
 
- 	        analyzeHTMLTop()
 
- 	    else:
 
- 	        usage()
 
- 	    i = i + 1
 
-     else:
 
-         usage()
 
- if __name__ == "__main__":
 
-     main()
 
 
  |