index.py 32 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258
  1. #!/usr/bin/python -u
  2. #
  3. # imports the API description and fills up a database with
  4. # name relevance to modules, functions or web pages
  5. #
  6. # Operation needed:
  7. # =================
  8. #
  9. # install mysqld, the python wrappers for mysql and libxml2, start mysqld
  10. # Change the root passwd of mysql:
  11. # mysqladmin -u root password new_password
  12. # Create the new database xmlsoft
  13. # mysqladmin -p create xmlsoft
  14. # Create a database user 'veillard' and give him passord access
  15. # change veillard and abcde with the right user name and passwd
  16. # mysql -p
  17. # password:
  18. # mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
  19. # IDENTIFIED BY 'abcde' WITH GRANT OPTION;
  20. #
  21. # As the user check the access:
  22. # mysql -p xmlsoft
  23. # Enter password:
  24. # Welcome to the MySQL monitor....
  25. # mysql> use xmlsoft
  26. # Database changed
  27. # mysql> quit
  28. # Bye
  29. #
  30. # Then run the script in the doc subdir, it will create the symbols and
  31. # word tables and populate them with informations extracted from
  32. # the libxml2-api.xml API description, and make then accessible read-only
  33. # by nobody@loaclhost the user expected to be Apache's one
  34. #
  35. # On the Apache configuration, make sure you have php support enabled
  36. #
  37. import MySQLdb
  38. import libxml2
  39. import sys
  40. import string
  41. import os
  42. #
  43. # We are not interested in parsing errors here
  44. #
  45. def callback(ctx, str):
  46. return
  47. libxml2.registerErrorHandler(callback, None)
  48. #
  49. # The dictionnary of tables required and the SQL command needed
  50. # to create them
  51. #
  52. TABLES={
  53. "symbols" : """CREATE TABLE symbols (
  54. name varchar(255) BINARY NOT NULL,
  55. module varchar(255) BINARY NOT NULL,
  56. type varchar(25) NOT NULL,
  57. descr varchar(255),
  58. UNIQUE KEY name (name),
  59. KEY module (module))""",
  60. "words" : """CREATE TABLE words (
  61. name varchar(50) BINARY NOT NULL,
  62. symbol varchar(255) BINARY NOT NULL,
  63. relevance int,
  64. KEY name (name),
  65. KEY symbol (symbol),
  66. UNIQUE KEY ID (name, symbol))""",
  67. "wordsHTML" : """CREATE TABLE wordsHTML (
  68. name varchar(50) BINARY NOT NULL,
  69. resource varchar(255) BINARY NOT NULL,
  70. section varchar(255),
  71. id varchar(50),
  72. relevance int,
  73. KEY name (name),
  74. KEY resource (resource),
  75. UNIQUE KEY ref (name, resource))""",
  76. "wordsArchive" : """CREATE TABLE wordsArchive (
  77. name varchar(50) BINARY NOT NULL,
  78. ID int(11) NOT NULL,
  79. relevance int,
  80. KEY name (name),
  81. UNIQUE KEY ref (name, ID))""",
  82. "pages" : """CREATE TABLE pages (
  83. resource varchar(255) BINARY NOT NULL,
  84. title varchar(255) BINARY NOT NULL,
  85. UNIQUE KEY name (resource))""",
  86. "archives" : """CREATE TABLE archives (
  87. ID int(11) NOT NULL auto_increment,
  88. resource varchar(255) BINARY NOT NULL,
  89. title varchar(255) BINARY NOT NULL,
  90. UNIQUE KEY id (ID,resource(255)),
  91. INDEX (ID),
  92. INDEX (resource))""",
  93. "Queries" : """CREATE TABLE Queries (
  94. ID int(11) NOT NULL auto_increment,
  95. Value varchar(50) NOT NULL,
  96. Count int(11) NOT NULL,
  97. UNIQUE KEY id (ID,Value(35)),
  98. INDEX (ID))""",
  99. "AllQueries" : """CREATE TABLE AllQueries (
  100. ID int(11) NOT NULL auto_increment,
  101. Value varchar(50) NOT NULL,
  102. Count int(11) NOT NULL,
  103. UNIQUE KEY id (ID,Value(35)),
  104. INDEX (ID))""",
  105. }
  106. #
  107. # The XML API description file to parse
  108. #
  109. API="libxml2-api.xml"
  110. DB=None
  111. #########################################################################
  112. # #
  113. # MySQL database interfaces #
  114. # #
  115. #########################################################################
  116. def createTable(db, name):
  117. global TABLES
  118. if db == None:
  119. return -1
  120. if name == None:
  121. return -1
  122. c = db.cursor()
  123. ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
  124. if ret == 1:
  125. print "Removed table %s" % (name)
  126. print "Creating table %s" % (name)
  127. try:
  128. ret = c.execute(TABLES[name])
  129. except:
  130. print "Failed to create table %s" % (name)
  131. return -1
  132. return ret
  133. def checkTables(db, verbose = 1):
  134. global TABLES
  135. if db == None:
  136. return -1
  137. c = db.cursor()
  138. nbtables = c.execute("show tables")
  139. if verbose:
  140. print "Found %d tables" % (nbtables)
  141. tables = {}
  142. i = 0
  143. while i < nbtables:
  144. l = c.fetchone()
  145. name = l[0]
  146. tables[name] = {}
  147. i = i + 1
  148. for table in TABLES.keys():
  149. if not tables.has_key(table):
  150. print "table %s missing" % (table)
  151. createTable(db, table)
  152. try:
  153. ret = c.execute("SELECT count(*) from %s" % table);
  154. row = c.fetchone()
  155. if verbose:
  156. print "Table %s contains %d records" % (table, row[0])
  157. except:
  158. print "Troubles with table %s : repairing" % (table)
  159. ret = c.execute("repair table %s" % table);
  160. print "repairing returned %d" % (ret)
  161. ret = c.execute("SELECT count(*) from %s" % table);
  162. row = c.fetchone()
  163. print "Table %s contains %d records" % (table, row[0])
  164. if verbose:
  165. print "checkTables finished"
  166. # make sure apache can access the tables read-only
  167. try:
  168. ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
  169. ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost")
  170. except:
  171. pass
  172. return 0
  173. def openMySQL(db="xmlsoft", passwd=None, verbose = 1):
  174. global DB
  175. if passwd == None:
  176. try:
  177. passwd = os.environ["MySQL_PASS"]
  178. except:
  179. print "No password available, set environment MySQL_PASS"
  180. sys.exit(1)
  181. DB = MySQLdb.connect(passwd=passwd, db=db)
  182. if DB == None:
  183. return -1
  184. ret = checkTables(DB, verbose)
  185. return ret
  186. def updateWord(name, symbol, relevance):
  187. global DB
  188. if DB == None:
  189. openMySQL()
  190. if DB == None:
  191. return -1
  192. if name == None:
  193. return -1
  194. if symbol == None:
  195. return -1
  196. c = DB.cursor()
  197. try:
  198. ret = c.execute(
  199. """INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
  200. (name, symbol, relevance))
  201. except:
  202. try:
  203. ret = c.execute(
  204. """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
  205. (relevance, name, symbol))
  206. except:
  207. print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
  208. print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
  209. print sys.exc_type, sys.exc_value
  210. return -1
  211. return ret
  212. def updateSymbol(name, module, type, desc):
  213. global DB
  214. updateWord(name, name, 50)
  215. if DB == None:
  216. openMySQL()
  217. if DB == None:
  218. return -1
  219. if name == None:
  220. return -1
  221. if module == None:
  222. return -1
  223. if type == None:
  224. return -1
  225. try:
  226. desc = string.replace(desc, "'", " ")
  227. l = string.split(desc, ".")
  228. desc = l[0]
  229. desc = desc[0:99]
  230. except:
  231. desc = ""
  232. c = DB.cursor()
  233. try:
  234. ret = c.execute(
  235. """INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
  236. (name, module, type, desc))
  237. except:
  238. try:
  239. ret = c.execute(
  240. """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
  241. (module, type, desc, name))
  242. except:
  243. print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
  244. print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
  245. print sys.exc_type, sys.exc_value
  246. return -1
  247. return ret
  248. def addFunction(name, module, desc = ""):
  249. return updateSymbol(name, module, 'function', desc)
  250. def addMacro(name, module, desc = ""):
  251. return updateSymbol(name, module, 'macro', desc)
  252. def addEnum(name, module, desc = ""):
  253. return updateSymbol(name, module, 'enum', desc)
  254. def addStruct(name, module, desc = ""):
  255. return updateSymbol(name, module, 'struct', desc)
  256. def addConst(name, module, desc = ""):
  257. return updateSymbol(name, module, 'const', desc)
  258. def addType(name, module, desc = ""):
  259. return updateSymbol(name, module, 'type', desc)
  260. def addFunctype(name, module, desc = ""):
  261. return updateSymbol(name, module, 'functype', desc)
  262. def addPage(resource, title):
  263. global DB
  264. if DB == None:
  265. openMySQL()
  266. if DB == None:
  267. return -1
  268. if resource == None:
  269. return -1
  270. c = DB.cursor()
  271. try:
  272. ret = c.execute(
  273. """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
  274. (resource, title))
  275. except:
  276. try:
  277. ret = c.execute(
  278. """UPDATE pages SET title='%s' WHERE resource='%s'""" %
  279. (title, resource))
  280. except:
  281. print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
  282. print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
  283. print sys.exc_type, sys.exc_value
  284. return -1
  285. return ret
  286. def updateWordHTML(name, resource, desc, id, relevance):
  287. global DB
  288. if DB == None:
  289. openMySQL()
  290. if DB == None:
  291. return -1
  292. if name == None:
  293. return -1
  294. if resource == None:
  295. return -1
  296. if id == None:
  297. id = ""
  298. if desc == None:
  299. desc = ""
  300. else:
  301. try:
  302. desc = string.replace(desc, "'", " ")
  303. desc = desc[0:99]
  304. except:
  305. desc = ""
  306. c = DB.cursor()
  307. try:
  308. ret = c.execute(
  309. """INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
  310. (name, resource, desc, id, relevance))
  311. except:
  312. try:
  313. ret = c.execute(
  314. """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
  315. (desc, id, relevance, name, resource))
  316. except:
  317. print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
  318. print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
  319. print sys.exc_type, sys.exc_value
  320. return -1
  321. return ret
  322. def checkXMLMsgArchive(url):
  323. global DB
  324. if DB == None:
  325. openMySQL()
  326. if DB == None:
  327. return -1
  328. if url == None:
  329. return -1
  330. c = DB.cursor()
  331. try:
  332. ret = c.execute(
  333. """SELECT ID FROM archives WHERE resource='%s'""" % (url))
  334. row = c.fetchone()
  335. if row == None:
  336. return -1
  337. except:
  338. return -1
  339. return row[0]
  340. def addXMLMsgArchive(url, title):
  341. global DB
  342. if DB == None:
  343. openMySQL()
  344. if DB == None:
  345. return -1
  346. if url == None:
  347. return -1
  348. if title == None:
  349. title = ""
  350. else:
  351. title = string.replace(title, "'", " ")
  352. title = title[0:99]
  353. c = DB.cursor()
  354. try:
  355. cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
  356. ret = c.execute(cmd)
  357. cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
  358. ret = c.execute(cmd)
  359. row = c.fetchone()
  360. if row == None:
  361. print "addXMLMsgArchive failed to get the ID: %s" % (url)
  362. return -1
  363. except:
  364. print "addXMLMsgArchive failed command: %s" % (cmd)
  365. return -1
  366. return((int)(row[0]))
  367. def updateWordArchive(name, id, relevance):
  368. global DB
  369. if DB == None:
  370. openMySQL()
  371. if DB == None:
  372. return -1
  373. if name == None:
  374. return -1
  375. if id == None:
  376. return -1
  377. c = DB.cursor()
  378. try:
  379. ret = c.execute(
  380. """INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
  381. (name, id, relevance))
  382. except:
  383. try:
  384. ret = c.execute(
  385. """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
  386. (relevance, name, id))
  387. except:
  388. print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
  389. print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
  390. print sys.exc_type, sys.exc_value
  391. return -1
  392. return ret
  393. #########################################################################
  394. # #
  395. # Word dictionnary and analysis routines #
  396. # #
  397. #########################################################################
  398. #
  399. # top 100 english word without the one len < 3 + own set
  400. #
  401. dropWords = {
  402. 'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
  403. 'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
  404. 'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
  405. 'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
  406. 'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
  407. 'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
  408. 'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
  409. 'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
  410. 'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
  411. 'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
  412. 'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
  413. 'down':0,
  414. 'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
  415. }
  416. wordsDict = {}
  417. wordsDictHTML = {}
  418. wordsDictArchive = {}
  419. def cleanupWordsString(str):
  420. str = string.replace(str, ".", " ")
  421. str = string.replace(str, "!", " ")
  422. str = string.replace(str, "?", " ")
  423. str = string.replace(str, ",", " ")
  424. str = string.replace(str, "'", " ")
  425. str = string.replace(str, '"', " ")
  426. str = string.replace(str, ";", " ")
  427. str = string.replace(str, "(", " ")
  428. str = string.replace(str, ")", " ")
  429. str = string.replace(str, "{", " ")
  430. str = string.replace(str, "}", " ")
  431. str = string.replace(str, "<", " ")
  432. str = string.replace(str, ">", " ")
  433. str = string.replace(str, "=", " ")
  434. str = string.replace(str, "/", " ")
  435. str = string.replace(str, "*", " ")
  436. str = string.replace(str, ":", " ")
  437. str = string.replace(str, "#", " ")
  438. str = string.replace(str, "\\", " ")
  439. str = string.replace(str, "\n", " ")
  440. str = string.replace(str, "\r", " ")
  441. str = string.replace(str, "\xc2", " ")
  442. str = string.replace(str, "\xa0", " ")
  443. return str
  444. def cleanupDescrString(str):
  445. str = string.replace(str, "'", " ")
  446. str = string.replace(str, "\n", " ")
  447. str = string.replace(str, "\r", " ")
  448. str = string.replace(str, "\xc2", " ")
  449. str = string.replace(str, "\xa0", " ")
  450. l = string.split(str)
  451. str = string.join(str)
  452. return str
  453. def splitIdentifier(str):
  454. ret = []
  455. while str != "":
  456. cur = string.lower(str[0])
  457. str = str[1:]
  458. if ((cur < 'a') or (cur > 'z')):
  459. continue
  460. while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
  461. cur = cur + string.lower(str[0])
  462. str = str[1:]
  463. while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
  464. cur = cur + str[0]
  465. str = str[1:]
  466. while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
  467. str = str[1:]
  468. ret.append(cur)
  469. return ret
  470. def addWord(word, module, symbol, relevance):
  471. global wordsDict
  472. if word == None or len(word) < 3:
  473. return -1
  474. if module == None or symbol == None:
  475. return -1
  476. if dropWords.has_key(word):
  477. return 0
  478. if ord(word[0]) > 0x80:
  479. return 0
  480. if wordsDict.has_key(word):
  481. d = wordsDict[word]
  482. if d == None:
  483. return 0
  484. if len(d) > 500:
  485. wordsDict[word] = None
  486. return 0
  487. try:
  488. relevance = relevance + d[(module, symbol)]
  489. except:
  490. pass
  491. else:
  492. wordsDict[word] = {}
  493. wordsDict[word][(module, symbol)] = relevance
  494. return relevance
  495. def addString(str, module, symbol, relevance):
  496. if str == None or len(str) < 3:
  497. return -1
  498. ret = 0
  499. str = cleanupWordsString(str)
  500. l = string.split(str)
  501. for word in l:
  502. if len(word) > 2:
  503. ret = ret + addWord(word, module, symbol, 5)
  504. return ret
  505. def addWordHTML(word, resource, id, section, relevance):
  506. global wordsDictHTML
  507. if word == None or len(word) < 3:
  508. return -1
  509. if resource == None or section == None:
  510. return -1
  511. if dropWords.has_key(word):
  512. return 0
  513. if ord(word[0]) > 0x80:
  514. return 0
  515. section = cleanupDescrString(section)
  516. if wordsDictHTML.has_key(word):
  517. d = wordsDictHTML[word]
  518. if d == None:
  519. print "skipped %s" % (word)
  520. return 0
  521. try:
  522. (r,i,s) = d[resource]
  523. if i != None:
  524. id = i
  525. if s != None:
  526. section = s
  527. relevance = relevance + r
  528. except:
  529. pass
  530. else:
  531. wordsDictHTML[word] = {}
  532. d = wordsDictHTML[word];
  533. d[resource] = (relevance, id, section)
  534. return relevance
  535. def addStringHTML(str, resource, id, section, relevance):
  536. if str == None or len(str) < 3:
  537. return -1
  538. ret = 0
  539. str = cleanupWordsString(str)
  540. l = string.split(str)
  541. for word in l:
  542. if len(word) > 2:
  543. try:
  544. r = addWordHTML(word, resource, id, section, relevance)
  545. if r < 0:
  546. print "addWordHTML failed: %s %s" % (word, resource)
  547. ret = ret + r
  548. except:
  549. print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
  550. print sys.exc_type, sys.exc_value
  551. return ret
  552. def addWordArchive(word, id, relevance):
  553. global wordsDictArchive
  554. if word == None or len(word) < 3:
  555. return -1
  556. if id == None or id == -1:
  557. return -1
  558. if dropWords.has_key(word):
  559. return 0
  560. if ord(word[0]) > 0x80:
  561. return 0
  562. if wordsDictArchive.has_key(word):
  563. d = wordsDictArchive[word]
  564. if d == None:
  565. print "skipped %s" % (word)
  566. return 0
  567. try:
  568. r = d[id]
  569. relevance = relevance + r
  570. except:
  571. pass
  572. else:
  573. wordsDictArchive[word] = {}
  574. d = wordsDictArchive[word];
  575. d[id] = relevance
  576. return relevance
  577. def addStringArchive(str, id, relevance):
  578. if str == None or len(str) < 3:
  579. return -1
  580. ret = 0
  581. str = cleanupWordsString(str)
  582. l = string.split(str)
  583. for word in l:
  584. i = len(word)
  585. if i > 2:
  586. try:
  587. r = addWordArchive(word, id, relevance)
  588. if r < 0:
  589. print "addWordArchive failed: %s %s" % (word, id)
  590. else:
  591. ret = ret + r
  592. except:
  593. print "addWordArchive failed: %s %s %d" % (word, id, relevance)
  594. print sys.exc_type, sys.exc_value
  595. return ret
  596. #########################################################################
  597. # #
  598. # XML API description analysis #
  599. # #
  600. #########################################################################
  601. def loadAPI(filename):
  602. doc = libxml2.parseFile(filename)
  603. print "loaded %s" % (filename)
  604. return doc
  605. def foundExport(file, symbol):
  606. if file == None:
  607. return 0
  608. if symbol == None:
  609. return 0
  610. addFunction(symbol, file)
  611. l = splitIdentifier(symbol)
  612. for word in l:
  613. addWord(word, file, symbol, 10)
  614. return 1
  615. def analyzeAPIFile(top):
  616. count = 0
  617. name = top.prop("name")
  618. cur = top.children
  619. while cur != None:
  620. if cur.type == 'text':
  621. cur = cur.next
  622. continue
  623. if cur.name == "exports":
  624. count = count + foundExport(name, cur.prop("symbol"))
  625. else:
  626. print "unexpected element %s in API doc <file name='%s'>" % (name)
  627. cur = cur.next
  628. return count
  629. def analyzeAPIFiles(top):
  630. count = 0
  631. cur = top.children
  632. while cur != None:
  633. if cur.type == 'text':
  634. cur = cur.next
  635. continue
  636. if cur.name == "file":
  637. count = count + analyzeAPIFile(cur)
  638. else:
  639. print "unexpected element %s in API doc <files>" % (cur.name)
  640. cur = cur.next
  641. return count
  642. def analyzeAPIEnum(top):
  643. file = top.prop("file")
  644. if file == None:
  645. return 0
  646. symbol = top.prop("name")
  647. if symbol == None:
  648. return 0
  649. addEnum(symbol, file)
  650. l = splitIdentifier(symbol)
  651. for word in l:
  652. addWord(word, file, symbol, 10)
  653. return 1
  654. def analyzeAPIConst(top):
  655. file = top.prop("file")
  656. if file == None:
  657. return 0
  658. symbol = top.prop("name")
  659. if symbol == None:
  660. return 0
  661. addConst(symbol, file)
  662. l = splitIdentifier(symbol)
  663. for word in l:
  664. addWord(word, file, symbol, 10)
  665. return 1
  666. def analyzeAPIType(top):
  667. file = top.prop("file")
  668. if file == None:
  669. return 0
  670. symbol = top.prop("name")
  671. if symbol == None:
  672. return 0
  673. addType(symbol, file)
  674. l = splitIdentifier(symbol)
  675. for word in l:
  676. addWord(word, file, symbol, 10)
  677. return 1
  678. def analyzeAPIFunctype(top):
  679. file = top.prop("file")
  680. if file == None:
  681. return 0
  682. symbol = top.prop("name")
  683. if symbol == None:
  684. return 0
  685. addFunctype(symbol, file)
  686. l = splitIdentifier(symbol)
  687. for word in l:
  688. addWord(word, file, symbol, 10)
  689. return 1
  690. def analyzeAPIStruct(top):
  691. file = top.prop("file")
  692. if file == None:
  693. return 0
  694. symbol = top.prop("name")
  695. if symbol == None:
  696. return 0
  697. addStruct(symbol, file)
  698. l = splitIdentifier(symbol)
  699. for word in l:
  700. addWord(word, file, symbol, 10)
  701. info = top.prop("info")
  702. if info != None:
  703. info = string.replace(info, "'", " ")
  704. info = string.strip(info)
  705. l = string.split(info)
  706. for word in l:
  707. if len(word) > 2:
  708. addWord(word, file, symbol, 5)
  709. return 1
  710. def analyzeAPIMacro(top):
  711. file = top.prop("file")
  712. if file == None:
  713. return 0
  714. symbol = top.prop("name")
  715. if symbol == None:
  716. return 0
  717. symbol = string.replace(symbol, "'", " ")
  718. symbol = string.strip(symbol)
  719. info = None
  720. cur = top.children
  721. while cur != None:
  722. if cur.type == 'text':
  723. cur = cur.next
  724. continue
  725. if cur.name == "info":
  726. info = cur.content
  727. break
  728. cur = cur.next
  729. l = splitIdentifier(symbol)
  730. for word in l:
  731. addWord(word, file, symbol, 10)
  732. if info == None:
  733. addMacro(symbol, file)
  734. print "Macro %s description has no <info>" % (symbol)
  735. return 0
  736. info = string.replace(info, "'", " ")
  737. info = string.strip(info)
  738. addMacro(symbol, file, info)
  739. l = string.split(info)
  740. for word in l:
  741. if len(word) > 2:
  742. addWord(word, file, symbol, 5)
  743. return 1
  744. def analyzeAPIFunction(top):
  745. file = top.prop("file")
  746. if file == None:
  747. return 0
  748. symbol = top.prop("name")
  749. if symbol == None:
  750. return 0
  751. symbol = string.replace(symbol, "'", " ")
  752. symbol = string.strip(symbol)
  753. info = None
  754. cur = top.children
  755. while cur != None:
  756. if cur.type == 'text':
  757. cur = cur.next
  758. continue
  759. if cur.name == "info":
  760. info = cur.content
  761. elif cur.name == "return":
  762. rinfo = cur.prop("info")
  763. if rinfo != None:
  764. rinfo = string.replace(rinfo, "'", " ")
  765. rinfo = string.strip(rinfo)
  766. addString(rinfo, file, symbol, 7)
  767. elif cur.name == "arg":
  768. ainfo = cur.prop("info")
  769. if ainfo != None:
  770. ainfo = string.replace(ainfo, "'", " ")
  771. ainfo = string.strip(ainfo)
  772. addString(ainfo, file, symbol, 5)
  773. name = cur.prop("name")
  774. if name != None:
  775. name = string.replace(name, "'", " ")
  776. name = string.strip(name)
  777. addWord(name, file, symbol, 7)
  778. cur = cur.next
  779. if info == None:
  780. print "Function %s description has no <info>" % (symbol)
  781. addFunction(symbol, file, "")
  782. else:
  783. info = string.replace(info, "'", " ")
  784. info = string.strip(info)
  785. addFunction(symbol, file, info)
  786. addString(info, file, symbol, 5)
  787. l = splitIdentifier(symbol)
  788. for word in l:
  789. addWord(word, file, symbol, 10)
  790. return 1
  791. def analyzeAPISymbols(top):
  792. count = 0
  793. cur = top.children
  794. while cur != None:
  795. if cur.type == 'text':
  796. cur = cur.next
  797. continue
  798. if cur.name == "macro":
  799. count = count + analyzeAPIMacro(cur)
  800. elif cur.name == "function":
  801. count = count + analyzeAPIFunction(cur)
  802. elif cur.name == "const":
  803. count = count + analyzeAPIConst(cur)
  804. elif cur.name == "typedef":
  805. count = count + analyzeAPIType(cur)
  806. elif cur.name == "struct":
  807. count = count + analyzeAPIStruct(cur)
  808. elif cur.name == "enum":
  809. count = count + analyzeAPIEnum(cur)
  810. elif cur.name == "functype":
  811. count = count + analyzeAPIFunctype(cur)
  812. else:
  813. print "unexpected element %s in API doc <files>" % (cur.name)
  814. cur = cur.next
  815. return count
  816. def analyzeAPI(doc):
  817. count = 0
  818. if doc == None:
  819. return -1
  820. root = doc.getRootElement()
  821. if root.name != "api":
  822. print "Unexpected root name"
  823. return -1
  824. cur = root.children
  825. while cur != None:
  826. if cur.type == 'text':
  827. cur = cur.next
  828. continue
  829. if cur.name == "files":
  830. pass
  831. # count = count + analyzeAPIFiles(cur)
  832. elif cur.name == "symbols":
  833. count = count + analyzeAPISymbols(cur)
  834. else:
  835. print "unexpected element %s in API doc" % (cur.name)
  836. cur = cur.next
  837. return count
  838. #########################################################################
  839. # #
  840. # Web pages parsing and analysis #
  841. # #
  842. #########################################################################
  843. import glob
  844. def analyzeHTMLText(doc, resource, p, section, id):
  845. words = 0
  846. try:
  847. content = p.content
  848. words = words + addStringHTML(content, resource, id, section, 5)
  849. except:
  850. return -1
  851. return words
  852. def analyzeHTMLPara(doc, resource, p, section, id):
  853. words = 0
  854. try:
  855. content = p.content
  856. words = words + addStringHTML(content, resource, id, section, 5)
  857. except:
  858. return -1
  859. return words
  860. def analyzeHTMLPre(doc, resource, p, section, id):
  861. words = 0
  862. try:
  863. content = p.content
  864. words = words + addStringHTML(content, resource, id, section, 5)
  865. except:
  866. return -1
  867. return words
  868. def analyzeHTML(doc, resource, p, section, id):
  869. words = 0
  870. try:
  871. content = p.content
  872. words = words + addStringHTML(content, resource, id, section, 5)
  873. except:
  874. return -1
  875. return words
  876. def analyzeHTML(doc, resource):
  877. para = 0;
  878. ctxt = doc.xpathNewContext()
  879. try:
  880. res = ctxt.xpathEval("//head/title")
  881. title = res[0].content
  882. except:
  883. title = "Page %s" % (resource)
  884. addPage(resource, title)
  885. try:
  886. items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
  887. section = title
  888. id = ""
  889. for item in items:
  890. if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
  891. section = item.content
  892. if item.prop("id"):
  893. id = item.prop("id")
  894. elif item.prop("name"):
  895. id = item.prop("name")
  896. elif item.type == 'text':
  897. analyzeHTMLText(doc, resource, item, section, id)
  898. para = para + 1
  899. elif item.name == 'p':
  900. analyzeHTMLPara(doc, resource, item, section, id)
  901. para = para + 1
  902. elif item.name == 'pre':
  903. analyzeHTMLPre(doc, resource, item, section, id)
  904. para = para + 1
  905. else:
  906. print "Page %s, unexpected %s element" % (resource, item.name)
  907. except:
  908. print "Page %s: problem analyzing" % (resource)
  909. print sys.exc_type, sys.exc_value
  910. return para
  911. def analyzeHTMLPages():
  912. ret = 0
  913. HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
  914. for html in HTMLfiles:
  915. if html[0:3] == "API":
  916. continue
  917. if html == "xml.html":
  918. continue
  919. try:
  920. doc = libxml2.parseFile(html)
  921. except:
  922. doc = libxml2.htmlParseFile(html, None)
  923. try:
  924. res = analyzeHTML(doc, html)
  925. print "Parsed %s : %d paragraphs" % (html, res)
  926. ret = ret + 1
  927. except:
  928. print "could not parse %s" % (html)
  929. return ret
  930. #########################################################################
  931. # #
  932. # Mail archives parsing and analysis #
  933. # #
  934. #########################################################################
  935. import time
  936. def getXMLDateArchive(t = None):
  937. if t == None:
  938. t = time.time()
  939. T = time.gmtime(t)
  940. month = time.strftime("%B", T)
  941. year = T[0]
  942. url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
  943. return url
  944. def scanXMLMsgArchive(url, title, force = 0):
  945. if url == None or title == None:
  946. return 0
  947. ID = checkXMLMsgArchive(url)
  948. if force == 0 and ID != -1:
  949. return 0
  950. if ID == -1:
  951. ID = addXMLMsgArchive(url, title)
  952. if ID == -1:
  953. return 0
  954. try:
  955. print "Loading %s" % (url)
  956. doc = libxml2.htmlParseFile(url, None);
  957. except:
  958. doc = None
  959. if doc == None:
  960. print "Failed to parse %s" % (url)
  961. return 0
  962. addStringArchive(title, ID, 20)
  963. ctxt = doc.xpathNewContext()
  964. texts = ctxt.xpathEval("//pre//text()")
  965. for text in texts:
  966. addStringArchive(text.content, ID, 5)
  967. return 1
  968. def scanXMLDateArchive(t = None, force = 0):
  969. global wordsDictArchive
  970. wordsDictArchive = {}
  971. url = getXMLDateArchive(t)
  972. print "loading %s" % (url)
  973. try:
  974. doc = libxml2.htmlParseFile(url, None);
  975. except:
  976. doc = None
  977. if doc == None:
  978. print "Failed to parse %s" % (url)
  979. return -1
  980. ctxt = doc.xpathNewContext()
  981. anchors = ctxt.xpathEval("//a[@href]")
  982. links = 0
  983. newmsg = 0
  984. for anchor in anchors:
  985. href = anchor.prop("href")
  986. if href == None or href[0:3] != "msg":
  987. continue
  988. try:
  989. links = links + 1
  990. msg = libxml2.buildURI(href, url)
  991. title = anchor.content
  992. if title != None and title[0:4] == 'Re: ':
  993. title = title[4:]
  994. if title != None and title[0:6] == '[xml] ':
  995. title = title[6:]
  996. newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
  997. except:
  998. pass
  999. return newmsg
  1000. #########################################################################
  1001. # #
  1002. # Main code: open the DB, the API XML and analyze it #
  1003. # #
  1004. #########################################################################
  1005. def analyzeArchives(t = None, force = 0):
  1006. global wordsDictArchive
  1007. ret = scanXMLDateArchive(t, force)
  1008. print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
  1009. i = 0
  1010. skipped = 0
  1011. for word in wordsDictArchive.keys():
  1012. refs = wordsDictArchive[word]
  1013. if refs == None:
  1014. skipped = skipped + 1
  1015. continue;
  1016. for id in refs.keys():
  1017. relevance = refs[id]
  1018. updateWordArchive(word, id, relevance)
  1019. i = i + 1
  1020. print "Found %d associations in HTML pages" % (i)
  1021. def analyzeHTMLTop():
  1022. global wordsDictHTML
  1023. ret = analyzeHTMLPages()
  1024. print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
  1025. i = 0
  1026. skipped = 0
  1027. for word in wordsDictHTML.keys():
  1028. refs = wordsDictHTML[word]
  1029. if refs == None:
  1030. skipped = skipped + 1
  1031. continue;
  1032. for resource in refs.keys():
  1033. (relevance, id, section) = refs[resource]
  1034. updateWordHTML(word, resource, section, id, relevance)
  1035. i = i + 1
  1036. print "Found %d associations in HTML pages" % (i)
  1037. def analyzeAPITop():
  1038. global wordsDict
  1039. global API
  1040. try:
  1041. doc = loadAPI(API)
  1042. ret = analyzeAPI(doc)
  1043. print "Analyzed %d blocs" % (ret)
  1044. doc.freeDoc()
  1045. except:
  1046. print "Failed to parse and analyze %s" % (API)
  1047. print sys.exc_type, sys.exc_value
  1048. sys.exit(1)
  1049. print "Indexed %d words" % (len(wordsDict))
  1050. i = 0
  1051. skipped = 0
  1052. for word in wordsDict.keys():
  1053. refs = wordsDict[word]
  1054. if refs == None:
  1055. skipped = skipped + 1
  1056. continue;
  1057. for (module, symbol) in refs.keys():
  1058. updateWord(word, symbol, refs[(module, symbol)])
  1059. i = i + 1
  1060. print "Found %d associations, skipped %d words" % (i, skipped)
  1061. def usage():
  1062. print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]"
  1063. sys.exit(1)
  1064. def main():
  1065. try:
  1066. openMySQL()
  1067. except:
  1068. print "Failed to open the database"
  1069. print sys.exc_type, sys.exc_value
  1070. sys.exit(1)
  1071. args = sys.argv[1:]
  1072. force = 0
  1073. if args:
  1074. i = 0
  1075. while i < len(args):
  1076. if args[i] == '--force':
  1077. force = 1
  1078. elif args[i] == '--archive':
  1079. analyzeArchives(None, force)
  1080. elif args[i] == '--archive-year':
  1081. i = i + 1;
  1082. year = args[i]
  1083. months = ["January" , "February", "March", "April", "May",
  1084. "June", "July", "August", "September", "October",
  1085. "November", "December"];
  1086. for month in months:
  1087. try:
  1088. str = "%s-%s" % (year, month)
  1089. T = time.strptime(str, "%Y-%B")
  1090. t = time.mktime(T) + 3600 * 24 * 10;
  1091. analyzeArchives(t, force)
  1092. except:
  1093. print "Failed to index month archive:"
  1094. print sys.exc_type, sys.exc_value
  1095. elif args[i] == '--archive-month':
  1096. i = i + 1;
  1097. month = args[i]
  1098. try:
  1099. T = time.strptime(month, "%Y-%B")
  1100. t = time.mktime(T) + 3600 * 24 * 10;
  1101. analyzeArchives(t, force)
  1102. except:
  1103. print "Failed to index month archive:"
  1104. print sys.exc_type, sys.exc_value
  1105. elif args[i] == '--API':
  1106. analyzeAPITop()
  1107. elif args[i] == '--docs':
  1108. analyzeHTMLTop()
  1109. else:
  1110. usage()
  1111. i = i + 1
  1112. else:
  1113. usage()
  1114. if __name__ == "__main__":
  1115. main()