#!/usr/bin/python CACHEDIR = '/home/rubys/planet/cache/' import cgi, os, sys, time print 'Content-Type: application/xml,charset=utf-8\r\n\r\n', # read POST parameters fs = cgi.FieldStorage() query=fs.getvalue('query') startTime=int(fs.getvalue('startTime') or time.time()+3600) lastFound=int(fs.getvalue('lastFound') or time.time()) startFile=fs.getvalue('startFile') # prime results import libxml2 msg=[] libxml2.registerErrorHandler(lambda msg,str: msg.append(str), msg) results = libxml2.parseDoc('') root = results.getRootElement() # sniff test xpath, if invalid, proceed no further try: ctxt = results.xpathNewContext() ctxt.xpathRegisterNs('atom','http://www.w3.org/2005/Atom') ctxt.xpathRegisterNs('xhtml','http://www.w3.org/1999/xhtml') ctxt.xpathRegisterNs('mathml','http://www.w3.org/1998/Math/MathML') ctxt.xpathRegisterNs('svg','http://www.w3.org/2000/svg') ctxt.xpathRegisterNs('xlink','http://www.w3.org/1999/xlink/') ctxt.xpathRegisterNs('planet','http://planet.intertwingly.net/') ctxt.xpathEval(query) except: # TODO: add errror message to output print results.serialize('utf-8') sys.exit(1) # sort Venus cache by date cache = os.listdir(CACHEDIR) cache = [(os.stat(CACHEDIR + name).st_mtime,name) for name in cache] cache.sort() cache.reverse() # scan Atom entries parsed = found = 0 start = time.time() for mtime, file in cache: if os.path.isdir(CACHEDIR + file): continue # make sure that we resume where we left off if mtime > startTime: continue if startFile and mtime == startTime: if file == startFile: startFile = None continue entry = None try: try: entry = libxml2.parseFile(CACHEDIR + file) ctxt = entry.xpathNewContext() ctxt.xpathRegisterNs('atom','http://www.w3.org/2005/Atom') ctxt.xpathRegisterNs('xhtml','http://www.w3.org/1999/xhtml') ctxt.xpathRegisterNs('mathml','http://www.w3.org/1998/Math/MathML') ctxt.xpathRegisterNs('svg','http://www.w3.org/2000/svg') ctxt.xpathRegisterNs('xlink','http://www.w3.org/1999/xlink/') ctxt.xpathRegisterNs('planet','http://planet.intertwingly.net/') if ctxt.xpathEval(query): found = found + 1 lastFound = mtime root.addChild(results.copyNode(entry.getRootElement(),1)) except Exception, e: pass # print e finally: if entry: entry.freeDoc() parsed = parsed + 1 # if we found enough or have searched for too long, quit if found >= 10 or time.time()-start > 0.3: # Policy: no bookmark if nothing found in a 90 day stretch. Limits # run-away scans of the entire database for queries that match nothing # TODO: let users manually continue via pressing 'OK'. if lastFound - mtime <= 86400*90: # add bookmark bookmark = root.newChild(None,'bookmark',None) bookmark.setProp('query',str(query)) bookmark.setProp('startTime',str(mtime)) bookmark.setProp('startFile',str(file)) break # add stats stats = root.newChild(None,'stats',None) stats.setProp('scanned',str(parsed)) stats.setProp('found',str(found)) # return results print results.serialize('utf-8')