=== added directory 'examples' === added file 'examples/pykhtmlsite.py' --- examples/pykhtmlsite.py 1970-01-01 00:00:00 +0000 +++ examples/pykhtmlsite.py 2007-02-05 22:39:48 +0000 @@ -0,0 +1,34 @@ +#!/usr/bin/python + +import sys +sys.path.append("..") + +import pykhtml +pykhtml.debugWithGUI = True + + +PyKHTMLUrl = "http://localhost/PyKHTML/" + +browser = pykhtml.Browser() + +def extractBitsFromPage(): + # getElementsByTagName returns a generator, so we convert + # to a list and access the first element + title = list(browser.document.getElementsByTagName("title"))[0] + print "Title:", title.text + navigation = [] + for item in browser.document.getElementById("navigation").children: + # if this child item is an element and its tag name + # is 'li' (i.e if it's a list item) + if isinstance(item, pykhtml.dom.Element) and item.tagName == "li": + navigation.append(item.children[0].text) + print "Navigation:", " | ".join(navigation) + pykhtml.stopEventLoop() + +def main(): + browser.load(PyKHTMLUrl, extractBitsFromPage) + pykhtml.startEventLoop() + + +if __name__ == "__main__": + main() === added file 'makedocs.py' --- makedocs.py 1970-01-01 00:00:00 +0000 +++ makedocs.py 2007-02-05 22:39:48 +0000 @@ -0,0 +1,101 @@ + +""" Create documentation """ + +import inspect, os + +moduleNames = [ + "pykhtml", + "pykhtml.dom" +] + +allowableSpecialMethods = [ + "__init__", + "__del__", + "__str__", + "__repr__" +] + +itemToClassList = {} + + +def getDoc(obj): + return "\n".join(x.strip() for x in (obj.__doc__ or "").split("\n")) + +def documentClass(cls, name, itemsToExamine, documentedItems): + doc = getDoc(cls) + ## add sub methods + classItemNames = [] + for itemName in dir(cls): + if not itemName.startswith("_") or (itemName in allowableSpecialMethods): + classItemNames.append(itemName) + # Add these to the start of the list if they exist + for x in reversed(("__init__", "__del__")): + if x in classItemNames: + classItemNames.remove(x) + classItemNames.insert(0, x) + l = [] + documentedItems.append(l) + ## add our documentation here + l.append((1, name, doc)) + ## + for name in classItemNames: + item = getattr(cls, name) + itemToClassList[item] = l + itemsToExamine.append((name, item)) + + +class GenerateHtml: + def __init__(self): + # [(name, data), ...] + self.pages = [] + + def generateModule(self, moduleName, structure): + ## 1 = class, 2 = method, 4 = property, 3 = function + self.pages.append((moduleName, structure)) + + def write(self, directory): + for name, data in self.pages: + f = file(os.path.join(directory, name) + ".htm", "w") + f.write(str(data)) + f.close() + + +def main(): + # turn the module names into modules + modules = [] + ## Speed up + inspect_isclass = inspect.isclass + inspect_ismethod = inspect.ismethod + inspect_isfunction = inspect.isfunction + _isinstance = isinstance + _documentClass = documentClass + _itemToClassList = itemToClassList + _getattr = getattr + ## End speed up + generator = GenerateHtml() + for moduleName in moduleNames: + module = __import__(moduleName) + if moduleName.count("."): + subModuleName = moduleName.split(".")[1] + module = _getattr(module, subModuleName) + itemsToExamine = [] + documentedItems = [] + for attributeName in dir(module): + if not attributeName.startswith("_"): + itemsToExamine.append((attributeName, _getattr(module, attributeName))) + + while itemsToExamine: + name, item = itemsToExamine.pop(0) + if inspect_isclass(item) and item.__module__ in moduleNames: + _documentClass(item, name, itemsToExamine, documentedItems) + elif inspect_ismethod(item) or _isinstance(item, property): + _itemToClassList[item].append((2, name, getDoc(item))) + #_documentMethod(item, name, itemsToExamine, documentedItems) + elif inspect_isfunction(item) and item.__module__ in moduleNames: + documentedItems.append((4, name, getDoc(item))) + generator.generateModule(moduleName, documentedItems) + generator.write("doc") + +if __name__ == "__main__": + main() + === added directory 'pykhtml' === added file 'pykhtml/__init__.py' --- pykhtml/__init__.py 1970-01-01 00:00:00 +0000 +++ pykhtml/__init__.py 2007-02-05 22:39:48 +0000 @@ -0,0 +1,160 @@ + +import sys, os, subprocess, time, tempfile +import khtml, kdecore +import sip # cast +#from khtml.DOM import DOMString +from qt import * +DOM = khtml.DOM +DOMString = DOM.DOMString + +import dom + +### set to true to see what's happening visually +debugWithGUI = False + +class _Dummy(object): + pass + +# whether or not init() was successful +initSuccessful = False +# KApplication +application = None +# the dialog that hosts the KHTMLParts +dialog = None +# directory temp files are stored in +temp = tempfile.gettempdir() +## If this is being launched from multiple instances it's +## stupid to start Xvfb each time +if os.name == "posix": + def running(name): + procs = os.popen("ps -eo comm").read().strip().split("\n") + return name in procs +else: + raise OSError("Need to implement running(name) for %s" % repr(os.name)) +# Xvfb process. All you need to know is that it's +# an object with a display attribute (that contains the +# X display value (i.e :1)) +xvfb = None +if running("Xvfb") and os.path.exists(os.path.join(temp, "PyKHTMLXvfb")): + f = file(os.path.join(temp, "PyKHTMLXvfb")) + data = f.read().strip() + assert data[1:].isdigit() + f.close() + del f + xvfb = _Dummy() + xvfb.display = data + + + +path = os.environ["PATH"].split(os.pathsep) +def pathSearch(name): + """ Utility function to search for and get the full path of a file in $PATH """ + os_path_exists = os.path.exists + os_path_join = os.path.join + return [os_path_join(x, name) for x in path if os_path_exists(os_path_join(x, name))] + + +def _startKApplication(): + global application, dialog + kdecore.KCmdLineArgs.init(sys.argv, "PyKHTML", "PyKHTML Library", "0.1") + application = kdecore.KApplication() + # the widget that will host the KHTMLParts + dialog = QDialog(None) + application.setMainWidget(dialog) + if debugWithGUI: + dialog.show() + dialog.layout = QVBoxLayout(dialog) + +def init(display=1, _sleep=1): + """ Initiate the system if necessary (start Xvfb if it's not running, connect to it, start our program instance). This is called automatically when you create a Browser instance, so you shouldn't have to worry about it. You can specify use of a certain display by setting the `display` parameter. """ + global initSuccessful + if not initSuccessful: + # check that Xvfb is in the path + if not pathSearch("Xvfb"): + raise OSError("Xvfb not installed") + global xvfb + if not xvfb: + # start xvfb + xvfb = subprocess.Popen(["Xvfb", ":%s" % display, "-ac", "-screen", "0", "640x480x8"], stderr=subprocess.PIPE) + xvfb.display = ":%s" % display + # pause for a little to check it hasn't terminated straight away and to give it some time to startup + time.sleep(_sleep) + if xvfb.poll(): + raise OSError("Xvfb failed to run correctly\nXvfb output:\n%s" % xvfb.stderr.read()) + else: + # started successfully - lets not start a new Xvfb each time + f = file(os.path.join(temp, "PyKHTMLXvfb"), "w") + f.write("%s\n" % xvfb.display) + f.close() + else: + print "Re-using server" + if not debugWithGUI: + os.environ["DISPLAY"] = xvfb.display + _startKApplication() + initSuccessful = True + +def startEventLoop(): + """ Starts the PyKHTML event loop. PyKHTML works with an asynchronous callback mechanism -- a little like Twisted does. Calls to open a new webpage aren't synchronous, as with urllib, for example. """ + application.exec_loop() + +def stopEventLoop(): + """ Stop the event loop and hence exit the scraper """ + dialog.deleteLater() + application.quit() + #sys.exit(0) + +def timer(time, func): + """ Call the given function after the alloted time. Requires that the PyKHTML event loop is running """ + QTimer.singleShot(int(time * 1000), func) + + +class Browser(object): + def __init__(self): + init() + self.part = khtml.KHTMLPart(dialog) + if debugWithGUI: + dialog.layout.addWidget(self.part.view()) + self.part.show() + # disable images, java, and stylesheets so things load quicker + self.part.setJavaEnabled(False) + self.part.setPluginsEnabled(False) + self.part.setAutoloadImages(False) + self.connect = self.part.connect # sore finger remedy + self.disconnect = self.part.disconnect + self.loadFunction = None + + def _setLocation(self, uri): + self.part.openURL(kdecore.KURL(uri)) + def _getLocation(self): + return str(self.part.url().url()) + location = property(_getLocation, _setLocation, None, "Browse to a new location. You probably don't want to set this directly as you'll receive no notification when the page has loaded. Have a look at Browser.load instead") + + def load(self, uri, callback): + """ Load a webpage in the browser. It takes as parameters the URI of the page to load, and a callable object to call when the page has loaded. """ + if self.loadFunction: + self.disconnect(self.part, SIGNAL("docCreated()"). self._slotDocCreated) + self.loadFunction = callback + self.connect(self.part, SIGNAL("docCreated()"), self._slotDocCreated) + self.location = uri + + def _slotDocCreated(self): + self.part.executeScript(DOM.Node(), "window.alert = function() {}") + self.disconnect(self.part, SIGNAL("docCreated()"), self._slotDocCreated) + if not self.loadFunction: + raise AttributeError("No load function callback present") + func = self.loadFunction + self.loadFunction = None + # do this so the DOM loads fully. Huzzah + doc = self.document + dom.Element(doc._d).addEvent("load", func) + + @property + def source(self): + print [x for x in dir(self.part) if x.lower().count("source") or x.lower().count("doc")] + return str(self.part.documentSource()) + + @property + def document(self): + """ Get a reference to the document (dom.Document) for the currently loaded page. It contains all the tasty methods for walking the DOM tree like getElementById/getElementsByTagName and methods for browsing to other linked pages. """ + return dom.Document(self.part.htmlDocument(), self) + === added file 'pykhtml/dom.py' --- pykhtml/dom.py 1970-01-01 00:00:00 +0000 +++ pykhtml/dom.py 2007-02-05 22:39:48 +0000 @@ -0,0 +1,256 @@ + +from khtml import DOM as _DOM +import sip +DOMString = _DOM.DOMString + + +_nodeTypeToClass = {} +def nodeToClass(n): + return _nodeTypeToClass.get(n.nodeType(), Node) +_elementNameToClassMap = {} +def elementToObject(e): + e = _elementNameToClassMap.get(str(e.nodeName().string()).upper(), Element)(e) + return e + +def registerNode(nodeType, klass): + _nodeTypeToClass[nodeType] = klass +def registerElement(elementName, klass): + _elementNameToClassMap[elementName] = klass + + + +class Node(object): + """ Node and all of its subclasses provide you with read-only access to the page's DOM. Instantiating the classes themselves won't do you much good. """ + def __init__(self, cNode): + self._ = cNode + object.__init__(self) + self.__children = None + + def isA(self, klass): + """ Syntactic sugar for isinstance """ + return isinstance(self, klass) + + @property + def children(self): + """ Get the children nodes of this """ + # cache the value + if self.__children is None: + l = self.__children = [] + elements = self._.childNodes() + for i in xrange(elements.length()): + n = elements.item(i) + l.append(nodeToClass(n)(n)) + return self.__children + + def childNodes(self): + """ For those that can't live without JavaScript DOM-compatible method names """ + return self.children() + + +class Text(Node): + """ A text node lets you access the text in it using the Text.value attribute or by converting to a string with str() """ + def __init__(self, cTextNode): + Node.__init__(self, cTextNode) + + def __str__(self): + return self.value + + def __repr__(self): + return self.value + + @property + def value(self): + return str(self._.nodeValue().string()) +registerNode(3, Text) + + +class Element(Node): + """ An HTML element. Instances of it provide methods for doing things with the element -- traversing it, adding events, etc. """ + def __init__(self, cElement): + Node.__init__(self, cElement) + + @property + def text(self): + """ If the next child of this element is a text node, this will return the text value of that node """ + children = self.children + if not len(children): + return None + if isinstance(children[0], Text): + return children[0].value + + ## DOM things + def getElementById(self, id): + """ Get a reference to an element in the page by its id attribute """ + n = self._.getElementById(DOMString(id)) + return nodeToClass(n)(n) + + def getElementsByTagName(self, name): + """ Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list() """ + elements = self._.getElementsByTagName(DOMString(name)) + for i in xrange(elements.length()): + node = elements.item(i) + yield nodeToClass(node)(node) + + def getElementsByTagNameNS(self, ns, name): + """ Get elements by tag name given a certain namespace """ + elements = self._.getElementsByTagNameNS(DOMString(ns), DOMString(name)) + for i in xrange(elements.length()): + node = elements.item(i) + yield nodeToClass(node)(node) + + # convenience + def getElementsByClass(self, className, tagName="*"): + """ Get elements in the document that have a certain class """ + for element in self.getElementsByTagName(tagName): + if className in str(element.getAttribute(DOMString("class")).string()).split(" "): + e = elements.item(i) + yield nodeToClass(e)(e) + + @property + def tagName(self): + """ Get the lowercase name of this tag """ + return str(self._.nodeName().string()).lower() + + @property + def originalTagName(self): + """ Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML """ + return str(e.nodeName().string()) + + def addEvent(self, eventName, func, capture=False): + """ This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events really. """ + listener = _CallbackEventListener(eventName, func) + self._.addEventListener(DOMString(eventName), listener, capture) + + def removeEvent(self, eventName, func, capture=False): + """ Removes events that you've added with Element.addEvent """ + self._.removeEventListener(DOMString(eventName), _CallbackEventListener.getCallbackInstance(eventName, func), capture) + _CallbackEventListener.remove(eventName, func) +# -- important, we hook to the method not Element base class +registerNode(1, elementToObject) + + +class Anchor(Element): + """ Anchor elements with an Anchor.href property """ + def __init__(self, cAnchor): + Element.__init__(self, cAnchor) + self._ = sip.cast(self._, _DOM.HTMLAnchorElement) + + @property + def href(self): + """ The anchors 'href' value. Returns the full URL pointed to by the href attribute """ + return str(self._.href().string()) +registerElement("A", Anchor) + + +#class Title(Element): + #def __init__(self, cTitle): + #Node.__init__(self, cTitle) + #self._ = sip.cast(self._, _DOM.HTMLTitleElement) +#registerElement("TITLE", Title) + + +class Document(object): + """ Document object for accessing the DOM tree. Don't keep this object around, when the page changes it's invalidated. Just use browser.document. """ + def __init__(self, htmlDocument, browser): + self._d = htmlDocument + self.browser = browser + + ## DOM things + def getElementById(self, id): + """ Get a reference to an element in the page by its id attribute """ + n = self._d.getElementById(DOMString(id)) + return nodeToClass(n)(n) + + def getElementsByTagName(self, name): + """ Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list() """ + elements = self._d.getElementsByTagName(DOMString(name)) + for i in xrange(elements.length()): + node = elements.item(i) + yield nodeToClass(node)(node) + + def getElementsByTagNameNS(self, ns, name): + """ Get elements by tag name given a certain namespace """ + elements = self._d.getElementsByTagNameNS(DOMString(ns), DOMString(name)) + for i in xrange(elements.length()): + node = elements.item(i) + yield nodeToClass(node)(node) + + # convenience + def getElementsByClass(self, className, tagName="*"): + """ Get elements in the document that have a certain class """ + for element in self.getElementsByTagName(tagName): + if className in str(element.getAttribute(DOMString("class")).string()).split(" "): + e = elements.item(i) + yield nodeToClass(e)(e) + + def visit(self, text=None, callback=None, attributes=None, stripSpace=True): + """ Visit a page pointed to by a certain link. This function searches for all links in the document that either: + - Match the given text (as a string or regular expression object) + - Match the attributes given (a dictionary mapping attribute name to attribute value, where the value is again either a string or regular expression object) + If the `stripSpace` attribute is True, when searching for a string match all whitespace is stripped from the item we are matching against """ + _dlinks = self._d.links() + possibleLinks = [_dlinks.item(i) for i in xrange(_dlinks.length())] + print "possible links:", len(possibleLinks) + correctLink = None + # check for text first + if text is not None: + match = None + if isinstance(text, basestring): + if stripSpace: + match = lambda x: text == x.strip() + else: + match = lambda x: text == x + else: + match = text.match + matched = False + for i in reversed(xrange(len(possibleLinks))): + link = possibleLinks[i] + #print repr(link.children().item(0).nodeValue().string()) + if match(unicode(link.childNodes().item(0).nodeValue().string())): + matched = True + correctLink = link + break + else: + possibleLinks.pop(i) + # the end, and no success - return False + if not matched and attributes is None: + return False + for key, value in (attributes or {}).items(): + if isinstance(value, str): + match = lambda x: text == x + else: + match = value.match + for link in possibleLinks: + if not match(str(getAttribute(match).string())): + return False + if not correctLink: + return False + # cast so we get the nice href (full URL) + link = Anchor(correctLink) + self.browser.load(link.href, callback) + return True + + +class _CallbackEventListener(_DOM.EventListener): + _funcToListener = {} + def __init__(self, event, callback): + _DOM.EventListener.__init__(self) + self.event = event + self.callback = callback + self._funcToListener[(self.event, callback)] = self + + def handleEvent(self, e): + try: + self.callback(e) + except TypeError: + self.callback() + + @staticmethod + def remove(event, callback): + self._funcToListener.remove((event, callback)) + + @staticmethod + def getCallbackInstance(event, callback): + return self._funcToListener[(event, callback)] + +