=== modified file '.bzrignore' --- .bzrignore 2007-03-05 15:17:59 +0000 +++ .bzrignore 2007-03-06 22:25:50 +0000 @@ -1,3 +1,4 @@ +examples/test.py build *.pyc makedocs.py === modified file 'doc/pykhtml.dom.htm' --- doc/pykhtml.dom.htm 2007-03-05 16:00:30 +0000 +++ doc/pykhtml.dom.htm 2007-03-06 22:25:50 +0000 @@ -14,7 +14,9 @@

pykhtml.dom

-

class Document (inherits object)

+

class ElementNotFoundError (inherits Exception)

+
+

class Document (inherits object)

Document object for accessing the DOM tree. Don't keep this object around, when the page changes it is invalidated. Just access it through pykhtml.Browser.document whenever you want it.

__init__(self, htmlDocument, browser)

getElementById(self, id)

Get a reference to an element in the page by its id attribute.
@@ -22,86 +24,130 @@

getElementsByTagName(self, name)

Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list().

getElementsByTagNameNS(self, ns, name)

Get elements by tag name given a certain namespace.

serialized (read-only property)

Return a string that represents the DOM structure of this document, much like what is returned via innerHTML in JavaScript.
-

visit(self, text=True, callback=None, attributes=None, stripSpace=None)

Visit a page pointed to by a certain link. This function searches for all links in the document that either:
  • Match the given text (as a string or regular expression object)
  • Match the attributes given (a dictionary mapping attribute name to attribute value, where the value is again either a string or regular expression object)
If the `stripSpace` attribute is True, when searching for a string match all whitespace is stripped from the item we are matching against.
+

visit(self, text=None, callback=None, attributes=None, stripSpace=True)

Visit a page pointed to by a certain link. This function searches for all links in the document that either:
  • Match the given text (as a string or regular expression object)
  • Match the attributes given (a dictionary mapping attribute name to attribute value, where the value is again either a string or regular expression object)
If the `stripSpace` attribute is True, when searching for a string match all whitespace is stripped from the item we are matching against.

class Node (inherits object)

Node and all of its subclasses provide you with access to the page's DOM. Instantiating the classes themselves won't do you much good.

__init__(self, cNode, browser)

-

childNodes(self)

For those that can't live without JavaScript DOM-compatible method names.
-

children (read-only property)

Get the children nodes of this node.
+

__iter__(self)

Iterate over child nodes.
+

addEvent(self, eventName, func, capture=False)

This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
+

childNodes (read-only property)

For those that can't live without JavaScript DOM-compatible method names.
+

children (read-only property)

Get a list containing the children nodes of this node. If you want to loop over children just loop over this node as with any regular list – it is more memory efficient that way.

isA(self, klass)

Syntactic sugar for isinstance.
+

removeEvent(self, eventName, func, capture=False)

Removes events that you've added with Element.addEvent.

class Element (inherits Node)

-
An HTML element. Instances of it provide methods for doing things with the element – traversing it, adding events, etc.
-

__init__(self, cElement, browser)

-

addEvent(self, eventName, func, capture=False)

This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
-

childNodes(self)

Inherited from Node
For those that can't live without JavaScript DOM-compatible method names.
-

children (read-only property)

Inherited from Node
Get the children nodes of this node.
-

getElementById(self, id)

Get a reference to an element in the page by its id attribute.
+
An HTML element. It provides methods for:
And more.
+

__init__(self, cElement, browser, cast=<class 'khtml.HTMLElement'>)

+

__delitem__(self, name)

Delete an attribute.
+

__getitem__(self, name)

Get the value of the attribute with the given name.
+

__iter__(self)

Inherited from Node
Iterate over child nodes.
+

__setitem__(self, name, value)

Set the value of the attribute with the given name.
+

addEvent(self, eventName, func, capture=False)

Inherited from Node
This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
+

attributes (read-only property)

+

childNodes (read-only property)

Inherited from Node
For those that can't live without JavaScript DOM-compatible method names.
+

children (read-only property)

Inherited from Node
Get a list containing the children nodes of this node. If you want to loop over children just loop over this node as with any regular list – it is more memory efficient that way.
+

className (property)

The class name of this element.

getElementsByClass(self, className, tagName='*')

Get elements in the document (optionally with a given tag name) that have a certain class.

getElementsByTagName(self, name)

Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list().

getElementsByTagNameNS(self, ns, name)

Get elements by tag name given a certain namespace.
+

hasAttribute(self, name)

Check whether an attribute exists in this element.
+

id (property)

The ID of this element.
+

innerHTML (property)

The HTML markup inside of this element. Notice that this property is writable, so you can change the markup if need be.

isA(self, klass)

Inherited from Node
Syntactic sugar for isinstance.
-

originalTagName (read-only property)

Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML.
-

removeEvent(self, eventName, func, capture=False)

Removes events that you've added with Element.addEvent.
+

originalTagName (read-only property)

Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML, which probably isn't the case anyway!
+

removeEvent(self, eventName, func, capture=False)

Inherited from Node
Removes events that you've added with Element.addEvent.

tagName (read-only property)

Get the lowercase name of this tag.

text (read-only property)

If the next child of this element is a text node, this will return the text value of that node.

class Anchor (inherits Element)

Anchor elements with an Anchor.href property.
-

__init__(self, cAnchor, browser)

-

addEvent(self, eventName, func, capture=False)

Inherited from Element
This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
-

childNodes(self)

Inherited from Node
For those that can't live without JavaScript DOM-compatible method names.
-

children (read-only property)

Inherited from Node
Get the children nodes of this node.
-

getElementById(self, id)

Inherited from Element
Get a reference to an element in the page by its id attribute.
+

__init__(self, cAnchor, browser, cast=<class 'khtml.HTMLAnchorElement'>)

+

__delitem__(self, name)

Inherited from Element
Delete an attribute.
+

__getitem__(self, name)

Inherited from Element
Get the value of the attribute with the given name.
+

__iter__(self)

Inherited from Node
Iterate over child nodes.
+

__setitem__(self, name, value)

Inherited from Element
Set the value of the attribute with the given name.
+

addEvent(self, eventName, func, capture=False)

Inherited from Node
This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
+

attributes (read-only property)

Inherited from Element
+

childNodes (read-only property)

Inherited from Node
For those that can't live without JavaScript DOM-compatible method names.
+

children (read-only property)

Inherited from Node
Get a list containing the children nodes of this node. If you want to loop over children just loop over this node as with any regular list – it is more memory efficient that way.
+

className (property)

Inherited from Element
The class name of this element.

getElementsByClass(self, className, tagName='*')

Inherited from Element
Get elements in the document (optionally with a given tag name) that have a certain class.

getElementsByTagName(self, name)

Inherited from Element
Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list().

getElementsByTagNameNS(self, ns, name)

Inherited from Element
Get elements by tag name given a certain namespace.
+

hasAttribute(self, name)

Inherited from Element
Check whether an attribute exists in this element.

href (read-only property)

The anchor's 'href' value. Returns the full URL pointed to by the href attribute of this element.
+

id (property)

Inherited from Element
The ID of this element.
+

innerHTML (property)

Inherited from Element
The HTML markup inside of this element. Notice that this property is writable, so you can change the markup if need be.

isA(self, klass)

Inherited from Node
Syntactic sugar for isinstance.
-

originalTagName (read-only property)

Inherited from Element
Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML.
-

removeEvent(self, eventName, func, capture=False)

Inherited from Element
Removes events that you've added with Element.addEvent.
+

originalTagName (read-only property)

Inherited from Element
Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML, which probably isn't the case anyway!
+

removeEvent(self, eventName, func, capture=False)

Inherited from Node
Removes events that you've added with Element.addEvent.

tagName (read-only property)

Inherited from Element
Get the lowercase name of this tag.

text (read-only property)

Inherited from Element
If the next child of this element is a text node, this will return the text value of that node.

class Form (inherits Element)

Form elements contain input elements. You can submit a form with Form.submit.
-

__init__(self, cForm, browser)

+

__init__(self, cForm, browser, cast=<class 'khtml.HTMLFormElement'>)

+

__delitem__(self, name)

Inherited from Element
Delete an attribute.
+

__getitem__(self, name)

Inherited from Element
Get the value of the attribute with the given name.
+

__iter__(self)

Inherited from Node
Iterate over child nodes.
+

__setitem__(self, name, value)

Inherited from Element
Set the value of the attribute with the given name.

action (property)

The method with which this form is to be submitted (GET, POST, etc).
-

addEvent(self, eventName, func, capture=False)

Inherited from Element
This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
-

childNodes(self)

Inherited from Node
For those that can't live without JavaScript DOM-compatible method names.
-

children (read-only property)

Inherited from Node
Get the children nodes of this node.
-

getElementById(self, id)

Inherited from Element
Get a reference to an element in the page by its id attribute.
+

addEvent(self, eventName, func, capture=False)

Inherited from Node
This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
+

attributes (read-only property)

Inherited from Element
+

childNodes (read-only property)

Inherited from Node
For those that can't live without JavaScript DOM-compatible method names.
+

children (read-only property)

Inherited from Node
Get a list containing the children nodes of this node. If you want to loop over children just loop over this node as with any regular list – it is more memory efficient that way.
+

className (property)

Inherited from Element
The class name of this element.

getElementsByClass(self, className, tagName='*')

Inherited from Element
Get elements in the document (optionally with a given tag name) that have a certain class.

getElementsByTagName(self, name)

Inherited from Element
Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list().

getElementsByTagNameNS(self, ns, name)

Inherited from Element
Get elements by tag name given a certain namespace.
+

hasAttribute(self, name)

Inherited from Element
Check whether an attribute exists in this element.
+

id (property)

Inherited from Element
The ID of this element.
+

innerHTML (property)

Inherited from Element
The HTML markup inside of this element. Notice that this property is writable, so you can change the markup if need be.

isA(self, klass)

Inherited from Node
Syntactic sugar for isinstance.
-

originalTagName (read-only property)

Inherited from Element
Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML.
-

removeEvent(self, eventName, func, capture=False)

Inherited from Element
Removes events that you've added with Element.addEvent.
-

submit(self)

Submit the form to the page specified in the action.
+

originalTagName (read-only property)

Inherited from Element
Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML, which probably isn't the case anyway!
+

removeEvent(self, eventName, func, capture=False)

Inherited from Node
Removes events that you've added with Element.addEvent.
+

reset(self)

Reset the data in the form.
+

submit(self, callback)

Submit the form to the page specified in the action. The callback given is like one you would pass to pykthml.Browser.load.

tagName (read-only property)

Inherited from Element
Get the lowercase name of this tag.

text (read-only property)

Inherited from Element
If the next child of this element is a text node, this will return the text value of that node.

class Input (inherits Element)

Input elements that you can set the value/name of.
-

__init__(self, cInput, browser)

-

addEvent(self, eventName, func, capture=False)

Inherited from Element
This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
-

childNodes(self)

Inherited from Node
For those that can't live without JavaScript DOM-compatible method names.
-

children (read-only property)

Inherited from Node
Get the children nodes of this node.
+

__init__(self, cInput, browser, cast=<class 'khtml.HTMLInputElement'>)

+

__delitem__(self, name)

Inherited from Element
Delete an attribute.
+

__getitem__(self, name)

Inherited from Element
Get the value of the attribute with the given name.
+

__iter__(self)

Inherited from Node
Iterate over child nodes.
+

__setitem__(self, name, value)

Inherited from Element
Set the value of the attribute with the given name.
+

addEvent(self, eventName, func, capture=False)

Inherited from Node
This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
+

attributes (read-only property)

Inherited from Element
+

childNodes (read-only property)

Inherited from Node
For those that can't live without JavaScript DOM-compatible method names.
+

children (read-only property)

Inherited from Node
Get a list containing the children nodes of this node. If you want to loop over children just loop over this node as with any regular list – it is more memory efficient that way.
+

className (property)

Inherited from Element
The class name of this element.
+

click(self)

Simulate a click on the input element. If this is a submit button, for example, this clicks it.
+

focus(self)

Focus the input.

form (read-only property)

Get the form element this input resides in.
-

getElementById(self, id)

Inherited from Element
Get a reference to an element in the page by its id attribute.

getElementsByClass(self, className, tagName='*')

Inherited from Element
Get elements in the document (optionally with a given tag name) that have a certain class.

getElementsByTagName(self, name)

Inherited from Element
Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list().

getElementsByTagNameNS(self, ns, name)

Inherited from Element
Get elements by tag name given a certain namespace.
+

hasAttribute(self, name)

Inherited from Element
Check whether an attribute exists in this element.
+

id (property)

Inherited from Element
The ID of this element.
+

innerHTML (property)

Inherited from Element
The HTML markup inside of this element. Notice that this property is writable, so you can change the markup if need be.

isA(self, klass)

Inherited from Node
Syntactic sugar for isinstance.
-

originalTagName (read-only property)

Inherited from Element
Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML.
-

removeEvent(self, eventName, func, capture=False)

Inherited from Element
Removes events that you've added with Element.addEvent.
+

name (property)

The HTML name of this input.
+

originalTagName (read-only property)

Inherited from Element
Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML, which probably isn't the case anyway!
+

removeEvent(self, eventName, func, capture=False)

Inherited from Node
Removes events that you've added with Element.addEvent.

tagName (read-only property)

Inherited from Element
Get the lowercase name of this tag.

text (read-only property)

Inherited from Element
If the next child of this element is a text node, this will return the text value of that node.
+

value (property)

The value of this input – for example, for textareas and inputs of type 'text', this is its content. For buttons, this is the text inside the button, etc.

class Text (inherits Node)

A text node lets you access the text in it using the Text.value attribute or by converting to a string with str().

__init__(self, cTextNode, browser)

+

__iter__(self)

Inherited from Node
Iterate over child nodes.

__repr__(self)

__str__(self)

-

childNodes(self)

Inherited from Node
For those that can't live without JavaScript DOM-compatible method names.
-

children (read-only property)

Inherited from Node
Get the children nodes of this node.
+

__unicode__(self)

Text.value converts the string to a regular string via str(). This gives the unicode version.
+

addEvent(self, eventName, func, capture=False)

Inherited from Node
This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
+

childNodes (read-only property)

Inherited from Node
For those that can't live without JavaScript DOM-compatible method names.
+

children (read-only property)

Inherited from Node
Get a list containing the children nodes of this node. If you want to loop over children just loop over this node as with any regular list – it is more memory efficient that way.

isA(self, klass)

Inherited from Node
Syntactic sugar for isinstance.
-

value (read-only property)

Equivalent to str(textNode). Get the string this node represents.
+

removeEvent(self, eventName, func, capture=False)

Inherited from Node
Removes events that you've added with Element.addEvent.
+

value (read-only property)

Equivalent to unicode(textNode). Get the string this node represents.

elementCast(e, browser)

nodeCast(n, browser)

registerElement(elementName, klass)

=== modified file 'doc/pykhtml.htm' --- doc/pykhtml.htm 2007-03-05 16:00:30 +0000 +++ doc/pykhtml.htm 2007-03-06 22:25:50 +0000 @@ -20,16 +20,19 @@

document (read-only property)

Get a reference to the document (see dom.Document) for the currently loaded page. It contains all the tasty methods for walking the DOM tree like getElementById / getElementsByTagName, and methods for browsing to other linked pages.

load(self, uri, callback)

Load a webpage in the browser. It takes as parameters the URI of the page to load, and a callable object to call when the page has loaded. This callback will be given the browser object as a reference unless you set Browser.referencelessCallbacks to True.

location (property)

Browse to a new location. You probably don't want to set this directly as you'll receive no notification when the page has loaded. Have a look at Browser.load instead.
+

onNextLoad (property)

If you're going to do something that will inadvertently cause PyKHTML to browse to a new page and you want a function to be called when the page is loaded, set onNextLoad to the function.

passReferenceToCallbacks (property)

Set whether callbacks passed to functions such as Browser.load or dom.Document.visit will have a reference to this browser object passed as a parameter. Default is True.

setHtml(self, source, url=None)

Set the HTML of the browser. Parses the HTML and generates the DOM tree so you can navigate it as usual. As well as the `source` parameter, a `url` parameter allows you to specify a URL with which this source code is linked so that e.g any scripts/images referenced in the HTML will be found.

source (read-only property)

+

class partial

+
Partial application of parameters. This is used internally but is also very useful with Browser.load as it allows you to pass data to other functions.
Use is as follows:

>>> def func(a, b):
... print "func:", a, b
...
>>> func2 = pykhtml.partial(func, "foo")
>>> func2("bar!")
func: foo bar!
+

__init__(self, func, *args, **kwargs)

Create a new functor that – when called – will call the given function, passing any extra arguments / keyword-arguments that you specify.
+

__call__(self, *args, **kwargs)

startEventLoop()

Starts the PyKHTML event loop. PyKHTML works with an asynchronous callback mechanism – a little like Twisted does. Calls to open a new webpage aren't synchronous, as with urllib, for example.

stopEventLoop()

Stop the event loop and hence exit the scraper.

timer(time, func)

Call the given function after the alloted time. The PyKHTML event loop needs to be running.
-

init(display=1, _sleep=1)

Initiate the system if necessary (start Xvfb if it's not running, connect to it, start our program instance). This is called automatically when you create a Browser instance, so you shouldn't have to worry about it. You can specify use of a certain display by setting the `display` parameter.
-

class partial

-
Partial application of parameters. This is used internally but is also very useful with Browser.load as it allows you to pass data to other functions.
(Provide Example).
-

__init__(self, fun)

+

init(display=1, registerExceptionHandler=True, _sleep=1, _supressQtDebug=True)

Initiate the system if necessary (start Xvfb if it's not running, connect to it, start our program instance). This is called automatically when you create a Browser instance, so you shouldn't have to worry about it unless you want to set some of the values of the arguments. You can specify use of a certain X display by setting the `display` parameter, and can stop pykhtml registering its exception handler (the excepthook function) by setting `registerExceptionHandler` to False.
+

excepthook(type, value, trace)

Our exception hook that prints out the traceback, powers down the pykhtml engine, and then exits.

pathSearch(name)

Utility function to search for and get the full path of a file in $PATH.

running(name)

Check whether a process of the given name is running.
=== modified file 'doc/styles.css' --- doc/styles.css 2007-02-10 20:22:22 +0000 +++ doc/styles.css 2007-03-06 22:25:50 +0000 @@ -132,4 +132,15 @@ a.reference:hover { color: darkgreen; text-shadow: #ddd 1px 1px 2px; +} + +h3, h4 { +margin-bottom: 0.4em; +} + +.doc { +margin-left: 0.05em; +} +.cls .doc { +margin-left: 1em; } \ No newline at end of file === modified file 'examples/dynamicdom.py' --- examples/dynamicdom.py 2007-03-05 16:00:30 +0000 +++ examples/dynamicdom.py 2007-03-06 22:25:50 +0000 @@ -45,7 +45,7 @@ # load our markup browser.setHtml(page) # to see if the DOM has changed we poll and access - # browser.document.serialized and compare its to + # browser.document.serialized and compare it to # the previous calling. Note the use of pykhtml.partial # to bind the serialised data to the function pykhtml.timer(1, pykhtml.partial(checkForDomChanges, browser, browser.document.serialized)) === modified file 'examples/pykhtmlsite.py' --- examples/pykhtmlsite.py 2007-02-12 23:13:40 +0000 +++ examples/pykhtmlsite.py 2007-03-06 22:25:50 +0000 @@ -5,24 +5,23 @@ import pykhtml -PyKHTMLUrl = "http://paul.giannaros.org/pykhtml/" +PyKHTMLUrl = "http://paul.giannaros.org/pykhtml" def extractBitsFromPage(browser): # getElementsByTagName returns a generator, so we convert # to a list and access the first element title = list(browser.document.getElementsByTagName("title"))[0] print "Title:", title.text - # Get the navigation + # Get the text of the navigation items navigation = [] - for item in browser.document.getElementById("navigation").children: - # if this child item is an element (as opposed to - # a text node or whatever) and its tag name is 'li' - # (i.e if it's a list item) - if isinstance(item, pykhtml.dom.Element) and item.tagName == "li": - # Add the text contents of the list item's first - # child to our list - anchor = item.children[0] - navigation.append(anchor.text) + # First get the container of the list items... + navigationElement = browser.document.getElementById("navigation") + # ... and then loop over the li elements we find + for listItem in navigationElement.getElementsByTagName("li"): + # Inside the list item is an anchor + anchor = listItem.children[0] + # And the text inside the anchor is what we want + navigation.append(anchor.text) print "Navigation:", " | ".join(navigation) # Stop here, we're done pykhtml.stopEventLoop() === modified file 'makedocs.py' --- makedocs.py 2007-03-05 15:17:59 +0000 +++ makedocs.py 2007-03-06 22:25:50 +0000 @@ -8,8 +8,6 @@ """ import inspect, os, sys -# pil stuff for inheritance diagrams -import ImageFont, ImageDraw, ImageChops, Image docDirectory = "doc" @@ -21,15 +19,17 @@ moduleForceFirstItems = { "pykhtml" : [ "Browser", + "partial", "startEventLoop", "stopEventLoop", "timer", "init" ], "pykhtml.dom" : [ + "ElementNotFoundError", "Document", "Node", - "Element" + "Element", ] } @@ -37,7 +37,13 @@ "__init__", "__del__", "__str__", - "__repr__" + "__repr__", + "__unicode__", + "__iter__", + "__getitem__", + "__setitem__", + "__delitem__", + "__call__", ] #fontPath = "arial.ttf" @@ -46,24 +52,20 @@ classToClassList = {} # dirty hack so that qt/kdecore/khtml aren't loaded when pykhtml calls 'import ...' -sys.modules["kdecore"] = {} -# fake the KHTML interface -class Dummy(object): - pass -root = Dummy() -dom = Dummy() -dom.DOMString = None -dom.EventListener = int -root.DOM = dom -root.QTimer = int -sys.modules["khtml"] = sys.modules["qt"] = root +#sys.modules["kdecore"] = {} +## fake the KHTML interface +#class Dummy(object): + #def __getattr__(self, name): + #return None +#root = Dummy() +#dom = Dummy() +#dom.DOMString = None +#dom.EventListener = int +#root.DOM = dom +#root.QTimer = int +#sys.modules["khtml"] = sys.modules["qt"] = root # end dirty hack -def autocrop(image, backgroundColor=(255, 255, 255)): - """ crop an image based on alpha. """ - background = Image.new("RGB", image.size, backgroundColor) - return image.crop(ImageChops.difference(image, background).getbbox()) - #def getDiagram(cls): #fontSize = 12 ## .... @@ -116,11 +118,8 @@ def functionSignature(func): name = func.__name__ - args = list(func.func_code.co_varnames[:func.func_code.co_argcount]) - for i in xrange(len(func.func_defaults or [])): - val = func.func_defaults[i] - args[-(i + 1)] += "=%s" % repr(val) - return "%s(%s)" % (name, ", ".join(args)) + s = "%s%s" % (name, inspect.formatargspec(*inspect.getargspec(func))) + return s.replace("<", "<").replace(">", ">") def propertySignature(prop, name): signature = "%s (read-only property)" % name @@ -233,8 +232,10 @@ s = s.replace("\n", "
") # and two dashes to an en dash s = s.replace(" -- ", " – ") - # and finally (and most importantly), add a full stop at the end - s = "%s." % s.rstrip(".") + # and finally (and most importantly), add a full stop at the end if there is no other punctuation present + s = s.strip() + if s and s[-1] not in ("!", ".", "?"): + s = "%s." % s return s def write(self, directory): @@ -353,10 +354,7 @@ doneItems = {} generator = GenerateHtml() for moduleName in moduleNames: - module = __import__(moduleName) - if moduleName.count("."): - subModuleName = moduleName.split(".")[1] - module = _getattr(module, subModuleName) + module = myImport(moduleName) moduleDoc = getDoc(module) itemsToExamine = [] documentedItems = [] === modified file 'pykhtml/__init__.py' --- pykhtml/__init__.py 2007-03-05 16:00:30 +0000 +++ pykhtml/__init__.py 2007-03-06 22:25:50 +0000 @@ -1,5 +1,5 @@ -import sys, os, subprocess, time, tempfile +import sys, os, tempfile import khtml, kdecore import sip # cast #from khtml.DOM import DOMString @@ -22,9 +22,17 @@ class partial: """ Partial application of parameters. This is used internally but is also very useful with [[Browser.load]] as it allows you to pass data to other functions. - (Provide Example). """ - def __init__(self, fun, *args, **kwargs): - self.fun = fun + Use is as follows: + + >>> def func(a, b): + ... print "func:", a, b + ... + >>> func2 = pykhtml.partial(func, "foo") + >>> func2("bar!") + func: foo bar! """ + def __init__(self, func, *args, **kwargs): + """ Create a new functor that -- when called -- will call the given function, passing any extra arguments / keyword-arguments that you specify """ + self.fun = func self.pending = args[:] self.kwargs = kwargs.copy() @@ -77,25 +85,47 @@ os_path_join = os.path.join return [os_path_join(x, name) for x in path if os_path_exists(os_path_join(x, name))] +class _MyDialog(qt.QDialog): + def closeEvent(self, e): + # stops a crash... + e.accept() + stopEventLoop() + def _startKApplication(): global application, dialog - kdecore.KCmdLineArgs.init(sys.argv, "PyKHTML", "PyKHTML Library", "9.9") + kdecore.KCmdLineArgs.init(sys.argv[:1], "PyKHTML", "PyKHTML Library", "9.9") application = kdecore.KApplication() # the widget that will host the KHTMLParts - dialog = qt.QDialog(None) + dialog = _MyDialog(None) application.setMainWidget(dialog) if debugWithGUI: dialog.show() dialog.layout = qt.QVBoxLayout(dialog) -def init(display=1, _sleep=1): - """ Initiate the system if necessary (start Xvfb if it's not running, connect to it, start our program instance). This is called automatically when you create a Browser instance, so you shouldn't have to worry about it. You can specify use of a certain display by setting the `display` parameter. """ +def excepthook(type, value, trace): + """ Our exception hook that prints out the traceback, powers down the pykhtml engine, and then exits """ + import traceback + traceback.print_exception(type, value, trace) + if initSuccessful: + stopEventLoop() + sys.exit() + +def _displayQtDebug(messageType, message): + """ Supress QObject::connect stuff """ + if messageType == qt.QtWarningMsg and message.startswith("QObject::connect") or message.startswith("QObject::disconnect"): + pass + else: + sys.stderr.write("Qt debug: %s\n" % message) + +def init(display=1, registerExceptionHandler=True, _sleep=1, _supressQtDebug=True): + """ Initiate the system if necessary (start Xvfb if it's not running, connect to it, start our program instance). This is called automatically when you create a Browser instance, so you shouldn't have to worry about it unless you want to set some of the values of the arguments. You can specify use of a certain X display by setting the `display` parameter, and can stop pykhtml registering its exception handler (the [[excepthook]] function) by setting `registerExceptionHandler` to False. """ global initSuccessful if not initSuccessful: # check that Xvfb is in the path if useXvfb and pathSearch("Xvfb"): global xvfb if not xvfb: + import subprocess, time # start xvfb xvfb = subprocess.Popen(["Xvfb", ":%s" % display, "-ac", "-screen", "0", "640x480x8"], stderr=subprocess.PIPE) xvfb.display = ":%s" % display @@ -115,8 +145,12 @@ usingXvfb = True else: usingXvfb = False + if _supressQtDebug: + qt.qInstallMsgHandler(_displayQtDebug) _startKApplication() initSuccessful = True + if registerExceptionHandler: + sys.excepthook = excepthook def startEventLoop(): """ Starts the PyKHTML event loop. PyKHTML works with an asynchronous callback mechanism -- a little like Twisted does. Calls to open a new webpage aren't synchronous, as with urllib, for example. """ @@ -154,6 +188,7 @@ """ Create a new Browser """ init() self.part = khtml.KHTMLPart(dialog) + qt.QObject.connect(self.part.browserExtension(), qt.SIGNAL("openURLRequestDelayed(const KURL &, const KParts::URLArgs &)"), self._openURLRequest) if debugWithGUI: dialog.layout.addWidget(self.part.view()) self.part.show() @@ -167,6 +202,9 @@ self.loadFunction = None self._passReferenceToCallbacks = True + def _openURLRequest(self, url, urlArgs): + self.location = str(url.url()) + def _setPassReferenceToCallbacks(self, b): self._referencelessCallbacks = b def _getPassReferenceToCallbacks(self): @@ -181,18 +219,17 @@ def load(self, uri, callback): """ Load a webpage in the browser. It takes as parameters the URI of the page to load, and a callable object to call when the page has loaded. This callback will be given the browser object as a reference unless you set [[Browser.referencelessCallbacks]] to True """ - if self.loadFunction: - self.disconnect(self.part, qt.SIGNAL("docCreated()"). self._slotDocCreated) - self.loadFunction = callback - self.connect(self.part, qt.SIGNAL("docCreated()"), self._slotDocCreated) + self.onNextLoad = callback self.location = uri - def _setOnLoadHandler(self, callback): - """ If a function is going to be called that changes the URL and we want a callback to be called when the page is loaded, this is the method for you. Does kinda what load does, without loading the page """ + def _setOnNextLoad(self, callback): if self.loadFunction: self.disconnect(self.part, qt.SIGNAL("docCreated()"). self._slotDocCreated) self.loadFunction = callback self.connect(self.part, qt.SIGNAL("docCreated()"), self._slotDocCreated) + def _getOnNextLoad(self): + return self.loadFunction + onNextLoad = property(_getOnNextLoad, _setOnNextLoad, None, "If you're going to do something that will inadvertently cause PyKHTML to browse to a new page and you want a function to be called when the page is loaded, set onNextLoad to the function") def _slotDocCreated(self): self.part.executeScript(DOM.Node(), "window.alert = function() {}") @@ -206,7 +243,7 @@ func = partial(func, self) # do this so the DOM loads fully. Cast to an Element -- not strictly correct, but we just want to get to addEvent. # XX why not just put addEvent in Node and make Document inherit from Node? Document IS meant to be a Node, after all. - dom.Element(self.document._d, self).addEvent("load", func) + dom.Node(self.document._d, self).addEvent("load", func) def setHtml(self, source, url=None): """ Set the HTML of the browser. Parses the HTML and generates the DOM tree so you can navigate it as usual. As well as the `source` parameter, a `url` parameter allows you to specify a URL with which this source code is linked so that e.g any scripts/images referenced in the HTML will be found. """ === modified file 'pykhtml/dom.py' --- pykhtml/dom.py 2007-03-05 16:00:30 +0000 +++ pykhtml/dom.py 2007-03-06 22:25:50 +0000 @@ -19,33 +19,54 @@ +class ElementNotFoundError(Exception): + pass + + class Node(object): """ Node and all of its subclasses provide you with access to the page's DOM. Instantiating the classes themselves won't do you much good. """ def __init__(self, cNode, browser): self._ = cNode self.browser = browser object.__init__(self) - self.__children = None + #n = self._.childNodes() + #for j in xrange(n.length()): + #print " | %s" % n.item(j) + #print self._.childNodes().length() + + def __iter__(self): + """ Iterate over child nodes """ + elements = self._.childNodes() + for i in xrange(elements.length()): + yield nodeCast(elements.item(i), self.browser) + + @property + def children(self): + """ Get a list containing the children nodes of this node. If you want to loop over children just loop over this node as with any regular list -- it is more memory efficient that way. """ + l = [] + elements = self._.childNodes() + for i in xrange(elements.length()): + l.append(nodeCast(elements.item(i), self.browser)) + return l + + @property + def childNodes(self): + """ For those that can't live without JavaScript DOM-compatible method names """ + return self.children def isA(self, klass): """ Syntactic sugar for isinstance """ return isinstance(self, klass) - @property - def children(self): - """ Get the children nodes of this node """ - # cache the value - if self.__children is None: - l = self.__children = [] - elements = self._.childNodes() - for i in xrange(elements.length()): - n = elements.item(i) - l.append(nodeCast(n, self.browser)) - return self.__children - - def childNodes(self): - """ For those that can't live without JavaScript DOM-compatible method names """ - return self.children() + def addEvent(self, eventName, func, capture=False): + """ This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally. """ + listener = _CallbackEventListener(eventName, func) + self._.addEventListener(DOMString(eventName), listener, capture) + + def removeEvent(self, eventName, func, capture=False): + """ Removes events that you've added with [[Element.addEvent]] """ + self._.removeEventListener(DOMString(eventName), _CallbackEventListener.getCallbackInstance(eventName, func), capture) + _CallbackEventListener.remove(eventName, func) class Text(Node): @@ -56,20 +77,63 @@ def __str__(self): return self.value + def __unicode__(self): + """ [[Text.value]] converts the string to a regular string via str(). This gives the unicode version """ + return unicode(self._.nodeValue().string()) + def __repr__(self): return self.value @property def value(self): - """ Equivalent to str(textNode). Get the string this node represents """ - return str(self._.nodeValue().string()) + """ Equivalent to unicode(textNode). Get the string this node represents """ + return unicode(self._.nodeValue().string()) registerNode(3, Text) class Element(Node): - """ An HTML element. Instances of it provide methods for doing things with the element -- traversing it, adding events, etc. """ - def __init__(self, cElement, browser): - Node.__init__(self, cElement, browser) + """ An HTML element. It provides methods for: + + Managing attributes using the [] operator (to get/set/delete an attribute) and [[Element.attributes]] to get a dictionary containing attribute names/values + + Traversing into children nodes with [[Element.getElementsByTagName]] and [[Element.getElementsByClass]] (as well as [[Node.children]], of course) + + Accessing its tag name with [[Element.tagName]] + + Accessing and modifying the HTML markup inside the element via [[Element.innerHTML]] + And more. """ + def __init__(self, cElement, browser, cast=_DOM.HTMLElement): + Node.__init__(self, cast(cElement), browser) + + def __getitem__(self, name): + """ Get the value of the attribute with the given name """ + #print "Getitem on:", self._ + #print self._.tagName() + #return "foo" + return str(self._.getAttribute(DOMString(name)).string()) + + def __setitem__(self, name, value): + """ Set the value of the attribute with the given name """ + self._.setAttribute(DOMString(name), DOMString(value)) + + def __delitem__(self, name): + """ Delete an attribute """ + dName = DOMString(name) + if self._.hasAttribute(dName): + self._.removeAttribute(dName) + else: + raise AttributeError("No attribute %s exists" % repr(name)) + + def hasAttribute(self, name): + """ Check whether an attribute exists in this element """ + return self._.hasAttribute(DOMString(name)) + + @property + def attributes(self): + nodeMap = self._.attributes() + d = {} + for i in xrange(nodeMap.length()): + item = nodeMap.item(i) + key = str(item.nodeName().string()) + value = str(item.nodeValue().string()) + d[key] = value + return d @property def text(self): @@ -78,14 +142,9 @@ if not len(children): return None if isinstance(children[0], Text): - return children[0].value - - ## DOM things - def getElementById(self, id): - """ Get a reference to an element in the page by its id attribute """ - n = self._.getElementById(DOMString(id)) - return nodeCast(n, self.browser) - + return unicode(children[0]) + + ## DOM things def getElementsByTagName(self, name): """ Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list() """ elements = self._.getElementsByTagName(DOMString(name)) @@ -108,6 +167,24 @@ e = elements.item(i) yield nodeCast(e, self.browser) + def _getId(self): + return str(self._.id().string()) + def _setId(self, val): + self._.setId(DOMString(val)) + id = property(_getId, _setId, None, "The ID of this element") + + def _getClassName(self): + return str(self._.className().string()) + def _setClassName(self, val): + self._.setClassName(DOMString(val)) + className = property(_getClassName, _setClassName, None, "The class name of this element") + + def _getInnerHTML(self): + return str(self._.innerHTML().string()) + def _setInnerHTML(self, val): + self._.setInnerHTML(DOMString(val)) + innerHTML = property(_getInnerHTML, _setInnerHTML, None, "The HTML markup inside of this element. Notice that this property is writable, so you can change the markup if need be") + @property def tagName(self): """ Get the lowercase name of this tag """ @@ -115,27 +192,16 @@ @property def originalTagName(self): - """ Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML """ + """ Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML, which probably isn't the case anyway! """ return str(e.nodeName().string()) - - def addEvent(self, eventName, func, capture=False): - """ This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally. """ - listener = _CallbackEventListener(eventName, func) - self._.addEventListener(DOMString(eventName), listener, capture) - - def removeEvent(self, eventName, func, capture=False): - """ Removes events that you've added with [[Element.addEvent]] """ - self._.removeEventListener(DOMString(eventName), _CallbackEventListener.getCallbackInstance(eventName, func), capture) - _CallbackEventListener.remove(eventName, func) # -- important, we hook to the method not Element base class registerNode(1, elementCast) class Anchor(Element): """ Anchor elements with an [[Anchor.href]] property """ - def __init__(self, cAnchor, browser): - Element.__init__(self, cAnchor, browser) - self._ = sip.cast(self._, _DOM.HTMLAnchorElement) + def __init__(self, cAnchor, browser, cast=_DOM.HTMLAnchorElement): + Element.__init__(self, cAnchor, browser, cast) @property def href(self): @@ -146,9 +212,8 @@ class Form(Element): """ Form elements contain input elements. You can submit a form with [[Form.submit]] """ - def __init__(self, cForm, browser): - Element.__init__(self, cForm, browser) - self._ = sip.cast(self._, _DOM.HTMLFormElement) + def __init__(self, cForm, browser, cast=_DOM.HTMLFormElement): + Element.__init__(self, cForm, browser, cast) def _getAction(self): return str(self._.action().string()) @@ -169,22 +234,41 @@ # submit self._.submit() - def submit(self): - """ Submit the form to the page specified in the action """ - self._.submit() + def reset(self): + """ Reset the data in the form """ + self._.reset() registerElement("FORM", Form) class Input(Element): """ Input elements that you can set the value/name of """ - def __init__(self, cInput, browser): - Element.__init__(self, cInput, browser) - self._ = sip.cast(self._, _DOM.HTMLInputElement) + def __init__(self, cInput, browser, cast=_DOM.HTMLInputElement): + Element.__init__(self, cInput, browser, cast) @property def form(self): """ Get the form element this input resides in """ return Form(self._.form()) + + def _getName(self): + return str(self._.name().string()) + def _setName(self, name): + self._.setName(DOMString(name)) + name = property(_getName, _setName, None, "The HTML name of this input") + + def click(self): + """ Simulate a click on the input element. If this is a submit button, for example, this clicks it. """ + self._.click() + + def focus(self): + """ Focus the input """ + self._.focus() + + def _getValue(self): + return str(self._.value().string()) + def _setValue(self, value): + self._.setValue(DOMString(value)) + value = property(_getValue, _setValue, None, "The value of this input -- for example, for textareas and inputs of type 'text', this is its content. For buttons, this is the text inside the button, etc.") registerElement("INPUT", Input) #class Title(Element): @@ -203,13 +287,17 @@ @property def serialized(self): """ Return a string that represents the DOM structure of this document, much like what is returned via innerHTML in JavaScript """ - return str(self._d.toString().string()) + return unicode(self._d.toString().string()) ## DOM things def getElementById(self, id): """ Get a reference to an element in the page by its id attribute """ n = self._d.getElementById(DOMString(id)) - return nodeCast(n, self.browser) + # no element with the given ID + if n.isNull(): + raise ElementNotFoundError("No element with an ID of '%s' found" % id) + cast = nodeCast(n, self.browser) + return cast def getElementsByTagName(self, name): """ Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list() """ @@ -240,7 +328,7 @@ If the `stripSpace` attribute is True, when searching for a string match all whitespace is stripped from the item we are matching against """ _dlinks = self._d.links() possibleLinks = [_dlinks.item(i) for i in xrange(_dlinks.length())] - print "possible links:", len(possibleLinks) + #print "possible links:", len(possibleLinks) correctLink = None # check for text first if text is not None: === modified file 'todo.txt' --- todo.txt 2007-02-15 20:31:06 +0000 +++ todo.txt 2007-03-06 22:25:50 +0000 @@ -2,7 +2,7 @@ * PyKHTML Todo List * ********************** - + distutils-based installer + + Event dispatching (document.createEvent, initMouseEvent with type "click", then dispatch on an element) + Make the examples available online + Big friendly 'Download (latest)' button + Add Contact section to the website