=== modified file '.bzrignore'
--- .bzrignore 2007-03-05 15:17:59 +0000
+++ .bzrignore 2007-03-06 22:25:50 +0000
@@ -1,3 +1,4 @@
+examples/test.py
build
*.pyc
makedocs.py
=== modified file 'doc/pykhtml.dom.htm'
--- doc/pykhtml.dom.htm 2007-03-05 16:00:30 +0000
+++ doc/pykhtml.dom.htm 2007-03-06 22:25:50 +0000
@@ -14,7 +14,9 @@
pykhtml.dom
-
+
+
Document object for accessing the DOM tree. Don't keep this object around, when the page changes it is invalidated. Just access it through
pykhtml.Browser.document whenever you want it.
Get a reference to an element in the page by its id attribute.
@@ -22,86 +24,130 @@
Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list().
Get elements by tag name given a certain namespace.
Return a string that represents the DOM structure of this document, much like what is returned via innerHTML in JavaScript.
-
Visit a page pointed to by a certain link. This function searches for all links in the document that either:
- Match the given text (as a string or regular expression object)
- Match the attributes given (a dictionary mapping attribute name to attribute value, where the value is again either a string or regular expression object)
If the `stripSpace` attribute is True, when searching for a string match all whitespace is stripped from the item we are matching against.
+
Visit a page pointed to by a certain link. This function searches for all links in the document that either:
- Match the given text (as a string or regular expression object)
- Match the attributes given (a dictionary mapping attribute name to attribute value, where the value is again either a string or regular expression object)
If the `stripSpace` attribute is True, when searching for a string match all whitespace is stripped from the item we are matching against.
Node and all of its subclasses provide you with access to the page's DOM. Instantiating the classes themselves won't do you much good.
-
For those that can't live without JavaScript DOM-compatible method names.
-
Get the children nodes of this node.
+
Iterate over child nodes.
+
This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
+
For those that can't live without JavaScript DOM-compatible method names.
+
Get a list containing the children nodes of this node. If you want to loop over children just loop over this node as with any regular list – it is more memory efficient that way.
Syntactic sugar for isinstance.
+
-
An HTML element. Instances of it provide methods for doing things with the element – traversing it, adding events, etc.
-
-
This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
-
For those that can't live without JavaScript DOM-compatible method names.
-
Get the children nodes of this node.
-
Get a reference to an element in the page by its id attribute.
+
An HTML element. It provides methods for:
And more.
+
+
+Get the value of the attribute with the given name.
+Iterate over child nodes.
+Set the value of the attribute with the given name.
+This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
+
+For those that can't live without JavaScript DOM-compatible method names.
+Get a list containing the children nodes of this node. If you want to loop over children just loop over this node as with any regular list – it is more memory efficient that way.
+The class name of this element.
Get elements in the document (optionally with a given tag name) that have a certain class.
Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list().
Get elements by tag name given a certain namespace.
+Check whether an attribute exists in this element.
+
+The HTML markup inside of this element. Notice that this property is writable, so you can change the markup if need be.
Syntactic sugar for isinstance.
-Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML.
-
+Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML, which probably isn't the case anyway!
+
Get the lowercase name of this tag.
If the next child of this element is a text node, this will return the text value of that node.
-
-This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
-For those that can't live without JavaScript DOM-compatible method names.
-Get the children nodes of this node.
-Get a reference to an element in the page by its id attribute.
+
+
+Get the value of the attribute with the given name.
+Iterate over child nodes.
+Set the value of the attribute with the given name.
+This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
+
+For those that can't live without JavaScript DOM-compatible method names.
+Get a list containing the children nodes of this node. If you want to loop over children just loop over this node as with any regular list – it is more memory efficient that way.
+The class name of this element.
Get elements in the document (optionally with a given tag name) that have a certain class.
Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list().
Get elements by tag name given a certain namespace.
+Check whether an attribute exists in this element.
The anchor's 'href' value. Returns the full URL pointed to by the href attribute of this element.
+
+The HTML markup inside of this element. Notice that this property is writable, so you can change the markup if need be.
Syntactic sugar for isinstance.
-Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML.
-
+Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML, which probably isn't the case anyway!
+
Get the lowercase name of this tag.
If the next child of this element is a text node, this will return the text value of that node.
Form elements contain input elements. You can submit a form with
Form.submit.
-
+
+
+Get the value of the attribute with the given name.
+Iterate over child nodes.
+Set the value of the attribute with the given name.
The method with which this form is to be submitted (GET, POST, etc).
-This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
-For those that can't live without JavaScript DOM-compatible method names.
-Get the children nodes of this node.
-Get a reference to an element in the page by its id attribute.
+This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
+
+For those that can't live without JavaScript DOM-compatible method names.
+Get a list containing the children nodes of this node. If you want to loop over children just loop over this node as with any regular list – it is more memory efficient that way.
+The class name of this element.
Get elements in the document (optionally with a given tag name) that have a certain class.
Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list().
Get elements by tag name given a certain namespace.
+Check whether an attribute exists in this element.
+
+The HTML markup inside of this element. Notice that this property is writable, so you can change the markup if need be.
Syntactic sugar for isinstance.
-Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML.
-
-Submit the form to the page specified in the action.
+Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML, which probably isn't the case anyway!
+
+Reset the data in the form.
+Submit the form to the page specified in the action. The callback given is like one you would pass to pykthml.Browser.load.
Get the lowercase name of this tag.
If the next child of this element is a text node, this will return the text value of that node.
Input elements that you can set the value/name of.
-
-This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
-For those that can't live without JavaScript DOM-compatible method names.
-Get the children nodes of this node.
+
+
+Get the value of the attribute with the given name.
+Iterate over child nodes.
+Set the value of the attribute with the given name.
+This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
+
+For those that can't live without JavaScript DOM-compatible method names.
+Get a list containing the children nodes of this node. If you want to loop over children just loop over this node as with any regular list – it is more memory efficient that way.
+The class name of this element.
+Simulate a click on the input element. If this is a submit button, for example, this clicks it.
+
Get the form element this input resides in.
-Get a reference to an element in the page by its id attribute.
Get elements in the document (optionally with a given tag name) that have a certain class.
Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list().
Get elements by tag name given a certain namespace.
+Check whether an attribute exists in this element.
+
+The HTML markup inside of this element. Notice that this property is writable, so you can change the markup if need be.
Syntactic sugar for isinstance.
-Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML.
-
+The HTML name of this input.
+Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML, which probably isn't the case anyway!
+
Get the lowercase name of this tag.
If the next child of this element is a text node, this will return the text value of that node.
+The value of this input – for example, for textareas and inputs of type 'text', this is its content. For buttons, this is the text inside the button, etc.
A text node lets you access the text in it using the
Text.value attribute or by converting to a string with str().
+Iterate over child nodes.
-For those that can't live without JavaScript DOM-compatible method names.
-Get the children nodes of this node.
+Text.value converts the string to a regular string via str(). This gives the unicode version.
+This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally.
+For those that can't live without JavaScript DOM-compatible method names.
+Get a list containing the children nodes of this node. If you want to loop over children just loop over this node as with any regular list – it is more memory efficient that way.
Syntactic sugar for isinstance.
-Equivalent to str(textNode). Get the string this node represents.
+
+Equivalent to unicode(textNode). Get the string this node represents.
=== modified file 'doc/pykhtml.htm'
--- doc/pykhtml.htm 2007-03-05 16:00:30 +0000
+++ doc/pykhtml.htm 2007-03-06 22:25:50 +0000
@@ -20,16 +20,19 @@
Get a reference to the document (see
dom.Document) for the currently loaded page. It contains all the tasty methods for walking the DOM tree like getElementById / getElementsByTagName, and methods for browsing to other linked pages.
Load a webpage in the browser. It takes as parameters the URI of the page to load, and a callable object to call when the page has loaded. This callback will be given the browser object as a reference unless you set Browser.referencelessCallbacks to True.
Browse to a new location. You probably don't want to set this directly as you'll receive no notification when the page has loaded. Have a look at
Browser.load instead.
+If you're going to do something that will inadvertently cause PyKHTML to browse to a new page and you want a function to be called when the page is loaded, set onNextLoad to the function.
Set whether callbacks passed to functions such as
Browser.load or
dom.Document.visit will have a reference to this browser object passed as a parameter. Default is True.
Set the HTML of the browser. Parses the HTML and generates the DOM tree so you can navigate it as usual. As well as the `source` parameter, a `url` parameter allows you to specify a URL with which this source code is linked so that e.g any scripts/images referenced in the HTML will be found.
+
+
Partial application of parameters. This is used internally but is also very useful with
Browser.load as it allows you to pass data to other functions.
Use is as follows:
>>> def func(a, b):
... print "func:", a, b
...
>>> func2 = pykhtml.partial(func, "foo")
>>> func2("bar!")
func: foo bar!
+Create a new functor that – when called – will call the given function, passing any extra arguments / keyword-arguments that you specify.
+
Starts the PyKHTML event loop. PyKHTML works with an asynchronous callback mechanism – a little like Twisted does. Calls to open a new webpage aren't synchronous, as with urllib, for example.
Stop the event loop and hence exit the scraper.
Call the given function after the alloted time. The PyKHTML event loop needs to be running.
-Initiate the system if necessary (start Xvfb if it's not running, connect to it, start our program instance). This is called automatically when you create a Browser instance, so you shouldn't have to worry about it. You can specify use of a certain display by setting the `display` parameter.
-
-
Partial application of parameters. This is used internally but is also very useful with
Browser.load as it allows you to pass data to other functions.
(Provide Example).
-
+Initiate the system if necessary (start Xvfb if it's not running, connect to it, start our program instance). This is called automatically when you create a Browser instance, so you shouldn't have to worry about it unless you want to set some of the values of the arguments. You can specify use of a certain X display by setting the `display` parameter, and can stop pykhtml registering its exception handler (the
excepthook function) by setting `registerExceptionHandler` to False.
+Our exception hook that prints out the traceback, powers down the pykhtml engine, and then exits.
Utility function to search for and get the full path of a file in $PATH.
Check whether a process of the given name is running.
=== modified file 'doc/styles.css'
--- doc/styles.css 2007-02-10 20:22:22 +0000
+++ doc/styles.css 2007-03-06 22:25:50 +0000
@@ -132,4 +132,15 @@
a.reference:hover {
color: darkgreen;
text-shadow: #ddd 1px 1px 2px;
+}
+
+h3, h4 {
+margin-bottom: 0.4em;
+}
+
+.doc {
+margin-left: 0.05em;
+}
+.cls .doc {
+margin-left: 1em;
}
\ No newline at end of file
=== modified file 'examples/dynamicdom.py'
--- examples/dynamicdom.py 2007-03-05 16:00:30 +0000
+++ examples/dynamicdom.py 2007-03-06 22:25:50 +0000
@@ -45,7 +45,7 @@
# load our markup
browser.setHtml(page)
# to see if the DOM has changed we poll and access
- # browser.document.serialized and compare its to
+ # browser.document.serialized and compare it to
# the previous calling. Note the use of pykhtml.partial
# to bind the serialised data to the function
pykhtml.timer(1, pykhtml.partial(checkForDomChanges, browser, browser.document.serialized))
=== modified file 'examples/pykhtmlsite.py'
--- examples/pykhtmlsite.py 2007-02-12 23:13:40 +0000
+++ examples/pykhtmlsite.py 2007-03-06 22:25:50 +0000
@@ -5,24 +5,23 @@
import pykhtml
-PyKHTMLUrl = "http://paul.giannaros.org/pykhtml/"
+PyKHTMLUrl = "http://paul.giannaros.org/pykhtml"
def extractBitsFromPage(browser):
# getElementsByTagName returns a generator, so we convert
# to a list and access the first element
title = list(browser.document.getElementsByTagName("title"))[0]
print "Title:", title.text
- # Get the navigation
+ # Get the text of the navigation items
navigation = []
- for item in browser.document.getElementById("navigation").children:
- # if this child item is an element (as opposed to
- # a text node or whatever) and its tag name is 'li'
- # (i.e if it's a list item)
- if isinstance(item, pykhtml.dom.Element) and item.tagName == "li":
- # Add the text contents of the list item's first
- # child to our list
- anchor = item.children[0]
- navigation.append(anchor.text)
+ # First get the container of the list items...
+ navigationElement = browser.document.getElementById("navigation")
+ # ... and then loop over the li elements we find
+ for listItem in navigationElement.getElementsByTagName("li"):
+ # Inside the list item is an anchor
+ anchor = listItem.children[0]
+ # And the text inside the anchor is what we want
+ navigation.append(anchor.text)
print "Navigation:", " | ".join(navigation)
# Stop here, we're done
pykhtml.stopEventLoop()
=== modified file 'makedocs.py'
--- makedocs.py 2007-03-05 15:17:59 +0000
+++ makedocs.py 2007-03-06 22:25:50 +0000
@@ -8,8 +8,6 @@
"""
import inspect, os, sys
-# pil stuff for inheritance diagrams
-import ImageFont, ImageDraw, ImageChops, Image
docDirectory = "doc"
@@ -21,15 +19,17 @@
moduleForceFirstItems = {
"pykhtml" : [
"Browser",
+ "partial",
"startEventLoop",
"stopEventLoop",
"timer",
"init"
],
"pykhtml.dom" : [
+ "ElementNotFoundError",
"Document",
"Node",
- "Element"
+ "Element",
]
}
@@ -37,7 +37,13 @@
"__init__",
"__del__",
"__str__",
- "__repr__"
+ "__repr__",
+ "__unicode__",
+ "__iter__",
+ "__getitem__",
+ "__setitem__",
+ "__delitem__",
+ "__call__",
]
#fontPath = "arial.ttf"
@@ -46,24 +52,20 @@
classToClassList = {}
# dirty hack so that qt/kdecore/khtml aren't loaded when pykhtml calls 'import ...'
-sys.modules["kdecore"] = {}
-# fake the KHTML interface
-class Dummy(object):
- pass
-root = Dummy()
-dom = Dummy()
-dom.DOMString = None
-dom.EventListener = int
-root.DOM = dom
-root.QTimer = int
-sys.modules["khtml"] = sys.modules["qt"] = root
+#sys.modules["kdecore"] = {}
+## fake the KHTML interface
+#class Dummy(object):
+ #def __getattr__(self, name):
+ #return None
+#root = Dummy()
+#dom = Dummy()
+#dom.DOMString = None
+#dom.EventListener = int
+#root.DOM = dom
+#root.QTimer = int
+#sys.modules["khtml"] = sys.modules["qt"] = root
# end dirty hack
-def autocrop(image, backgroundColor=(255, 255, 255)):
- """ crop an image based on alpha. """
- background = Image.new("RGB", image.size, backgroundColor)
- return image.crop(ImageChops.difference(image, background).getbbox())
-
#def getDiagram(cls):
#fontSize = 12
## ....
@@ -116,11 +118,8 @@
def functionSignature(func):
name = func.__name__
- args = list(func.func_code.co_varnames[:func.func_code.co_argcount])
- for i in xrange(len(func.func_defaults or [])):
- val = func.func_defaults[i]
- args[-(i + 1)] += "=%s" % repr(val)
- return "%s(%s)" % (name, ", ".join(args))
+ s = "%s%s" % (name, inspect.formatargspec(*inspect.getargspec(func)))
+ return s.replace("<", "<").replace(">", ">")
def propertySignature(prop, name):
signature = "%s (read-only property)" % name
@@ -233,8 +232,10 @@
s = s.replace("\n", "
")
# and two dashes to an en dash
s = s.replace(" -- ", " – ")
- # and finally (and most importantly), add a full stop at the end
- s = "%s." % s.rstrip(".")
+ # and finally (and most importantly), add a full stop at the end if there is no other punctuation present
+ s = s.strip()
+ if s and s[-1] not in ("!", ".", "?"):
+ s = "%s." % s
return s
def write(self, directory):
@@ -353,10 +354,7 @@
doneItems = {}
generator = GenerateHtml()
for moduleName in moduleNames:
- module = __import__(moduleName)
- if moduleName.count("."):
- subModuleName = moduleName.split(".")[1]
- module = _getattr(module, subModuleName)
+ module = myImport(moduleName)
moduleDoc = getDoc(module)
itemsToExamine = []
documentedItems = []
=== modified file 'pykhtml/__init__.py'
--- pykhtml/__init__.py 2007-03-05 16:00:30 +0000
+++ pykhtml/__init__.py 2007-03-06 22:25:50 +0000
@@ -1,5 +1,5 @@
-import sys, os, subprocess, time, tempfile
+import sys, os, tempfile
import khtml, kdecore
import sip # cast
#from khtml.DOM import DOMString
@@ -22,9 +22,17 @@
class partial:
""" Partial application of parameters. This is used internally but is also very useful with [[Browser.load]] as it allows you to pass data to other functions.
- (Provide Example). """
- def __init__(self, fun, *args, **kwargs):
- self.fun = fun
+ Use is as follows:
+
+ >>> def func(a, b):
+ ... print "func:", a, b
+ ...
+ >>> func2 = pykhtml.partial(func, "foo")
+ >>> func2("bar!")
+ func: foo bar! """
+ def __init__(self, func, *args, **kwargs):
+ """ Create a new functor that -- when called -- will call the given function, passing any extra arguments / keyword-arguments that you specify """
+ self.fun = func
self.pending = args[:]
self.kwargs = kwargs.copy()
@@ -77,25 +85,47 @@
os_path_join = os.path.join
return [os_path_join(x, name) for x in path if os_path_exists(os_path_join(x, name))]
+class _MyDialog(qt.QDialog):
+ def closeEvent(self, e):
+ # stops a crash...
+ e.accept()
+ stopEventLoop()
+
def _startKApplication():
global application, dialog
- kdecore.KCmdLineArgs.init(sys.argv, "PyKHTML", "PyKHTML Library", "9.9")
+ kdecore.KCmdLineArgs.init(sys.argv[:1], "PyKHTML", "PyKHTML Library", "9.9")
application = kdecore.KApplication()
# the widget that will host the KHTMLParts
- dialog = qt.QDialog(None)
+ dialog = _MyDialog(None)
application.setMainWidget(dialog)
if debugWithGUI:
dialog.show()
dialog.layout = qt.QVBoxLayout(dialog)
-def init(display=1, _sleep=1):
- """ Initiate the system if necessary (start Xvfb if it's not running, connect to it, start our program instance). This is called automatically when you create a Browser instance, so you shouldn't have to worry about it. You can specify use of a certain display by setting the `display` parameter. """
+def excepthook(type, value, trace):
+ """ Our exception hook that prints out the traceback, powers down the pykhtml engine, and then exits """
+ import traceback
+ traceback.print_exception(type, value, trace)
+ if initSuccessful:
+ stopEventLoop()
+ sys.exit()
+
+def _displayQtDebug(messageType, message):
+ """ Supress QObject::connect stuff """
+ if messageType == qt.QtWarningMsg and message.startswith("QObject::connect") or message.startswith("QObject::disconnect"):
+ pass
+ else:
+ sys.stderr.write("Qt debug: %s\n" % message)
+
+def init(display=1, registerExceptionHandler=True, _sleep=1, _supressQtDebug=True):
+ """ Initiate the system if necessary (start Xvfb if it's not running, connect to it, start our program instance). This is called automatically when you create a Browser instance, so you shouldn't have to worry about it unless you want to set some of the values of the arguments. You can specify use of a certain X display by setting the `display` parameter, and can stop pykhtml registering its exception handler (the [[excepthook]] function) by setting `registerExceptionHandler` to False. """
global initSuccessful
if not initSuccessful:
# check that Xvfb is in the path
if useXvfb and pathSearch("Xvfb"):
global xvfb
if not xvfb:
+ import subprocess, time
# start xvfb
xvfb = subprocess.Popen(["Xvfb", ":%s" % display, "-ac", "-screen", "0", "640x480x8"], stderr=subprocess.PIPE)
xvfb.display = ":%s" % display
@@ -115,8 +145,12 @@
usingXvfb = True
else:
usingXvfb = False
+ if _supressQtDebug:
+ qt.qInstallMsgHandler(_displayQtDebug)
_startKApplication()
initSuccessful = True
+ if registerExceptionHandler:
+ sys.excepthook = excepthook
def startEventLoop():
""" Starts the PyKHTML event loop. PyKHTML works with an asynchronous callback mechanism -- a little like Twisted does. Calls to open a new webpage aren't synchronous, as with urllib, for example. """
@@ -154,6 +188,7 @@
""" Create a new Browser """
init()
self.part = khtml.KHTMLPart(dialog)
+ qt.QObject.connect(self.part.browserExtension(), qt.SIGNAL("openURLRequestDelayed(const KURL &, const KParts::URLArgs &)"), self._openURLRequest)
if debugWithGUI:
dialog.layout.addWidget(self.part.view())
self.part.show()
@@ -167,6 +202,9 @@
self.loadFunction = None
self._passReferenceToCallbacks = True
+ def _openURLRequest(self, url, urlArgs):
+ self.location = str(url.url())
+
def _setPassReferenceToCallbacks(self, b):
self._referencelessCallbacks = b
def _getPassReferenceToCallbacks(self):
@@ -181,18 +219,17 @@
def load(self, uri, callback):
""" Load a webpage in the browser. It takes as parameters the URI of the page to load, and a callable object to call when the page has loaded. This callback will be given the browser object as a reference unless you set [[Browser.referencelessCallbacks]] to True """
- if self.loadFunction:
- self.disconnect(self.part, qt.SIGNAL("docCreated()"). self._slotDocCreated)
- self.loadFunction = callback
- self.connect(self.part, qt.SIGNAL("docCreated()"), self._slotDocCreated)
+ self.onNextLoad = callback
self.location = uri
- def _setOnLoadHandler(self, callback):
- """ If a function is going to be called that changes the URL and we want a callback to be called when the page is loaded, this is the method for you. Does kinda what load does, without loading the page """
+ def _setOnNextLoad(self, callback):
if self.loadFunction:
self.disconnect(self.part, qt.SIGNAL("docCreated()"). self._slotDocCreated)
self.loadFunction = callback
self.connect(self.part, qt.SIGNAL("docCreated()"), self._slotDocCreated)
+ def _getOnNextLoad(self):
+ return self.loadFunction
+ onNextLoad = property(_getOnNextLoad, _setOnNextLoad, None, "If you're going to do something that will inadvertently cause PyKHTML to browse to a new page and you want a function to be called when the page is loaded, set onNextLoad to the function")
def _slotDocCreated(self):
self.part.executeScript(DOM.Node(), "window.alert = function() {}")
@@ -206,7 +243,7 @@
func = partial(func, self)
# do this so the DOM loads fully. Cast to an Element -- not strictly correct, but we just want to get to addEvent.
# XX why not just put addEvent in Node and make Document inherit from Node? Document IS meant to be a Node, after all.
- dom.Element(self.document._d, self).addEvent("load", func)
+ dom.Node(self.document._d, self).addEvent("load", func)
def setHtml(self, source, url=None):
""" Set the HTML of the browser. Parses the HTML and generates the DOM tree so you can navigate it as usual. As well as the `source` parameter, a `url` parameter allows you to specify a URL with which this source code is linked so that e.g any scripts/images referenced in the HTML will be found. """
=== modified file 'pykhtml/dom.py'
--- pykhtml/dom.py 2007-03-05 16:00:30 +0000
+++ pykhtml/dom.py 2007-03-06 22:25:50 +0000
@@ -19,33 +19,54 @@
+class ElementNotFoundError(Exception):
+ pass
+
+
class Node(object):
""" Node and all of its subclasses provide you with access to the page's DOM. Instantiating the classes themselves won't do you much good. """
def __init__(self, cNode, browser):
self._ = cNode
self.browser = browser
object.__init__(self)
- self.__children = None
+ #n = self._.childNodes()
+ #for j in xrange(n.length()):
+ #print " | %s" % n.item(j)
+ #print self._.childNodes().length()
+
+ def __iter__(self):
+ """ Iterate over child nodes """
+ elements = self._.childNodes()
+ for i in xrange(elements.length()):
+ yield nodeCast(elements.item(i), self.browser)
+
+ @property
+ def children(self):
+ """ Get a list containing the children nodes of this node. If you want to loop over children just loop over this node as with any regular list -- it is more memory efficient that way. """
+ l = []
+ elements = self._.childNodes()
+ for i in xrange(elements.length()):
+ l.append(nodeCast(elements.item(i), self.browser))
+ return l
+
+ @property
+ def childNodes(self):
+ """ For those that can't live without JavaScript DOM-compatible method names """
+ return self.children
def isA(self, klass):
""" Syntactic sugar for isinstance """
return isinstance(self, klass)
- @property
- def children(self):
- """ Get the children nodes of this node """
- # cache the value
- if self.__children is None:
- l = self.__children = []
- elements = self._.childNodes()
- for i in xrange(elements.length()):
- n = elements.item(i)
- l.append(nodeCast(n, self.browser))
- return self.__children
-
- def childNodes(self):
- """ For those that can't live without JavaScript DOM-compatible method names """
- return self.children()
+ def addEvent(self, eventName, func, capture=False):
+ """ This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally. """
+ listener = _CallbackEventListener(eventName, func)
+ self._.addEventListener(DOMString(eventName), listener, capture)
+
+ def removeEvent(self, eventName, func, capture=False):
+ """ Removes events that you've added with [[Element.addEvent]] """
+ self._.removeEventListener(DOMString(eventName), _CallbackEventListener.getCallbackInstance(eventName, func), capture)
+ _CallbackEventListener.remove(eventName, func)
class Text(Node):
@@ -56,20 +77,63 @@
def __str__(self):
return self.value
+ def __unicode__(self):
+ """ [[Text.value]] converts the string to a regular string via str(). This gives the unicode version """
+ return unicode(self._.nodeValue().string())
+
def __repr__(self):
return self.value
@property
def value(self):
- """ Equivalent to str(textNode). Get the string this node represents """
- return str(self._.nodeValue().string())
+ """ Equivalent to unicode(textNode). Get the string this node represents """
+ return unicode(self._.nodeValue().string())
registerNode(3, Text)
class Element(Node):
- """ An HTML element. Instances of it provide methods for doing things with the element -- traversing it, adding events, etc. """
- def __init__(self, cElement, browser):
- Node.__init__(self, cElement, browser)
+ """ An HTML element. It provides methods for:
+ + Managing attributes using the [] operator (to get/set/delete an attribute) and [[Element.attributes]] to get a dictionary containing attribute names/values
+ + Traversing into children nodes with [[Element.getElementsByTagName]] and [[Element.getElementsByClass]] (as well as [[Node.children]], of course)
+ + Accessing its tag name with [[Element.tagName]]
+ + Accessing and modifying the HTML markup inside the element via [[Element.innerHTML]]
+ And more. """
+ def __init__(self, cElement, browser, cast=_DOM.HTMLElement):
+ Node.__init__(self, cast(cElement), browser)
+
+ def __getitem__(self, name):
+ """ Get the value of the attribute with the given name """
+ #print "Getitem on:", self._
+ #print self._.tagName()
+ #return "foo"
+ return str(self._.getAttribute(DOMString(name)).string())
+
+ def __setitem__(self, name, value):
+ """ Set the value of the attribute with the given name """
+ self._.setAttribute(DOMString(name), DOMString(value))
+
+ def __delitem__(self, name):
+ """ Delete an attribute """
+ dName = DOMString(name)
+ if self._.hasAttribute(dName):
+ self._.removeAttribute(dName)
+ else:
+ raise AttributeError("No attribute %s exists" % repr(name))
+
+ def hasAttribute(self, name):
+ """ Check whether an attribute exists in this element """
+ return self._.hasAttribute(DOMString(name))
+
+ @property
+ def attributes(self):
+ nodeMap = self._.attributes()
+ d = {}
+ for i in xrange(nodeMap.length()):
+ item = nodeMap.item(i)
+ key = str(item.nodeName().string())
+ value = str(item.nodeValue().string())
+ d[key] = value
+ return d
@property
def text(self):
@@ -78,14 +142,9 @@
if not len(children):
return None
if isinstance(children[0], Text):
- return children[0].value
-
- ## DOM things
- def getElementById(self, id):
- """ Get a reference to an element in the page by its id attribute """
- n = self._.getElementById(DOMString(id))
- return nodeCast(n, self.browser)
-
+ return unicode(children[0])
+
+ ## DOM things
def getElementsByTagName(self, name):
""" Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list() """
elements = self._.getElementsByTagName(DOMString(name))
@@ -108,6 +167,24 @@
e = elements.item(i)
yield nodeCast(e, self.browser)
+ def _getId(self):
+ return str(self._.id().string())
+ def _setId(self, val):
+ self._.setId(DOMString(val))
+ id = property(_getId, _setId, None, "The ID of this element")
+
+ def _getClassName(self):
+ return str(self._.className().string())
+ def _setClassName(self, val):
+ self._.setClassName(DOMString(val))
+ className = property(_getClassName, _setClassName, None, "The class name of this element")
+
+ def _getInnerHTML(self):
+ return str(self._.innerHTML().string())
+ def _setInnerHTML(self, val):
+ self._.setInnerHTML(DOMString(val))
+ innerHTML = property(_getInnerHTML, _setInnerHTML, None, "The HTML markup inside of this element. Notice that this property is writable, so you can change the markup if need be")
+
@property
def tagName(self):
""" Get the lowercase name of this tag """
@@ -115,27 +192,16 @@
@property
def originalTagName(self):
- """ Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML """
+ """ Like tagName but this won't convert things to lower. Only really useful if you're dealing with XML, which probably isn't the case anyway! """
return str(e.nodeName().string())
-
- def addEvent(self, eventName, func, capture=False):
- """ This lets you listen for certain events as they occur on the current element. Only particularly useful when listening for load events reaaally. """
- listener = _CallbackEventListener(eventName, func)
- self._.addEventListener(DOMString(eventName), listener, capture)
-
- def removeEvent(self, eventName, func, capture=False):
- """ Removes events that you've added with [[Element.addEvent]] """
- self._.removeEventListener(DOMString(eventName), _CallbackEventListener.getCallbackInstance(eventName, func), capture)
- _CallbackEventListener.remove(eventName, func)
# -- important, we hook to the method not Element base class
registerNode(1, elementCast)
class Anchor(Element):
""" Anchor elements with an [[Anchor.href]] property """
- def __init__(self, cAnchor, browser):
- Element.__init__(self, cAnchor, browser)
- self._ = sip.cast(self._, _DOM.HTMLAnchorElement)
+ def __init__(self, cAnchor, browser, cast=_DOM.HTMLAnchorElement):
+ Element.__init__(self, cAnchor, browser, cast)
@property
def href(self):
@@ -146,9 +212,8 @@
class Form(Element):
""" Form elements contain input elements. You can submit a form with [[Form.submit]] """
- def __init__(self, cForm, browser):
- Element.__init__(self, cForm, browser)
- self._ = sip.cast(self._, _DOM.HTMLFormElement)
+ def __init__(self, cForm, browser, cast=_DOM.HTMLFormElement):
+ Element.__init__(self, cForm, browser, cast)
def _getAction(self):
return str(self._.action().string())
@@ -169,22 +234,41 @@
# submit
self._.submit()
- def submit(self):
- """ Submit the form to the page specified in the action """
- self._.submit()
+ def reset(self):
+ """ Reset the data in the form """
+ self._.reset()
registerElement("FORM", Form)
class Input(Element):
""" Input elements that you can set the value/name of """
- def __init__(self, cInput, browser):
- Element.__init__(self, cInput, browser)
- self._ = sip.cast(self._, _DOM.HTMLInputElement)
+ def __init__(self, cInput, browser, cast=_DOM.HTMLInputElement):
+ Element.__init__(self, cInput, browser, cast)
@property
def form(self):
""" Get the form element this input resides in """
return Form(self._.form())
+
+ def _getName(self):
+ return str(self._.name().string())
+ def _setName(self, name):
+ self._.setName(DOMString(name))
+ name = property(_getName, _setName, None, "The HTML name of this input")
+
+ def click(self):
+ """ Simulate a click on the input element. If this is a submit button, for example, this clicks it. """
+ self._.click()
+
+ def focus(self):
+ """ Focus the input """
+ self._.focus()
+
+ def _getValue(self):
+ return str(self._.value().string())
+ def _setValue(self, value):
+ self._.setValue(DOMString(value))
+ value = property(_getValue, _setValue, None, "The value of this input -- for example, for textareas and inputs of type 'text', this is its content. For buttons, this is the text inside the button, etc.")
registerElement("INPUT", Input)
#class Title(Element):
@@ -203,13 +287,17 @@
@property
def serialized(self):
""" Return a string that represents the DOM structure of this document, much like what is returned via innerHTML in JavaScript """
- return str(self._d.toString().string())
+ return unicode(self._d.toString().string())
## DOM things
def getElementById(self, id):
""" Get a reference to an element in the page by its id attribute """
n = self._d.getElementById(DOMString(id))
- return nodeCast(n, self.browser)
+ # no element with the given ID
+ if n.isNull():
+ raise ElementNotFoundError("No element with an ID of '%s' found" % id)
+ cast = nodeCast(n, self.browser)
+ return cast
def getElementsByTagName(self, name):
""" Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list() """
@@ -240,7 +328,7 @@
If the `stripSpace` attribute is True, when searching for a string match all whitespace is stripped from the item we are matching against """
_dlinks = self._d.links()
possibleLinks = [_dlinks.item(i) for i in xrange(_dlinks.length())]
- print "possible links:", len(possibleLinks)
+ #print "possible links:", len(possibleLinks)
correctLink = None
# check for text first
if text is not None:
=== modified file 'todo.txt'
--- todo.txt 2007-02-15 20:31:06 +0000
+++ todo.txt 2007-03-06 22:25:50 +0000
@@ -2,7 +2,7 @@
* PyKHTML Todo List *
**********************
- + distutils-based installer
+ + Event dispatching (document.createEvent, initMouseEvent with type "click", then dispatch on an element)
+ Make the examples available online
+ Big friendly 'Download (latest)' button
+ Add Contact section to the website