=== modified file 'doc/pykhtml.dom.htm' --- doc/pykhtml.dom.htm 2007-02-10 20:22:22 +0000 +++ doc/pykhtml.dom.htm 2007-02-11 19:31:51 +0000 @@ -4,8 +4,8 @@ pykhtml.dom module documentation - - + +
@@ -15,12 +15,13 @@

pykhtml.dom

None

class Document (inherits object)

-
Document object for accessing the DOM tree. Don't keep this object around, when the page changes it's invalidated. Just access it through pykhtml.Browser.document whenever you want it.
+
Document object for accessing the DOM tree. Don't keep this object around, when the page changes it is invalidated. Just access it through pykhtml.Browser.document whenever you want it.

__init__(self, htmlDocument, browser)

getElementById(self, id)

Get a reference to an element in the page by its id attribute.

getElementsByClass(self, className, tagName='*')

Get elements in the document that have a certain class.

getElementsByTagName(self, name)

Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list().

getElementsByTagNameNS(self, ns, name)

Get elements by tag name given a certain namespace.
+

serialized (read-only property)

Return a string that represents the DOM structure of this document, much like what is returned via innerHTML in JavaScript.

visit(self, text=True, callback=None, attributes=None, stripSpace=None)

Visit a page pointed to by a certain link. This function searches for all links in the document that either:
If the `stripSpace` attribute is True, when searching for a string match all whitespace is stripped from the item we are matching against.

class Node (inherits object)

Node and all of its subclasses provide you with read-only access to the page's DOM. Instantiating the classes themselves won't do you much good.
@@ -35,7 +36,7 @@

childNodes(self)

Inherited from Node
For those that can't live without JavaScript DOM-compatible method names.

children (read-only property)

Inherited from Node
Get the children nodes of this node.

getElementById(self, id)

Get a reference to an element in the page by its id attribute.
-

getElementsByClass(self, className, tagName='*')

Get elements in the document that have a certain class.
+

getElementsByClass(self, className, tagName='*')

Get elements in the document (optionally with a given tag name) that have a certain class.

getElementsByTagName(self, name)

Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list().

getElementsByTagNameNS(self, ns, name)

Get elements by tag name given a certain namespace.

isA(self, klass)

Inherited from Node
Syntactic sugar for isinstance.
@@ -50,7 +51,7 @@

childNodes(self)

Inherited from Node
For those that can't live without JavaScript DOM-compatible method names.

children (read-only property)

Inherited from Node
Get the children nodes of this node.

getElementById(self, id)

Inherited from Element
Get a reference to an element in the page by its id attribute.
-

getElementsByClass(self, className, tagName='*')

Inherited from Element
Get elements in the document that have a certain class.
+

getElementsByClass(self, className, tagName='*')

Inherited from Element
Get elements in the document (optionally with a given tag name) that have a certain class.

getElementsByTagName(self, name)

Inherited from Element
Get elements by tag name. Returns a generator that you can loop over or flatten into a list with list().

getElementsByTagNameNS(self, ns, name)

Inherited from Element
Get elements by tag name given a certain namespace.

href (read-only property)

The anchor's 'href' value. Returns the full URL pointed to by the href attribute of this element.
=== modified file 'doc/pykhtml.htm' --- doc/pykhtml.htm 2007-02-10 20:22:22 +0000 +++ doc/pykhtml.htm 2007-02-11 19:31:51 +0000 @@ -4,8 +4,8 @@ pykhtml module documentation - - + +
@@ -18,13 +18,18 @@
A Browser is the main class you use to navigate around and visit different pages. Have a look at Browser.load and Browser.document to access basic use.

__init__(self)

Create a new Browser.

document (read-only property)

Get a reference to the document (see dom.Document) for the currently loaded page. It contains all the tasty methods for walking the DOM tree like getElementById / getElementsByTagName, and methods for browsing to other linked pages.
-

load(self, uri, callback)

Load a webpage in the browser. It takes as parameters the URI of the page to load, and a callable object to call when the page has loaded.
-

location (property)

Browse to a new location. You probably don't want to set this directly as you'll receive no notification when the page has loaded. Have a look at Browser.load instead.
+

load(self, uri, callback)

Load a webpage in the browser. It takes as parameters the URI of the page to load, and a callable object to call when the page has loaded. This callback will be given the browser object as a reference unless you set Browser.referencelessCallbacks to True.
+

location (property)

Browse to a new location. You probably don't want to set this directly as you'll receive no notification when the page has loaded. Have a look at Browser.load instead.
+

passReferenceToCallbacks (property)

Set whether callbacks passed to functions such as Browser.load or dom.Document.visit will have a reference to this browser object passed as a parameter. Default is True.
+

setHtml(self, source, url=None)

Set the HTML of the browser. Parses the HTML and generates the DOM tree so you can navigate it as usual. As well as the `source` parameter, a `url` parameter allows you to specify a URL with which this source code is linked so that e.g any scripts/images referenced in the HTML will be found.

source (read-only property)

startEventLoop()

Starts the PyKHTML event loop. PyKHTML works with an asynchronous callback mechanism – a little like Twisted does. Calls to open a new webpage aren't synchronous, as with urllib, for example.

stopEventLoop()

Stop the event loop and hence exit the scraper.
-

timer(time, func)

Call the given function after the alloted time. Requires that the PyKHTML event loop is running.
+

timer(time, func)

Call the given function after the alloted time. The PyKHTML event loop needs to be running.

init(display=1, _sleep=1)

Initiate the system if necessary (start Xvfb if it's not running, connect to it, start our program instance). This is called automatically when you create a Browser instance, so you shouldn't have to worry about it. You can specify use of a certain display by setting the `display` parameter.
+

class curry

+
Partial application of parameters. This is used internally but is also very useful with Browser.load as it allows you to pass data to other functions.
EXAMPLE.
+

__init__(self, fun)

pathSearch(name)

Utility function to search for and get the full path of a file in $PATH.

running(name)

Check whether a process of the given name is running.
=== modified file 'makedocs.py' --- makedocs.py 2007-02-10 20:22:22 +0000 +++ makedocs.py 2007-02-11 19:31:51 +0000 @@ -46,16 +46,17 @@ classToClassList = {} # dirty hack so that qt/kdecore/khtml aren't loaded when pykhtml calls 'import ...' -sys.modules["qt"] = sys.modules["kdecore"] = {} +sys.modules["kdecore"] = {} # fake the KHTML interface class Dummy(object): pass -khtml = Dummy() +root = Dummy() dom = Dummy() dom.DOMString = None dom.EventListener = int -khtml.DOM = dom -sys.modules["khtml"] = khtml +root.DOM = dom +root.QTimer = int +sys.modules["khtml"] = sys.modules["qt"] = root # end dirty hack def autocrop(image, backgroundColor=(255, 255, 255)): === modified file 'pykhtml/__init__.py' --- pykhtml/__init__.py 2007-02-11 18:58:20 +0000 +++ pykhtml/__init__.py 2007-02-11 19:31:51 +0000 @@ -9,8 +9,16 @@ import dom +# use Xvfb to run a virtual X session if it's present +useXvfb = False +# whether or not Xvfb is being used at the moment. If +# it is None things have not yet been initialised. +usingXvfb = None + # set to true to see what's happening visually -debugWithGUI = True +debugWithGUI = False + + class curry: """ Partial application of parameters. This is used internally but is also very useful with [[Browser.load]] as it allows you to pass data to other functions. @@ -26,7 +34,6 @@ kw.update(kwargs) else: kw = kwargs or self.kwargs - return self.fun(*(self.pending + args), **kw) class _Dummy(object): @@ -70,7 +77,6 @@ os_path_join = os.path.join return [os_path_join(x, name) for x in path if os_path_exists(os_path_join(x, name))] - def _startKApplication(): global application, dialog kdecore.KCmdLineArgs.init(sys.argv, "PyKHTML", "PyKHTML Library", "9.9") @@ -87,26 +93,28 @@ global initSuccessful if not initSuccessful: # check that Xvfb is in the path - if pathSearch("Xvfb"): - raise OSError("Xvfb not installed") - global xvfb - if not xvfb: - # start xvfb - xvfb = subprocess.Popen(["Xvfb", ":%s" % display, "-ac", "-screen", "0", "640x480x8"], stderr=subprocess.PIPE) - xvfb.display = ":%s" % display - # pause for a little to check it hasn't terminated straight away and to give it some time to startup - time.sleep(_sleep) - if xvfb.poll(): - raise OSError("Xvfb failed to run correctly\nXvfb output:\n%s" % xvfb.stderr.read()) - else: - # started successfully - lets not start a new Xvfb each time - f = file(os.path.join(temp, "PyKHTMLXvfb"), "w") - f.write("%s\n" % xvfb.display) - f.close() + if useXvfb and pathSearch("Xvfb"): + global xvfb + if not xvfb: + # start xvfb + xvfb = subprocess.Popen(["Xvfb", ":%s" % display, "-ac", "-screen", "0", "640x480x8"], stderr=subprocess.PIPE) + xvfb.display = ":%s" % display + # pause for a little to check it hasn't terminated straight away and to give it some time to startup + time.sleep(_sleep) + if xvfb.poll(): + raise OSError("Xvfb failed to run correctly\nXvfb output:\n%s" % xvfb.stderr.read()) + else: + # started successfully - lets not start a new Xvfb each time + f = file(os.path.join(temp, "PyKHTMLXvfb"), "w") + f.write("%s\n" % xvfb.display) + f.close() + #else: + #print "Re-using server" + if not debugWithGUI: + os.environ["DISPLAY"] = xvfb.display + usingXvfb = True else: - print "Re-using server" - if not debugWithGUI: - os.environ["DISPLAY"] = xvfb.display + usingXvfb = False _startKApplication() initSuccessful = True @@ -119,7 +127,6 @@ """ Stop the event loop and hence exit the scraper """ dialog.deleteLater() application.quit() - #sys.exit(0) class _Timer(qt.QTimer):