""" Scrapes all of the headlines from Google News and then recursively visits the links to try to find the writer of the article """ import sys sys.path.append("..") import pykhtml pykhtml.debugWithGUI = True def discoverJournalist(browser, headline): print headline, browser.document pykhtml.stopEventLoop() def scrapeHeadlines(browser): #anchors = browser.document.getElementsByTagName("a") #for anchor in list(anchors)[:35]: for anchor in browser.document.getElementsByTagName("a"): if anchor["id"]: boldElements = list(anchor.getElementsByTagName("b")) if boldElements: title = boldElements[0].text newBrowser = pykhtml.Browser() func = pykhtml.partial(discoverJournalist, title) newBrowser.load(anchor["href"], func) pykhtml.timer(9.0, pykhtml.stopEventLoop) return #print anchor.getElementsByTagName("b").next().text #print #print anchor["id"], anchor["href"] def main(): browser = pykhtml.Browser() browser.load("http://news.google.co.uk/news", scrapeHeadlines) # kick things off pykhtml.startEventLoop() if __name__ == "__main__": main()