""" Myspace is a notoriously crufty website. Walking the DOM (as opposed to scraggling through markup) with PyKHTML is a fair bit nicer! """ # in case pykhtml isn't already installed import sys sys.path.append("..") import re import pykhtml def login(email, password, browser): #print "Here" document = browser.document # First, check if we're already logged in header = document.getElementById("header") for anchor in header.getElementsByTagName("a"): if anchor.text == "SignOut": # We are. Sign out and then quit print "Already signed in to Myspace! Signing out..." browser.load(anchor.href, signedOut) return # Log in # Set the text in the email/password boxes document.getElementById("email").value = email document.getElementById("password").value = password # And then get a reference to the submit image button: submit = None # the ID regularly seems to change, so fuzzy match: for element in document.getElementsByTagName("input"): if element.id.count("loginbutton"): submit = element break assert submit is not None # and then click on it. We have to do this instead # of just submitting the form because it runs a muck # of javascript onclick that submits the form # # There is another problem, however: when we click # on the button, there will be an implicit change of # page. We won't know when the new page has loaded # because of the asynchronous design of PyKHTML. # To fix this, we set browser.onNextLoad to our # callback function. The function specified will be # called when the next browsed-to page has finished # loading browser.onNextLoad = displayAccountInformation print "Logging in..." submit.click() #print "Clicked" def displayAccountInformation(browser): document = browser.document # all elements with a class of 'heading' and a tag name of 'h4' headings = document.getElementsByClass("heading", "h4") # parse out the nick name if headings: # should be of the form " Hello, NickName! " welcomeMessage = list(headings)[0].text nickName = welcomeMessage.split(",")[1].strip().rstrip("!") print "Nickname:", nickName else: print "Oops, couldn't find the header" # then the number of friends for anchor in document.getElementsByTagName("a"): # to do that, we find all links to the page that lets # you view your friends... if anchor.href.count("index.cfm?fuseaction=user.viewfriends"): # and that also consists of nothing but digits (once # stripped of the superfluous whitespace that # Myspace's code is full of, of course) if anchor.text.strip().isdigit(): print "Number of friends:", anchor.text.strip() break # and finally (and quite trickily, this is pretty fragile) the # date of the last login. # firstly, we create a regular expression that will match a # date of the form (1 or 2 numbers)/(2 numbers)/(4 numbers) dateExpression = re.compile("\d{1,2}/\d{2}/\d{4}") # we know the date is in a span in a div with a class of # 'section', so: for div in document.getElementsByClass("section", "div"): for span in div.getElementsByTagName("span"): # if there is text for this span: if span.text: # and the regular expression matches: match = dateExpression.search(span.text) if match: print "Last login:", match.group() break pykhtml.stopEventLoop() def signedOut(browser): # We were signed in before. When this func is called, we've # signed out and so can exit. print "Signed out. Bye bye" pykhtml.stopEventLoop() return def main(): if len(sys.argv) != 3: print "Usage: myspace.py email@email.com password" return email, password = sys.argv[1:] browser = pykhtml.Browser() # the browser is passed as a parameter to `login` # when it is called (when the page has loaded) print "Connecting to myspace.com..." browser.load("http://www.myspace.com/", pykhtml.partial(login, email, password)) # kick things off pykhtml.startEventLoop() if __name__ == "__main__": main()