"""
Browser: an automated web browser, to aid you in automating interaction
with websites.
You can use Browser to navigate to pages, follow links, fill in and
submit forms.
Example:
>>> from browser import Browser
>>> b=Browser()
>>> b.get('http://www.yahoo.com/')
>>> b.follow_link('Advanced')
>>> b.dump_forms()
Form f0
Action: http://search.yahoo.com/search/validate
Method: GET
Hidden: _adv_prop web
Hidden: x op
Hidden: ei ISO-8859-1
Hidden: prev_vm p
Button: (no name): Yahoo! Search
Textbox: va (no value)
Textbox: vp (no value)
Textbox: vo (no value)
Textbox: ve (no value)
Textbox: vs (no value)
Radio button: vm r (off)
Radio button: vm p (on)
Radio button: fl 0 (on)
Radio button: fl 1 (off)
Checkbox: vl lang_ar (off)
Checkbox: vl lang_bg (off)
Checkbox: vl lang_ca (off)
Checkbox: vl lang_zh-CN (off)
Checkbox: vl lang_zh-TW (off)
Checkbox: vl lang_hr (off)
Checkbox: vl lang_cs (off)
Checkbox: vl lang_da (off)
Checkbox: vl lang_nl (off)
Checkbox: vl lang_en (off)
Checkbox: vl lang_et (off)
Checkbox: vl lang_fi (off)
Checkbox: vl lang_fr (off)
Checkbox: vl lang_de (off)
Checkbox: vl lang_el (off)
Checkbox: vl lang_iw (off)
Checkbox: vl lang_hu (off)
Checkbox: vl lang_is (off)
Checkbox: vl lang_id (off)
Checkbox: vl lang_it (off)
Checkbox: vl lang_ja (off)
Checkbox: vl lang_ko (off)
Checkbox: vl lang_lv (off)
Checkbox: vl lang_lt (off)
Checkbox: vl lang_no (off)
Checkbox: vl lang_pl (off)
Checkbox: vl lang_pt (off)
Checkbox: vl lang_ro (off)
Checkbox: vl lang_ru (off)
Checkbox: vl lang_sr (off)
Checkbox: vl lang_sk (off)
Checkbox: vl lang_sl (off)
Checkbox: vl lang_es (off)
Checkbox: vl lang_sv (off)
Checkbox: vl lang_tr (off)
Button: (no name): Yahoo! Search
Dropdown: va_vt
Dropdown: vp_vt
Dropdown: vo_vt
Dropdown: ve_vt
Dropdown: vd
Dropdown: vc
Dropdown: n
>>> b.form('f0')
>>> b.field('va','kryogenix')
>>> b.submit()
>>> b.follow_link('kryogenix.org')
>>> print b._Browser__uri
http://drs.yahoo.com/S=2766679/K=kryogenix/v=2/SID=w/l=WS1/R=1/H=0/*-http://www.kryogenix.org/
"""
import urllib,re,urlparse,sys
from xml.dom.ext.reader import HtmlLib
from xml.dom import minidom
import ClientCookie # http://wwwsearch.sourceforge.net/ClientCookie/
# Patch HTMLElement to correctly grab attribute names case-insensitively
from xml.dom.html import HTMLElement
from xml.dom.Element import Element
def newSetAttributeNS(self,ns,qname,value):
# Remove namespace from element name if we're not passed a namespace
if qname.find(':') <> -1 and not ns:
qname = qname[qname.find(':')+1:]
# And uppercase the element name so getAttribute works
Element.setAttributeNS(self,ns,qname.upper(),value)
HTMLElement.HTMLElement.setAttributeNS = newSetAttributeNS
class BrowserError(Exception):
"Base class for exceptions"
pass
class UnknownMethodError(BrowserError):
"""Exception raised on attempting to fetch a URI with an unknown
HTTP method."""
def __init__(self,method):
self.__method = method
def __str__(self):
return "Bad method %s (must be POST or GET)" % self.__method
class LinkNotFoundError(BrowserError):
"""Exception raised on attempting to navigate to a link that
doesn't exist."""
def __init__(self,linkdata):
self.__linkdata = linkdata
def __str__(self):
return "Link %s not found" % self.__linkdata
class LinkNoHrefError(BrowserError):
"""Exception raised on attempting to navigate to a link
without an href attribute."""
def __init__(self,link):
self.__link = link
def __str__(self):
return "Link does not have an href attribute (link attributes were %s)" % self.__link._get_attributes()
class FormNotFoundError(BrowserError):
"""Exception raised on attempting to specify a form that
doesn't exist."""
def __init__(self,formdata):
self.__formdata = formdata
def __str__(self):
return "Form %s not found" % self.__formdata
class FieldNotFoundError(BrowserError):
"""Exception raised on attempting to specify a field that
doesn't exist in the current form."""
def __init__(self,fielddata):
self.__fielddata = fielddata
def __str__(self):
return "Field %s not found" % self.__fielddata
class NoFormSpecifiedError(BrowserError):
"""Exception raised on attempting to submit a form
before a form has been specified."""
pass
class Browser:
"""An automated web browser."""
def __init__(self):
self.__uri = None
self.__data = None
self.__htmldom = None
self.__parsedData = None
self.__form = None
self.__reader = HtmlLib.Reader()
def __getInnerText(self,node):
if node.hasChildNodes():
return ''.join([self.__getInnerText(x) for x in node.childNodes])
else:
if node.nodeType == 3: # is a text node
return node.nodeValue
else: # Empty element
return ''
def __setFormDefaultValues(self,f):
fields = f.getElementsByTagName('input') + \
f.getElementsByTagName('textarea') + \
f.getElementsByTagName('select')
fields = [x for x in fields
if x.getAttribute("name")]
f.fieldValues = {}
for fl in fields:
name = fl.getAttribute('name')
if fl.nodeName == 'TEXTAREA':
f.fieldValues[name] = self.__getInnerText(fl)
elif fl.nodeName == 'SELECT':
options = fl.getElementsByTagName('option')
foundSelected = 0
for o in options:
if o.getAttribute('selected'):
if o.getAttribute('selected').lower() == 'selected' or \
o.getAttribute('selected').lower() == 'true':
foundSelected = 1
f.fieldValues[name] = o.getAttribute('value')
if not foundSelected:
f.fieldValues[name] = options[0].getAttribute('value')
elif fl.nodeName == 'INPUT':
typ = fl.getAttribute('type').lower()
if typ == 'text' or typ == 'password' or typ == 'hidden':
f.fieldValues[name] = fl.getAttribute('value')
elif typ == 'checkbox' or typ == 'radio':
if fl.getAttribute('checked'):
if fl.getAttribute('checked').lower() == 'true' or \
fl.getAttribute('checked').lower() == 'checked':
f.fieldValues[name] = 'on'
def get(self,uri,method='GET',data=None):
"""
get -- have the Browser navigate to a URI.
@param uri: the URI to which to navigate.
@type uri: string
@param method: the HTTP method to use (must be GET or POST).
@param data: a dictionary of keys and values to send as data
to the remote server.
@type data: dictionary
@raise UnknownMethodError: If you passed an unknown HTTP method (must
be GET or POST).
"""
newuri = urlparse.urljoin(self.__uri,uri)
if data:
if method.upper() == 'GET':
# convert data to URL pairs and append to URL
bits = list(urlparse.urlsplit(newuri)[:3])
bits += (['',urllib.urlencode(data),''])
newuri = urlparse.urlunparse(tuple(bits))
fp = ClientCookie.urlopen(newuri)
elif method.upper() == 'POST':
fp = ClientCookie.urlopen(newuri,urllib.urlencode(data))
else:
raise UnknownMethodError,method
else:
if method.upper() == "GET":
fp = ClientCookie.urlopen(newuri)
else:
raise UnknownMethodError,method
self.__data = fp.read()
fp.close()
self.__uri = newuri
try:
self.__htmldom = self.__reader.fromString(self.__data)
except "abc":
# Attempt to parse with minidom
print "Failed to parse with HtmlLib: trying minidom"
self.__htmldom = minidom.parseString(self.__data)
def follow_link(self,linkdata):
"""Direct the Browser to follow a specified link on the
current page.
@param linkdata: Pass either an integer, a string, or a compiled regex.
An integer will follow that link on the page (first link
is 1, not 0!).
A string will follow the first link it finds with that
text therein (as the full string, not a substring).
A regex will follow the first link it finds that matches
the regex.
@type linkdata: string, int, compiled regex
@raise LinkNotFoundError: If you specified a link that is not on the
page (either too high a number, or a string that doesn't match any
of the page links).
@raise LinkNoHrefError: If you specified a link that does not have
a href attribute (and hence cannot be navigated).
"""
links = self.__htmldom.getElementsByTagName('a')
if isinstance(linkdata,int):
try:
link = links[linkdata-1]
except IndexError:
raise LinkNotFoundError,linkdata
elif isinstance(linkdata,str):
matchinglinks = [x for x in links
if self.__getInnerText(x) == linkdata]
try:
link = matchinglinks[0]
except IndexError:
raise LinkNotFoundError,linkdata
href = link.getAttribute('href')
if not href: raise LinkNoHrefError,link
self.get(href)
def form(self,formdata):
"""Choose a form on the page to which further form manipulations
(L{field}, L{submit}) will apply.
@param formdata: Pass either an integer or a string.
An integer will select that form on the page (first form
is 1, not 0!).
A string will attempt to select a form by name, matching on
the form's name attribute.
@type formdata: string, int
@raise FormNotFoundError: If you specified a form that is not on the
page (either too high a number, or a string that doesn't match any
of the page's forms' name attributes).
"""
forms = self.__htmldom.getElementsByTagName('form')
if isinstance(formdata,int):
try:
thisform = forms[formdata-1]
except:
raise FormNotFoundError, formdata
elif isinstance(formdata,str):
matchingforms = [x for x in forms
if x.getAttribute('name') == formdata]
try:
thisform = matchingforms[0]
except:
raise FormNotFoundError, formdata
self.__form = thisform
self.__setFormDefaultValues(self.__form)
def field(self,fieldname,fieldvalue):
"""Change the value in a form field in the currently selected form.
Defaults to the first form on the page if no L{form} has been
specified.
@param fieldname: The name of the field to alter, matching its
name attribute.
@type fieldname: string
@param fieldvalue: The value to set in the field.
@type fieldvalue: string
@raise FieldNotFoundError: If you specified a field that is not in
the form.
"""
if not self.__form:
self.form(1)
fields = self.__form.getElementsByTagName('input') + \
self.__form.getElementsByTagName('textarea') + \
self.__form.getElementsByTagName('select')
matchingfields = [x for x in fields
if x.getAttribute('name') == fieldname]
try:
thisfield = matchingfields[0]
except:
raise FieldNotFoundError,fieldname
# FIXME check is a valid value for a select
self.__form.fieldValues[fieldname] = fieldvalue
def submit(self):
"""Submit the currently selected L{form}.
@raise NoFormSpecifiedError: If no form is currently selected.
"""
if not self.__form: raise NoFormSpecifiedError
method = self.__form.getAttribute('method') or 'GET'
action = self.__form.getAttribute('action') or self.__uri
self.get(action,method,self.__form.fieldValues)
def dump_forms(self):
"""Dump details of all forms on the page. Useful for debugging."""
forms = self.__htmldom.getElementsByTagName('form')
for f in forms:
print "Form",f.getAttribute('name') or '(no name)'
print " Action:",f.getAttribute('action') or self.__uri
print " Method:",f.getAttribute('method').upper() or 'GET'
fields = f.getElementsByTagName('input') + \
f.getElementsByTagName('textarea') + \
f.getElementsByTagName('select')
for fl in fields:
if fl.nodeName == 'SELECT':
print ' Dropdown:',fl.getAttribute('name') or '(no name)'
elif fl.nodeName == 'INPUT':
typ = fl.getAttribute('type') or 'text'
if typ == 'text': print ' Textbox:',
elif typ == 'password': print ' Password:',
elif typ == 'radio': print ' Radio button:',
elif typ == 'checkbox': print ' Checkbox:',
elif typ == 'hidden': print ' Hidden:',
elif typ == 'submit' or typ == 'image': print ' Button:',
else: print ' '+typ+':',
print fl.getAttribute('name') or '(no name):',
if typ == 'text' or typ == 'password' or typ == 'hidden' \
or typ == 'image' or typ == 'submit':
print fl.getAttribute('value') or '(no value)'
elif typ == 'checkbox' or typ == 'radio':
print fl.getAttribute('value') or '(no value)',
if fl.getAttribute('checked'):
print '(on)'
else:
print '(off)'
else: print '.'
elif fl.nodeName == 'TEXTAREA':
print ' Textarea:', fl.getAttribute('name') or '(no name)'
print self.__getInnerText(fl) or '(no text)'
# Getters
def content(self):
"Returns a string containing the current data in the page."
return self.__data
def uri(self):
"Returns a string containing the current URI."
return self.__uri
def current_form(self):
"Returns a HTMLFormElement representing the current form."
return self.__form
def fieldValues(self):
"Returns a dictionary containing the values set in the current form."
return self.__form.fieldValues