"""
Browser: an automated web browser, to aid you in automating interaction
with websites.

You can use Browser to navigate to pages, follow links, fill in and
submit forms.

Example:
>>> from browser import Browser
>>> b=Browser()
>>> b.get('http://www.yahoo.com/')
>>> b.follow_link('Advanced')
>>> b.dump_forms()
Form f0
  Action: http://search.yahoo.com/search/validate
  Method: GET
    Hidden: _adv_prop web
    Hidden: x op
    Hidden: ei ISO-8859-1
    Hidden: prev_vm p
    Button: (no name): Yahoo! Search
    Textbox: va (no value)
    Textbox: vp (no value)
    Textbox: vo (no value)
    Textbox: ve (no value)
    Textbox: vs (no value)
    Radio button: vm r (off)
    Radio button: vm p (on)
    Radio button: fl 0 (on)
    Radio button: fl 1 (off)
    Checkbox: vl lang_ar (off)
    Checkbox: vl lang_bg (off)
    Checkbox: vl lang_ca (off)
    Checkbox: vl lang_zh-CN (off)
    Checkbox: vl lang_zh-TW (off)
    Checkbox: vl lang_hr (off)
    Checkbox: vl lang_cs (off)
    Checkbox: vl lang_da (off)
    Checkbox: vl lang_nl (off)
    Checkbox: vl lang_en (off)
    Checkbox: vl lang_et (off)
    Checkbox: vl lang_fi (off)
    Checkbox: vl lang_fr (off)
    Checkbox: vl lang_de (off)
    Checkbox: vl lang_el (off)
    Checkbox: vl lang_iw (off)
    Checkbox: vl lang_hu (off)
    Checkbox: vl lang_is (off)
    Checkbox: vl lang_id (off)
    Checkbox: vl lang_it (off)
    Checkbox: vl lang_ja (off)
    Checkbox: vl lang_ko (off)
    Checkbox: vl lang_lv (off)
    Checkbox: vl lang_lt (off)
    Checkbox: vl lang_no (off)
    Checkbox: vl lang_pl (off)
    Checkbox: vl lang_pt (off)
    Checkbox: vl lang_ro (off)
    Checkbox: vl lang_ru (off)
    Checkbox: vl lang_sr (off)
    Checkbox: vl lang_sk (off)
    Checkbox: vl lang_sl (off)
    Checkbox: vl lang_es (off)
    Checkbox: vl lang_sv (off)
    Checkbox: vl lang_tr (off)
    Button: (no name): Yahoo! Search
    Dropdown: va_vt
    Dropdown: vp_vt
    Dropdown: vo_vt
    Dropdown: ve_vt
    Dropdown: vd
    Dropdown: vc
    Dropdown: n
>>> b.form('f0')
>>> b.field('va','kryogenix')
>>> b.submit()
>>> b.follow_link('kryogenix.org')
>>> print b.uri()
http://www.kryogenix.org/

"""
import urllib,re,urlparse,sys
from xml.dom.ext.reader import HtmlLib
from xml.dom import minidom
import ClientCookie   # http://wwwsearch.sourceforge.net/ClientCookie/

# Patch HTMLElement to correctly grab attribute names case-insensitively
from xml.dom.html import HTMLElement
from xml.dom.Element import Element
def newSetAttributeNS(self,ns,qname,value):
  # Remove namespace from element name if we're not passed a namespace
  if qname.find(':') <> -1 and not ns:
    qname = qname[qname.find(':')+1:]
  # And uppercase the element name so getAttribute works
  Element.setAttributeNS(self,ns,qname.upper(),value)
HTMLElement.HTMLElement.setAttributeNS = newSetAttributeNS

class BrowserError(Exception):
  "Base class for exceptions"
  pass

class UnknownMethodError(BrowserError):
  """Exception raised on attempting to fetch a URI with an unknown
     HTTP method."""
  def __init__(self,method):
    self.__method = method
  def __str__(self):
    return "Bad method %s (must be POST or GET)" % self.__method

class LinkNotFoundError(BrowserError):
  """Exception raised on attempting to navigate to a link that
     doesn't exist."""
  def __init__(self,linkdata):
    self.__linkdata = linkdata
  def __str__(self):
    return "Link %s not found" % self.__linkdata

class LinkNoHrefError(BrowserError):
  """Exception raised on attempting to navigate to a link
     without an href attribute."""
  def __init__(self,link):
    self.__link = link
  def __str__(self):
    return "Link does not have an href attribute (link attributes were %s)" % self.__link._get_attributes()

class FormNotFoundError(BrowserError):
  """Exception raised on attempting to specify a form that
     doesn't exist."""
  def __init__(self,formdata):
    self.__formdata = formdata
  def __str__(self):
    return "Form %s not found" % self.__formdata

class FieldNotFoundError(BrowserError):
  """Exception raised on attempting to specify a field that
     doesn't exist in the current form."""
  def __init__(self,fielddata):
    self.__fielddata = fielddata
  def __str__(self):
    return "Field %s not found" % self.__fielddata

class NoFormSpecifiedError(BrowserError):
  """Exception raised on attempting to submit a form
     before a form has been specified."""
  pass


class Browser:
  """An automated web browser."""
  def __init__(self):
    self.__uri = None
    self.__data = None
    self.__htmldom = None
    self.__parsedData = None
    self.__form = None
    self.__reader = HtmlLib.Reader()

  def __getInnerText(self,node):
    if node.hasChildNodes():
      return ''.join([self.__getInnerText(x) for x in node.childNodes])
    else:
      if node.nodeType == 3: # is a text node
        return node.nodeValue
      else: # Empty element
        return ''

  def __setFormDefaultValues(self,f):
    fields = f.getElementsByTagName('input') + \
             f.getElementsByTagName('textarea') + \
             f.getElementsByTagName('select')
    fields = [x for x in fields
             if x.getAttribute("name")]
    f.fieldValues = {}
    for fl in fields:
      name = fl.getAttribute('name')
      if fl.nodeName == 'TEXTAREA':
        f.fieldValues[name] = self.__getInnerText(fl)
      elif fl.nodeName == 'SELECT':
        options = fl.getElementsByTagName('option')
        foundSelected = 0
        for o in options:
          if o.getAttribute('selected'):
            if o.getAttribute('selected').lower() == 'selected' or \
               o.getAttribute('selected').lower() == 'true':
                 foundSelected = 1
                 f.fieldValues[name] = o.getAttribute('value')
        if not foundSelected:
          f.fieldValues[name] = options[0].getAttribute('value')
      elif fl.nodeName == 'INPUT':
        typ = fl.getAttribute('type').lower()
        if typ == 'text' or typ == 'password' or typ == 'hidden':
          f.fieldValues[name] = fl.getAttribute('value')
        elif typ == 'checkbox' or typ == 'radio':
          if fl.getAttribute('checked'):
            if fl.getAttribute('checked').lower() == 'true' or \
               fl.getAttribute('checked').lower() == 'checked':
                 f.fieldValues[name] = 'on'

  def get(self,uri,method='GET',data=None):
    """
    get -- have the Browser navigate to a URI.

    @param uri: the URI to which to navigate.
    @type uri: string
    @param method: the HTTP method to use (must be GET or POST).
    @param data: a dictionary of keys and values to send as data
    to the remote server.
    @type data: dictionary
    @raise UnknownMethodError: If you passed an unknown HTTP method (must
    be GET or POST).
    """
    newuri = urlparse.urljoin(self.__uri,uri)
    if data:
      if method.upper() == 'GET':
        # convert data to URL pairs and append to URL
        bits = list(urlparse.urlsplit(newuri)[:3])
        bits += (['',urllib.urlencode(data),''])
        newuri = urlparse.urlunparse(tuple(bits))
        fp = ClientCookie.urlopen(newuri)
      elif method.upper() == 'POST':
        fp = ClientCookie.urlopen(newuri,urllib.urlencode(data))
      else:
        raise UnknownMethodError,method
    else:
      if method.upper() == "GET":
        fp = ClientCookie.urlopen(newuri)
      else:
        raise UnknownMethodError,method
    self.__data = fp.read()
    self.__uri = fp.url
    fp.close()
    try:
      self.__htmldom = self.__reader.fromString(self.__data)
    except "abc":
      # Attempt to parse with minidom
      print "Failed to parse with HtmlLib: trying minidom"
      self.__htmldom = minidom.parseString(self.__data)

  def follow_link(self,linkdata):
    """Direct the Browser to follow a specified link on the
       current page.

       @param linkdata: Pass either an integer, a string, or a compiled regex.
       An integer will follow that link on the page (first link
       is 1, not 0!).
       A string will follow the first link it finds with that
       text therein (as the full string, not a substring).
       A regex will follow the first link it finds that matches
       the regex.
       @type linkdata: string, int, compiled regex
       @raise LinkNotFoundError: If you specified a link that is not on the
       page (either too high a number, or a string that doesn't match any
       of the page links).
       @raise LinkNoHrefError: If you specified a link that does not have
       a href attribute (and hence cannot be navigated).
    """
    links = self.__htmldom.getElementsByTagName('a')
    if isinstance(linkdata,int):
      try:
        link = links[linkdata-1]
      except IndexError:
        raise LinkNotFoundError,linkdata
    elif isinstance(linkdata,str):
      matchinglinks = [x for x in links
          if self.__getInnerText(x) == linkdata]
      try:
        link = matchinglinks[0]
      except IndexError:
        raise LinkNotFoundError,linkdata
    href = link.getAttribute('href')
    if not href: raise LinkNoHrefError,link
    self.get(href)

  def form(self,formdata):
    """Choose a form on the page to which further form manipulations
       (L{field}, L{submit}) will apply.

       @param formdata: Pass either an integer or a string.
       An integer will select that form on the page (first form
       is 1, not 0!).
       A string will attempt to select a form by name, matching on
       the form's name attribute.

       @type formdata: string, int
       @raise FormNotFoundError: If you specified a form that is not on the
       page (either too high a number, or a string that doesn't match any
       of the page's forms' name attributes).
    """
    forms = self.__htmldom.getElementsByTagName('form')
    if isinstance(formdata,int):
      try:
        thisform = forms[formdata-1]
      except:
        raise FormNotFoundError, formdata
    elif isinstance(formdata,str):
      matchingforms = [x for x in forms
          if x.getAttribute('name') == formdata]
      try:
        thisform = matchingforms[0]
      except:
        raise FormNotFoundError, formdata
    self.__form = thisform
    self.__setFormDefaultValues(self.__form)

  def field(self,fieldname,fieldvalue):
    """Change the value in a form field in the currently selected form.
       Defaults to the first form on the page if no L{form} has been
       specified.

       @param fieldname: The name of the field to alter, matching its
       name attribute.
       @type fieldname: string
       @param fieldvalue: The value to set in the field.
       @type fieldvalue: string
       @raise FieldNotFoundError: If you specified a field that is not in
       the form.
    """
    if not self.__form:
      self.form(1)
    fields = self.__form.getElementsByTagName('input') + \
             self.__form.getElementsByTagName('textarea') + \
             self.__form.getElementsByTagName('select')
    matchingfields = [x for x in fields
	                  if x.getAttribute('name') == fieldname]
    try:
      thisfield = matchingfields[0]
    except:
      raise FieldNotFoundError,fieldname
    # FIXME check is a valid value for a select
    self.__form.fieldValues[fieldname] = fieldvalue

  def submit(self):
    """Submit the currently selected L{form}.

       @raise NoFormSpecifiedError: If no form is currently selected.
    """
    if not self.__form: raise NoFormSpecifiedError
    method = self.__form.getAttribute('method') or 'GET'
    action = self.__form.getAttribute('action') or self.__uri
    self.get(action,method,self.__form.fieldValues)

  def dump_forms(self):
    """Dump details of all forms on the page. Useful for debugging."""
    forms = self.__htmldom.getElementsByTagName('form')
    for f in forms:
      print "Form",f.getAttribute('name') or '(no name)'
      print "  Action:",f.getAttribute('action') or self.__uri
      print "  Method:",f.getAttribute('method').upper() or 'GET'
      fields = f.getElementsByTagName('input') + \
               f.getElementsByTagName('textarea') + \
               f.getElementsByTagName('select')
      for fl in fields:
        if fl.nodeName == 'SELECT':
          print '    Dropdown:',fl.getAttribute('name') or '(no name)'
        elif fl.nodeName == 'INPUT':
          typ = fl.getAttribute('type') or 'text'
          if typ == 'text': print '    Textbox:',
          elif typ == 'password': print '    Password:',
          elif typ == 'radio': print '    Radio button:',
          elif typ == 'checkbox': print '    Checkbox:',
          elif typ == 'hidden': print '    Hidden:',
          elif typ == 'submit' or typ == 'image': print '    Button:',
          else: print '    '+typ+':',
          print fl.getAttribute('name') or '(no name):',
          if typ == 'text' or typ == 'password' or typ == 'hidden' \
		  or typ == 'image' or typ == 'submit':
            print fl.getAttribute('value') or '(no value)'
          elif typ == 'checkbox' or typ == 'radio':
            print fl.getAttribute('value') or '(no value)',
            if fl.getAttribute('checked'):
              print '(on)'
            else:
              print '(off)'
          else: print '.'
        elif fl.nodeName == 'TEXTAREA':
          print '    Textarea:', fl.getAttribute('name') or '(no name)'
          print self.__getInnerText(fl) or '(no text)'


  # Getters
  def content(self):
    "Returns a string containing the current data in the page."
    return self.__data
  def uri(self):
    "Returns a string containing the current URI."
    return self.__uri
  def current_form(self):
    "Returns a HTMLFormElement representing the current form."
    return self.__form
  def fieldValues(self):
    "Returns a dictionary containing the values set in the current form."
    return self.__form.fieldValues




