diff --git a/mechanize/__init__.py b/mechanize/__init__.py new file mode 100644 index 0000000..c4429be --- /dev/null +++ b/mechanize/__init__.py @@ -0,0 +1,211 @@ +__all__ = [ + 'AbstractBasicAuthHandler', + 'AbstractDigestAuthHandler', + 'BaseHandler', + 'Browser', + 'BrowserStateError', + 'CacheFTPHandler', + 'ContentTooShortError', + 'Cookie', + 'CookieJar', + 'CookiePolicy', + 'DefaultCookiePolicy', + 'DefaultFactory', + 'FTPHandler', + 'Factory', + 'FileCookieJar', + 'FileHandler', + 'FormNotFoundError', + 'FormsFactory', + 'HTTPBasicAuthHandler', + 'HTTPCookieProcessor', + 'HTTPDefaultErrorHandler', + 'HTTPDigestAuthHandler', + 'HTTPEquivProcessor', + 'HTTPError', + 'HTTPErrorProcessor', + 'HTTPHandler', + 'HTTPPasswordMgr', + 'HTTPPasswordMgrWithDefaultRealm', + 'HTTPProxyPasswordMgr', + 'HTTPRedirectDebugProcessor', + 'HTTPRedirectHandler', + 'HTTPRefererProcessor', + 'HTTPRefreshProcessor', + 'HTTPResponseDebugProcessor', + 'HTTPRobotRulesProcessor', + 'HTTPSClientCertMgr', + 'HeadParser', + 'History', + 'LWPCookieJar', + 'Link', + 'LinkNotFoundError', + 'LinksFactory', + 'LoadError', + 'MSIECookieJar', + 'MozillaCookieJar', + 'OpenerDirector', + 'OpenerFactory', + 'ParseError', + 'ProxyBasicAuthHandler', + 'ProxyDigestAuthHandler', + 'ProxyHandler', + 'Request', + 'RobotExclusionError', + 'RobustFactory', + 'RobustFormsFactory', + 'RobustLinksFactory', + 'RobustTitleFactory', + 'SeekableResponseOpener', + 'TitleFactory', + 'URLError', + 'USE_BARE_EXCEPT', + 'UnknownHandler', + 'UserAgent', + 'UserAgentBase', + 'XHTMLCompatibleHeadParser', + '__version__', + 'build_opener', + 'install_opener', + 'lwp_cookie_str', + 'make_response', + 'request_host', + 'response_seek_wrapper', # XXX deprecate in public interface? + 'seek_wrapped_response', # XXX should probably use this internally in place of response_seek_wrapper() + 'str2time', + 'urlopen', + 'urlretrieve', + 'urljoin', + + # ClientForm API + 'AmbiguityError', + 'ControlNotFoundError', + 'FormParser', + 'ItemCountError', + 'ItemNotFoundError', + 'LocateError', + 'Missing', + 'ParseFile', + 'ParseFileEx', + 'ParseResponse', + 'ParseResponseEx', + 'ParseString', + 'XHTMLCompatibleFormParser', + # deprecated + 'CheckboxControl', + 'Control', + 'FileControl', + 'HTMLForm', + 'HiddenControl', + 'IgnoreControl', + 'ImageControl', + 'IsindexControl', + 'Item', + 'Label', + 'ListControl', + 'PasswordControl', + 'RadioControl', + 'ScalarControl', + 'SelectControl', + 'SubmitButtonControl', + 'SubmitControl', + 'TextControl', + 'TextareaControl', + ] + +import logging +import sys + +from _version import __version__ + +# high-level stateful browser-style interface +from _mechanize import \ + Browser, History, \ + BrowserStateError, LinkNotFoundError, FormNotFoundError + +# configurable URL-opener interface +from _useragent import UserAgentBase, UserAgent +from _html import \ + Link, \ + Factory, DefaultFactory, RobustFactory, \ + FormsFactory, LinksFactory, TitleFactory, \ + RobustFormsFactory, RobustLinksFactory, RobustTitleFactory + +# urllib2 work-alike interface. This is a superset of the urllib2 interface. +from _urllib2 import * +import _urllib2 +if hasattr(_urllib2, "HTTPSHandler"): + __all__.append("HTTPSHandler") +del _urllib2 + +# misc +from _http import HeadParser +from _http import XHTMLCompatibleHeadParser +from _opener import ContentTooShortError, OpenerFactory, urlretrieve +from _response import \ + response_seek_wrapper, seek_wrapped_response, make_response +from _rfc3986 import urljoin +from _util import http2time as str2time + +# cookies +from _clientcookie import Cookie, CookiePolicy, DefaultCookiePolicy, \ + CookieJar, FileCookieJar, LoadError, request_host_lc as request_host, \ + effective_request_host +from _lwpcookiejar import LWPCookieJar, lwp_cookie_str +# 2.4 raises SyntaxError due to generator / try/finally use +if sys.version_info[:2] > (2,4): + try: + import sqlite3 + except ImportError: + pass + else: + from _firefox3cookiejar import Firefox3CookieJar +from _mozillacookiejar import MozillaCookieJar +from _msiecookiejar import MSIECookieJar + +# forms +from _form import ( + AmbiguityError, + ControlNotFoundError, + FormParser, + ItemCountError, + ItemNotFoundError, + LocateError, + Missing, + ParseError, + ParseFile, + ParseFileEx, + ParseResponse, + ParseResponseEx, + ParseString, + XHTMLCompatibleFormParser, + # deprecated + CheckboxControl, + Control, + FileControl, + HTMLForm, + HiddenControl, + IgnoreControl, + ImageControl, + IsindexControl, + Item, + Label, + ListControl, + PasswordControl, + RadioControl, + ScalarControl, + SelectControl, + SubmitButtonControl, + SubmitControl, + TextControl, + TextareaControl, + ) + +# If you hate the idea of turning bugs into warnings, do: +# import mechanize; mechanize.USE_BARE_EXCEPT = False +USE_BARE_EXCEPT = True + +logger = logging.getLogger("mechanize") +if logger.level is logging.NOTSET: + logger.setLevel(logging.CRITICAL) +del logger diff --git a/mechanize/__init__.pyc b/mechanize/__init__.pyc new file mode 100644 index 0000000..ed78626 Binary files /dev/null and b/mechanize/__init__.pyc differ diff --git a/mechanize/_auth.py b/mechanize/_auth.py new file mode 100644 index 0000000..900e201 --- /dev/null +++ b/mechanize/_auth.py @@ -0,0 +1,68 @@ +"""HTTP Authentication and Proxy support. + + +Copyright 2006 John J. Lee + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +from _urllib2_fork import HTTPPasswordMgr + + +# TODO: stop deriving from HTTPPasswordMgr +class HTTPProxyPasswordMgr(HTTPPasswordMgr): + # has default realm and host/port + def add_password(self, realm, uri, user, passwd): + # uri could be a single URI or a sequence + if uri is None or isinstance(uri, basestring): + uris = [uri] + else: + uris = uri + passwd_by_domain = self.passwd.setdefault(realm, {}) + for uri in uris: + for default_port in True, False: + reduced_uri = self.reduce_uri(uri, default_port) + passwd_by_domain[reduced_uri] = (user, passwd) + + def find_user_password(self, realm, authuri): + attempts = [(realm, authuri), (None, authuri)] + # bleh, want default realm to take precedence over default + # URI/authority, hence this outer loop + for default_uri in False, True: + for realm, authuri in attempts: + authinfo_by_domain = self.passwd.get(realm, {}) + for default_port in True, False: + reduced_authuri = self.reduce_uri(authuri, default_port) + for uri, authinfo in authinfo_by_domain.iteritems(): + if uri is None and not default_uri: + continue + if self.is_suburi(uri, reduced_authuri): + return authinfo + user, password = None, None + + if user is not None: + break + return user, password + + def reduce_uri(self, uri, default_port=True): + if uri is None: + return None + return HTTPPasswordMgr.reduce_uri(self, uri, default_port) + + def is_suburi(self, base, test): + if base is None: + # default to the proxy's host/port + hostport, path = test + base = (hostport, "/") + return HTTPPasswordMgr.is_suburi(self, base, test) + + +class HTTPSClientCertMgr(HTTPPasswordMgr): + # implementation inheritance: this is not a proper subclass + def add_key_cert(self, uri, key_file, cert_file): + self.add_password(None, uri, key_file, cert_file) + def find_key_cert(self, authuri): + return HTTPPasswordMgr.find_user_password(self, None, authuri) diff --git a/mechanize/_auth.pyc b/mechanize/_auth.pyc new file mode 100644 index 0000000..6c6e05f Binary files /dev/null and b/mechanize/_auth.pyc differ diff --git a/mechanize/_beautifulsoup.py b/mechanize/_beautifulsoup.py new file mode 100644 index 0000000..0040140 --- /dev/null +++ b/mechanize/_beautifulsoup.py @@ -0,0 +1,1077 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +v2.1.1 +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance +into a tree representation. It provides methods and Pythonic idioms +that make it easy to search and modify the tree. + +A well-formed XML/HTML document will yield a well-formed data +structure. An ill-formed XML/HTML document will yield a +correspondingly ill-formed data structure. If your document is only +locally well-formed, you can use this library to find and process the +well-formed part of it. The BeautifulSoup class has heuristics for +obtaining a sensible parse tree in the face of common HTML errors. + +Beautiful Soup has no external dependencies. It works with Python 2.2 +and up. + +Beautiful Soup defines classes for four different parsing strategies: + + * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific + language that kind of looks like XML. + + * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid + or invalid. + + * ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML + that trips up BeautifulSoup. + + * BeautifulSOAP, for making it easier to parse XML documents that use + lots of subelements containing a single string, where you'd prefer + they put that string into an attribute (such as SOAP messages). + +You can subclass BeautifulStoneSoup or BeautifulSoup to create a +parsing strategy specific to an XML schema or a particular bizarre +HTML document. Typically your subclass would just override +SELF_CLOSING_TAGS and/or NESTABLE_TAGS. +""" #" +from __future__ import generators + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "2.1.1" +__date__ = "$Date: 2004/10/18 00:14:20 $" +__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson" +__license__ = "PSF" + +from _sgmllib_copy import SGMLParser, SGMLParseError +import types +import re +import _sgmllib_copy as sgmllib + +class NullType(object): + + """Similar to NoneType with a corresponding singleton instance + 'Null' that, unlike None, accepts any message and returns itself. + + Examples: + >>> Null("send", "a", "message")("and one more", + ... "and what you get still") is Null + True + """ + + def __new__(cls): return Null + def __call__(self, *args, **kwargs): return Null +## def __getstate__(self, *args): return Null + def __getattr__(self, attr): return Null + def __getitem__(self, item): return Null + def __setattr__(self, attr, value): pass + def __setitem__(self, item, value): pass + def __len__(self): return 0 + # FIXME: is this a python bug? otherwise ``for x in Null: pass`` + # never terminates... + def __iter__(self): return iter([]) + def __contains__(self, item): return False + def __repr__(self): return "Null" +Null = object.__new__(NullType) + +class PageElement: + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + def setup(self, parent=Null, previous=Null): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent + self.previous = previous + self.next = Null + self.previousSibling = Null + self.nextSibling = Null + if self.parent and self.parent.contents: + self.previousSibling = self.parent.contents[-1] + self.previousSibling.nextSibling = self + + def findNext(self, name=None, attrs={}, text=None): + """Returns the first item that matches the given criteria and + appears after this Tag in the document.""" + return self._first(self.fetchNext, name, attrs, text) + firstNext = findNext + + def fetchNext(self, name=None, attrs={}, text=None, limit=None): + """Returns all items that match the given criteria and appear + before after Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.nextGenerator) + + def findNextSibling(self, name=None, attrs={}, text=None): + """Returns the closest sibling to this Tag that matches the + given criteria and appears after this Tag in the document.""" + return self._first(self.fetchNextSiblings, name, attrs, text) + firstNextSibling = findNextSibling + + def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None): + """Returns the siblings of this Tag that match the given + criteria and appear after this Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator) + + def findPrevious(self, name=None, attrs={}, text=None): + """Returns the first item that matches the given criteria and + appears before this Tag in the document.""" + return self._first(self.fetchPrevious, name, attrs, text) + + def fetchPrevious(self, name=None, attrs={}, text=None, limit=None): + """Returns all items that match the given criteria and appear + before this Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.previousGenerator) + firstPrevious = findPrevious + + def findPreviousSibling(self, name=None, attrs={}, text=None): + """Returns the closest sibling to this Tag that matches the + given criteria and appears before this Tag in the document.""" + return self._first(self.fetchPreviousSiblings, name, attrs, text) + firstPreviousSibling = findPreviousSibling + + def fetchPreviousSiblings(self, name=None, attrs={}, text=None, + limit=None): + """Returns the siblings of this Tag that match the given + criteria and appear before this Tag in the document.""" + return self._fetch(name, attrs, text, limit, + self.previousSiblingGenerator) + + def findParent(self, name=None, attrs={}): + """Returns the closest parent of this Tag that matches the given + criteria.""" + r = Null + l = self.fetchParents(name, attrs, 1) + if l: + r = l[0] + return r + firstParent = findParent + + def fetchParents(self, name=None, attrs={}, limit=None): + """Returns the parents of this Tag that match the given + criteria.""" + return self._fetch(name, attrs, None, limit, self.parentGenerator) + + #These methods do the real heavy lifting. + + def _first(self, method, name, attrs, text): + r = Null + l = method(name, attrs, text, 1) + if l: + r = l[0] + return r + + def _fetch(self, name, attrs, text, limit, generator): + "Iterates over a generator looking for things that match." + if not hasattr(attrs, 'items'): + attrs = {'class' : attrs} + + results = [] + g = generator() + while True: + try: + i = g.next() + except StopIteration: + break + found = None + if isinstance(i, Tag): + if not text: + if not name or self._matches(i, name): + match = True + for attr, matchAgainst in attrs.items(): + check = i.get(attr) + if not self._matches(check, matchAgainst): + match = False + break + if match: + found = i + elif text: + if self._matches(i, text): + found = i + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #Generators that can be used to navigate starting from both + #NavigableTexts and Tags. + def nextGenerator(self): + i = self + while i: + i = i.next + yield i + + def nextSiblingGenerator(self): + i = self + while i: + i = i.nextSibling + yield i + + def previousGenerator(self): + i = self + while i: + i = i.previous + yield i + + def previousSiblingGenerator(self): + i = self + while i: + i = i.previousSibling + yield i + + def parentGenerator(self): + i = self + while i: + i = i.parent + yield i + + def _matches(self, chunk, howToMatch): + #print 'looking for %s in %s' % (howToMatch, chunk) + # + # If given a list of items, return true if the list contains a + # text element that matches. + if isList(chunk) and not isinstance(chunk, Tag): + for tag in chunk: + if isinstance(tag, NavigableText) and self._matches(tag, howToMatch): + return True + return False + if callable(howToMatch): + return howToMatch(chunk) + if isinstance(chunk, Tag): + #Custom match methods take the tag as an argument, but all other + #ways of matching match the tag name as a string + chunk = chunk.name + #Now we know that chunk is a string + if not isinstance(chunk, basestring): + chunk = str(chunk) + if hasattr(howToMatch, 'match'): + # It's a regexp object. + return howToMatch.search(chunk) + if isList(howToMatch): + return chunk in howToMatch + if hasattr(howToMatch, 'items'): + return howToMatch.has_key(chunk) + #It's just a string + return str(howToMatch) == chunk + +class NavigableText(PageElement): + + def __getattr__(self, attr): + "For backwards compatibility, text.string gives you text" + if attr == 'string': + return self + else: + raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + +class NavigableString(str, NavigableText): + pass + +class NavigableUnicodeString(unicode, NavigableText): + pass + +class Tag(PageElement): + + """Represents a found HTML tag with its attributes and contents.""" + + def __init__(self, name, attrs=None, parent=Null, previous=Null): + "Basic constructor." + self.name = name + if attrs == None: + attrs = [] + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self._getAttrMap().get(key, default) + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the tag, + and throws an exception if it's not there.""" + return self._getAttrMap()[key] + + def __iter__(self): + "Iterating over a tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __nonzero__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self._getAttrMap() + self.attrMap[key] = value + found = False + for i in range(0, len(self.attrs)): + if self.attrs[i][0] == key: + self.attrs[i] = (key, value) + found = True + if not found: + self.attrs.append((key, value)) + self._getAttrMap()[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + for item in self.attrs: + if item[0] == key: + self.attrs.remove(item) + #We don't break because bad HTML can define the same + #attribute multiple times. + self._getAttrMap() + if self.attrMap.has_key(key): + del self.attrMap[key] + + def __call__(self, *args, **kwargs): + """Calling a tag like a function is the same as calling its + fetch() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return apply(self.fetch, args, kwargs) + + def __getattr__(self, tag): + if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: + return self.first(tag[:-3]) + elif tag.find('__') != 0: + return self.first(tag) + + def __eq__(self, other): + """Returns true iff this tag has the same name, the same attributes, + and the same contents (recursively) as the given tag. + + NOTE: right now this will return false if two tags have the + same attributes in a different order. Should this be fixed?""" + if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): + return False + for i in range(0, len(self.contents)): + if self.contents[i] != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this tag is not identical to the other tag, + as defined in __eq__.""" + return not self == other + + def __repr__(self): + """Renders this tag as a string.""" + return str(self) + + def __unicode__(self): + return self.__str__(1) + + def __str__(self, needUnicode=None, showStructureIndent=None): + """Returns a string or Unicode representation of this tag and + its contents. + + NOTE: since Python's HTML parser consumes whitespace, this + method is not certain to reproduce the whitespace present in + the original string.""" + + attrs = [] + if self.attrs: + for key, val in self.attrs: + attrs.append('%s="%s"' % (key, val)) + close = '' + closeTag = '' + if self.isSelfClosing(): + close = ' /' + else: + closeTag = '' % self.name + indentIncrement = None + if showStructureIndent != None: + indentIncrement = showStructureIndent + if not self.hidden: + indentIncrement += 1 + contents = self.renderContents(indentIncrement, needUnicode=needUnicode) + if showStructureIndent: + space = '\n%s' % (' ' * showStructureIndent) + if self.hidden: + s = contents + else: + s = [] + attributeString = '' + if attrs: + attributeString = ' ' + ' '.join(attrs) + if showStructureIndent: + s.append(space) + s.append('<%s%s%s>' % (self.name, attributeString, close)) + s.append(contents) + if closeTag and showStructureIndent != None: + s.append(space) + s.append(closeTag) + s = ''.join(s) + isUnicode = type(s) == types.UnicodeType + if needUnicode and not isUnicode: + s = unicode(s) + elif isUnicode and needUnicode==False: + s = str(s) + return s + + def prettify(self, needUnicode=None): + return self.__str__(needUnicode, showStructureIndent=True) + + def renderContents(self, showStructureIndent=None, needUnicode=None): + """Renders the contents of this tag as a (possibly Unicode) + string.""" + s=[] + for c in self: + text = None + if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType: + text = unicode(c) + elif isinstance(c, Tag): + s.append(c.__str__(needUnicode, showStructureIndent)) + elif needUnicode: + text = unicode(c) + else: + text = str(c) + if text: + if showStructureIndent != None: + if text[-1] == '\n': + text = text[:-1] + s.append(text) + return ''.join(s) + + #Soup methods + + def firstText(self, text, recursive=True): + """Convenience method to retrieve the first piece of text matching the + given criteria. 'text' can be a string, a regular expression object, + a callable that takes a string and returns whether or not the + string 'matches', etc.""" + return self.first(recursive=recursive, text=text) + + def fetchText(self, text, recursive=True, limit=None): + """Convenience method to retrieve all pieces of text matching the + given criteria. 'text' can be a string, a regular expression object, + a callable that takes a string and returns whether or not the + string 'matches', etc.""" + return self.fetch(recursive=recursive, text=text, limit=limit) + + def first(self, name=None, attrs={}, recursive=True, text=None): + """Return only the first child of this + Tag matching the given criteria.""" + r = Null + l = self.fetch(name, attrs, recursive, text, 1) + if l: + r = l[0] + return r + findChild = first + + def fetch(self, name=None, attrs={}, recursive=True, text=None, + limit=None): + """Extracts a list of Tag objects that match the given + criteria. You can specify the name of the Tag and any + attributes you want the Tag to have. + + The value of a key-value pair in the 'attrs' map can be a + string, a list of strings, a regular expression object, or a + callable that takes a string and returns whether or not the + string matches for some custom definition of 'matches'. The + same is true of the tag name.""" + generator = self.recursiveChildGenerator + if not recursive: + generator = self.childGenerator + return self._fetch(name, attrs, text, limit, generator) + fetchChildren = fetch + + #Utility methods + + def isSelfClosing(self): + """Returns true iff this is a self-closing tag as defined in the HTML + standard. + + TODO: This is specific to BeautifulSoup and its subclasses, but it's + used by __str__""" + return self.name in BeautifulSoup.SELF_CLOSING_TAGS + + def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.contents.append(tag) + + #Private methods + + def _getAttrMap(self): + """Initializes a map representation of this tag's attributes, + if not already initialized.""" + if not getattr(self, 'attrMap'): + self.attrMap = {} + for (key, value) in self.attrs: + self.attrMap[key] = value + return self.attrMap + + #Generator methods + def childGenerator(self): + for i in range(0, len(self.contents)): + yield self.contents[i] + raise StopIteration + + def recursiveChildGenerator(self): + stack = [(self, 0)] + while stack: + tag, start = stack.pop() + if isinstance(tag, Tag): + for i in range(start, len(tag.contents)): + a = tag.contents[i] + yield a + if isinstance(a, Tag) and tag.contents: + if i < len(tag.contents) - 1: + stack.append((tag, i+1)) + stack.append((a, 0)) + break + raise StopIteration + + +def isList(l): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is listlike.""" + return hasattr(l, '__iter__') \ + or (type(l) in (types.ListType, types.TupleType)) + +def buildTagMap(default, *args): + """Turns a list of maps, lists, or scalars into a single map. + Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out + of lists and partial maps.""" + built = {} + for portion in args: + if hasattr(portion, 'items'): + #It's a map. Merge it. + for k,v in portion.items(): + built[k] = v + elif isList(portion): + #It's a list. Map each item to the default. + for k in portion: + built[k] = default + else: + #It's a scalar. Map it to the default. + built[portion] = default + return built + +class BeautifulStoneSoup(Tag, SGMLParser): + + """This class contains the basic parser and fetch code. It defines + a parser that knows nothing about tag behavior except for the + following: + + You can't close a tag without closing all the tags it encloses. + That is, "" actually means + "". + + [Another possible explanation is "", but since + this class defines no SELF_CLOSING_TAGS, it will never use that + explanation.] + + This class is useful for parsing XML or made-up markup languages, + or when BeautifulSoup makes an assumption counter to what you were + expecting.""" + + SELF_CLOSING_TAGS = {} + NESTABLE_TAGS = {} + RESET_NESTING_TAGS = {} + QUOTE_TAGS = {} + + #As a public service we will by default silently replace MS smart quotes + #and similar characters with their HTML or ASCII equivalents. + MS_CHARS = { '\x80' : '€', + '\x81' : ' ', + '\x82' : '‚', + '\x83' : 'ƒ', + '\x84' : '„', + '\x85' : '…', + '\x86' : '†', + '\x87' : '‡', + '\x88' : '⁁', + '\x89' : '%', + '\x8A' : 'Š', + '\x8B' : '<', + '\x8C' : 'Œ', + '\x8D' : '?', + '\x8E' : 'Z', + '\x8F' : '?', + '\x90' : '?', + '\x91' : '‘', + '\x92' : '’', + '\x93' : '“', + '\x94' : '”', + '\x95' : '•', + '\x96' : '–', + '\x97' : '—', + '\x98' : '˜', + '\x99' : '™', + '\x9a' : 'š', + '\x9b' : '>', + '\x9c' : 'œ', + '\x9d' : '?', + '\x9e' : 'z', + '\x9f' : 'Ÿ',} + + PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'), + lambda(x):x.group(1) + ' />'), + (re.compile(']*)>'), + lambda(x):''), + (re.compile("([\x80-\x9f])"), + lambda(x): BeautifulStoneSoup.MS_CHARS.get(x.group(1))) + ] + + ROOT_TAG_NAME = '[document]' + + def __init__(self, text=None, avoidParserProblems=True, + initialTextIsEverything=True): + """Initialize this as the 'root tag' and feed in any text to + the parser. + + NOTE about avoidParserProblems: sgmllib will process most bad + HTML, and BeautifulSoup has tricks for dealing with some HTML + that kills sgmllib, but Beautiful Soup can nonetheless choke + or lose data if your data uses self-closing tags or + declarations incorrectly. By default, Beautiful Soup sanitizes + its input to avoid the vast majority of these problems. The + problems are relatively rare, even in bad HTML, so feel free + to pass in False to avoidParserProblems if they don't apply to + you, and you'll get better performance. The only reason I have + this turned on by default is so I don't get so many tech + support questions. + + The two most common instances of invalid HTML that will choke + sgmllib are fixed by the default parser massage techniques: + +
(No space between name of closing tag and tag close) + (Extraneous whitespace in declaration) + + You can pass in a custom list of (RE object, replace method) + tuples to get Beautiful Soup to scrub your input the way you + want.""" + Tag.__init__(self, self.ROOT_TAG_NAME) + if avoidParserProblems \ + and not isList(avoidParserProblems): + avoidParserProblems = self.PARSER_MASSAGE + self.avoidParserProblems = avoidParserProblems + SGMLParser.__init__(self) + self.quoteStack = [] + self.hidden = 1 + self.reset() + if hasattr(text, 'read'): + #It's a file-type object. + text = text.read() + if text: + self.feed(text) + if initialTextIsEverything: + self.done() + + def __getattr__(self, methodName): + """This method routes method call requests to either the SGMLParser + superclass or the Tag superclass, depending on the method name.""" + if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ + or methodName.find('do_') == 0: + return SGMLParser.__getattr__(self, methodName) + elif methodName.find('__') != 0: + return Tag.__getattr__(self, methodName) + else: + raise AttributeError + + def feed(self, text): + if self.avoidParserProblems: + for fix, m in self.avoidParserProblems: + text = fix.sub(m, text) + SGMLParser.feed(self, text) + + def done(self): + """Called when you're done parsing, so that the unclosed tags can be + correctly processed.""" + self.endData() #NEW + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + SGMLParser.reset(self) + self.currentData = [] + self.currentTag = None + self.tagStack = [] + self.pushTag(self) + + def popTag(self): + tag = self.tagStack.pop() + # Tags with just one string-owning child get the child as a + # 'string' property, so that soup.tag.string is shorthand for + # soup.tag.contents[0] + if len(self.currentTag.contents) == 1 and \ + isinstance(self.currentTag.contents[0], NavigableText): + self.currentTag.string = self.currentTag.contents[0] + + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + + def endData(self): + currentData = ''.join(self.currentData) + if currentData: + if not currentData.strip(): + if '\n' in currentData: + currentData = '\n' + else: + currentData = ' ' + c = NavigableString + if type(currentData) == types.UnicodeType: + c = NavigableUnicodeString + o = c(currentData) + o.setup(self.currentTag, self.previous) + if self.previous: + self.previous.next = o + self.previous = o + self.currentTag.contents.append(o) + self.currentData = [] + + def _popToTag(self, name, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + if name == self.ROOT_TAG_NAME: + return + + numPops = 0 + mostRecentTag = None + for i in range(len(self.tagStack)-1, 0, -1): + if name == self.tagStack[i].name: + numPops = len(self.tagStack)-i + break + if not inclusivePop: + numPops = numPops - 1 + + for i in range(0, numPops): + mostRecentTag = self.popTag() + return mostRecentTag + + def _smartPop(self, name): + + """We need to pop up to the previous tag of this type, unless + one of this tag's nesting reset triggers comes between this + tag and the previous tag of this type, OR unless this tag is a + generic nesting trigger and another generic nesting trigger + comes between this tag and the previous tag of this type. + + Examples: +

FooBar

should pop to 'p', not 'b'. +

FooBar

should pop to 'table', not 'p'. +

Foo

Bar

should pop to 'tr', not 'p'. +

FooBar

should pop to 'p', not 'b'. + +

    • *
    • * should pop to 'ul', not the first 'li'. +
  • ** should pop to 'table', not the first 'tr' + tag should + implicitly close the previous tag within the same
    ** should pop to 'tr', not the first 'td' + """ + + nestingResetTriggers = self.NESTABLE_TAGS.get(name) + isNestable = nestingResetTriggers != None + isResetNesting = self.RESET_NESTING_TAGS.has_key(name) + popTo = None + inclusive = True + for i in range(len(self.tagStack)-1, 0, -1): + p = self.tagStack[i] + if (not p or p.name == name) and not isNestable: + #Non-nestable tags get popped to the top or to their + #last occurance. + popTo = name + break + if (nestingResetTriggers != None + and p.name in nestingResetTriggers) \ + or (nestingResetTriggers == None and isResetNesting + and self.RESET_NESTING_TAGS.has_key(p.name)): + + #If we encounter one of the nesting reset triggers + #peculiar to this tag, or we encounter another tag + #that causes nesting to reset, pop up to but not + #including that tag. + + popTo = p.name + inclusive = False + break + p = p.parent + if popTo: + self._popToTag(popTo, inclusive) + + def unknown_starttag(self, name, attrs, selfClosing=0): + #print "Start tag %s" % name + if self.quoteStack: + #This is not a real tag. + #print "<%s> is not real!" % name + attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) + self.handle_data('<%s%s>' % (name, attrs)) + return + self.endData() + if not name in self.SELF_CLOSING_TAGS and not selfClosing: + self._smartPop(name) + tag = Tag(name, attrs, self.currentTag, self.previous) + if self.previous: + self.previous.next = tag + self.previous = tag + self.pushTag(tag) + if selfClosing or name in self.SELF_CLOSING_TAGS: + self.popTag() + if name in self.QUOTE_TAGS: + #print "Beginning quote (%s)" % name + self.quoteStack.append(name) + self.literal = 1 + + def unknown_endtag(self, name): + if self.quoteStack and self.quoteStack[-1] != name: + #This is not a real end tag. + #print " is not real!" % name + self.handle_data('' % name) + return + self.endData() + self._popToTag(name) + if self.quoteStack and self.quoteStack[-1] == name: + self.quoteStack.pop() + self.literal = (len(self.quoteStack) > 0) + + def handle_data(self, data): + self.currentData.append(data) + + def handle_pi(self, text): + "Propagate processing instructions right through." + self.handle_data("" % text) + + def handle_comment(self, text): + "Propagate comments right through." + self.handle_data("" % text) + + def handle_charref(self, ref): + "Propagate char refs right through." + self.handle_data('&#%s;' % ref) + + def handle_entityref(self, ref): + "Propagate entity refs right through." + self.handle_data('&%s;' % ref) + + def handle_decl(self, data): + "Propagate DOCTYPEs and the like right through." + self.handle_data('' % data) + + def parse_declaration(self, i): + """Treat a bogus SGML declaration as raw data. Treat a CDATA + declaration as regular data.""" + j = None + if self.rawdata[i:i+9] == '', i) + if k == -1: + k = len(self.rawdata) + self.handle_data(self.rawdata[i+9:k]) + j = k+3 + else: + try: + j = SGMLParser.parse_declaration(self, i) + except SGMLParseError: + toHandle = self.rawdata[i:] + self.handle_data(toHandle) + j = i + len(toHandle) + return j + +class BeautifulSoup(BeautifulStoneSoup): + + """This parser knows the following facts about HTML: + + * Some tags have no closing tag and should be interpreted as being + closed as soon as they are encountered. + + * The text inside some tags (ie. 'script') may contain tags which + are not really part of the document and which should be parsed + as text, not tags. If you want to parse the text as tags, you can + always fetch it and parse it explicitly. + + * Tag nesting rules: + + Most tags can't be nested at all. For instance, the occurance of + a

    tag should implicitly close the previous

    tag. + +

    Para1

    Para2 + should be transformed into: +

    Para1

    Para2 + + Some tags can be nested arbitrarily. For instance, the occurance + of a

    tag should _not_ implicitly close the previous +
    tag. + + Alice said:
    Bob said:
    Blah + should NOT be transformed into: + Alice said:
    Bob said:
    Blah + + Some tags can be nested, but the nesting is reset by the + interposition of other tags. For instance, a
    , + but not close a tag in another table. + +
    BlahBlah + should be transformed into: +
    BlahBlah + but, + Blah
    Blah + should NOT be transformed into + Blah
    Blah + + Differing assumptions about tag nesting rules are a major source + of problems with the BeautifulSoup class. If BeautifulSoup is not + treating as nestable a tag your page author treats as nestable, + try ICantBelieveItsBeautifulSoup before writing your own + subclass.""" + + SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + + QUOTE_TAGS = {'script': None} + + #According to the HTML standard, each of these inline tags can + #contain another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', + 'center'] + + #According to the HTML standard, these block tags can contain + #another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] + + #Lists can contain other lists, but there are restrictions. + NESTABLE_LIST_TAGS = { 'ol' : [], + 'ul' : [], + 'li' : ['ul', 'ol'], + 'dl' : [], + 'dd' : ['dl'], + 'dt' : ['dl'] } + + #Tables can contain other tables, but there are restrictions. + NESTABLE_TABLE_TAGS = {'table' : [], + 'tr' : ['table', 'tbody', 'tfoot', 'thead'], + 'td' : ['tr'], + 'th' : ['tr'], + } + + NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] + + #If one of these tags is encountered, all tags up to the next tag of + #this type are popped. + RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', + NON_NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, + NESTABLE_TABLE_TAGS) + + NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) + +class ICantBelieveItsBeautifulSoup(BeautifulSoup): + + """The BeautifulSoup class is oriented towards skipping over + common HTML errors like unclosed tags. However, sometimes it makes + errors of its own. For instance, consider this fragment: + + FooBar + + This is perfectly valid (if bizarre) HTML. However, the + BeautifulSoup class will implicitly close the first b tag when it + encounters the second 'b'. It will think the author wrote + "FooBar", and didn't close the first 'b' tag, because + there's no real-world reason to bold something that's already + bold. When it encounters '' it will close two more 'b' + tags, for a grand total of three tags closed instead of two. This + can throw off the rest of your document structure. The same is + true of a number of other tags, listed below. + + It's much more common for someone to forget to close (eg.) a 'b' + tag than to actually use nested 'b' tags, and the BeautifulSoup + class handles the common case. This class handles the + not-co-common case: where you can't believe someone wrote what + they did, but it's valid HTML and BeautifulSoup screwed up by + assuming it wouldn't be. + + If this doesn't do what you need, try subclassing this class or + BeautifulSoup, and providing your own list of NESTABLE_TAGS.""" + + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ + ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', + 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', + 'big'] + + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] + + NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) + +class BeautifulSOAP(BeautifulStoneSoup): + """This class will push a tag with only a single string child into + the tag's parent as an attribute. The attribute's name is the tag + name, and the value is the string child. An example should give + the flavor of the change: + + baz + => + baz + + You can then access fooTag['bar'] instead of fooTag.barTag.string. + + This is, of course, useful for scraping structures that tend to + use subelements instead of attributes, such as SOAP messages. Note + that it modifies its input, so don't print the modified version + out. + + I'm not sure how many people really want to use this class; let me + know if you do. Mainly I like the name.""" + + def popTag(self): + if len(self.tagStack) > 1: + tag = self.tagStack[-1] + parent = self.tagStack[-2] + parent._getAttrMap() + if (isinstance(tag, Tag) and len(tag.contents) == 1 and + isinstance(tag.contents[0], NavigableText) and + not parent.attrMap.has_key(tag.name)): + parent[tag.name] = tag.contents[0] + BeautifulStoneSoup.popTag(self) + +#Enterprise class names! It has come to our attention that some people +#think the names of the Beautiful Soup parser classes are too silly +#and "unprofessional" for use in enterprise screen-scraping. We feel +#your pain! For such-minded folk, the Beautiful Soup Consortium And +#All-Night Kosher Bakery recommends renaming this file to +#"RobustParser.py" (or, in cases of extreme enterprisitude, +#"RobustParserBeanInterface.class") and using the following +#enterprise-friendly class aliases: +class RobustXMLParser(BeautifulStoneSoup): + pass +class RobustHTMLParser(BeautifulSoup): + pass +class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): + pass +class SimplifyingSOAPParser(BeautifulSOAP): + pass + +### + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulStoneSoup(sys.stdin.read()) + print soup.prettify() diff --git a/mechanize/_beautifulsoup.pyc b/mechanize/_beautifulsoup.pyc new file mode 100644 index 0000000..e5095b8 Binary files /dev/null and b/mechanize/_beautifulsoup.pyc differ diff --git a/mechanize/_clientcookie.py b/mechanize/_clientcookie.py new file mode 100644 index 0000000..2ed4c87 --- /dev/null +++ b/mechanize/_clientcookie.py @@ -0,0 +1,1725 @@ +"""HTTP cookie handling for web clients. + +This module originally developed from my port of Gisle Aas' Perl module +HTTP::Cookies, from the libwww-perl library. + +Docstrings, comments and debug strings in this code refer to the +attributes of the HTTP cookie system as cookie-attributes, to distinguish +them clearly from Python attributes. + + CookieJar____ + / \ \ + FileCookieJar \ \ + / | \ \ \ + MozillaCookieJar | LWPCookieJar \ \ + | | \ + | ---MSIEBase | \ + | / | | \ + | / MSIEDBCookieJar BSDDBCookieJar + |/ + MSIECookieJar + +Comments to John J Lee . + + +Copyright 2002-2006 John J Lee +Copyright 1997-1999 Gisle Aas (original libwww-perl code) +Copyright 2002-2003 Johnny Lee (original MSIE Perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import sys, re, copy, time, urllib, types, logging +try: + import threading + _threading = threading; del threading +except ImportError: + import dummy_threading + _threading = dummy_threading; del dummy_threading + +MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " + "instance initialised with one)") +DEFAULT_HTTP_PORT = "80" + +from _headersutil import split_header_words, parse_ns_headers +from _util import isstringlike +import _rfc3986 + +debug = logging.getLogger("mechanize.cookies").debug + + +def reraise_unmasked_exceptions(unmasked=()): + # There are a few catch-all except: statements in this module, for + # catching input that's bad in unexpected ways. + # This function re-raises some exceptions we don't want to trap. + import mechanize, warnings + if not mechanize.USE_BARE_EXCEPT: + raise + unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError) + etype = sys.exc_info()[0] + if issubclass(etype, unmasked): + raise + # swallowed an exception + import traceback, StringIO + f = StringIO.StringIO() + traceback.print_exc(None, f) + msg = f.getvalue() + warnings.warn("mechanize bug!\n%s" % msg, stacklevel=2) + + +IPV4_RE = re.compile(r"\.\d+$") +def is_HDN(text): + """Return True if text is a host domain name.""" + # XXX + # This may well be wrong. Which RFC is HDN defined in, if any (for + # the purposes of RFC 2965)? + # For the current implementation, what about IPv6? Remember to look + # at other uses of IPV4_RE also, if change this. + return not (IPV4_RE.search(text) or + text == "" or + text[0] == "." or text[-1] == ".") + +def domain_match(A, B): + """Return True if domain A domain-matches domain B, according to RFC 2965. + + A and B may be host domain names or IP addresses. + + RFC 2965, section 1: + + Host names can be specified either as an IP address or a HDN string. + Sometimes we compare one host name with another. (Such comparisons SHALL + be case-insensitive.) Host A's name domain-matches host B's if + + * their host name strings string-compare equal; or + + * A is a HDN string and has the form NB, where N is a non-empty + name string, B has the form .B', and B' is a HDN string. (So, + x.y.com domain-matches .Y.com but not Y.com.) + + Note that domain-match is not a commutative operation: a.b.c.com + domain-matches .c.com, but not the reverse. + + """ + # Note that, if A or B are IP addresses, the only relevant part of the + # definition of the domain-match algorithm is the direct string-compare. + A = A.lower() + B = B.lower() + if A == B: + return True + if not is_HDN(A): + return False + i = A.rfind(B) + has_form_nb = not (i == -1 or i == 0) + return ( + has_form_nb and + B.startswith(".") and + is_HDN(B[1:]) + ) + +def liberal_is_HDN(text): + """Return True if text is a sort-of-like a host domain name. + + For accepting/blocking domains. + + """ + return not IPV4_RE.search(text) + +def user_domain_match(A, B): + """For blocking/accepting domains. + + A and B may be host domain names or IP addresses. + + """ + A = A.lower() + B = B.lower() + if not (liberal_is_HDN(A) and liberal_is_HDN(B)): + if A == B: + # equal IP addresses + return True + return False + initial_dot = B.startswith(".") + if initial_dot and A.endswith(B): + return True + if not initial_dot and A == B: + return True + return False + +cut_port_re = re.compile(r":\d+$") +def request_host(request): + """Return request-host, as defined by RFC 2965. + + Variation from RFC: returned value is lowercased, for convenient + comparison. + + """ + url = request.get_full_url() + host = _rfc3986.urlsplit(url)[1] + if host is None: + host = request.get_header("Host", "") + # remove port, if present + return cut_port_re.sub("", host, 1) + +def request_host_lc(request): + return request_host(request).lower() + +def eff_request_host(request): + """Return a tuple (request-host, effective request-host name).""" + erhn = req_host = request_host(request) + if req_host.find(".") == -1 and not IPV4_RE.search(req_host): + erhn = req_host + ".local" + return req_host, erhn + +def eff_request_host_lc(request): + req_host, erhn = eff_request_host(request) + return req_host.lower(), erhn.lower() + +def effective_request_host(request): + """Return the effective request-host, as defined by RFC 2965.""" + return eff_request_host(request)[1] + +def request_path(request): + """Return path component of request-URI, as defined by RFC 2965.""" + url = request.get_full_url() + path = escape_path(_rfc3986.urlsplit(url)[2]) + if not path.startswith("/"): + path = "/" + path + return path + +def request_port(request): + host = request.get_host() + i = host.find(':') + if i >= 0: + port = host[i+1:] + try: + int(port) + except ValueError: + debug("nonnumeric port: '%s'", port) + return None + else: + port = DEFAULT_HTTP_PORT + return port + +def request_is_unverifiable(request): + try: + return request.is_unverifiable() + except AttributeError: + if hasattr(request, "unverifiable"): + return request.unverifiable + else: + raise + +# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't +# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). +HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" +ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") +def uppercase_escaped_char(match): + return "%%%s" % match.group(1).upper() +def escape_path(path): + """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" + # There's no knowing what character encoding was used to create URLs + # containing %-escapes, but since we have to pick one to escape invalid + # path characters, we pick UTF-8, as recommended in the HTML 4.0 + # specification: + # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 + # And here, kind of: draft-fielding-uri-rfc2396bis-03 + # (And in draft IRI specification: draft-duerst-iri-05) + # (And here, for new URI schemes: RFC 2718) + if isinstance(path, types.UnicodeType): + path = path.encode("utf-8") + path = urllib.quote(path, HTTP_PATH_SAFE) + path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) + return path + +def reach(h): + """Return reach of host h, as defined by RFC 2965, section 1. + + The reach R of a host name H is defined as follows: + + * If + + - H is the host domain name of a host; and, + + - H has the form A.B; and + + - A has no embedded (that is, interior) dots; and + + - B has at least one embedded dot, or B is the string "local". + then the reach of H is .B. + + * Otherwise, the reach of H is H. + + >>> reach("www.acme.com") + '.acme.com' + >>> reach("acme.com") + 'acme.com' + >>> reach("acme.local") + '.local' + + """ + i = h.find(".") + if i >= 0: + #a = h[:i] # this line is only here to show what a is + b = h[i+1:] + i = b.find(".") + if is_HDN(h) and (i >= 0 or b == "local"): + return "."+b + return h + +def is_third_party(request): + """ + + RFC 2965, section 3.3.6: + + An unverifiable transaction is to a third-party host if its request- + host U does not domain-match the reach R of the request-host O in the + origin transaction. + + """ + req_host = request_host_lc(request) + # the origin request's request-host was stuffed into request by + # _urllib2_support.AbstractHTTPHandler + return not domain_match(req_host, reach(request.origin_req_host)) + + +try: + all +except NameError: + # python 2.4 + def all(iterable): + for x in iterable: + if not x: + return False + return True + + +class Cookie: + """HTTP Cookie. + + This class represents both Netscape and RFC 2965 cookies. + + This is deliberately a very simple class. It just holds attributes. It's + possible to construct Cookie instances that don't comply with the cookie + standards. CookieJar.make_cookies is the factory function for Cookie + objects -- it deals with cookie parsing, supplying defaults, and + normalising to the representation used in this class. CookiePolicy is + responsible for checking them to see whether they should be accepted from + and returned to the server. + + version: integer; + name: string; + value: string (may be None); + port: string; None indicates no attribute was supplied (e.g. "Port", rather + than eg. "Port=80"); otherwise, a port string (eg. "80") or a port list + string (e.g. "80,8080") + port_specified: boolean; true if a value was supplied with the Port + cookie-attribute + domain: string; + domain_specified: boolean; true if Domain was explicitly set + domain_initial_dot: boolean; true if Domain as set in HTTP header by server + started with a dot (yes, this really is necessary!) + path: string; + path_specified: boolean; true if Path was explicitly set + secure: boolean; true if should only be returned over secure connection + expires: integer; seconds since epoch (RFC 2965 cookies should calculate + this value from the Max-Age attribute) + discard: boolean, true if this is a session cookie; (if no expires value, + this should be true) + comment: string; + comment_url: string; + rfc2109: boolean; true if cookie arrived in a Set-Cookie: (not + Set-Cookie2:) header, but had a version cookie-attribute of 1 + rest: mapping of other cookie-attributes + + Note that the port may be present in the headers, but unspecified ("Port" + rather than"Port=80", for example); if this is the case, port is None. + + """ + + + _attrs = ("version", "name", "value", + "port", "port_specified", + "domain", "domain_specified", "domain_initial_dot", + "path", "path_specified", + "secure", "expires", "discard", "comment", "comment_url", + "rfc2109", "_rest") + + def __init__(self, version, name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest, + rfc2109=False, + ): + + if version is not None: version = int(version) + if expires is not None: expires = int(expires) + if port is None and port_specified is True: + raise ValueError("if port is None, port_specified must be false") + + self.version = version + self.name = name + self.value = value + self.port = port + self.port_specified = port_specified + # normalise case, as per RFC 2965 section 3.3.3 + self.domain = domain.lower() + self.domain_specified = domain_specified + # Sigh. We need to know whether the domain given in the + # cookie-attribute had an initial dot, in order to follow RFC 2965 + # (as clarified in draft errata). Needed for the returned $Domain + # value. + self.domain_initial_dot = domain_initial_dot + self.path = path + self.path_specified = path_specified + self.secure = secure + self.expires = expires + self.discard = discard + self.comment = comment + self.comment_url = comment_url + self.rfc2109 = rfc2109 + + self._rest = copy.copy(rest) + + def has_nonstandard_attr(self, name): + return self._rest.has_key(name) + def get_nonstandard_attr(self, name, default=None): + return self._rest.get(name, default) + def set_nonstandard_attr(self, name, value): + self._rest[name] = value + def nonstandard_attr_keys(self): + return self._rest.keys() + + def is_expired(self, now=None): + if now is None: now = time.time() + return (self.expires is not None) and (self.expires <= now) + + def __eq__(self, other): + return all(getattr(self, a) == getattr(other, a) for a in self._attrs) + + def __ne__(self, other): + return not (self == other) + + def __str__(self): + if self.port is None: p = "" + else: p = ":"+self.port + limit = self.domain + p + self.path + if self.value is not None: + namevalue = "%s=%s" % (self.name, self.value) + else: + namevalue = self.name + return "" % (namevalue, limit) + + def __repr__(self): + args = [] + for name in ["version", "name", "value", + "port", "port_specified", + "domain", "domain_specified", "domain_initial_dot", + "path", "path_specified", + "secure", "expires", "discard", "comment", "comment_url", + ]: + attr = getattr(self, name) + args.append("%s=%s" % (name, repr(attr))) + args.append("rest=%s" % repr(self._rest)) + args.append("rfc2109=%s" % repr(self.rfc2109)) + return "Cookie(%s)" % ", ".join(args) + + +class CookiePolicy: + """Defines which cookies get accepted from and returned to server. + + May also modify cookies. + + The subclass DefaultCookiePolicy defines the standard rules for Netscape + and RFC 2965 cookies -- override that if you want a customised policy. + + As well as implementing set_ok and return_ok, implementations of this + interface must also supply the following attributes, indicating which + protocols should be used, and how. These can be read and set at any time, + though whether that makes complete sense from the protocol point of view is + doubtful. + + Public attributes: + + netscape: implement netscape protocol + rfc2965: implement RFC 2965 protocol + rfc2109_as_netscape: + WARNING: This argument will change or go away if is not accepted into + the Python standard library in this form! + If true, treat RFC 2109 cookies as though they were Netscape cookies. The + default is for this attribute to be None, which means treat 2109 cookies + as RFC 2965 cookies unless RFC 2965 handling is switched off (which it is, + by default), and as Netscape cookies otherwise. + hide_cookie2: don't add Cookie2 header to requests (the presence of + this header indicates to the server that we understand RFC 2965 + cookies) + + """ + def set_ok(self, cookie, request): + """Return true if (and only if) cookie should be accepted from server. + + Currently, pre-expired cookies never get this far -- the CookieJar + class deletes such cookies itself. + + cookie: mechanize.Cookie object + request: object implementing the interface defined by + CookieJar.extract_cookies.__doc__ + + """ + raise NotImplementedError() + + def return_ok(self, cookie, request): + """Return true if (and only if) cookie should be returned to server. + + cookie: mechanize.Cookie object + request: object implementing the interface defined by + CookieJar.add_cookie_header.__doc__ + + """ + raise NotImplementedError() + + def domain_return_ok(self, domain, request): + """Return false if cookies should not be returned, given cookie domain. + + This is here as an optimization, to remove the need for checking every + cookie with a particular domain (which may involve reading many files). + The default implementations of domain_return_ok and path_return_ok + (return True) leave all the work to return_ok. + + If domain_return_ok returns true for the cookie domain, path_return_ok + is called for the cookie path. Otherwise, path_return_ok and return_ok + are never called for that cookie domain. If path_return_ok returns + true, return_ok is called with the Cookie object itself for a full + check. Otherwise, return_ok is never called for that cookie path. + + Note that domain_return_ok is called for every *cookie* domain, not + just for the *request* domain. For example, the function might be + called with both ".acme.com" and "www.acme.com" if the request domain + is "www.acme.com". The same goes for path_return_ok. + + For argument documentation, see the docstring for return_ok. + + """ + return True + + def path_return_ok(self, path, request): + """Return false if cookies should not be returned, given cookie path. + + See the docstring for domain_return_ok. + + """ + return True + + +class DefaultCookiePolicy(CookiePolicy): + """Implements the standard rules for accepting and returning cookies. + + Both RFC 2965 and Netscape cookies are covered. RFC 2965 handling is + switched off by default. + + The easiest way to provide your own policy is to override this class and + call its methods in your overriden implementations before adding your own + additional checks. + + import mechanize + class MyCookiePolicy(mechanize.DefaultCookiePolicy): + def set_ok(self, cookie, request): + if not mechanize.DefaultCookiePolicy.set_ok( + self, cookie, request): + return False + if i_dont_want_to_store_this_cookie(): + return False + return True + + In addition to the features required to implement the CookiePolicy + interface, this class allows you to block and allow domains from setting + and receiving cookies. There are also some strictness switches that allow + you to tighten up the rather loose Netscape protocol rules a little bit (at + the cost of blocking some benign cookies). + + A domain blacklist and whitelist is provided (both off by default). Only + domains not in the blacklist and present in the whitelist (if the whitelist + is active) participate in cookie setting and returning. Use the + blocked_domains constructor argument, and blocked_domains and + set_blocked_domains methods (and the corresponding argument and methods for + allowed_domains). If you set a whitelist, you can turn it off again by + setting it to None. + + Domains in block or allow lists that do not start with a dot must + string-compare equal. For example, "acme.com" matches a blacklist entry of + "acme.com", but "www.acme.com" does not. Domains that do start with a dot + are matched by more specific domains too. For example, both "www.acme.com" + and "www.munitions.acme.com" match ".acme.com" (but "acme.com" itself does + not). IP addresses are an exception, and must match exactly. For example, + if blocked_domains contains "192.168.1.2" and ".168.1.2" 192.168.1.2 is + blocked, but 193.168.1.2 is not. + + Additional Public Attributes: + + General strictness switches + + strict_domain: don't allow sites to set two-component domains with + country-code top-level domains like .co.uk, .gov.uk, .co.nz. etc. + This is far from perfect and isn't guaranteed to work! + + RFC 2965 protocol strictness switches + + strict_rfc2965_unverifiable: follow RFC 2965 rules on unverifiable + transactions (usually, an unverifiable transaction is one resulting from + a redirect or an image hosted on another site); if this is false, cookies + are NEVER blocked on the basis of verifiability + + Netscape protocol strictness switches + + strict_ns_unverifiable: apply RFC 2965 rules on unverifiable transactions + even to Netscape cookies + strict_ns_domain: flags indicating how strict to be with domain-matching + rules for Netscape cookies: + DomainStrictNoDots: when setting cookies, host prefix must not contain a + dot (e.g. www.foo.bar.com can't set a cookie for .bar.com, because + www.foo contains a dot) + DomainStrictNonDomain: cookies that did not explicitly specify a Domain + cookie-attribute can only be returned to a domain that string-compares + equal to the domain that set the cookie (e.g. rockets.acme.com won't + be returned cookies from acme.com that had no Domain cookie-attribute) + DomainRFC2965Match: when setting cookies, require a full RFC 2965 + domain-match + DomainLiberal and DomainStrict are the most useful combinations of the + above flags, for convenience + strict_ns_set_initial_dollar: ignore cookies in Set-Cookie: headers that + have names starting with '$' + strict_ns_set_path: don't allow setting cookies whose path doesn't + path-match request URI + + """ + + DomainStrictNoDots = 1 + DomainStrictNonDomain = 2 + DomainRFC2965Match = 4 + + DomainLiberal = 0 + DomainStrict = DomainStrictNoDots|DomainStrictNonDomain + + def __init__(self, + blocked_domains=None, allowed_domains=None, + netscape=True, rfc2965=False, + # WARNING: this argument will change or go away if is not + # accepted into the Python standard library in this form! + # default, ie. treat 2109 as netscape iff not rfc2965 + rfc2109_as_netscape=None, + hide_cookie2=False, + strict_domain=False, + strict_rfc2965_unverifiable=True, + strict_ns_unverifiable=False, + strict_ns_domain=DomainLiberal, + strict_ns_set_initial_dollar=False, + strict_ns_set_path=False, + ): + """ + Constructor arguments should be used as keyword arguments only. + + blocked_domains: sequence of domain names that we never accept cookies + from, nor return cookies to + allowed_domains: if not None, this is a sequence of the only domains + for which we accept and return cookies + + For other arguments, see CookiePolicy.__doc__ and + DefaultCookiePolicy.__doc__.. + + """ + self.netscape = netscape + self.rfc2965 = rfc2965 + self.rfc2109_as_netscape = rfc2109_as_netscape + self.hide_cookie2 = hide_cookie2 + self.strict_domain = strict_domain + self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable + self.strict_ns_unverifiable = strict_ns_unverifiable + self.strict_ns_domain = strict_ns_domain + self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar + self.strict_ns_set_path = strict_ns_set_path + + if blocked_domains is not None: + self._blocked_domains = tuple(blocked_domains) + else: + self._blocked_domains = () + + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def blocked_domains(self): + """Return the sequence of blocked domains (as a tuple).""" + return self._blocked_domains + def set_blocked_domains(self, blocked_domains): + """Set the sequence of blocked domains.""" + self._blocked_domains = tuple(blocked_domains) + + def is_blocked(self, domain): + for blocked_domain in self._blocked_domains: + if user_domain_match(domain, blocked_domain): + return True + return False + + def allowed_domains(self): + """Return None, or the sequence of allowed domains (as a tuple).""" + return self._allowed_domains + def set_allowed_domains(self, allowed_domains): + """Set the sequence of allowed domains, or None.""" + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def is_not_allowed(self, domain): + if self._allowed_domains is None: + return False + for allowed_domain in self._allowed_domains: + if user_domain_match(domain, allowed_domain): + return False + return True + + def set_ok(self, cookie, request): + """ + If you override set_ok, be sure to call this method. If it returns + false, so should your subclass (assuming your subclass wants to be more + strict about which cookies to accept). + + """ + debug(" - checking cookie %s", cookie) + + assert cookie.name is not None + + for n in "version", "verifiability", "name", "path", "domain", "port": + fn_name = "set_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request): + return False + + return True + + def set_ok_version(self, cookie, request): + if cookie.version is None: + # Version is always set to 0 by parse_ns_headers if it's a Netscape + # cookie, so this must be an invalid RFC 2965 cookie. + debug(" Set-Cookie2 without version attribute (%s)", cookie) + return False + if cookie.version > 0 and not self.rfc2965: + debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + debug(" Netscape cookies are switched off") + return False + return True + + def set_ok_verifiability(self, cookie, request): + if request_is_unverifiable(request) and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + debug(" third-party RFC 2965 cookie during " + "unverifiable transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + debug(" third-party Netscape cookie during " + "unverifiable transaction") + return False + return True + + def set_ok_name(self, cookie, request): + # Try and stop servers setting V0 cookies designed to hack other + # servers that know both V0 and V1 protocols. + if (cookie.version == 0 and self.strict_ns_set_initial_dollar and + cookie.name.startswith("$")): + debug(" illegal name (starts with '$'): '%s'", cookie.name) + return False + return True + + def set_ok_path(self, cookie, request): + if cookie.path_specified: + req_path = request_path(request) + if ((cookie.version > 0 or + (cookie.version == 0 and self.strict_ns_set_path)) and + not req_path.startswith(cookie.path)): + debug(" path attribute %s is not a prefix of request " + "path %s", cookie.path, req_path) + return False + return True + + def set_ok_countrycode_domain(self, cookie, request): + """Return False if explicit cookie domain is not acceptable. + + Called by set_ok_domain, for convenience of overriding by + subclasses. + + """ + if cookie.domain_specified and self.strict_domain: + domain = cookie.domain + # since domain was specified, we know that: + assert domain.startswith(".") + if domain.count(".") == 2: + # domain like .foo.bar + i = domain.rfind(".") + tld = domain[i+1:] + sld = domain[1:i] + if (sld.lower() in [ + "co", "ac", + "com", "edu", "org", "net", "gov", "mil", "int", + "aero", "biz", "cat", "coop", "info", "jobs", "mobi", + "museum", "name", "pro", "travel", + ] and + len(tld) == 2): + # domain like .co.uk + return False + return True + + def set_ok_domain(self, cookie, request): + if self.is_blocked(cookie.domain): + debug(" domain %s is in user block-list", cookie.domain) + return False + if self.is_not_allowed(cookie.domain): + debug(" domain %s is not in user allow-list", cookie.domain) + return False + if not self.set_ok_countrycode_domain(cookie, request): + debug(" country-code second level domain %s", cookie.domain) + return False + if cookie.domain_specified: + req_host, erhn = eff_request_host_lc(request) + domain = cookie.domain + if domain.startswith("."): + undotted_domain = domain[1:] + else: + undotted_domain = domain + embedded_dots = (undotted_domain.find(".") >= 0) + if not embedded_dots and domain != ".local": + debug(" non-local domain %s contains no embedded dot", + domain) + return False + if cookie.version == 0: + if (not erhn.endswith(domain) and + (not erhn.startswith(".") and + not ("."+erhn).endswith(domain))): + debug(" effective request-host %s (even with added " + "initial dot) does not end end with %s", + erhn, domain) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainRFC2965Match)): + if not domain_match(erhn, domain): + debug(" effective request-host %s does not domain-match " + "%s", erhn, domain) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainStrictNoDots)): + host_prefix = req_host[:-len(domain)] + if (host_prefix.find(".") >= 0 and + not IPV4_RE.search(req_host)): + debug(" host prefix %s for domain %s contains a dot", + host_prefix, domain) + return False + return True + + def set_ok_port(self, cookie, request): + if cookie.port_specified: + req_port = request_port(request) + if req_port is None: + req_port = "80" + else: + req_port = str(req_port) + for p in cookie.port.split(","): + try: + int(p) + except ValueError: + debug(" bad port %s (not numeric)", p) + return False + if p == req_port: + break + else: + debug(" request port (%s) not found in %s", + req_port, cookie.port) + return False + return True + + def return_ok(self, cookie, request): + """ + If you override return_ok, be sure to call this method. If it returns + false, so should your subclass (assuming your subclass wants to be more + strict about which cookies to return). + + """ + # Path has already been checked by path_return_ok, and domain blocking + # done by domain_return_ok. + debug(" - checking cookie %s", cookie) + + for n in ("version", "verifiability", "secure", "expires", "port", + "domain"): + fn_name = "return_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request): + return False + return True + + def return_ok_version(self, cookie, request): + if cookie.version > 0 and not self.rfc2965: + debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + debug(" Netscape cookies are switched off") + return False + return True + + def return_ok_verifiability(self, cookie, request): + if request_is_unverifiable(request) and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + debug(" third-party RFC 2965 cookie during unverifiable " + "transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + debug(" third-party Netscape cookie during unverifiable " + "transaction") + return False + return True + + def return_ok_secure(self, cookie, request): + if cookie.secure and request.get_type() != "https": + debug(" secure cookie with non-secure request") + return False + return True + + def return_ok_expires(self, cookie, request): + if cookie.is_expired(self._now): + debug(" cookie expired") + return False + return True + + def return_ok_port(self, cookie, request): + if cookie.port: + req_port = request_port(request) + if req_port is None: + req_port = "80" + for p in cookie.port.split(","): + if p == req_port: + break + else: + debug(" request port %s does not match cookie port %s", + req_port, cookie.port) + return False + return True + + def return_ok_domain(self, cookie, request): + req_host, erhn = eff_request_host_lc(request) + domain = cookie.domain + + # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't + if (cookie.version == 0 and + (self.strict_ns_domain & self.DomainStrictNonDomain) and + not cookie.domain_specified and domain != erhn): + debug(" cookie with unspecified domain does not string-compare " + "equal to request domain") + return False + + if cookie.version > 0 and not domain_match(erhn, domain): + debug(" effective request-host name %s does not domain-match " + "RFC 2965 cookie domain %s", erhn, domain) + return False + if cookie.version == 0 and not ("."+erhn).endswith(domain): + debug(" request-host %s does not match Netscape cookie domain " + "%s", req_host, domain) + return False + return True + + def domain_return_ok(self, domain, request): + # Liberal check of domain. This is here as an optimization to avoid + # having to load lots of MSIE cookie files unless necessary. + + # Munge req_host and erhn to always start with a dot, so as to err on + # the side of letting cookies through. + dotted_req_host, dotted_erhn = eff_request_host_lc(request) + if not dotted_req_host.startswith("."): + dotted_req_host = "."+dotted_req_host + if not dotted_erhn.startswith("."): + dotted_erhn = "."+dotted_erhn + if not (dotted_req_host.endswith(domain) or + dotted_erhn.endswith(domain)): + #debug(" request domain %s does not match cookie domain %s", + # req_host, domain) + return False + + if self.is_blocked(domain): + debug(" domain %s is in user block-list", domain) + return False + if self.is_not_allowed(domain): + debug(" domain %s is not in user allow-list", domain) + return False + + return True + + def path_return_ok(self, path, request): + debug("- checking cookie path=%s", path) + req_path = request_path(request) + if not req_path.startswith(path): + debug(" %s does not path-match %s", req_path, path) + return False + return True + + +def vals_sorted_by_key(adict): + keys = adict.keys() + keys.sort() + return map(adict.get, keys) + +class MappingIterator: + """Iterates over nested mapping, depth-first, in sorted order by key.""" + def __init__(self, mapping): + self._s = [(vals_sorted_by_key(mapping), 0, None)] # LIFO stack + + def __iter__(self): return self + + def next(self): + # this is hairy because of lack of generators + while 1: + try: + vals, i, prev_item = self._s.pop() + except IndexError: + raise StopIteration() + if i < len(vals): + item = vals[i] + i = i + 1 + self._s.append((vals, i, prev_item)) + try: + item.items + except AttributeError: + # non-mapping + break + else: + # mapping + self._s.append((vals_sorted_by_key(item), 0, item)) + continue + return item + + +# Used as second parameter to dict.get method, to distinguish absent +# dict key from one with a None value. +class Absent: pass + +class CookieJar: + """Collection of HTTP cookies. + + You may not need to know about this class: try mechanize.urlopen(). + + The major methods are extract_cookies and add_cookie_header; these are all + you are likely to need. + + CookieJar supports the iterator protocol: + + for cookie in cookiejar: + # do something with cookie + + Methods: + + add_cookie_header(request) + extract_cookies(response, request) + get_policy() + set_policy(policy) + cookies_for_request(request) + make_cookies(response, request) + set_cookie_if_ok(cookie, request) + set_cookie(cookie) + clear_session_cookies() + clear_expired_cookies() + clear(domain=None, path=None, name=None) + + Public attributes + + policy: CookiePolicy object + + """ + + non_word_re = re.compile(r"\W") + quote_re = re.compile(r"([\"\\])") + strict_domain_re = re.compile(r"\.?[^.]*") + domain_re = re.compile(r"[^.]*") + dots_re = re.compile(r"^\.+") + + def __init__(self, policy=None): + """ + See CookieJar.__doc__ for argument documentation. + + """ + if policy is None: + policy = DefaultCookiePolicy() + self._policy = policy + + self._cookies = {} + + # for __getitem__ iteration in pre-2.2 Pythons + self._prev_getitem_index = 0 + + def get_policy(self): + return self._policy + + def set_policy(self, policy): + self._policy = policy + + def _cookies_for_domain(self, domain, request): + cookies = [] + if not self._policy.domain_return_ok(domain, request): + return [] + debug("Checking %s for cookies to return", domain) + cookies_by_path = self._cookies[domain] + for path in cookies_by_path.keys(): + if not self._policy.path_return_ok(path, request): + continue + cookies_by_name = cookies_by_path[path] + for cookie in cookies_by_name.values(): + if not self._policy.return_ok(cookie, request): + debug(" not returning cookie") + continue + debug(" it's a match") + cookies.append(cookie) + return cookies + + def cookies_for_request(self, request): + """Return a list of cookies to be returned to server. + + The returned list of cookie instances is sorted in the order they + should appear in the Cookie: header for return to the server. + + See add_cookie_header.__doc__ for the interface required of the + request argument. + + New in version 0.1.10 + + """ + self._policy._now = self._now = int(time.time()) + cookies = self._cookies_for_request(request) + # add cookies in order of most specific (i.e. longest) path first + def decreasing_size(a, b): return cmp(len(b.path), len(a.path)) + cookies.sort(decreasing_size) + return cookies + + def _cookies_for_request(self, request): + """Return a list of cookies to be returned to server.""" + # this method still exists (alongside cookies_for_request) because it + # is part of an implied protected interface for subclasses of cookiejar + # XXX document that implied interface, or provide another way of + # implementing cookiejars than subclassing + cookies = [] + for domain in self._cookies.keys(): + cookies.extend(self._cookies_for_domain(domain, request)) + return cookies + + def _cookie_attrs(self, cookies): + """Return a list of cookie-attributes to be returned to server. + + The $Version attribute is also added when appropriate (currently only + once per request). + + >>> jar = CookieJar() + >>> ns_cookie = Cookie(0, "foo", '"bar"', None, False, + ... "example.com", False, False, + ... "/", False, False, None, True, + ... None, None, {}) + >>> jar._cookie_attrs([ns_cookie]) + ['foo="bar"'] + >>> rfc2965_cookie = Cookie(1, "foo", "bar", None, False, + ... ".example.com", True, False, + ... "/", False, False, None, True, + ... None, None, {}) + >>> jar._cookie_attrs([rfc2965_cookie]) + ['$Version=1', 'foo=bar', '$Domain="example.com"'] + + """ + version_set = False + + attrs = [] + for cookie in cookies: + # set version of Cookie header + # XXX + # What should it be if multiple matching Set-Cookie headers have + # different versions themselves? + # Answer: there is no answer; was supposed to be settled by + # RFC 2965 errata, but that may never appear... + version = cookie.version + if not version_set: + version_set = True + if version > 0: + attrs.append("$Version=%s" % version) + + # quote cookie value if necessary + # (not for Netscape protocol, which already has any quotes + # intact, due to the poorly-specified Netscape Cookie: syntax) + if ((cookie.value is not None) and + self.non_word_re.search(cookie.value) and version > 0): + value = self.quote_re.sub(r"\\\1", cookie.value) + else: + value = cookie.value + + # add cookie-attributes to be returned in Cookie header + if cookie.value is None: + attrs.append(cookie.name) + else: + attrs.append("%s=%s" % (cookie.name, value)) + if version > 0: + if cookie.path_specified: + attrs.append('$Path="%s"' % cookie.path) + if cookie.domain.startswith("."): + domain = cookie.domain + if (not cookie.domain_initial_dot and + domain.startswith(".")): + domain = domain[1:] + attrs.append('$Domain="%s"' % domain) + if cookie.port is not None: + p = "$Port" + if cookie.port_specified: + p = p + ('="%s"' % cookie.port) + attrs.append(p) + + return attrs + + def add_cookie_header(self, request): + """Add correct Cookie: header to request (mechanize.Request object). + + The Cookie2 header is also added unless policy.hide_cookie2 is true. + + The request object (usually a mechanize.Request instance) must support + the methods get_full_url, get_host, is_unverifiable, get_type, + has_header, get_header, header_items and add_unredirected_header, as + documented by urllib2. + """ + debug("add_cookie_header") + cookies = self.cookies_for_request(request) + + attrs = self._cookie_attrs(cookies) + if attrs: + if not request.has_header("Cookie"): + request.add_unredirected_header("Cookie", "; ".join(attrs)) + + # if necessary, advertise that we know RFC 2965 + if self._policy.rfc2965 and not self._policy.hide_cookie2: + for cookie in cookies: + if cookie.version != 1 and not request.has_header("Cookie2"): + request.add_unredirected_header("Cookie2", '$Version="1"') + break + + self.clear_expired_cookies() + + def _normalized_cookie_tuples(self, attrs_set): + """Return list of tuples containing normalised cookie information. + + attrs_set is the list of lists of key,value pairs extracted from + the Set-Cookie or Set-Cookie2 headers. + + Tuples are name, value, standard, rest, where name and value are the + cookie name and value, standard is a dictionary containing the standard + cookie-attributes (discard, secure, version, expires or max-age, + domain, path and port) and rest is a dictionary containing the rest of + the cookie-attributes. + + """ + cookie_tuples = [] + + boolean_attrs = "discard", "secure" + value_attrs = ("version", + "expires", "max-age", + "domain", "path", "port", + "comment", "commenturl") + + for cookie_attrs in attrs_set: + name, value = cookie_attrs[0] + + # Build dictionary of standard cookie-attributes (standard) and + # dictionary of other cookie-attributes (rest). + + # Note: expiry time is normalised to seconds since epoch. V0 + # cookies should have the Expires cookie-attribute, and V1 cookies + # should have Max-Age, but since V1 includes RFC 2109 cookies (and + # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we + # accept either (but prefer Max-Age). + max_age_set = False + + bad_cookie = False + + standard = {} + rest = {} + for k, v in cookie_attrs[1:]: + lc = k.lower() + # don't lose case distinction for unknown fields + if lc in value_attrs or lc in boolean_attrs: + k = lc + if k in boolean_attrs and v is None: + # boolean cookie-attribute is present, but has no value + # (like "discard", rather than "port=80") + v = True + if standard.has_key(k): + # only first value is significant + continue + if k == "domain": + if v is None: + debug(" missing value for domain attribute") + bad_cookie = True + break + # RFC 2965 section 3.3.3 + v = v.lower() + if k == "expires": + if max_age_set: + # Prefer max-age to expires (like Mozilla) + continue + if v is None: + debug(" missing or invalid value for expires " + "attribute: treating as session cookie") + continue + if k == "max-age": + max_age_set = True + if v is None: + debug(" missing value for max-age attribute") + bad_cookie = True + break + try: + v = int(v) + except ValueError: + debug(" missing or invalid (non-numeric) value for " + "max-age attribute") + bad_cookie = True + break + # convert RFC 2965 Max-Age to seconds since epoch + # XXX Strictly you're supposed to follow RFC 2616 + # age-calculation rules. Remember that zero Max-Age is a + # is a request to discard (old and new) cookie, though. + k = "expires" + v = self._now + v + if (k in value_attrs) or (k in boolean_attrs): + if (v is None and + k not in ["port", "comment", "commenturl"]): + debug(" missing value for %s attribute" % k) + bad_cookie = True + break + standard[k] = v + else: + rest[k] = v + + if bad_cookie: + continue + + cookie_tuples.append((name, value, standard, rest)) + + return cookie_tuples + + def _cookie_from_cookie_tuple(self, tup, request): + # standard is dict of standard cookie-attributes, rest is dict of the + # rest of them + name, value, standard, rest = tup + + domain = standard.get("domain", Absent) + path = standard.get("path", Absent) + port = standard.get("port", Absent) + expires = standard.get("expires", Absent) + + # set the easy defaults + version = standard.get("version", None) + if version is not None: + try: + version = int(version) + except ValueError: + return None # invalid version, ignore cookie + secure = standard.get("secure", False) + # (discard is also set if expires is Absent) + discard = standard.get("discard", False) + comment = standard.get("comment", None) + comment_url = standard.get("commenturl", None) + + # set default path + if path is not Absent and path != "": + path_specified = True + path = escape_path(path) + else: + path_specified = False + path = request_path(request) + i = path.rfind("/") + if i != -1: + if version == 0: + # Netscape spec parts company from reality here + path = path[:i] + else: + path = path[:i+1] + if len(path) == 0: path = "/" + + # set default domain + domain_specified = domain is not Absent + # but first we have to remember whether it starts with a dot + domain_initial_dot = False + if domain_specified: + domain_initial_dot = bool(domain.startswith(".")) + if domain is Absent: + req_host, erhn = eff_request_host_lc(request) + domain = erhn + elif not domain.startswith("."): + domain = "."+domain + + # set default port + port_specified = False + if port is not Absent: + if port is None: + # Port attr present, but has no value: default to request port. + # Cookie should then only be sent back on that port. + port = request_port(request) + else: + port_specified = True + port = re.sub(r"\s+", "", port) + else: + # No port attr present. Cookie can be sent back on any port. + port = None + + # set default expires and discard + if expires is Absent: + expires = None + discard = True + + return Cookie(version, + name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest) + + def _cookies_from_attrs_set(self, attrs_set, request): + cookie_tuples = self._normalized_cookie_tuples(attrs_set) + + cookies = [] + for tup in cookie_tuples: + cookie = self._cookie_from_cookie_tuple(tup, request) + if cookie: cookies.append(cookie) + return cookies + + def _process_rfc2109_cookies(self, cookies): + if self._policy.rfc2109_as_netscape is None: + rfc2109_as_netscape = not self._policy.rfc2965 + else: + rfc2109_as_netscape = self._policy.rfc2109_as_netscape + for cookie in cookies: + if cookie.version == 1: + cookie.rfc2109 = True + if rfc2109_as_netscape: + # treat 2109 cookies as Netscape cookies rather than + # as RFC2965 cookies + cookie.version = 0 + + def _make_cookies(self, response, request): + # get cookie-attributes for RFC 2965 and Netscape protocols + headers = response.info() + rfc2965_hdrs = headers.getheaders("Set-Cookie2") + ns_hdrs = headers.getheaders("Set-Cookie") + + rfc2965 = self._policy.rfc2965 + netscape = self._policy.netscape + + if ((not rfc2965_hdrs and not ns_hdrs) or + (not ns_hdrs and not rfc2965) or + (not rfc2965_hdrs and not netscape) or + (not netscape and not rfc2965)): + return [] # no relevant cookie headers: quick exit + + try: + cookies = self._cookies_from_attrs_set( + split_header_words(rfc2965_hdrs), request) + except: + reraise_unmasked_exceptions() + cookies = [] + + if ns_hdrs and netscape: + try: + # RFC 2109 and Netscape cookies + ns_cookies = self._cookies_from_attrs_set( + parse_ns_headers(ns_hdrs), request) + except: + reraise_unmasked_exceptions() + ns_cookies = [] + self._process_rfc2109_cookies(ns_cookies) + + # Look for Netscape cookies (from Set-Cookie headers) that match + # corresponding RFC 2965 cookies (from Set-Cookie2 headers). + # For each match, keep the RFC 2965 cookie and ignore the Netscape + # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are + # bundled in with the Netscape cookies for this purpose, which is + # reasonable behaviour. + if rfc2965: + lookup = {} + for cookie in cookies: + lookup[(cookie.domain, cookie.path, cookie.name)] = None + + def no_matching_rfc2965(ns_cookie, lookup=lookup): + key = ns_cookie.domain, ns_cookie.path, ns_cookie.name + return not lookup.has_key(key) + ns_cookies = filter(no_matching_rfc2965, ns_cookies) + + if ns_cookies: + cookies.extend(ns_cookies) + + return cookies + + def make_cookies(self, response, request): + """Return sequence of Cookie objects extracted from response object. + + See extract_cookies.__doc__ for the interface required of the + response and request arguments. + + """ + self._policy._now = self._now = int(time.time()) + return [cookie for cookie in self._make_cookies(response, request) + if cookie.expires is None or not cookie.expires <= self._now] + + def set_cookie_if_ok(self, cookie, request): + """Set a cookie if policy says it's OK to do so. + + cookie: mechanize.Cookie instance + request: see extract_cookies.__doc__ for the required interface + + """ + self._policy._now = self._now = int(time.time()) + + if self._policy.set_ok(cookie, request): + self.set_cookie(cookie) + + def set_cookie(self, cookie): + """Set a cookie, without checking whether or not it should be set. + + cookie: mechanize.Cookie instance + """ + c = self._cookies + if not c.has_key(cookie.domain): c[cookie.domain] = {} + c2 = c[cookie.domain] + if not c2.has_key(cookie.path): c2[cookie.path] = {} + c3 = c2[cookie.path] + c3[cookie.name] = cookie + + def extract_cookies(self, response, request): + """Extract cookies from response, where allowable given the request. + + Look for allowable Set-Cookie: and Set-Cookie2: headers in the response + object passed as argument. Any of these headers that are found are + used to update the state of the object (subject to the policy.set_ok + method's approval). + + The response object (usually be the result of a call to + mechanize.urlopen, or similar) should support an info method, which + returns a mimetools.Message object (in fact, the 'mimetools.Message + object' may be any object that provides a getheaders method). + + The request object (usually a mechanize.Request instance) must support + the methods get_full_url, get_type, get_host, and is_unverifiable, as + documented by mechanize, and the port attribute (the port number). The + request is used to set default values for cookie-attributes as well as + for checking that the cookie is OK to be set. + + """ + debug("extract_cookies: %s", response.info()) + self._policy._now = self._now = int(time.time()) + + for cookie in self._make_cookies(response, request): + if cookie.expires is not None and cookie.expires <= self._now: + # Expiry date in past is request to delete cookie. This can't be + # in DefaultCookiePolicy, because can't delete cookies there. + try: + self.clear(cookie.domain, cookie.path, cookie.name) + except KeyError: + pass + debug("Expiring cookie, domain='%s', path='%s', name='%s'", + cookie.domain, cookie.path, cookie.name) + elif self._policy.set_ok(cookie, request): + debug(" setting cookie: %s", cookie) + self.set_cookie(cookie) + + def clear(self, domain=None, path=None, name=None): + """Clear some cookies. + + Invoking this method without arguments will clear all cookies. If + given a single argument, only cookies belonging to that domain will be + removed. If given two arguments, cookies belonging to the specified + path within that domain are removed. If given three arguments, then + the cookie with the specified name, path and domain is removed. + + Raises KeyError if no matching cookie exists. + + """ + if name is not None: + if (domain is None) or (path is None): + raise ValueError( + "domain and path must be given to remove a cookie by name") + del self._cookies[domain][path][name] + elif path is not None: + if domain is None: + raise ValueError( + "domain must be given to remove cookies by path") + del self._cookies[domain][path] + elif domain is not None: + del self._cookies[domain] + else: + self._cookies = {} + + def clear_session_cookies(self): + """Discard all session cookies. + + Discards all cookies held by object which had either no Max-Age or + Expires cookie-attribute or an explicit Discard cookie-attribute, or + which otherwise have ended up with a true discard attribute. For + interactive browsers, the end of a session usually corresponds to + closing the browser window. + + Note that the save method won't save session cookies anyway, unless you + ask otherwise by passing a true ignore_discard argument. + + """ + for cookie in self: + if cookie.discard: + self.clear(cookie.domain, cookie.path, cookie.name) + + def clear_expired_cookies(self): + """Discard all expired cookies. + + You probably don't need to call this method: expired cookies are never + sent back to the server (provided you're using DefaultCookiePolicy), + this method is called by CookieJar itself every so often, and the save + method won't save expired cookies anyway (unless you ask otherwise by + passing a true ignore_expires argument). + + """ + now = time.time() + for cookie in self: + if cookie.is_expired(now): + self.clear(cookie.domain, cookie.path, cookie.name) + + def __getitem__(self, i): + if i == 0: + self._getitem_iterator = self.__iter__() + elif self._prev_getitem_index != i-1: raise IndexError( + "CookieJar.__getitem__ only supports sequential iteration") + self._prev_getitem_index = i + try: + return self._getitem_iterator.next() + except StopIteration: + raise IndexError() + + def __iter__(self): + return MappingIterator(self._cookies) + + def __len__(self): + """Return number of contained cookies.""" + i = 0 + for cookie in self: i = i + 1 + return i + + def __repr__(self): + r = [] + for cookie in self: r.append(repr(cookie)) + return "<%s[%s]>" % (self.__class__, ", ".join(r)) + + def __str__(self): + r = [] + for cookie in self: r.append(str(cookie)) + return "<%s[%s]>" % (self.__class__, ", ".join(r)) + + +class LoadError(Exception): pass + +class FileCookieJar(CookieJar): + """CookieJar that can be loaded from and saved to a file. + + Additional methods + + save(filename=None, ignore_discard=False, ignore_expires=False) + load(filename=None, ignore_discard=False, ignore_expires=False) + revert(filename=None, ignore_discard=False, ignore_expires=False) + + Additional public attributes + + filename: filename for loading and saving cookies + + Additional public readable attributes + + delayload: request that cookies are lazily loaded from disk; this is only + a hint since this only affects performance, not behaviour (unless the + cookies on disk are changing); a CookieJar object may ignore it (in fact, + only MSIECookieJar lazily loads cookies at the moment) + + """ + + def __init__(self, filename=None, delayload=False, policy=None): + """ + See FileCookieJar.__doc__ for argument documentation. + + Cookies are NOT loaded from the named file until either the load or + revert method is called. + + """ + CookieJar.__init__(self, policy) + if filename is not None and not isstringlike(filename): + raise ValueError("filename must be string-like") + self.filename = filename + self.delayload = bool(delayload) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + """Save cookies to a file. + + filename: name of file in which to save cookies + ignore_discard: save even cookies set to be discarded + ignore_expires: save even cookies that have expired + + The file is overwritten if it already exists, thus wiping all its + cookies. Saved cookies can be restored later using the load or revert + methods. If filename is not specified, self.filename is used; if + self.filename is None, ValueError is raised. + + """ + raise NotImplementedError() + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + """Load cookies from a file. + + Old cookies are kept unless overwritten by newly loaded ones. + + Arguments are as for .save(). + + If filename is not specified, self.filename is used; if self.filename + is None, ValueError is raised. The named file must be in the format + understood by the class, or LoadError will be raised. This format will + be identical to that written by the save method, unless the load format + is not sufficiently well understood (as is the case for MSIECookieJar). + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename) + try: + self._really_load(f, filename, ignore_discard, ignore_expires) + finally: + f.close() + + def revert(self, filename=None, + ignore_discard=False, ignore_expires=False): + """Clear all cookies and reload cookies from a saved file. + + Raises LoadError (or IOError) if reversion is not successful; the + object's state will not be altered if this happens. + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + old_state = copy.deepcopy(self._cookies) + self._cookies = {} + try: + self.load(filename, ignore_discard, ignore_expires) + except (LoadError, IOError): + self._cookies = old_state + raise diff --git a/mechanize/_clientcookie.pyc b/mechanize/_clientcookie.pyc new file mode 100644 index 0000000..7cf573b Binary files /dev/null and b/mechanize/_clientcookie.pyc differ diff --git a/mechanize/_debug.py b/mechanize/_debug.py new file mode 100644 index 0000000..8243969 --- /dev/null +++ b/mechanize/_debug.py @@ -0,0 +1,28 @@ +import logging + +from _response import response_seek_wrapper +from _urllib2_fork import BaseHandler + + +class HTTPResponseDebugProcessor(BaseHandler): + handler_order = 900 # before redirections, after everything else + + def http_response(self, request, response): + if not hasattr(response, "seek"): + response = response_seek_wrapper(response) + info = logging.getLogger("mechanize.http_responses").info + try: + info(response.read()) + finally: + response.seek(0) + info("*****************************************************") + return response + + https_response = http_response + +class HTTPRedirectDebugProcessor(BaseHandler): + def http_request(self, request): + if hasattr(request, "redirect_dict"): + info = logging.getLogger("mechanize.http_redirects").info + info("redirecting to %s", request.get_full_url()) + return request diff --git a/mechanize/_debug.pyc b/mechanize/_debug.pyc new file mode 100644 index 0000000..2870b1a Binary files /dev/null and b/mechanize/_debug.pyc differ diff --git a/mechanize/_firefox3cookiejar.py b/mechanize/_firefox3cookiejar.py new file mode 100644 index 0000000..a64d70f --- /dev/null +++ b/mechanize/_firefox3cookiejar.py @@ -0,0 +1,248 @@ +"""Firefox 3 "cookies.sqlite" cookie persistence. + +Copyright 2008 John J Lee + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import logging +import time + +from _clientcookie import CookieJar, Cookie, MappingIterator +from _util import isstringlike, experimental +debug = logging.getLogger("mechanize.cookies").debug + + +class Firefox3CookieJar(CookieJar): + + """Firefox 3 cookie jar. + + The cookies are stored in Firefox 3's "cookies.sqlite" format. + + Constructor arguments: + + filename: filename of cookies.sqlite (typically found at the top level + of a firefox profile directory) + autoconnect: as a convenience, connect to the SQLite cookies database at + Firefox3CookieJar construction time (default True) + policy: an object satisfying the mechanize.CookiePolicy interface + + Note that this is NOT a FileCookieJar, and there are no .load(), + .save() or .restore() methods. The database is in sync with the + cookiejar object's state after each public method call. + + Following Firefox's own behaviour, session cookies are never saved to + the database. + + The file is created, and an sqlite database written to it, if it does + not already exist. The moz_cookies database table is created if it does + not already exist. + """ + + # XXX + # handle DatabaseError exceptions + # add a FileCookieJar (explicit .save() / .revert() / .load() methods) + + def __init__(self, filename, autoconnect=True, policy=None): + experimental("Firefox3CookieJar is experimental code") + CookieJar.__init__(self, policy) + if filename is not None and not isstringlike(filename): + raise ValueError("filename must be string-like") + self.filename = filename + self._conn = None + if autoconnect: + self.connect() + + def connect(self): + import sqlite3 # not available in Python 2.4 stdlib + self._conn = sqlite3.connect(self.filename) + self._conn.isolation_level = "DEFERRED" + self._create_table_if_necessary() + + def close(self): + self._conn.close() + + def _transaction(self, func): + try: + cur = self._conn.cursor() + try: + result = func(cur) + finally: + cur.close() + except: + self._conn.rollback() + raise + else: + self._conn.commit() + return result + + def _execute(self, query, params=()): + return self._transaction(lambda cur: cur.execute(query, params)) + + def _query(self, query, params=()): + # XXX should we bother with a transaction? + cur = self._conn.cursor() + try: + cur.execute(query, params) + return cur.fetchall() + finally: + cur.close() + + def _create_table_if_necessary(self): + self._execute("""\ +CREATE TABLE IF NOT EXISTS moz_cookies (id INTEGER PRIMARY KEY, name TEXT, + value TEXT, host TEXT, path TEXT,expiry INTEGER, + lastAccessed INTEGER, isSecure INTEGER, isHttpOnly INTEGER)""") + + def _cookie_from_row(self, row): + (pk, name, value, domain, path, expires, + last_accessed, secure, http_only) = row + + version = 0 + domain = domain.encode("ascii", "ignore") + path = path.encode("ascii", "ignore") + name = name.encode("ascii", "ignore") + value = value.encode("ascii", "ignore") + secure = bool(secure) + + # last_accessed isn't a cookie attribute, so isn't added to rest + rest = {} + if http_only: + rest["HttpOnly"] = None + + if name == "": + name = value + value = None + + initial_dot = domain.startswith(".") + domain_specified = initial_dot + + discard = False + if expires == "": + expires = None + discard = True + + return Cookie(version, name, value, + None, False, + domain, domain_specified, initial_dot, + path, False, + secure, + expires, + discard, + None, + None, + rest) + + def clear(self, domain=None, path=None, name=None): + CookieJar.clear(self, domain, path, name) + where_parts = [] + sql_params = [] + if domain is not None: + where_parts.append("host = ?") + sql_params.append(domain) + if path is not None: + where_parts.append("path = ?") + sql_params.append(path) + if name is not None: + where_parts.append("name = ?") + sql_params.append(name) + where = " AND ".join(where_parts) + if where: + where = " WHERE " + where + def clear(cur): + cur.execute("DELETE FROM moz_cookies%s" % where, + tuple(sql_params)) + self._transaction(clear) + + def _row_from_cookie(self, cookie, cur): + expires = cookie.expires + if cookie.discard: + expires = "" + + domain = unicode(cookie.domain) + path = unicode(cookie.path) + name = unicode(cookie.name) + value = unicode(cookie.value) + secure = bool(int(cookie.secure)) + + if value is None: + value = name + name = "" + + last_accessed = int(time.time()) + http_only = cookie.has_nonstandard_attr("HttpOnly") + + query = cur.execute("""SELECT MAX(id) + 1 from moz_cookies""") + pk = query.fetchone()[0] + if pk is None: + pk = 1 + + return (pk, name, value, domain, path, expires, + last_accessed, secure, http_only) + + def set_cookie(self, cookie): + if cookie.discard: + CookieJar.set_cookie(self, cookie) + return + + def set_cookie(cur): + # XXX + # is this RFC 2965-correct? + # could this do an UPDATE instead? + row = self._row_from_cookie(cookie, cur) + name, unused, domain, path = row[1:5] + cur.execute("""\ +DELETE FROM moz_cookies WHERE host = ? AND path = ? AND name = ?""", + (domain, path, name)) + cur.execute("""\ +INSERT INTO moz_cookies VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) +""", row) + self._transaction(set_cookie) + + def __iter__(self): + # session (non-persistent) cookies + for cookie in MappingIterator(self._cookies): + yield cookie + # persistent cookies + for row in self._query("""\ +SELECT * FROM moz_cookies ORDER BY name, path, host"""): + yield self._cookie_from_row(row) + + def _cookies_for_request(self, request): + session_cookies = CookieJar._cookies_for_request(self, request) + def get_cookies(cur): + query = cur.execute("SELECT host from moz_cookies") + domains = [row[0] for row in query.fetchall()] + cookies = [] + for domain in domains: + cookies += self._persistent_cookies_for_domain(domain, + request, cur) + return cookies + persistent_coookies = self._transaction(get_cookies) + return session_cookies + persistent_coookies + + def _persistent_cookies_for_domain(self, domain, request, cur): + cookies = [] + if not self._policy.domain_return_ok(domain, request): + return [] + debug("Checking %s for cookies to return", domain) + query = cur.execute("""\ +SELECT * from moz_cookies WHERE host = ? ORDER BY path""", + (domain,)) + cookies = [self._cookie_from_row(row) for row in query.fetchall()] + last_path = None + r = [] + for cookie in cookies: + if (cookie.path != last_path and + not self._policy.path_return_ok(cookie.path, request)): + last_path = cookie.path + continue + if not self._policy.return_ok(cookie, request): + debug(" not returning cookie") + continue + debug(" it's a match") + r.append(cookie) + return r diff --git a/mechanize/_firefox3cookiejar.pyc b/mechanize/_firefox3cookiejar.pyc new file mode 100644 index 0000000..eb5bd73 Binary files /dev/null and b/mechanize/_firefox3cookiejar.pyc differ diff --git a/mechanize/_form.py b/mechanize/_form.py new file mode 100644 index 0000000..d45bdfc --- /dev/null +++ b/mechanize/_form.py @@ -0,0 +1,3280 @@ +"""HTML form handling for web clients. + +HTML form handling for web clients: useful for parsing HTML forms, filling them +in and returning the completed forms to the server. This code developed from a +port of Gisle Aas' Perl module HTML::Form, from the libwww-perl library, but +the interface is not the same. + +The most useful docstring is the one for HTMLForm. + +RFC 1866: HTML 2.0 +RFC 1867: Form-based File Upload in HTML +RFC 2388: Returning Values from Forms: multipart/form-data +HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX) +HTML 4.01 Specification, W3C Recommendation 24 December 1999 + + +Copyright 2002-2007 John J. Lee +Copyright 2005 Gary Poster +Copyright 2005 Zope Corporation +Copyright 1998-2000 Gisle Aas. + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +# TODO: +# Clean up post the merge into mechanize +# * Remove code that was duplicated in ClientForm and mechanize +# * Remove weird import stuff +# * Remove pre-Python 2.4 compatibility cruft +# * Clean up tests +# * Later release: Remove the ClientForm 0.1 backwards-compatibility switch +# Remove parser testing hack +# Clean action URI +# Switch to unicode throughout +# See Wichert Akkerman's 2004-01-22 message to c.l.py. +# Apply recommendations from google code project CURLIES +# Apply recommendations from HTML 5 spec +# Add charset parameter to Content-type headers? How to find value?? +# Functional tests to add: +# Single and multiple file upload +# File upload with missing name (check standards) +# mailto: submission & enctype text/plain?? + +# Replace by_label etc. with moniker / selector concept. Allows, e.g., a +# choice between selection by value / id / label / element contents. Or +# choice between matching labels exactly or by substring. etc. + + +__all__ = ['AmbiguityError', 'CheckboxControl', 'Control', + 'ControlNotFoundError', 'FileControl', 'FormParser', 'HTMLForm', + 'HiddenControl', 'IgnoreControl', 'ImageControl', 'IsindexControl', + 'Item', 'ItemCountError', 'ItemNotFoundError', 'Label', + 'ListControl', 'LocateError', 'Missing', 'ParseError', 'ParseFile', + 'ParseFileEx', 'ParseResponse', 'ParseResponseEx','PasswordControl', + 'RadioControl', 'ScalarControl', 'SelectControl', + 'SubmitButtonControl', 'SubmitControl', 'TextControl', + 'TextareaControl', 'XHTMLCompatibleFormParser'] + +import HTMLParser +from cStringIO import StringIO +import inspect +import logging +import random +import re +import sys +import urllib +import urlparse +import warnings + +import _beautifulsoup +import _request + +# from Python itself, for backwards compatibility of raised exceptions +import sgmllib +# bundled copy of sgmllib +import _sgmllib_copy + + +VERSION = "0.2.11" + +CHUNK = 1024 # size of chunks fed to parser, in bytes + +DEFAULT_ENCODING = "latin-1" + +_logger = logging.getLogger("mechanize.forms") +OPTIMIZATION_HACK = True + +def debug(msg, *args, **kwds): + if OPTIMIZATION_HACK: + return + + caller_name = inspect.stack()[1][3] + extended_msg = '%%s %s' % msg + extended_args = (caller_name,)+args + _logger.debug(extended_msg, *extended_args, **kwds) + +def _show_debug_messages(): + global OPTIMIZATION_HACK + OPTIMIZATION_HACK = False + _logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler(sys.stdout) + handler.setLevel(logging.DEBUG) + _logger.addHandler(handler) + + +def deprecation(message, stack_offset=0): + warnings.warn(message, DeprecationWarning, stacklevel=3+stack_offset) + + +class Missing: pass + +_compress_re = re.compile(r"\s+") +def compress_text(text): return _compress_re.sub(" ", text.strip()) + +def normalize_line_endings(text): + return re.sub(r"(?:(? + w = MimeWriter(f) + ...call w.addheader(key, value) 0 or more times... + + followed by either: + + f = w.startbody(content_type) + ...call f.write(data) for body data... + + or: + + w.startmultipartbody(subtype) + for each part: + subwriter = w.nextpart() + ...use the subwriter's methods to create the subpart... + w.lastpart() + + The subwriter is another MimeWriter instance, and should be + treated in the same way as the toplevel MimeWriter. This way, + writing recursive body parts is easy. + + Warning: don't forget to call lastpart()! + + XXX There should be more state so calls made in the wrong order + are detected. + + Some special cases: + + - startbody() just returns the file passed to the constructor; + but don't use this knowledge, as it may be changed. + + - startmultipartbody() actually returns a file as well; + this can be used to write the initial 'if you can read this your + mailer is not MIME-aware' message. + + - If you call flushheaders(), the headers accumulated so far are + written out (and forgotten); this is useful if you don't need a + body part at all, e.g. for a subpart of type message/rfc822 + that's (mis)used to store some header-like information. + + - Passing a keyword argument 'prefix=' to addheader(), + start*body() affects where the header is inserted; 0 means + append at the end, 1 means insert at the start; default is + append for addheader(), but insert for start*body(), which use + it to determine where the Content-type header goes. + + """ + + def __init__(self, fp, http_hdrs=None): + self._http_hdrs = http_hdrs + self._fp = fp + self._headers = [] + self._boundary = [] + self._first_part = True + + def addheader(self, key, value, prefix=0, + add_to_http_hdrs=0): + """ + prefix is ignored if add_to_http_hdrs is true. + """ + lines = value.split("\r\n") + while lines and not lines[-1]: del lines[-1] + while lines and not lines[0]: del lines[0] + if add_to_http_hdrs: + value = "".join(lines) + # 2.2 urllib2 doesn't normalize header case + self._http_hdrs.append((key.capitalize(), value)) + else: + for i in range(1, len(lines)): + lines[i] = " " + lines[i].strip() + value = "\r\n".join(lines) + "\r\n" + line = key.title() + ": " + value + if prefix: + self._headers.insert(0, line) + else: + self._headers.append(line) + + def flushheaders(self): + self._fp.writelines(self._headers) + self._headers = [] + + def startbody(self, ctype=None, plist=[], prefix=1, + add_to_http_hdrs=0, content_type=1): + """ + prefix is ignored if add_to_http_hdrs is true. + """ + if content_type and ctype: + for name, value in plist: + ctype = ctype + ';\r\n %s=%s' % (name, value) + self.addheader("Content-Type", ctype, prefix=prefix, + add_to_http_hdrs=add_to_http_hdrs) + self.flushheaders() + if not add_to_http_hdrs: self._fp.write("\r\n") + self._first_part = True + return self._fp + + def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1, + add_to_http_hdrs=0, content_type=1): + boundary = boundary or choose_boundary() + self._boundary.append(boundary) + return self.startbody("multipart/" + subtype, + [("boundary", boundary)] + plist, + prefix=prefix, + add_to_http_hdrs=add_to_http_hdrs, + content_type=content_type) + + def nextpart(self): + boundary = self._boundary[-1] + if self._first_part: + self._first_part = False + else: + self._fp.write("\r\n") + self._fp.write("--" + boundary + "\r\n") + return self.__class__(self._fp) + + def lastpart(self): + if self._first_part: + self.nextpart() + boundary = self._boundary.pop() + self._fp.write("\r\n--" + boundary + "--\r\n") + + +class LocateError(ValueError): pass +class AmbiguityError(LocateError): pass +class ControlNotFoundError(LocateError): pass +class ItemNotFoundError(LocateError): pass + +class ItemCountError(ValueError): pass + +# for backwards compatibility, ParseError derives from exceptions that were +# raised by versions of ClientForm <= 0.2.5 +# TODO: move to _html +class ParseError(sgmllib.SGMLParseError, + HTMLParser.HTMLParseError): + + def __init__(self, *args, **kwds): + Exception.__init__(self, *args, **kwds) + + def __str__(self): + return Exception.__str__(self) + + +class _AbstractFormParser: + """forms attribute contains HTMLForm instances on completion.""" + # thanks to Moshe Zadka for an example of sgmllib/htmllib usage + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + if entitydefs is None: + entitydefs = get_entitydefs() + self._entitydefs = entitydefs + self._encoding = encoding + + self.base = None + self.forms = [] + self.labels = [] + self._current_label = None + self._current_form = None + self._select = None + self._optgroup = None + self._option = None + self._textarea = None + + # forms[0] will contain all controls that are outside of any form + # self._global_form is an alias for self.forms[0] + self._global_form = None + self.start_form([]) + self.end_form() + self._current_form = self._global_form = self.forms[0] + + def do_base(self, attrs): + debug("%s", attrs) + for key, value in attrs: + if key == "href": + self.base = self.unescape_attr_if_required(value) + + def end_body(self): + debug("") + if self._current_label is not None: + self.end_label() + if self._current_form is not self._global_form: + self.end_form() + + def start_form(self, attrs): + debug("%s", attrs) + if self._current_form is not self._global_form: + raise ParseError("nested FORMs") + name = None + action = None + enctype = "application/x-www-form-urlencoded" + method = "GET" + d = {} + for key, value in attrs: + if key == "name": + name = self.unescape_attr_if_required(value) + elif key == "action": + action = self.unescape_attr_if_required(value) + elif key == "method": + method = self.unescape_attr_if_required(value.upper()) + elif key == "enctype": + enctype = self.unescape_attr_if_required(value.lower()) + d[key] = self.unescape_attr_if_required(value) + controls = [] + self._current_form = (name, action, method, enctype), d, controls + + def end_form(self): + debug("") + if self._current_label is not None: + self.end_label() + if self._current_form is self._global_form: + raise ParseError("end of FORM before start") + self.forms.append(self._current_form) + self._current_form = self._global_form + + def start_select(self, attrs): + debug("%s", attrs) + if self._select is not None: + raise ParseError("nested SELECTs") + if self._textarea is not None: + raise ParseError("SELECT inside TEXTAREA") + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + + self._select = d + self._add_label(d) + + self._append_select_control({"__select": d}) + + def end_select(self): + debug("") + if self._select is None: + raise ParseError("end of SELECT before start") + + if self._option is not None: + self._end_option() + + self._select = None + + def start_optgroup(self, attrs): + debug("%s", attrs) + if self._select is None: + raise ParseError("OPTGROUP outside of SELECT") + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + + self._optgroup = d + + def end_optgroup(self): + debug("") + if self._optgroup is None: + raise ParseError("end of OPTGROUP before start") + self._optgroup = None + + def _start_option(self, attrs): + debug("%s", attrs) + if self._select is None: + raise ParseError("OPTION outside of SELECT") + if self._option is not None: + self._end_option() + + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + + self._option = {} + self._option.update(d) + if (self._optgroup and self._optgroup.has_key("disabled") and + not self._option.has_key("disabled")): + self._option["disabled"] = None + + def _end_option(self): + debug("") + if self._option is None: + raise ParseError("end of OPTION before start") + + contents = self._option.get("contents", "").strip() + self._option["contents"] = contents + if not self._option.has_key("value"): + self._option["value"] = contents + if not self._option.has_key("label"): + self._option["label"] = contents + # stuff dict of SELECT HTML attrs into a special private key + # (gets deleted again later) + self._option["__select"] = self._select + self._append_select_control(self._option) + self._option = None + + def _append_select_control(self, attrs): + debug("%s", attrs) + controls = self._current_form[2] + name = self._select.get("name") + controls.append(("select", name, attrs)) + + def start_textarea(self, attrs): + debug("%s", attrs) + if self._textarea is not None: + raise ParseError("nested TEXTAREAs") + if self._select is not None: + raise ParseError("TEXTAREA inside SELECT") + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + self._add_label(d) + + self._textarea = d + + def end_textarea(self): + debug("") + if self._textarea is None: + raise ParseError("end of TEXTAREA before start") + controls = self._current_form[2] + name = self._textarea.get("name") + controls.append(("textarea", name, self._textarea)) + self._textarea = None + + def start_label(self, attrs): + debug("%s", attrs) + if self._current_label: + self.end_label() + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + taken = bool(d.get("for")) # empty id is invalid + d["__text"] = "" + d["__taken"] = taken + if taken: + self.labels.append(d) + self._current_label = d + + def end_label(self): + debug("") + label = self._current_label + if label is None: + # something is ugly in the HTML, but we're ignoring it + return + self._current_label = None + # if it is staying around, it is True in all cases + del label["__taken"] + + def _add_label(self, d): + #debug("%s", d) + if self._current_label is not None: + if not self._current_label["__taken"]: + self._current_label["__taken"] = True + d["__label"] = self._current_label + + def handle_data(self, data): + debug("%s", data) + + if self._option is not None: + # self._option is a dictionary of the OPTION element's HTML + # attributes, but it has two special keys, one of which is the + # special "contents" key contains text between OPTION tags (the + # other is the "__select" key: see the end_option method) + map = self._option + key = "contents" + elif self._textarea is not None: + map = self._textarea + key = "value" + data = normalize_line_endings(data) + # not if within option or textarea + elif self._current_label is not None: + map = self._current_label + key = "__text" + else: + return + + if data and not map.has_key(key): + # according to + # http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.1 line break + # immediately after start tags or immediately before end tags must + # be ignored, but real browsers only ignore a line break after a + # start tag, so we'll do that. + if data[0:2] == "\r\n": + data = data[2:] + elif data[0:1] in ["\n", "\r"]: + data = data[1:] + map[key] = data + else: + map[key] = map[key] + data + + def do_button(self, attrs): + debug("%s", attrs) + d = {} + d["type"] = "submit" # default + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + controls = self._current_form[2] + + type = d["type"] + name = d.get("name") + # we don't want to lose information, so use a type string that + # doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON} + # e.g. type for BUTTON/RESET is "resetbutton" + # (type for INPUT/RESET is "reset") + type = type+"button" + self._add_label(d) + controls.append((type, name, d)) + + def do_input(self, attrs): + debug("%s", attrs) + d = {} + d["type"] = "text" # default + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + controls = self._current_form[2] + + type = d["type"] + name = d.get("name") + self._add_label(d) + controls.append((type, name, d)) + + def do_isindex(self, attrs): + debug("%s", attrs) + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + controls = self._current_form[2] + + self._add_label(d) + # isindex doesn't have type or name HTML attributes + controls.append(("isindex", None, d)) + + def handle_entityref(self, name): + #debug("%s", name) + self.handle_data(unescape( + '&%s;' % name, self._entitydefs, self._encoding)) + + def handle_charref(self, name): + #debug("%s", name) + self.handle_data(unescape_charref(name, self._encoding)) + + def unescape_attr(self, name): + #debug("%s", name) + return unescape(name, self._entitydefs, self._encoding) + + def unescape_attrs(self, attrs): + #debug("%s", attrs) + escaped_attrs = {} + for key, val in attrs.items(): + try: + val.items + except AttributeError: + escaped_attrs[key] = self.unescape_attr(val) + else: + # e.g. "__select" -- yuck! + escaped_attrs[key] = self.unescape_attrs(val) + return escaped_attrs + + def unknown_entityref(self, ref): self.handle_data("&%s;" % ref) + def unknown_charref(self, ref): self.handle_data("&#%s;" % ref) + + +class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser): + """Good for XHTML, bad for tolerance of incorrect HTML.""" + # thanks to Michael Howitz for this! + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + HTMLParser.HTMLParser.__init__(self) + _AbstractFormParser.__init__(self, entitydefs, encoding) + + def feed(self, data): + try: + HTMLParser.HTMLParser.feed(self, data) + except HTMLParser.HTMLParseError, exc: + raise ParseError(exc) + + def start_option(self, attrs): + _AbstractFormParser._start_option(self, attrs) + + def end_option(self): + _AbstractFormParser._end_option(self) + + def handle_starttag(self, tag, attrs): + try: + method = getattr(self, "start_" + tag) + except AttributeError: + try: + method = getattr(self, "do_" + tag) + except AttributeError: + pass # unknown tag + else: + method(attrs) + else: + method(attrs) + + def handle_endtag(self, tag): + try: + method = getattr(self, "end_" + tag) + except AttributeError: + pass # unknown tag + else: + method() + + def unescape(self, name): + # Use the entitydefs passed into constructor, not + # HTMLParser.HTMLParser's entitydefs. + return self.unescape_attr(name) + + def unescape_attr_if_required(self, name): + return name # HTMLParser.HTMLParser already did it + def unescape_attrs_if_required(self, attrs): + return attrs # ditto + + def close(self): + HTMLParser.HTMLParser.close(self) + self.end_body() + + +class _AbstractSgmllibParser(_AbstractFormParser): + + def do_option(self, attrs): + _AbstractFormParser._start_option(self, attrs) + + # we override this attr to decode hex charrefs + entity_or_charref = re.compile( + '&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(x?[0-9a-fA-F]+))(;?)') + def convert_entityref(self, name): + return unescape("&%s;" % name, self._entitydefs, self._encoding) + def convert_charref(self, name): + return unescape_charref("%s" % name, self._encoding) + def unescape_attr_if_required(self, name): + return name # sgmllib already did it + def unescape_attrs_if_required(self, attrs): + return attrs # ditto + + +class FormParser(_AbstractSgmllibParser, _sgmllib_copy.SGMLParser): + """Good for tolerance of incorrect HTML, bad for XHTML.""" + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + _sgmllib_copy.SGMLParser.__init__(self) + _AbstractFormParser.__init__(self, entitydefs, encoding) + + def feed(self, data): + try: + _sgmllib_copy.SGMLParser.feed(self, data) + except _sgmllib_copy.SGMLParseError, exc: + raise ParseError(exc) + + def close(self): + _sgmllib_copy.SGMLParser.close(self) + self.end_body() + + +class _AbstractBSFormParser(_AbstractSgmllibParser): + + bs_base_class = None + + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + _AbstractFormParser.__init__(self, entitydefs, encoding) + self.bs_base_class.__init__(self) + + def handle_data(self, data): + _AbstractFormParser.handle_data(self, data) + self.bs_base_class.handle_data(self, data) + + def feed(self, data): + try: + self.bs_base_class.feed(self, data) + except _sgmllib_copy.SGMLParseError, exc: + raise ParseError(exc) + + def close(self): + self.bs_base_class.close(self) + self.end_body() + + +class RobustFormParser(_AbstractBSFormParser, _beautifulsoup.BeautifulSoup): + + """Tries to be highly tolerant of incorrect HTML.""" + + bs_base_class = _beautifulsoup.BeautifulSoup + + +class NestingRobustFormParser(_AbstractBSFormParser, + _beautifulsoup.ICantBelieveItsBeautifulSoup): + + """Tries to be highly tolerant of incorrect HTML. + + Different from RobustFormParser in that it more often guesses nesting + above missing end tags (see BeautifulSoup docs). + """ + + bs_base_class = _beautifulsoup.ICantBelieveItsBeautifulSoup + + +#FormParser = XHTMLCompatibleFormParser # testing hack +#FormParser = RobustFormParser # testing hack + + +def ParseResponseEx(response, + select_default=False, + form_parser_class=FormParser, + request_class=_request.Request, + entitydefs=None, + encoding=DEFAULT_ENCODING, + + # private + _urljoin=urlparse.urljoin, + _urlparse=urlparse.urlparse, + _urlunparse=urlparse.urlunparse, + ): + """Identical to ParseResponse, except that: + + 1. The returned list contains an extra item. The first form in the list + contains all controls not contained in any FORM element. + + 2. The arguments ignore_errors and backwards_compat have been removed. + + 3. Backwards-compatibility mode (backwards_compat=True) is not available. + """ + return _ParseFileEx(response, response.geturl(), + select_default, + False, + form_parser_class, + request_class, + entitydefs, + False, + encoding, + _urljoin=_urljoin, + _urlparse=_urlparse, + _urlunparse=_urlunparse, + ) + +def ParseFileEx(file, base_uri, + select_default=False, + form_parser_class=FormParser, + request_class=_request.Request, + entitydefs=None, + encoding=DEFAULT_ENCODING, + + # private + _urljoin=urlparse.urljoin, + _urlparse=urlparse.urlparse, + _urlunparse=urlparse.urlunparse, + ): + """Identical to ParseFile, except that: + + 1. The returned list contains an extra item. The first form in the list + contains all controls not contained in any FORM element. + + 2. The arguments ignore_errors and backwards_compat have been removed. + + 3. Backwards-compatibility mode (backwards_compat=True) is not available. + """ + return _ParseFileEx(file, base_uri, + select_default, + False, + form_parser_class, + request_class, + entitydefs, + False, + encoding, + _urljoin=_urljoin, + _urlparse=_urlparse, + _urlunparse=_urlunparse, + ) + +def ParseString(text, base_uri, *args, **kwds): + fh = StringIO(text) + return ParseFileEx(fh, base_uri, *args, **kwds) + +def ParseResponse(response, *args, **kwds): + """Parse HTTP response and return a list of HTMLForm instances. + + The return value of mechanize.urlopen can be conveniently passed to this + function as the response parameter. + + mechanize.ParseError is raised on parse errors. + + response: file-like object (supporting read() method) with a method + geturl(), returning the URI of the HTTP response + select_default: for multiple-selection SELECT controls and RADIO controls, + pick the first item as the default if none are selected in the HTML + form_parser_class: class to instantiate and use to pass + request_class: class to return from .click() method (default is + mechanize.Request) + entitydefs: mapping like {"&": "&", ...} containing HTML entity + definitions (a sensible default is used) + encoding: character encoding used for encoding numeric character references + when matching link text. mechanize does not attempt to find the encoding + in a META HTTP-EQUIV attribute in the document itself (mechanize, for + example, does do that and will pass the correct value to mechanize using + this parameter). + + backwards_compat: boolean that determines whether the returned HTMLForm + objects are backwards-compatible with old code. If backwards_compat is + true: + + - ClientForm 0.1 code will continue to work as before. + + - Label searches that do not specify a nr (number or count) will always + get the first match, even if other controls match. If + backwards_compat is False, label searches that have ambiguous results + will raise an AmbiguityError. + + - Item label matching is done by strict string comparison rather than + substring matching. + + - De-selecting individual list items is allowed even if the Item is + disabled. + + The backwards_compat argument will be removed in a future release. + + Pass a true value for select_default if you want the behaviour specified by + RFC 1866 (the HTML 2.0 standard), which is to select the first item in a + RADIO or multiple-selection SELECT control if none were selected in the + HTML. Most browsers (including Microsoft Internet Explorer (IE) and + Netscape Navigator) instead leave all items unselected in these cases. The + W3C HTML 4.0 standard leaves this behaviour undefined in the case of + multiple-selection SELECT controls, but insists that at least one RADIO + button should be checked at all times, in contradiction to browser + behaviour. + + There is a choice of parsers. mechanize.XHTMLCompatibleFormParser (uses + HTMLParser.HTMLParser) works best for XHTML, mechanize.FormParser (uses + bundled copy of sgmllib.SGMLParser) (the default) works better for ordinary + grubby HTML. Note that HTMLParser is only available in Python 2.2 and + later. You can pass your own class in here as a hack to work around bad + HTML, but at your own risk: there is no well-defined interface. + + """ + return _ParseFileEx(response, response.geturl(), *args, **kwds)[1:] + +def ParseFile(file, base_uri, *args, **kwds): + """Parse HTML and return a list of HTMLForm instances. + + mechanize.ParseError is raised on parse errors. + + file: file-like object (supporting read() method) containing HTML with zero + or more forms to be parsed + base_uri: the URI of the document (note that the base URI used to submit + the form will be that given in the BASE element if present, not that of + the document) + + For the other arguments and further details, see ParseResponse.__doc__. + + """ + return _ParseFileEx(file, base_uri, *args, **kwds)[1:] + +def _ParseFileEx(file, base_uri, + select_default=False, + ignore_errors=False, + form_parser_class=FormParser, + request_class=_request.Request, + entitydefs=None, + backwards_compat=True, + encoding=DEFAULT_ENCODING, + _urljoin=urlparse.urljoin, + _urlparse=urlparse.urlparse, + _urlunparse=urlparse.urlunparse, + ): + if backwards_compat: + deprecation("operating in backwards-compatibility mode", 1) + fp = form_parser_class(entitydefs, encoding) + while 1: + data = file.read(CHUNK) + try: + fp.feed(data) + except ParseError, e: + e.base_uri = base_uri + raise + if len(data) != CHUNK: break + fp.close() + if fp.base is not None: + # HTML BASE element takes precedence over document URI + base_uri = fp.base + labels = [] # Label(label) for label in fp.labels] + id_to_labels = {} + for l in fp.labels: + label = Label(l) + labels.append(label) + for_id = l["for"] + coll = id_to_labels.get(for_id) + if coll is None: + id_to_labels[for_id] = [label] + else: + coll.append(label) + forms = [] + for (name, action, method, enctype), attrs, controls in fp.forms: + if action is None: + action = base_uri + else: + action = _urljoin(base_uri, action) + # would be nice to make HTMLForm class (form builder) pluggable + form = HTMLForm( + action, method, enctype, name, attrs, request_class, + forms, labels, id_to_labels, backwards_compat) + form._urlparse = _urlparse + form._urlunparse = _urlunparse + for ii in range(len(controls)): + type, name, attrs = controls[ii] + # index=ii*10 allows ImageControl to return multiple ordered pairs + form.new_control( + type, name, attrs, select_default=select_default, index=ii*10) + forms.append(form) + for form in forms: + form.fixup() + return forms + + +class Label: + def __init__(self, attrs): + self.id = attrs.get("for") + self._text = attrs.get("__text").strip() + self._ctext = compress_text(self._text) + self.attrs = attrs + self._backwards_compat = False # maintained by HTMLForm + + def __getattr__(self, name): + if name == "text": + if self._backwards_compat: + return self._text + else: + return self._ctext + return getattr(Label, name) + + def __setattr__(self, name, value): + if name == "text": + # don't see any need for this, so make it read-only + raise AttributeError("text attribute is read-only") + self.__dict__[name] = value + + def __str__(self): + return "" % (self.id, self.text) + + +def _get_label(attrs): + text = attrs.get("__label") + if text is not None: + return Label(text) + else: + return None + +class Control: + """An HTML form control. + + An HTMLForm contains a sequence of Controls. The Controls in an HTMLForm + are accessed using the HTMLForm.find_control method or the + HTMLForm.controls attribute. + + Control instances are usually constructed using the ParseFile / + ParseResponse functions. If you use those functions, you can ignore the + rest of this paragraph. A Control is only properly initialised after the + fixup method has been called. In fact, this is only strictly necessary for + ListControl instances. This is necessary because ListControls are built up + from ListControls each containing only a single item, and their initial + value(s) can only be known after the sequence is complete. + + The types and values that are acceptable for assignment to the value + attribute are defined by subclasses. + + If the disabled attribute is true, this represents the state typically + represented by browsers by 'greying out' a control. If the disabled + attribute is true, the Control will raise AttributeError if an attempt is + made to change its value. In addition, the control will not be considered + 'successful' as defined by the W3C HTML 4 standard -- ie. it will + contribute no data to the return value of the HTMLForm.click* methods. To + enable a control, set the disabled attribute to a false value. + + If the readonly attribute is true, the Control will raise AttributeError if + an attempt is made to change its value. To make a control writable, set + the readonly attribute to a false value. + + All controls have the disabled and readonly attributes, not only those that + may have the HTML attributes of the same names. + + On assignment to the value attribute, the following exceptions are raised: + TypeError, AttributeError (if the value attribute should not be assigned + to, because the control is disabled, for example) and ValueError. + + If the name or value attributes are None, or the value is an empty list, or + if the control is disabled, the control is not successful. + + Public attributes: + + type: string describing type of control (see the keys of the + HTMLForm.type2class dictionary for the allowable values) (readonly) + name: name of control (readonly) + value: current value of control (subclasses may allow a single value, a + sequence of values, or either) + disabled: disabled state + readonly: readonly state + id: value of id HTML attribute + + """ + def __init__(self, type, name, attrs, index=None): + """ + type: string describing type of control (see the keys of the + HTMLForm.type2class dictionary for the allowable values) + name: control name + attrs: HTML attributes of control's HTML element + + """ + raise NotImplementedError() + + def add_to_form(self, form): + self._form = form + form.controls.append(self) + + def fixup(self): + pass + + def is_of_kind(self, kind): + raise NotImplementedError() + + def clear(self): + raise NotImplementedError() + + def __getattr__(self, name): raise NotImplementedError() + def __setattr__(self, name, value): raise NotImplementedError() + + def pairs(self): + """Return list of (key, value) pairs suitable for passing to urlencode. + """ + return [(k, v) for (i, k, v) in self._totally_ordered_pairs()] + + def _totally_ordered_pairs(self): + """Return list of (key, value, index) tuples. + + Like pairs, but allows preserving correct ordering even where several + controls are involved. + + """ + raise NotImplementedError() + + def _write_mime_data(self, mw, name, value): + """Write data for a subitem of this control to a MimeWriter.""" + # called by HTMLForm + mw2 = mw.nextpart() + mw2.addheader("Content-Disposition", + 'form-data; name="%s"' % name, 1) + f = mw2.startbody(prefix=0) + f.write(value) + + def __str__(self): + raise NotImplementedError() + + def get_labels(self): + """Return all labels (Label instances) for this control. + + If the control was surrounded by a