From c3418098cfee2a4a7ae5efcbd904c1de12850e7f Mon Sep 17 00:00:00 2001 From: Rajesh Majumdar Date: Mon, 27 Feb 2017 01:16:47 +0530 Subject: [PATCH] Added some important files. --- mechanize/__init__.py | 211 ++ mechanize/__init__.pyc | Bin 0 -> 5044 bytes mechanize/_auth.py | 68 + mechanize/_auth.pyc | Bin 0 -> 2857 bytes mechanize/_beautifulsoup.py | 1077 ++++++++++ mechanize/_beautifulsoup.pyc | Bin 0 -> 42887 bytes mechanize/_clientcookie.py | 1725 ++++++++++++++++ mechanize/_clientcookie.pyc | Bin 0 -> 59678 bytes mechanize/_debug.py | 28 + mechanize/_debug.pyc | Bin 0 -> 1528 bytes mechanize/_firefox3cookiejar.py | 248 +++ mechanize/_firefox3cookiejar.pyc | Bin 0 -> 9003 bytes mechanize/_form.py | 3280 ++++++++++++++++++++++++++++++ mechanize/_form.pyc | Bin 0 -> 115144 bytes mechanize/_gzip.py | 105 + mechanize/_gzip.pyc | Bin 0 -> 4645 bytes mechanize/_headersutil.py | 241 +++ mechanize/_headersutil.pyc | Bin 0 -> 8302 bytes mechanize/_html.py | 629 ++++++ mechanize/_html.pyc | Bin 0 -> 22594 bytes mechanize/_http.py | 447 ++++ mechanize/_http.pyc | Bin 0 -> 16608 bytes mechanize/_lwpcookiejar.py | 185 ++ mechanize/_lwpcookiejar.pyc | Bin 0 -> 6310 bytes mechanize/_markupbase.py | 393 ++++ mechanize/_mechanize.py | 669 ++++++ mechanize/_mechanize.pyc | Bin 0 -> 24844 bytes mechanize/_mozillacookiejar.py | 161 ++ mechanize/_mozillacookiejar.pyc | Bin 0 -> 5113 bytes mechanize/_msiecookiejar.py | 388 ++++ mechanize/_msiecookiejar.pyc | Bin 0 -> 12443 bytes mechanize/_opener.py | 442 ++++ mechanize/_opener.pyc | Bin 0 -> 12793 bytes mechanize/_pullparser.py | 391 ++++ mechanize/_pullparser.pyc | Bin 0 -> 17658 bytes mechanize/_request.py | 40 + mechanize/_request.pyc | Bin 0 -> 1583 bytes mechanize/_response.py | 525 +++++ mechanize/_response.pyc | Bin 0 -> 17286 bytes mechanize/_rfc3986.py | 245 +++ mechanize/_rfc3986.pyc | Bin 0 -> 4614 bytes mechanize/_sgmllib_copy.py | 559 +++++ mechanize/_sgmllib_copy.pyc | Bin 0 -> 16285 bytes mechanize/_sockettimeout.py | 6 + mechanize/_sockettimeout.pyc | Bin 0 -> 276 bytes mechanize/_testcase.py | 162 ++ mechanize/_urllib2.py | 50 + mechanize/_urllib2.pyc | Bin 0 -> 1661 bytes mechanize/_urllib2_fork.py | 1414 +++++++++++++ mechanize/_urllib2_fork.pyc | Bin 0 -> 46355 bytes mechanize/_useragent.py | 367 ++++ mechanize/_useragent.pyc | Bin 0 -> 14156 bytes mechanize/_util.py | 305 +++ mechanize/_util.pyc | Bin 0 -> 10278 bytes mechanize/_version.py | 2 + mechanize/_version.pyc | Bin 0 -> 232 bytes 56 files changed, 14363 insertions(+) create mode 100644 mechanize/__init__.py create mode 100644 mechanize/__init__.pyc create mode 100644 mechanize/_auth.py create mode 100644 mechanize/_auth.pyc create mode 100644 mechanize/_beautifulsoup.py create mode 100644 mechanize/_beautifulsoup.pyc create mode 100644 mechanize/_clientcookie.py create mode 100644 mechanize/_clientcookie.pyc create mode 100644 mechanize/_debug.py create mode 100644 mechanize/_debug.pyc create mode 100644 mechanize/_firefox3cookiejar.py create mode 100644 mechanize/_firefox3cookiejar.pyc create mode 100644 mechanize/_form.py create mode 100644 mechanize/_form.pyc create mode 100644 mechanize/_gzip.py create mode 100644 mechanize/_gzip.pyc create mode 100644 mechanize/_headersutil.py create mode 100644 mechanize/_headersutil.pyc create mode 100644 mechanize/_html.py create mode 100644 mechanize/_html.pyc create mode 100644 mechanize/_http.py create mode 100644 mechanize/_http.pyc create mode 100644 mechanize/_lwpcookiejar.py create mode 100644 mechanize/_lwpcookiejar.pyc create mode 100644 mechanize/_markupbase.py create mode 100644 mechanize/_mechanize.py create mode 100644 mechanize/_mechanize.pyc create mode 100644 mechanize/_mozillacookiejar.py create mode 100644 mechanize/_mozillacookiejar.pyc create mode 100644 mechanize/_msiecookiejar.py create mode 100644 mechanize/_msiecookiejar.pyc create mode 100644 mechanize/_opener.py create mode 100644 mechanize/_opener.pyc create mode 100644 mechanize/_pullparser.py create mode 100644 mechanize/_pullparser.pyc create mode 100644 mechanize/_request.py create mode 100644 mechanize/_request.pyc create mode 100644 mechanize/_response.py create mode 100644 mechanize/_response.pyc create mode 100644 mechanize/_rfc3986.py create mode 100644 mechanize/_rfc3986.pyc create mode 100644 mechanize/_sgmllib_copy.py create mode 100644 mechanize/_sgmllib_copy.pyc create mode 100644 mechanize/_sockettimeout.py create mode 100644 mechanize/_sockettimeout.pyc create mode 100644 mechanize/_testcase.py create mode 100644 mechanize/_urllib2.py create mode 100644 mechanize/_urllib2.pyc create mode 100644 mechanize/_urllib2_fork.py create mode 100644 mechanize/_urllib2_fork.pyc create mode 100644 mechanize/_useragent.py create mode 100644 mechanize/_useragent.pyc create mode 100644 mechanize/_util.py create mode 100644 mechanize/_util.pyc create mode 100644 mechanize/_version.py create mode 100644 mechanize/_version.pyc diff --git a/mechanize/__init__.py b/mechanize/__init__.py new file mode 100644 index 0000000..c4429be --- /dev/null +++ b/mechanize/__init__.py @@ -0,0 +1,211 @@ +__all__ = [ + 'AbstractBasicAuthHandler', + 'AbstractDigestAuthHandler', + 'BaseHandler', + 'Browser', + 'BrowserStateError', + 'CacheFTPHandler', + 'ContentTooShortError', + 'Cookie', + 'CookieJar', + 'CookiePolicy', + 'DefaultCookiePolicy', + 'DefaultFactory', + 'FTPHandler', + 'Factory', + 'FileCookieJar', + 'FileHandler', + 'FormNotFoundError', + 'FormsFactory', + 'HTTPBasicAuthHandler', + 'HTTPCookieProcessor', + 'HTTPDefaultErrorHandler', + 'HTTPDigestAuthHandler', + 'HTTPEquivProcessor', + 'HTTPError', + 'HTTPErrorProcessor', + 'HTTPHandler', + 'HTTPPasswordMgr', + 'HTTPPasswordMgrWithDefaultRealm', + 'HTTPProxyPasswordMgr', + 'HTTPRedirectDebugProcessor', + 'HTTPRedirectHandler', + 'HTTPRefererProcessor', + 'HTTPRefreshProcessor', + 'HTTPResponseDebugProcessor', + 'HTTPRobotRulesProcessor', + 'HTTPSClientCertMgr', + 'HeadParser', + 'History', + 'LWPCookieJar', + 'Link', + 'LinkNotFoundError', + 'LinksFactory', + 'LoadError', + 'MSIECookieJar', + 'MozillaCookieJar', + 'OpenerDirector', + 'OpenerFactory', + 'ParseError', + 'ProxyBasicAuthHandler', + 'ProxyDigestAuthHandler', + 'ProxyHandler', + 'Request', + 'RobotExclusionError', + 'RobustFactory', + 'RobustFormsFactory', + 'RobustLinksFactory', + 'RobustTitleFactory', + 'SeekableResponseOpener', + 'TitleFactory', + 'URLError', + 'USE_BARE_EXCEPT', + 'UnknownHandler', + 'UserAgent', + 'UserAgentBase', + 'XHTMLCompatibleHeadParser', + '__version__', + 'build_opener', + 'install_opener', + 'lwp_cookie_str', + 'make_response', + 'request_host', + 'response_seek_wrapper', # XXX deprecate in public interface? + 'seek_wrapped_response', # XXX should probably use this internally in place of response_seek_wrapper() + 'str2time', + 'urlopen', + 'urlretrieve', + 'urljoin', + + # ClientForm API + 'AmbiguityError', + 'ControlNotFoundError', + 'FormParser', + 'ItemCountError', + 'ItemNotFoundError', + 'LocateError', + 'Missing', + 'ParseFile', + 'ParseFileEx', + 'ParseResponse', + 'ParseResponseEx', + 'ParseString', + 'XHTMLCompatibleFormParser', + # deprecated + 'CheckboxControl', + 'Control', + 'FileControl', + 'HTMLForm', + 'HiddenControl', + 'IgnoreControl', + 'ImageControl', + 'IsindexControl', + 'Item', + 'Label', + 'ListControl', + 'PasswordControl', + 'RadioControl', + 'ScalarControl', + 'SelectControl', + 'SubmitButtonControl', + 'SubmitControl', + 'TextControl', + 'TextareaControl', + ] + +import logging +import sys + +from _version import __version__ + +# high-level stateful browser-style interface +from _mechanize import \ + Browser, History, \ + BrowserStateError, LinkNotFoundError, FormNotFoundError + +# configurable URL-opener interface +from _useragent import UserAgentBase, UserAgent +from _html import \ + Link, \ + Factory, DefaultFactory, RobustFactory, \ + FormsFactory, LinksFactory, TitleFactory, \ + RobustFormsFactory, RobustLinksFactory, RobustTitleFactory + +# urllib2 work-alike interface. This is a superset of the urllib2 interface. +from _urllib2 import * +import _urllib2 +if hasattr(_urllib2, "HTTPSHandler"): + __all__.append("HTTPSHandler") +del _urllib2 + +# misc +from _http import HeadParser +from _http import XHTMLCompatibleHeadParser +from _opener import ContentTooShortError, OpenerFactory, urlretrieve +from _response import \ + response_seek_wrapper, seek_wrapped_response, make_response +from _rfc3986 import urljoin +from _util import http2time as str2time + +# cookies +from _clientcookie import Cookie, CookiePolicy, DefaultCookiePolicy, \ + CookieJar, FileCookieJar, LoadError, request_host_lc as request_host, \ + effective_request_host +from _lwpcookiejar import LWPCookieJar, lwp_cookie_str +# 2.4 raises SyntaxError due to generator / try/finally use +if sys.version_info[:2] > (2,4): + try: + import sqlite3 + except ImportError: + pass + else: + from _firefox3cookiejar import Firefox3CookieJar +from _mozillacookiejar import MozillaCookieJar +from _msiecookiejar import MSIECookieJar + +# forms +from _form import ( + AmbiguityError, + ControlNotFoundError, + FormParser, + ItemCountError, + ItemNotFoundError, + LocateError, + Missing, + ParseError, + ParseFile, + ParseFileEx, + ParseResponse, + ParseResponseEx, + ParseString, + XHTMLCompatibleFormParser, + # deprecated + CheckboxControl, + Control, + FileControl, + HTMLForm, + HiddenControl, + IgnoreControl, + ImageControl, + IsindexControl, + Item, + Label, + ListControl, + PasswordControl, + RadioControl, + ScalarControl, + SelectControl, + SubmitButtonControl, + SubmitControl, + TextControl, + TextareaControl, + ) + +# If you hate the idea of turning bugs into warnings, do: +# import mechanize; mechanize.USE_BARE_EXCEPT = False +USE_BARE_EXCEPT = True + +logger = logging.getLogger("mechanize") +if logger.level is logging.NOTSET: + logger.setLevel(logging.CRITICAL) +del logger diff --git a/mechanize/__init__.pyc b/mechanize/__init__.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed78626482b244224b5c43c9555e70225871b5f1 GIT binary patch literal 5044 zcmdT{>2?#z75-$I&Dhvr_OK6&*%uGXFl zykFiSkB|pQ>Q<{Id3?w@`IoSKuDVrq>n`8D)z1HOWN7*ylgo8<{wVYFnfagSL4eG^ z9B2TP2MvODfOdj*fp&xTfQCSOLBpUC&_2*8XbiL;bO1CCItV%hIt)4jItn@lIu80B z=mh8_=oIKQ=nUvA=p5(=(0R}W&_&QC&;;l*=nCj6=o;uc=mzK}=oaWU=niNS^dsml z=pN`k=mF>_&_mGAphuv`peLZGpl6^d&~s1$Gz}_(W7CtaS-Yd#387|5Qm|T zKpcTO3UM^mk{~#SYRA%J5XT^oLz*;d{5pX6#()IYOg_ythg=A}H*3cBHi8pSCm~Lz zdcN5CDTq_){40XfP-h^{K%Ip+3v~|ST)MV_;0LJl5a*#TKwN;j2yro8+eB~)Y64;+ zoqt1c8R`ne)pY)Q1lORhLtKZt0dWKBCd5ssTM)OPZbRILx&v_sY7*i{sJjq%)02Nd za1ZJ}#C@m-5D$=(16VVJ`XkQJU$H)jxRBp8+)lIh5aJ=0!u=SOKVy9e@y&cLm-}4E zFY|^!A$SD!7~-)F3X6V0@C529#MAV?KO=YsH3c!1&i{hoIaC3n05uIU4ON6FLd`(T zK)rx?0W}LT3pEEZ2lW!-CDbd3S5PI064X4zJk$cj0@Q1W*HC4MGSnN0H&70Q166^j zKvf~CP>T?YP;VjLLM=foK`lcpL%oA|2PGf`)O(2cP%98CP#+*ZKz)Sx2<1Y!P@f<^ zL3t1!R1KmARfnjX$Pt8u%wf$`!mmsX(SU4Vt!afm)^{UTgDhFIlJzcqOx=X2O#IDs zAAQuoR4vi5SJ%(I#IH}ab#vAWDUh)~*0(tAH)NvwSNED#b3|slckw)qz9#16{@%Ev zJuQoI9L0LnNDE%QDQBwX?9@S)MWL3Vu0~O%X||=CciM$$-Isb#clgqa^>BAqjsm~F zsmG0ATCRHSKo=rD(2?)!$!CmvQM{>#jHvH&HaS}w;jtgcR0Ute4zeUOm7SW2Vzm(I znW!C7YA2h+(j-&F9?e#(jul^$8rR*#@$YS7&v;XI3nzx>WiLs-MlsDdVtsnMY{}Qnj**V^0;MxojpmKb z{=>&RhaE|NEbDq&*4mBU{aGZk<;;#upCeagEc*=ZTZm=S>}l#ikv8faS->|P>Xv=;xMP{W(WiMtXGeu_o#IlD|U0Ukoj~QZ~Zj}6R-SV9W z+o{3QE;TqgrO4|rhZ#3tnJe}fH^!3an;!&TPslWW-IAe4~ep4rHc;&8$ed?$rY6$TYX9$F}~r?A>klE;^;u9;WhQrRYANa*A$oxlk-u zHAiqUTo0qKVdffVV$tMxs$o)ZiIAFVMLo?%8>e!cGx@&8ZA6kp2RLyU+eZ856 zl8cU8a~uU*d2PF>$!(`ZIJ|ROs)A{Xo#xH0l0{q3ao9>xz55<8zvlfU@xzAg@m3#q zTx)4}R@|_Wvh$1!_AS{WF}qm5l~s&`Mkl{tzeyr@YP+TL8Z~8oy%ueB+_cfk#;h%U z5NrrEhm{$@Dy%`XK9LNw?OX_RjWCMM3z8n-dYV&Sqbsn1GTxHxUdSDU1EqJELCLGh zzVDIL>8;hYa@^?lU|vU~kiWIoKO~$R46nMf z7d)M1D6-6YQOZV^i%qRtwt^-&?~ZMS+%z0(x|MM{oht?`8T-rD4eFXZm|R^o@%O)Q zh1)i2i#iR)MLs=kz~|Wb9Iw5MJ-FvEQ&a72?FUZ8+UHot#pKrQ=S=#{{Ab=eEj~HJ z@VR^3yiw`dH7|B9@*0!W)?4=A`Qd70JxM+XzLt}=WzDHp_uHKxb$7MH;hwgb6*0YC zm44*(mYCW04RTfD%buKhp1Qhf+V1}c*}0Ufaa%f%*m!Ol4XI0p&*s(UoHMn}kQ-)T zp5ed>7hYE@#j4G-`TQtZId_?YGgqA}OqC|g7(Gb1rWYTsls9!V3Rj-TZ7r88mFvn& zw2W7}WwGMAe&}o0z0umV${(qS%y;k;ZaT?v)A;ha|Ixqs@7gu+-IYFH_T-0i!vkaa zi5-V?GrNXz!@1GiST3I*Gk79*V!-^&Kg=H<7&m9eawEAxGh*4_FYO!{%^k>%=l%_y C#j3dg literal 0 HcmV?d00001 diff --git a/mechanize/_auth.py b/mechanize/_auth.py new file mode 100644 index 0000000..900e201 --- /dev/null +++ b/mechanize/_auth.py @@ -0,0 +1,68 @@ +"""HTTP Authentication and Proxy support. + + +Copyright 2006 John J. Lee + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +from _urllib2_fork import HTTPPasswordMgr + + +# TODO: stop deriving from HTTPPasswordMgr +class HTTPProxyPasswordMgr(HTTPPasswordMgr): + # has default realm and host/port + def add_password(self, realm, uri, user, passwd): + # uri could be a single URI or a sequence + if uri is None or isinstance(uri, basestring): + uris = [uri] + else: + uris = uri + passwd_by_domain = self.passwd.setdefault(realm, {}) + for uri in uris: + for default_port in True, False: + reduced_uri = self.reduce_uri(uri, default_port) + passwd_by_domain[reduced_uri] = (user, passwd) + + def find_user_password(self, realm, authuri): + attempts = [(realm, authuri), (None, authuri)] + # bleh, want default realm to take precedence over default + # URI/authority, hence this outer loop + for default_uri in False, True: + for realm, authuri in attempts: + authinfo_by_domain = self.passwd.get(realm, {}) + for default_port in True, False: + reduced_authuri = self.reduce_uri(authuri, default_port) + for uri, authinfo in authinfo_by_domain.iteritems(): + if uri is None and not default_uri: + continue + if self.is_suburi(uri, reduced_authuri): + return authinfo + user, password = None, None + + if user is not None: + break + return user, password + + def reduce_uri(self, uri, default_port=True): + if uri is None: + return None + return HTTPPasswordMgr.reduce_uri(self, uri, default_port) + + def is_suburi(self, base, test): + if base is None: + # default to the proxy's host/port + hostport, path = test + base = (hostport, "/") + return HTTPPasswordMgr.is_suburi(self, base, test) + + +class HTTPSClientCertMgr(HTTPPasswordMgr): + # implementation inheritance: this is not a proper subclass + def add_key_cert(self, uri, key_file, cert_file): + self.add_password(None, uri, key_file, cert_file) + def find_key_cert(self, authuri): + return HTTPPasswordMgr.find_user_password(self, None, authuri) diff --git a/mechanize/_auth.pyc b/mechanize/_auth.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c6e05f6a015c8d8d5cec1df2b3f234958a387f0 GIT binary patch literal 2857 zcmbtWTWcFv5T0FKEXy)6aU9x)mW4D#f!0o4NFgz~*e0n%>WY<%6JxVk?;creuXg1< zXR%adC^&g7{T+SmV_*9V`WyNa3Vm(AnYHB7%R{ix8O^!P%$e^yb5#0$w!Z$`=Pv^~ z|CI3i7)?Kg2#IzeEedUlE$Z2{W0Mt@NZHiEXqny+oe{m-DN$G%k5(t6WeTUL-K?>_ zibeG0)>g}TlAEqdOceMgN)yLVLZ_9bgQ27Iem~93V!d8pOZ&qtI_Mf_`R?6&&hxaJ zIL{ZI7fLxRhllawe!8Cy7K60+uwLKlM%oF|P&xSRWLTopjydr&^}rdXxfA$_lc_M$ zCX4oSgJp)DE~S~%OT(x$ydQdYj%@v{NDA#e-FH+s!ie z1Qm#Bv?|&fn*IS|gxy3(HPSc8Sll%>o!OM#X8pjT(+VY3)}f4u4oh@crjbPvD1gB# z4W7}cLhE~v=+vgcJu($CRqEInYSQVHY=X3DG$lJ~G%C@k%qvDPT%(oaUD`h0plt-w zmJDqQcMM0pnI_8Ap+q`LwDFUm=nG8-YS;5| zYZm)MvF(SUCr&|`t89g%Ap3^Zu&dUBHEYdVSFL~}=ScYW3Ce#!0bT$A8Apl>!}vsuW3QZA3UoEmqEg`mnAJ5a_IF0hCI0$NWS z>1j{r`%D&2tIGs(8Ck;?7$4>doAeF#^oTy7n`>hC_o{}c~izDuaaTT%O%byHkV;Tb73H> zVU>%h8SF{a_tCkC`Zlwc*;V2z0RDq)U=ZjJO8(bjgjtv%S}MX9Vxdd!0z{L2300(K zaIq4_pk@iO-BcT?5gbt8H{Ht|m_2@hrfU#5Fo3#ZHJP5`%kvVyr#w$;pXcHA%VTt1 z=7IYm3%Lu-^&)sEmjZy(!XQdKReQQIeXUWETfnbgdo7OeM_^551`l7r&iKQdR?+ka z5bA&4H#}vW=zz1vjnlk_CY!i)$e)*jbU^W}jK3`xkBVR+wPUd3CVj95guti{cE|p+0dkozvj4I{$R$}RuEj$xLsB@Z_FVb;(qB<8VUYj; literal 0 HcmV?d00001 diff --git a/mechanize/_beautifulsoup.py b/mechanize/_beautifulsoup.py new file mode 100644 index 0000000..0040140 --- /dev/null +++ b/mechanize/_beautifulsoup.py @@ -0,0 +1,1077 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +v2.1.1 +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance +into a tree representation. It provides methods and Pythonic idioms +that make it easy to search and modify the tree. + +A well-formed XML/HTML document will yield a well-formed data +structure. An ill-formed XML/HTML document will yield a +correspondingly ill-formed data structure. If your document is only +locally well-formed, you can use this library to find and process the +well-formed part of it. The BeautifulSoup class has heuristics for +obtaining a sensible parse tree in the face of common HTML errors. + +Beautiful Soup has no external dependencies. It works with Python 2.2 +and up. + +Beautiful Soup defines classes for four different parsing strategies: + + * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific + language that kind of looks like XML. + + * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid + or invalid. + + * ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML + that trips up BeautifulSoup. + + * BeautifulSOAP, for making it easier to parse XML documents that use + lots of subelements containing a single string, where you'd prefer + they put that string into an attribute (such as SOAP messages). + +You can subclass BeautifulStoneSoup or BeautifulSoup to create a +parsing strategy specific to an XML schema or a particular bizarre +HTML document. Typically your subclass would just override +SELF_CLOSING_TAGS and/or NESTABLE_TAGS. +""" #" +from __future__ import generators + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "2.1.1" +__date__ = "$Date: 2004/10/18 00:14:20 $" +__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson" +__license__ = "PSF" + +from _sgmllib_copy import SGMLParser, SGMLParseError +import types +import re +import _sgmllib_copy as sgmllib + +class NullType(object): + + """Similar to NoneType with a corresponding singleton instance + 'Null' that, unlike None, accepts any message and returns itself. + + Examples: + >>> Null("send", "a", "message")("and one more", + ... "and what you get still") is Null + True + """ + + def __new__(cls): return Null + def __call__(self, *args, **kwargs): return Null +## def __getstate__(self, *args): return Null + def __getattr__(self, attr): return Null + def __getitem__(self, item): return Null + def __setattr__(self, attr, value): pass + def __setitem__(self, item, value): pass + def __len__(self): return 0 + # FIXME: is this a python bug? otherwise ``for x in Null: pass`` + # never terminates... + def __iter__(self): return iter([]) + def __contains__(self, item): return False + def __repr__(self): return "Null" +Null = object.__new__(NullType) + +class PageElement: + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + def setup(self, parent=Null, previous=Null): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent + self.previous = previous + self.next = Null + self.previousSibling = Null + self.nextSibling = Null + if self.parent and self.parent.contents: + self.previousSibling = self.parent.contents[-1] + self.previousSibling.nextSibling = self + + def findNext(self, name=None, attrs={}, text=None): + """Returns the first item that matches the given criteria and + appears after this Tag in the document.""" + return self._first(self.fetchNext, name, attrs, text) + firstNext = findNext + + def fetchNext(self, name=None, attrs={}, text=None, limit=None): + """Returns all items that match the given criteria and appear + before after Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.nextGenerator) + + def findNextSibling(self, name=None, attrs={}, text=None): + """Returns the closest sibling to this Tag that matches the + given criteria and appears after this Tag in the document.""" + return self._first(self.fetchNextSiblings, name, attrs, text) + firstNextSibling = findNextSibling + + def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None): + """Returns the siblings of this Tag that match the given + criteria and appear after this Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator) + + def findPrevious(self, name=None, attrs={}, text=None): + """Returns the first item that matches the given criteria and + appears before this Tag in the document.""" + return self._first(self.fetchPrevious, name, attrs, text) + + def fetchPrevious(self, name=None, attrs={}, text=None, limit=None): + """Returns all items that match the given criteria and appear + before this Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.previousGenerator) + firstPrevious = findPrevious + + def findPreviousSibling(self, name=None, attrs={}, text=None): + """Returns the closest sibling to this Tag that matches the + given criteria and appears before this Tag in the document.""" + return self._first(self.fetchPreviousSiblings, name, attrs, text) + firstPreviousSibling = findPreviousSibling + + def fetchPreviousSiblings(self, name=None, attrs={}, text=None, + limit=None): + """Returns the siblings of this Tag that match the given + criteria and appear before this Tag in the document.""" + return self._fetch(name, attrs, text, limit, + self.previousSiblingGenerator) + + def findParent(self, name=None, attrs={}): + """Returns the closest parent of this Tag that matches the given + criteria.""" + r = Null + l = self.fetchParents(name, attrs, 1) + if l: + r = l[0] + return r + firstParent = findParent + + def fetchParents(self, name=None, attrs={}, limit=None): + """Returns the parents of this Tag that match the given + criteria.""" + return self._fetch(name, attrs, None, limit, self.parentGenerator) + + #These methods do the real heavy lifting. + + def _first(self, method, name, attrs, text): + r = Null + l = method(name, attrs, text, 1) + if l: + r = l[0] + return r + + def _fetch(self, name, attrs, text, limit, generator): + "Iterates over a generator looking for things that match." + if not hasattr(attrs, 'items'): + attrs = {'class' : attrs} + + results = [] + g = generator() + while True: + try: + i = g.next() + except StopIteration: + break + found = None + if isinstance(i, Tag): + if not text: + if not name or self._matches(i, name): + match = True + for attr, matchAgainst in attrs.items(): + check = i.get(attr) + if not self._matches(check, matchAgainst): + match = False + break + if match: + found = i + elif text: + if self._matches(i, text): + found = i + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #Generators that can be used to navigate starting from both + #NavigableTexts and Tags. + def nextGenerator(self): + i = self + while i: + i = i.next + yield i + + def nextSiblingGenerator(self): + i = self + while i: + i = i.nextSibling + yield i + + def previousGenerator(self): + i = self + while i: + i = i.previous + yield i + + def previousSiblingGenerator(self): + i = self + while i: + i = i.previousSibling + yield i + + def parentGenerator(self): + i = self + while i: + i = i.parent + yield i + + def _matches(self, chunk, howToMatch): + #print 'looking for %s in %s' % (howToMatch, chunk) + # + # If given a list of items, return true if the list contains a + # text element that matches. + if isList(chunk) and not isinstance(chunk, Tag): + for tag in chunk: + if isinstance(tag, NavigableText) and self._matches(tag, howToMatch): + return True + return False + if callable(howToMatch): + return howToMatch(chunk) + if isinstance(chunk, Tag): + #Custom match methods take the tag as an argument, but all other + #ways of matching match the tag name as a string + chunk = chunk.name + #Now we know that chunk is a string + if not isinstance(chunk, basestring): + chunk = str(chunk) + if hasattr(howToMatch, 'match'): + # It's a regexp object. + return howToMatch.search(chunk) + if isList(howToMatch): + return chunk in howToMatch + if hasattr(howToMatch, 'items'): + return howToMatch.has_key(chunk) + #It's just a string + return str(howToMatch) == chunk + +class NavigableText(PageElement): + + def __getattr__(self, attr): + "For backwards compatibility, text.string gives you text" + if attr == 'string': + return self + else: + raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + +class NavigableString(str, NavigableText): + pass + +class NavigableUnicodeString(unicode, NavigableText): + pass + +class Tag(PageElement): + + """Represents a found HTML tag with its attributes and contents.""" + + def __init__(self, name, attrs=None, parent=Null, previous=Null): + "Basic constructor." + self.name = name + if attrs == None: + attrs = [] + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self._getAttrMap().get(key, default) + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the tag, + and throws an exception if it's not there.""" + return self._getAttrMap()[key] + + def __iter__(self): + "Iterating over a tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __nonzero__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self._getAttrMap() + self.attrMap[key] = value + found = False + for i in range(0, len(self.attrs)): + if self.attrs[i][0] == key: + self.attrs[i] = (key, value) + found = True + if not found: + self.attrs.append((key, value)) + self._getAttrMap()[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + for item in self.attrs: + if item[0] == key: + self.attrs.remove(item) + #We don't break because bad HTML can define the same + #attribute multiple times. + self._getAttrMap() + if self.attrMap.has_key(key): + del self.attrMap[key] + + def __call__(self, *args, **kwargs): + """Calling a tag like a function is the same as calling its + fetch() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return apply(self.fetch, args, kwargs) + + def __getattr__(self, tag): + if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: + return self.first(tag[:-3]) + elif tag.find('__') != 0: + return self.first(tag) + + def __eq__(self, other): + """Returns true iff this tag has the same name, the same attributes, + and the same contents (recursively) as the given tag. + + NOTE: right now this will return false if two tags have the + same attributes in a different order. Should this be fixed?""" + if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): + return False + for i in range(0, len(self.contents)): + if self.contents[i] != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this tag is not identical to the other tag, + as defined in __eq__.""" + return not self == other + + def __repr__(self): + """Renders this tag as a string.""" + return str(self) + + def __unicode__(self): + return self.__str__(1) + + def __str__(self, needUnicode=None, showStructureIndent=None): + """Returns a string or Unicode representation of this tag and + its contents. + + NOTE: since Python's HTML parser consumes whitespace, this + method is not certain to reproduce the whitespace present in + the original string.""" + + attrs = [] + if self.attrs: + for key, val in self.attrs: + attrs.append('%s="%s"' % (key, val)) + close = '' + closeTag = '' + if self.isSelfClosing(): + close = ' /' + else: + closeTag = '' % self.name + indentIncrement = None + if showStructureIndent != None: + indentIncrement = showStructureIndent + if not self.hidden: + indentIncrement += 1 + contents = self.renderContents(indentIncrement, needUnicode=needUnicode) + if showStructureIndent: + space = '\n%s' % (' ' * showStructureIndent) + if self.hidden: + s = contents + else: + s = [] + attributeString = '' + if attrs: + attributeString = ' ' + ' '.join(attrs) + if showStructureIndent: + s.append(space) + s.append('<%s%s%s>' % (self.name, attributeString, close)) + s.append(contents) + if closeTag and showStructureIndent != None: + s.append(space) + s.append(closeTag) + s = ''.join(s) + isUnicode = type(s) == types.UnicodeType + if needUnicode and not isUnicode: + s = unicode(s) + elif isUnicode and needUnicode==False: + s = str(s) + return s + + def prettify(self, needUnicode=None): + return self.__str__(needUnicode, showStructureIndent=True) + + def renderContents(self, showStructureIndent=None, needUnicode=None): + """Renders the contents of this tag as a (possibly Unicode) + string.""" + s=[] + for c in self: + text = None + if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType: + text = unicode(c) + elif isinstance(c, Tag): + s.append(c.__str__(needUnicode, showStructureIndent)) + elif needUnicode: + text = unicode(c) + else: + text = str(c) + if text: + if showStructureIndent != None: + if text[-1] == '\n': + text = text[:-1] + s.append(text) + return ''.join(s) + + #Soup methods + + def firstText(self, text, recursive=True): + """Convenience method to retrieve the first piece of text matching the + given criteria. 'text' can be a string, a regular expression object, + a callable that takes a string and returns whether or not the + string 'matches', etc.""" + return self.first(recursive=recursive, text=text) + + def fetchText(self, text, recursive=True, limit=None): + """Convenience method to retrieve all pieces of text matching the + given criteria. 'text' can be a string, a regular expression object, + a callable that takes a string and returns whether or not the + string 'matches', etc.""" + return self.fetch(recursive=recursive, text=text, limit=limit) + + def first(self, name=None, attrs={}, recursive=True, text=None): + """Return only the first child of this + Tag matching the given criteria.""" + r = Null + l = self.fetch(name, attrs, recursive, text, 1) + if l: + r = l[0] + return r + findChild = first + + def fetch(self, name=None, attrs={}, recursive=True, text=None, + limit=None): + """Extracts a list of Tag objects that match the given + criteria. You can specify the name of the Tag and any + attributes you want the Tag to have. + + The value of a key-value pair in the 'attrs' map can be a + string, a list of strings, a regular expression object, or a + callable that takes a string and returns whether or not the + string matches for some custom definition of 'matches'. The + same is true of the tag name.""" + generator = self.recursiveChildGenerator + if not recursive: + generator = self.childGenerator + return self._fetch(name, attrs, text, limit, generator) + fetchChildren = fetch + + #Utility methods + + def isSelfClosing(self): + """Returns true iff this is a self-closing tag as defined in the HTML + standard. + + TODO: This is specific to BeautifulSoup and its subclasses, but it's + used by __str__""" + return self.name in BeautifulSoup.SELF_CLOSING_TAGS + + def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.contents.append(tag) + + #Private methods + + def _getAttrMap(self): + """Initializes a map representation of this tag's attributes, + if not already initialized.""" + if not getattr(self, 'attrMap'): + self.attrMap = {} + for (key, value) in self.attrs: + self.attrMap[key] = value + return self.attrMap + + #Generator methods + def childGenerator(self): + for i in range(0, len(self.contents)): + yield self.contents[i] + raise StopIteration + + def recursiveChildGenerator(self): + stack = [(self, 0)] + while stack: + tag, start = stack.pop() + if isinstance(tag, Tag): + for i in range(start, len(tag.contents)): + a = tag.contents[i] + yield a + if isinstance(a, Tag) and tag.contents: + if i < len(tag.contents) - 1: + stack.append((tag, i+1)) + stack.append((a, 0)) + break + raise StopIteration + + +def isList(l): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is listlike.""" + return hasattr(l, '__iter__') \ + or (type(l) in (types.ListType, types.TupleType)) + +def buildTagMap(default, *args): + """Turns a list of maps, lists, or scalars into a single map. + Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out + of lists and partial maps.""" + built = {} + for portion in args: + if hasattr(portion, 'items'): + #It's a map. Merge it. + for k,v in portion.items(): + built[k] = v + elif isList(portion): + #It's a list. Map each item to the default. + for k in portion: + built[k] = default + else: + #It's a scalar. Map it to the default. + built[portion] = default + return built + +class BeautifulStoneSoup(Tag, SGMLParser): + + """This class contains the basic parser and fetch code. It defines + a parser that knows nothing about tag behavior except for the + following: + + You can't close a tag without closing all the tags it encloses. + That is, "" actually means + "". + + [Another possible explanation is "", but since + this class defines no SELF_CLOSING_TAGS, it will never use that + explanation.] + + This class is useful for parsing XML or made-up markup languages, + or when BeautifulSoup makes an assumption counter to what you were + expecting.""" + + SELF_CLOSING_TAGS = {} + NESTABLE_TAGS = {} + RESET_NESTING_TAGS = {} + QUOTE_TAGS = {} + + #As a public service we will by default silently replace MS smart quotes + #and similar characters with their HTML or ASCII equivalents. + MS_CHARS = { '\x80' : '€', + '\x81' : ' ', + '\x82' : '‚', + '\x83' : 'ƒ', + '\x84' : '„', + '\x85' : '…', + '\x86' : '†', + '\x87' : '‡', + '\x88' : '⁁', + '\x89' : '%', + '\x8A' : 'Š', + '\x8B' : '<', + '\x8C' : 'Œ', + '\x8D' : '?', + '\x8E' : 'Z', + '\x8F' : '?', + '\x90' : '?', + '\x91' : '‘', + '\x92' : '’', + '\x93' : '“', + '\x94' : '”', + '\x95' : '•', + '\x96' : '–', + '\x97' : '—', + '\x98' : '˜', + '\x99' : '™', + '\x9a' : 'š', + '\x9b' : '>', + '\x9c' : 'œ', + '\x9d' : '?', + '\x9e' : 'z', + '\x9f' : 'Ÿ',} + + PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'), + lambda(x):x.group(1) + ' />'), + (re.compile(']*)>'), + lambda(x):''), + (re.compile("([\x80-\x9f])"), + lambda(x): BeautifulStoneSoup.MS_CHARS.get(x.group(1))) + ] + + ROOT_TAG_NAME = '[document]' + + def __init__(self, text=None, avoidParserProblems=True, + initialTextIsEverything=True): + """Initialize this as the 'root tag' and feed in any text to + the parser. + + NOTE about avoidParserProblems: sgmllib will process most bad + HTML, and BeautifulSoup has tricks for dealing with some HTML + that kills sgmllib, but Beautiful Soup can nonetheless choke + or lose data if your data uses self-closing tags or + declarations incorrectly. By default, Beautiful Soup sanitizes + its input to avoid the vast majority of these problems. The + problems are relatively rare, even in bad HTML, so feel free + to pass in False to avoidParserProblems if they don't apply to + you, and you'll get better performance. The only reason I have + this turned on by default is so I don't get so many tech + support questions. + + The two most common instances of invalid HTML that will choke + sgmllib are fixed by the default parser massage techniques: + +
(No space between name of closing tag and tag close) + (Extraneous whitespace in declaration) + + You can pass in a custom list of (RE object, replace method) + tuples to get Beautiful Soup to scrub your input the way you + want.""" + Tag.__init__(self, self.ROOT_TAG_NAME) + if avoidParserProblems \ + and not isList(avoidParserProblems): + avoidParserProblems = self.PARSER_MASSAGE + self.avoidParserProblems = avoidParserProblems + SGMLParser.__init__(self) + self.quoteStack = [] + self.hidden = 1 + self.reset() + if hasattr(text, 'read'): + #It's a file-type object. + text = text.read() + if text: + self.feed(text) + if initialTextIsEverything: + self.done() + + def __getattr__(self, methodName): + """This method routes method call requests to either the SGMLParser + superclass or the Tag superclass, depending on the method name.""" + if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ + or methodName.find('do_') == 0: + return SGMLParser.__getattr__(self, methodName) + elif methodName.find('__') != 0: + return Tag.__getattr__(self, methodName) + else: + raise AttributeError + + def feed(self, text): + if self.avoidParserProblems: + for fix, m in self.avoidParserProblems: + text = fix.sub(m, text) + SGMLParser.feed(self, text) + + def done(self): + """Called when you're done parsing, so that the unclosed tags can be + correctly processed.""" + self.endData() #NEW + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + SGMLParser.reset(self) + self.currentData = [] + self.currentTag = None + self.tagStack = [] + self.pushTag(self) + + def popTag(self): + tag = self.tagStack.pop() + # Tags with just one string-owning child get the child as a + # 'string' property, so that soup.tag.string is shorthand for + # soup.tag.contents[0] + if len(self.currentTag.contents) == 1 and \ + isinstance(self.currentTag.contents[0], NavigableText): + self.currentTag.string = self.currentTag.contents[0] + + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + + def endData(self): + currentData = ''.join(self.currentData) + if currentData: + if not currentData.strip(): + if '\n' in currentData: + currentData = '\n' + else: + currentData = ' ' + c = NavigableString + if type(currentData) == types.UnicodeType: + c = NavigableUnicodeString + o = c(currentData) + o.setup(self.currentTag, self.previous) + if self.previous: + self.previous.next = o + self.previous = o + self.currentTag.contents.append(o) + self.currentData = [] + + def _popToTag(self, name, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + if name == self.ROOT_TAG_NAME: + return + + numPops = 0 + mostRecentTag = None + for i in range(len(self.tagStack)-1, 0, -1): + if name == self.tagStack[i].name: + numPops = len(self.tagStack)-i + break + if not inclusivePop: + numPops = numPops - 1 + + for i in range(0, numPops): + mostRecentTag = self.popTag() + return mostRecentTag + + def _smartPop(self, name): + + """We need to pop up to the previous tag of this type, unless + one of this tag's nesting reset triggers comes between this + tag and the previous tag of this type, OR unless this tag is a + generic nesting trigger and another generic nesting trigger + comes between this tag and the previous tag of this type. + + Examples: +

FooBar

should pop to 'p', not 'b'. +

FooBar

should pop to 'table', not 'p'. +

Foo

Bar

should pop to 'tr', not 'p'. +

FooBar

should pop to 'p', not 'b'. + +

    • *
    • * should pop to 'ul', not the first 'li'. +
  • ** should pop to 'table', not the first 'tr' + tag should + implicitly close the previous tag within the same
    ** should pop to 'tr', not the first 'td' + """ + + nestingResetTriggers = self.NESTABLE_TAGS.get(name) + isNestable = nestingResetTriggers != None + isResetNesting = self.RESET_NESTING_TAGS.has_key(name) + popTo = None + inclusive = True + for i in range(len(self.tagStack)-1, 0, -1): + p = self.tagStack[i] + if (not p or p.name == name) and not isNestable: + #Non-nestable tags get popped to the top or to their + #last occurance. + popTo = name + break + if (nestingResetTriggers != None + and p.name in nestingResetTriggers) \ + or (nestingResetTriggers == None and isResetNesting + and self.RESET_NESTING_TAGS.has_key(p.name)): + + #If we encounter one of the nesting reset triggers + #peculiar to this tag, or we encounter another tag + #that causes nesting to reset, pop up to but not + #including that tag. + + popTo = p.name + inclusive = False + break + p = p.parent + if popTo: + self._popToTag(popTo, inclusive) + + def unknown_starttag(self, name, attrs, selfClosing=0): + #print "Start tag %s" % name + if self.quoteStack: + #This is not a real tag. + #print "<%s> is not real!" % name + attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) + self.handle_data('<%s%s>' % (name, attrs)) + return + self.endData() + if not name in self.SELF_CLOSING_TAGS and not selfClosing: + self._smartPop(name) + tag = Tag(name, attrs, self.currentTag, self.previous) + if self.previous: + self.previous.next = tag + self.previous = tag + self.pushTag(tag) + if selfClosing or name in self.SELF_CLOSING_TAGS: + self.popTag() + if name in self.QUOTE_TAGS: + #print "Beginning quote (%s)" % name + self.quoteStack.append(name) + self.literal = 1 + + def unknown_endtag(self, name): + if self.quoteStack and self.quoteStack[-1] != name: + #This is not a real end tag. + #print " is not real!" % name + self.handle_data('' % name) + return + self.endData() + self._popToTag(name) + if self.quoteStack and self.quoteStack[-1] == name: + self.quoteStack.pop() + self.literal = (len(self.quoteStack) > 0) + + def handle_data(self, data): + self.currentData.append(data) + + def handle_pi(self, text): + "Propagate processing instructions right through." + self.handle_data("" % text) + + def handle_comment(self, text): + "Propagate comments right through." + self.handle_data("" % text) + + def handle_charref(self, ref): + "Propagate char refs right through." + self.handle_data('&#%s;' % ref) + + def handle_entityref(self, ref): + "Propagate entity refs right through." + self.handle_data('&%s;' % ref) + + def handle_decl(self, data): + "Propagate DOCTYPEs and the like right through." + self.handle_data('' % data) + + def parse_declaration(self, i): + """Treat a bogus SGML declaration as raw data. Treat a CDATA + declaration as regular data.""" + j = None + if self.rawdata[i:i+9] == '', i) + if k == -1: + k = len(self.rawdata) + self.handle_data(self.rawdata[i+9:k]) + j = k+3 + else: + try: + j = SGMLParser.parse_declaration(self, i) + except SGMLParseError: + toHandle = self.rawdata[i:] + self.handle_data(toHandle) + j = i + len(toHandle) + return j + +class BeautifulSoup(BeautifulStoneSoup): + + """This parser knows the following facts about HTML: + + * Some tags have no closing tag and should be interpreted as being + closed as soon as they are encountered. + + * The text inside some tags (ie. 'script') may contain tags which + are not really part of the document and which should be parsed + as text, not tags. If you want to parse the text as tags, you can + always fetch it and parse it explicitly. + + * Tag nesting rules: + + Most tags can't be nested at all. For instance, the occurance of + a

    tag should implicitly close the previous

    tag. + +

    Para1

    Para2 + should be transformed into: +

    Para1

    Para2 + + Some tags can be nested arbitrarily. For instance, the occurance + of a

    tag should _not_ implicitly close the previous +
    tag. + + Alice said:
    Bob said:
    Blah + should NOT be transformed into: + Alice said:
    Bob said:
    Blah + + Some tags can be nested, but the nesting is reset by the + interposition of other tags. For instance, a
    , + but not close a tag in another table. + +
    BlahBlah + should be transformed into: +
    BlahBlah + but, + Blah
    Blah + should NOT be transformed into + Blah
    Blah + + Differing assumptions about tag nesting rules are a major source + of problems with the BeautifulSoup class. If BeautifulSoup is not + treating as nestable a tag your page author treats as nestable, + try ICantBelieveItsBeautifulSoup before writing your own + subclass.""" + + SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + + QUOTE_TAGS = {'script': None} + + #According to the HTML standard, each of these inline tags can + #contain another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', + 'center'] + + #According to the HTML standard, these block tags can contain + #another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] + + #Lists can contain other lists, but there are restrictions. + NESTABLE_LIST_TAGS = { 'ol' : [], + 'ul' : [], + 'li' : ['ul', 'ol'], + 'dl' : [], + 'dd' : ['dl'], + 'dt' : ['dl'] } + + #Tables can contain other tables, but there are restrictions. + NESTABLE_TABLE_TAGS = {'table' : [], + 'tr' : ['table', 'tbody', 'tfoot', 'thead'], + 'td' : ['tr'], + 'th' : ['tr'], + } + + NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] + + #If one of these tags is encountered, all tags up to the next tag of + #this type are popped. + RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', + NON_NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, + NESTABLE_TABLE_TAGS) + + NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) + +class ICantBelieveItsBeautifulSoup(BeautifulSoup): + + """The BeautifulSoup class is oriented towards skipping over + common HTML errors like unclosed tags. However, sometimes it makes + errors of its own. For instance, consider this fragment: + + FooBar + + This is perfectly valid (if bizarre) HTML. However, the + BeautifulSoup class will implicitly close the first b tag when it + encounters the second 'b'. It will think the author wrote + "FooBar", and didn't close the first 'b' tag, because + there's no real-world reason to bold something that's already + bold. When it encounters '' it will close two more 'b' + tags, for a grand total of three tags closed instead of two. This + can throw off the rest of your document structure. The same is + true of a number of other tags, listed below. + + It's much more common for someone to forget to close (eg.) a 'b' + tag than to actually use nested 'b' tags, and the BeautifulSoup + class handles the common case. This class handles the + not-co-common case: where you can't believe someone wrote what + they did, but it's valid HTML and BeautifulSoup screwed up by + assuming it wouldn't be. + + If this doesn't do what you need, try subclassing this class or + BeautifulSoup, and providing your own list of NESTABLE_TAGS.""" + + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ + ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', + 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', + 'big'] + + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] + + NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) + +class BeautifulSOAP(BeautifulStoneSoup): + """This class will push a tag with only a single string child into + the tag's parent as an attribute. The attribute's name is the tag + name, and the value is the string child. An example should give + the flavor of the change: + + baz + => + baz + + You can then access fooTag['bar'] instead of fooTag.barTag.string. + + This is, of course, useful for scraping structures that tend to + use subelements instead of attributes, such as SOAP messages. Note + that it modifies its input, so don't print the modified version + out. + + I'm not sure how many people really want to use this class; let me + know if you do. Mainly I like the name.""" + + def popTag(self): + if len(self.tagStack) > 1: + tag = self.tagStack[-1] + parent = self.tagStack[-2] + parent._getAttrMap() + if (isinstance(tag, Tag) and len(tag.contents) == 1 and + isinstance(tag.contents[0], NavigableText) and + not parent.attrMap.has_key(tag.name)): + parent[tag.name] = tag.contents[0] + BeautifulStoneSoup.popTag(self) + +#Enterprise class names! It has come to our attention that some people +#think the names of the Beautiful Soup parser classes are too silly +#and "unprofessional" for use in enterprise screen-scraping. We feel +#your pain! For such-minded folk, the Beautiful Soup Consortium And +#All-Night Kosher Bakery recommends renaming this file to +#"RobustParser.py" (or, in cases of extreme enterprisitude, +#"RobustParserBeanInterface.class") and using the following +#enterprise-friendly class aliases: +class RobustXMLParser(BeautifulStoneSoup): + pass +class RobustHTMLParser(BeautifulSoup): + pass +class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): + pass +class SimplifyingSOAPParser(BeautifulSOAP): + pass + +### + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulStoneSoup(sys.stdin.read()) + print soup.prettify() diff --git a/mechanize/_beautifulsoup.pyc b/mechanize/_beautifulsoup.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5095b8becf930b545cac986439514be941c1546 GIT binary patch literal 42887 zcmdU&e{dbwb>H^^AV`3qNRi?%NtTwRD1bBxO0rDbGEIrUBr&EydI4J&CHm#z?E-k@ z!+XH)14$@SW1C77r}^RhcG{$Aqck&VleB4*$uynjhdZ4~+$K$?<2X&yHkr({oo1$! z8Bg+88~5`)XLsKNf5?jXl(Z!FzPoqt-XG_ld(OG%oO^fhtD8no|H=D4Sr77mgZw_j zm2DUZf;b2+5>|qEAZS*C#ev}BKv0PXgNuVfJQQ3U3gUIa#dSeE99$d@;`PDB^_F`> z(A*F#jszD+g2mC`;wbM>#z@fI7+lj;0;;=~FQ>H!~#CwYp_LZq&5I+(m@3C_Cm#KS# z_|YJF$Wq74R9@c~#1p}lfgl|k$cN&Q;35S)5G0QTm-h#k9~Ha>;_)CE4=zvW_POU< zfO;gDo;Vp%I*oQ~GCbP}m(uptMohVjNoS!QQ$BsVuu7b6 zg^jq;Ud%>23$;$TSfkuVCroPDYDk$`QcLR#mVdDwH|9tL2C8Rrbo4}cEon9n&$ZLV z#M*a6Z42XeePxmQ!)uLZGhA&X&6u|JW{qo|+Gy5ESL&UWG?@%fv_i7Jy@E#T?Uep4 zwOetcH81#d-&DC_Pq}C3!qxUl+HFxI3)`*c>S(iFuQhqMr%{KLHLTZK;Yvmi7s%Rd z%rXWBhq(q26Vw1@J;^eGVYKIM4O}N|&jGv1P-s$)tHwF3H*37JP~%Eg(ni*4)Uyz{ zjkaeywML7M03pUKYs@wiM*<(rM#}&`SF0x~9aLIuw+scsBu(3C#xRssISQ$kRy$0t zcapSKYld;M1b!y1dLzjU8?Lp}D;a~+S;%o9d}8v6QFU%*>DJ=ngua71)&+iPHJ5$d6PBKqDPmPX-2YX=9X}1yq;*h;wTX z=9qF}7&dFI`IXwd#smbtqJaPs&32nk*fgLRn!TpJTdGP|T8G8)#Pj^D}jA$cYK=8u5`lL#;Y}u)M`PbJ0SR+2t(3LGzS^C zHk6=5F-;i=2W@H19}2H6Fn|L5xDY#GIK8#WYPdu_r#)0+54sA%fU^vsEq=gwH-CbT;5TF1PBc9&eI`rh=V~j>&SX2CKai~>yD^XKL1K@b1_)1uPdxV6lSdwZ z?8xIE3?F;!smGsu>WRm~N37&i?WNVUF~86WC+Y_*-(fPpKfJZZ&LHJon0|HwamO2} z*>*uZChgoqT6ud$#AK?ofwZZWW)pBFb>j9#D6Y1kqgBuVM5ks#h+kx_ABr64?wT4XGUrYAP4#sTV{ZZKmRS%!WJ zGNPtA@v4E_J**%U4QeJ}7|5FQMdn^Ot`3bGG9L<8T1GNd(4nwauO~}Pky>lDz+VkD z92Bg>wZ)~TNSp4CA3q+d+=;zV?Rf8@aBodPJ`469nAod=s3lx% zr^((!_R{3!q}~n5%)@J%xL{*ApLCd!P^G;GLb#5Kwd^zLip}bidPmB(6rTfHF>2v zTZloK=+n|_XFYFLtF7c(wR(bR#uZqbgkGxcJlE~o6bZ8@l&hoG37QtErSn+_fh)Lz zOf=u0MhbPSRn75g)v%TOpf1GMrB>bva^6biid2ZV5Hi;n*a?lRRfbrEvsyjPt;6q~ zwye=f7W>;Wl~}EwDO;xTDn_=1S=_}o9{ZpgxS}y`I~r*C-%8TQrey`to~3XH+F!A0 zB5~0cg8sJ30rp(kDj`>KbpcjkdTNlT8U%R@l*^#jmj?N~Y$mO0^?3^J&P!FJs;x2N zP0)&YTLIldG3ZNb^rL^x+CYWXstT@FKT7=VClxmk<>S;qp*c zt)45}Rt%ngZ!>^SAb1wME0EHHty+DdXwAeXV#bM~xwXZlS~c;aT4e#Y($uYwJIkzU zHBvkhk%{zLv{}Ku3brcPreM2*oeFjlbU|sccg_|fk@TZn*$#r?k>P=n%7`xiKd9f1 zT6O`s@QjP2b#CpS=*3B{>=|I5!2AoDA{h+$lVpKD27%(ffayah5d+oCaFC#mL7FJj zd4$k`z6QynOho!1^N92qC}_IrWJVq#GqOgR8Nr0isB6j;vLU39ZBvk-$+@TJ5i$#z z(cEkfE^g^b=M|(FUO|iF2_ypu&ljk3r$aN?eUO@p>y)ZqAb713 zyvF<@GGLKP(DBG%5g95X>y+Vg5TtP)xjeik&gkz}SwniPtV2O{UC;s#2B4{`Ih!Jw zPLSQSa-j7ra5@dTnI=u^W5%+sa}7lePm!te)sUn+>!XCQw)L`DCrz3(KQ*EGWTFXc z5Y|sN4_O*rZM0W1qXQ-$w17iLbFFwZ&B7h=)9_sni~9}a5yjC`%v$a=?L8z%4-s@{ z<2#>-Mm1rUD@!w!n2Fy7Hv4=gZm(>s)HPRiY2Mk8?Iz-*ppdcSkMSA|ZVUlq5yFfS z3Zh&T7L4Z_Da#}=KNF_W6?IsHnJOflZ(Ie|^;A^9QCkD&+R_qAB&6&)p80UhfLOU^ zqfi!2`Wi(-VbvT{}6Vy852SUZ?igoFPTJQMxsAA&^1TuqUlVw(402uA3$T_Y8aPI{7zDaHC!M%F3 zjb4;>lFu_O25n*s5~il;LB3iouYh_IEZ3mkUk?G)vZWUV^IAXC z1K4+niRn$S?GF)rdk{6m)Do;f%)DEFo%s1as{dAjQ6w-J+FCJTUcgj>i8W6UL0|Ct zpb0mHWQa_DV=#Cf`DQ3cClw-$?3PpmL%iu<%B;}yXHqlx=xJk6>=*85+W=k|4qwYiIFdUzAO`p0gi z6!-#HK?F58T36|J4T`Z4?2(*>Vne~}!@+AT2SG862SPDh`MoB2dXV?0dhkI5?7A71 z?yCcV-mTm>NoLUpl;?v4{Xhoe%s)n9jsgsSjsmLx9D+5HuOJC=Rk&~3dQbFar=4JiKg(ctCjQbWO&bxOIg ze6F(mc);I_Jef8jDARbJWnGEMjQN5EFRWot$qjL4BxY4^3J(ZRYrc2kHEGKpEoU=} znW=KhwQs^F?C4ru+Tv2CnCM{=ZB;ehX)k%DI9NnaDOrS|fdR_xPG)^!Y2sSEd}$JW zL>Yy{SOX&I%uLUqbzUpBbTqjW!kwEZrV!R%+0?wKACR|xEgnUT493meuOO!Sm0 zEVSxfOE0o%w7&c|#U}X#BaUHW{GuLjr zHdnUtyMar;TPtIPn<~318!Kj75@FQ*u5)Gk3BaS*E5Y?$!s0Ykc-c~je4?^E61*69 zEfhu=G+aGu%Y%2WZHeAX8Nb1mZBS{nT56U)wRsOEz8h^mLb?Be>)mLxsj-YT z3m{S7;4C@h%kNT~L&|+O;QW5d{eyR@&C(gZ8*TnD<^J(4Z5Ex9kRX0`N>2xwRN`as z5GLR!#6yUq4gMHNCtqco7mf4INWD zr8X7u8m4>8Prp^Ea4IMOOe22;yMUekBN1PJCw~ejDVS~gYl>fnmsqo>oe`_5uPBLU zSJVK1!`t>%djn>dzd_;FIPP&V%!QEX1wdj1-3Yc3@ak5ktS znz-n+o;{<$$iL=`4AwQCo3vtVWVR+vmmGAADG-gGCZpBi?s!J{VNca7$!b>@kQ4Kj z))gyep?z(p{eo1qR!pZh-xunqF8p_{>=A-s-Gk8VJy7u>==W%4^WYd%pZ^1!{Jyej zU~0mY-O@Kl_bYfnff!S?Pl0$vbb!DWzS1#E^J-eeXj17?Xq&c0>G_(;R65b)dRI$s zgSF);v=-5mO1`gnwMZ^r*E%Trkg{wnvzUk+Nq9Bs&a_rW?IV%TNc1Su1dlXhY%IGF zwb)%Sl?N&E0#~MH4-X6vY#!S>G(50pJ zalGTR%rC9^^lB!#`dPfAW@~l1bKqTym*NsunAt|N(OEqdN;5y1yOBvHo0->(QXQua z*RwqoGrm6?58JbslX}OTC2?cy`kU#YuW!tC=>)QwIii{Ck|7VqUHMsJ6&1Gi&`QZY z-LblhVUj9t80cjpi^0HHWoW=xTwU~5mTp~lix(QzAyuf68XmB|gU{DST7eyh=x8t!&=MaW2Ng@^8MCg3y+3D~&prP}4}BGNPkR-H?Y{^)P}!1W@!A#S(>`18ZnVRv>4u5f zJ>6_OfZoEgY})}aDCg#2Y|3s9#*XafVDU@lM>z8n=jna+2vhQdQm5pJYzDw)jcfn?!9{G!y^nZ>-2rTYFc<%ec=h`p z0UY^vS(owMw+#XpLv0RFcks}+%>ftpipq%YXIk_`g-1Ge7uI4dOx*t#p5&fC^^kfeqR%u){+ z$HGk~+e)z~YZ(sl)D5}9IlY~tQPEn^bqU6*#&x?_!9E4$I&Ps6<79nX8=En}zFB)U zyK%3<;9afm6jp1=7>F_0N@Rq!mMQfFuM%czCLLzUbi-)5;AiqqZtMLV&wEu+=F8>! zx+EM&&KU0wP`;eTA-jeTF;48Vrt6=K#QvgN-sCDij{mqkUB| zcL_@1xH|{oG7o{s%QWgz=|b8CiRnJy8kSv9FabIXY5SVy0-pSGQ$^L_AY{u)t1ahj ze77gvA!&}Pd)OUV-@}6L0v4MoXuO2QV4bKf%1c;?*&!kb94;=q=YT}ATtR~NC0EO6 zwxez7QPpVLm-F1)FM-Wwmv>CK-j`98A>&ZllYzJ06HWOU&>GduJo_aW#G1Lq;8UWZ z?Ix=)P2_)Nxv5Xjy-R(PmE^IWK50~n%P>`=1#Qa?OZJ*JaI;)Ckh#)8;UDCkmLh+O5OVS%+-4$kDwJ1iL_0LD4e={lyVm3E*DGszBvKNAit`)yqB+Krm=*B#X)S-@F6a-H5#=s3WPkA45i_xaFhzdr^Ab@68`dt7=r7?38WVRDaQk zJkO3Ht>lIoX=U)2>PQ<}5-EI!9F5-&wJ(ViuTLUCAm~kKdPX8XMukPdMW~G>3R)>U zcaq4^-?-*8%Kj`t50@0JG8f(GoML^`9*LWOSR*Kk5)6*AP93g1T-k{;lw!LpM%V}S zjLXI}CUX5!KCGuNMFHA*Mz&{wP;oGB{5ki`fzCZ60D|k`puE+9JtLB$Izy%N>w83BRW9C>Uhl_rZ2TN{z+ik^IepvyBO z1GdD4eoXXyzR+Un(5uz6g0@y#!N3+|kZr&m^P(oJAgq~lo+}g8Dp`gj@C~gcl`;EK zf>ZQeXj(I(VB0m28dq9%XMv8f84h6DdnEOVGDCh#1T};8#DRRX<79Ydeo{qEjMv5w zl#IDnQfq-)P&FddQ;Nx{+9hYM86x-~zHuqjw4k?du3F6NQzu<^hs z$y1;XrWElMs6E*Llp;13g5i?r|Md0a0tdkLOt`y}&zQj@Mr0@+`ya+m73eX&{5gAh z`E#gA*8_Xa^=tz@ECqF}Qpkk4Q62O&UMXC*=^^KFv@gYNDPL#48%YwoR$66-g2RRr zX;NQdOXt<3xq3jeOj%r{*BtQuPuEr|EXL6y;Vbc|R414yLw$nICC&TFl+o)$1uwiqqaXpEj?y>Oh1vbW% zoF_?P+VfQSXlyBJDyGaXKo}x#;MQ=|Jc9eif{Y?ptI2Y;`i(+MBaOpNRHYAB?nh;6 zsHv%=%TQA*9&iXGV3LmIB=I7NcOG!yt;GP_7F+Bo#sS(KBiSp=kl}300<9#AjHB=6 zjRj+G*rd-qwZLiD3bB67#=P?_wks@_#OpQ-2$4zW1|l)e1Z=siEz39!+kgwRX?-o< zx0)|{X4F*5_Lldf(Y8M#Et`9XRf32CT2Y(g#SR}e!`fnkP_>Hspjz#3<8pSp;!3q@ z^=jEpU+#;oi$WMC{UHxw_6UyN+VoU#9W%SN{Pd-LX6nb)0woCY3IreGhSai7lIIQv zqO!ajZv#{P;+Oyv!7D*8o0tP%fN7B4LXG9Z?O?Ej9PHl2OF;0CDj>9GvfE{z|(-Hcp>Q@$x8q)h1O46|4^+_ z1?C;7<%SAx@F%$#%3TR|@<4A&H&nr!K_q-s4@^|Hd-QQC7%)-Ba4<(l^gPd;`y;Fm z(#sO@^KKEpZCSW#=v^A57P#}oMs4KrI2!89z=h_TZ7Kc5%4?f2oc*Hq$dtKTdje9^ z-;?VG9uVTyzgNMy(ak`AHZoB}Dzd*IC`dywIkaITSG*$Y;AvK#1W|fl*>vjJ2(e}v zelu~$r=gzY=LDd6ux-`&C$WLp)MOm=0L5QG0?d{;qd@GY(Uqax5l5R^MWpI%I>zll zBuXt~Hxng`wRGRtE_5{*3RTiArqy6tgHbvT3%gp2GK> zN_%vGS}B+j0_QWx)Tw-fhlw|r+iaDWaMod?vauCKGHXiS3~gaHLXl5qXH7=WG>e@Q zf0K%ku$*DZLx=8XWt`PX4jp&hXM#(hu#J~ik$Kyh&xyrma6pj;2YHKk&bHWIuOk7B z{aJn+GK{IzVHryaf$He6*;E=?-srv%*z44_D#G;*M1l?So!C)%i1pa8w3k>7^{dn^ z=1+C6G@AI3=>HC`EtU7MdfUqHCajFxE!;}1WTmB0EWl}I6B>D=mynm7Tt{#4)J0G= z$VGuJ4;1A4(c5D05UcLF)t|bFTuYGFW#sx#i8#5w{$(}$^rhdFk^t%oiKgigXE(H? zPL~L2QaVTmSrv&mC*!au%3F?Ge+7cX(`!cLAw%Rh@+wkm>w=2N4LQDJ?fYY9QMWq8 zA#lDjibZQAlzvGmFpS+0I#piWhvZ%Uk`Y{Z3e}m$qq4jy;O|8^hCIBCF6WZ$Svnn+ zY(6QqW^3t`*#5*)JCmdCYC+i!ge7&z34-n3%7n_J(W``06B4)FxO`Hvx`IzD_>6+j zD!8FQ17I^gGF;FsjJ}z`mO4Hag)Wh>BKmd$n=p0TIMWX-iT)O4{0&4Nc{ zMXx+aY%^9y%?hol@9S#zOI9y!i(|UDI9$q>B{zbBT=VniJcmG`Pve*(2yRZ3jR-T% zQM4Hi_U#Nyy&L^}gO^s&hK?;#|TxJL}`tfn%a* zmIQ;jb^Xtj{jG;j3Je=K_ZX($|IbIxM#BcoeUfd|mHKQuCH4R%yujO@$*dgi&k73%`%wyc3v050cE4-MWyn`2Wc8(HUbh87gdZu7~p}~nX zxu<-I;_&h{gL>13%iDU%M+P!A+{xIZij6BUxEbE}*C|>v>vsgWQ6eH`4?w!`ix=S5 z0Psl?9QWn;j7H^w9FkgA_#-1@&@(^^E0Hg7xux>URB`4yhfUSdUw0LGf{_y#ts9{x zMh^_MMzl&`7W>#Zjio%Q$PS$H)6Jwe@c~Fel_@0&N5qt`mtvATvRvbQD62$yA*<4G z^yo-t5MN9EdM(5{c-U{3Y8)wWYa%O3 z*)2qh(mv7IoXmE&k=HpsslDjh1-4;}&d+tJv7Lw3?TmmcWvX;nhUJic04JbN8ddJd zfK<}$Q<@Itz6BGyZHAOd=BDLF`VG9;qxI=x`5`@co}eEj-lo+b2+LJpuyMrLbDe~> zw$UvlIF2JRuyigBaM>^Mq*%8Z97-J?fT?i~TyyTQBzR-JLclee8?A@54C{iFGXP^8 zuX>U0{OR*gaSk2j@z;BL*2&^bot1jfoCpg)@dMGzwsvw8mfaf0^-Y8QEH(T^*)dl?3E1VTMj|tOhrWr+ z;QV@0N>6U9c*c(#w#mY{+w?o^-GF-XSFosK%X>|$;|V!5K1Gpd{qO`1e-iqM2Hjb6 zfDWKb|GI)4DqcxtwI)Zn#X7Unj$@$2c+xCUKC{IG@}rR4J|%9{BZ|#)+qd(2%nbIw z1y1Kin{Ba)2(6i_(GytfT-k>Rt{)O)0jt(dK+G0MfcW&Kt@1kdB>4PpHXEQEV-@yU z7M3!JsHpsmAC>4T=I0dKtKbBIEi`-cU<7#bZzZmzzpIet@r}Tc1+|oWZ4cxt&2(#U{fBI^NjfYTtyb5B{i>O; zEeZClEn2YXPiWT?zrIpUqx>C3SibfY!3t&96-fLqSQFs?#^g!e*<;CeLx`<3^VAE= zQNZ7e%Uc8fWD+SxV31z=2$I0^qtP=8KCj?=6iBek zmo}}~Sh8_Jce|FzY-wo2j5zypdQLA2OC$BR3pR_r^A(mY*tdb{*>Q@MtMt~{8QW*j8(P8lSOpM8_E0 zf4azVt2eK9by^RaRT;Y7%}8ePNgHtYr7(lnr)h_&H%#-S-3X>>93gpTg|iEF=e$@8 znC^(&#b8{Zpa+D=f?zO+Dz2bGLQqk`?Ar+7QZS?I-=rCT`ch`Z2Fhd(G7wobSl3_H zE&LjSbo$bc9t#D;#R$Z)`03z{Km;aF ze8v(_tW7*=iKp_!dEC=+H$QFXn!OYx99sq?&IE6ej*I!T$|e`{=PZJY`STV5K4&e0 zi}^QL1WIHKp=Y+B;O9Y1JgZM)3UmkzV5`k!B5e_Mvwg{b_u?Vi@ zAGZi_{)9zvB7e~$IFWzSBB5NyFA6mnYzs^tIn;ANBGunY*$$~g=H}5teVpb8rq`UT^;A1~$*qY!gGu=;Cw0WW zoWnJaVrn|incKx$oE+vrx5Zj|g`eW!@7!O74CsJcHysY zTJ1T{eZ#ZN#$}c_Od;^-0%zhkmORt9cv4)OpHEUx`rDp#x+m#dy{LK(UAJfYHY4Br ztu3GP7p8gKZu#SHH+K7IGtd2Z{QdKMuwdRZe22aNw7vhG7Qblm@3Qc_E&RN{)y&Ec ze2=H3-IVXm^K?7;eR-a4%J(~DW;wRlEB^t{f=(h^@RT3)l*Mk!4|xiw9>|>m8rZJO20loi;9rdB=amy7r^i|{z!#r{9 zLe!}d?Oe1&skfbx8a#pg-R`9UC z$W|uA9c$JWXXD!OHf@s-?%NNJ?Rh18??m2#Zu=!MoKhYw1Za8Q1zXDQgniNn@!u&^ zoE+QZCUSM$I}-gA(f;-!`o|QxT(*6^DtP(RhkxwSffDFtqj&QVNg5H5#kZR7(xDAN zvC39s-3ag#%DsW$h3V?4=TAh_z7Beqx*&)-_(!So<>Ju7OQlsua5d2XQ?<)_K^N2u zpwE6IgwON|TTa_(jB%rf7 zSf#}0&)P~FmlHb?K?-!F<1bk5V(YW{ykNflCziU&${-_HksF$RnJ#n{LGVmSb?*8% zp0>5&q&7cpvg#ZT6;fR5*mN2Hkb`s97#fdu?%bn|l$&7a9JhA0-H0zh_LKBN+Q#m; zm^~F{^NWzoS!XH52R0VlY^Ip4bcav0-UWutI-i3OPl*%0mI6<) z?_1a@r)!j@E(>tjI!aHk)|p^grhi03N_chx$mGbG?=Z!M_LZ&;lxoDxL;KzcsxSMd zfZf2{DEKA?E;4_2lT@4_*02f2R)9Fy=m`6Ir@1;Ao?MmqqRmo=x?S|vad9-Nmfub*k9!|clhUJ9A-MJxqh970#t+bT3vjR%I9YE*%+OiDwofYkDlPk z<6AJWMbvz|-k~sSBW@+IZa)7~HiSjc61-OkVJA0hRdkzCHYd;pJ}eQpwf)Pyr+X0x z5c=Q}8E2u4novi$ijDJ`9R_w$AG)#jyTuGeqrRzd*6c0a)(CBNdOs-2Q?%AONKFsk zCcD?K;^_{B3Ww_pWdYgBlGah-@(MmX8lxUOo?)3MM^;14{8KE&kuSEIFFw^{UQ&i> zCMB&!o&wtibn|0XE46|wTAE9@9L%pTYFP7H4YfyKRVn@*n@x`#4=1MhM2{S=%HwVe z<5eMtGHY$l=+`9B13hKz2@fAW#V2L(3Oan4(#)vVN;v8k!Kv`+(#ZB8?SXD1Udq4o zQH*~r%q>BMooXUFQ<(gvQf{iD?tjp21;^Gh!-O>&lbfa?sTp5Un)Ml$<2#_bR$En4 z*%jH~+y-i{S5iXf4qYP3rYAF9RdoLRjK1DcojUQt8Cz1DMd-c*8XOt$6AWCTwkS@8|c>T|mOSZ0DSuU&d z+3XAgH6Lbzg7stVZau#)=JgO21@~{OXvwaPqMPJer6U_SutAPh!(AsT^~QLaG=Z)q zF2x``ezK>FMAO2+g4QJZ4g&WLIxpp!b92gpQ`t5HxQ?lk^(GvtBeu9E6CjgnR~yzO zF_;F)))2Yx3bK~=$mDmjY=T`jVGWP+^Q$$b%I}+W=`RyJNquDr8CRwmrbajQ5Xi0* znZ!e&Aj*(eWF|^;Z03}wv{RL!=i^NFswte~b~VyI&?(o-Nik?x6KQjTyBd8Xv93M9 z(8{5g_&rDk7#AG~|AL#FW`RKsDrf?GU%qP* zsg|Er_az+$Ao&jM_LAhdM#g`V1n=$*o?PE&HwCsLdkQs=SRxor4)bCwjfkJ(?NcBg zY{nL);#$yi>YaYy8rffN+TYOA;C2(ozUWyqN9s&{7Bxqt@reFq=6FnPLz0;+sIM zuc?%c89MP0oQ)pkHu_lt!%1u%B)a&np7#Znex8DUUa*vjBp4jUHgPK?Wj-rK-X6IX zUo-3sG}g=$kuV5}PQ~p?$jlY{DTb-Jl1C8M?Oc36bknNHS9G?NGkWA7r4p2)UsLcy3Z7FS@VQCyq+%aaP$lRR zBQtvS1#f-9rTX){tykyE&Aeqv%OqyG{8S67rrtm`6l$Y~g3i7JBML$di;8 z+|s^TR!4A?t`*k4$R(QNy?s_Of#Kcg;}*8t?R7nUqY zt9Mi8WNZ4BQPY~a_xv~WDEd_blZfP{v=q%M?b8ZAqu{d&>I(KLFr<7)v4<7xRiIJp zVH3ZqwC^G?9?|X#e*SBU`-;XcBobhIm4~f7(B;gt9iA};*BCLo><;(26#-PwZA`^t z7zlwAQwyVj-W=ATVKb!#zT4MlWT=9@DU;KI!C)6TVs2!zdY`!udLicGwmbLcetOMY z7}R%B=!NB{1O8;h%08!|0W*}&=)k(DN$A6X&$u9dtf(va7 zQDRGwQapAMWbH#5`naB!_Vwn9$v-CiXdTWH6{b^)+r^@%aNz3h4Ghn6%7@<9?m=qN zvdcCB9zrqQt+Ontw{AHe9Yl+L@XjsZN!D4`N|36{Le$J|6M8%WoERE!`B{4HY^14} zQA8tk75X>7ldXx{CLahul4qF`r>2`vL70}TA9f7=E}sJ9BRX4(|{KZT^` zB(QEXhfQfC#G|0*Hueu9$q(!h$stRimvk9=Zm&480rLtv3WWf5z&*qo&_+mU{K?5l zcdvnYu3rC_5-I4fvT-~t6TjlP3x)}Izff!lgT$=I|AU_mY^41b{fOW=$NNEH(y#k)I zUEy9MbZEYq2F7DW#OQlI0m8u`c#+TJNR7n;6Hw>KFFe9k-M;l=Goy3>!*Fij4n#C7 zu}-31nE~Ax#wDo?ST@*FP&z4T{So#s-mm>0O<9JfpQhZ`srh`Q4;M-gWokLSCGA{U zj2#KvsFw}MtG1$Rxn1ZEdCD?wZKZ+W>$a{(KeF}{iKeMIwsid2cKg`u@sqWbc*tfZ zeQ;AS2lV4h`0C4VYD51#NkAX7&kZ(lDFV~Et@ zN`vkOWC+}Un(Gk^{tY9b@OVg}2)fZn1ObU6c52KHt-d6r>5(s6 z^^6-fim{Eg?G%|Gl30ehJPmOat+zt-ppYAHV4XlRA%#P_f7~4LEsmThQf`l`E?#cb zttyA&XTND>k(EjjXMf*{;OKA!ExqRr+GNkx8KoxcJw;<6)6`hqtSZzKlUZEctOFO5 ztD)b)G?y_(a=&8YG7y1Hc{lK$*pb_eOgscke3qn}eCS=qE7xcu09m(Q=peBMuHgh=vf^aBchN5LrtH;Kw$ zRGQ=#n++fPI5rmhF0qNvCadUNmbBJ%Rn!J$H(6&~?`VN7GsP))Gw<8V&kk$l2wI#i zTDXfdF<@3l1dzSd$SP^@0nXqVLh&fk<$+wr-sF2ZIXA|H8NsfUwLi~|aFBIyv4fpq zwj-0!(0Gy=!Wj8A_-_lV{7fF68FqR(-;ozsl!MGsB;~UvU#gp$f(drTk<#|27*eWr zQ@-a=Q`dJK6A64Hf(`JOx%Gkkb;Y_1@vkVK`q(9P{lGmVg9YeO83Fib1$d!cPFi#r6t5iyfXX&E~YmXQ5C^*D0>-KN4Vr$^|W#ENmsba}TnZv_7~q z!%$3z=TaD=nHJY*XIN;H!Oj5mJg^^E*XY)zDk~B?wv!k3sm zaDJMVLkz~-aY~ITlxfy7tZsCb5V_=b4d)Tk+#^470I_^kP>r~+NbE;3}#X+ z@!@QH9)qt;WxaM@Ie(hy`36Xi;y$1H5w^9|n;VI~_QK$%mam+iHe z&28$^rQ^mp?GUm22tivwNKHe9CjVeg$aOTv5=0$sYnhqGt3B05j9~9^f*(~*k@3sX zKU8#sg7)(!TJ>YHM1MkoYPxOT))v?MkzFM0KY%)B5UFO9{lAaB8=})U@7?$TrTvD2 zFDdw41wTz-;wbKa0sE!_? z-y*rx;Arjczt~XSRplVg)^el4Qw=$gqtbY(XvIVWpQ3SmOd3laPKR8?uO#<3_g3Nt zQ#uE!bSbMKg>ufO`{l>83x_~sa`sN4|DhiSr^1g-O{FES_7Wg)TQ>)?(D#wcn+oqZ zSI(=+dVy?*%!|2RE6zo#HSvK#x30}sa>o{T^*7~gj_Y@$-jG*W2?BJ6rGApn$akHI zUXX&{hJ?~_&7#H1D4URopDqzdJ(dZPak2r-QvsJ*n#68ldh*DKb!?87gYmgu8!R{<_aEz#f>@0n|B(JJ(d z-7)G>&k7=X!ysd-Zi2Uis$C7Q6>)?zZYhVR6|J;pd1a;!za?*8=N$$=huAHptf|&A zo#qe%zTM7Zxx14Bp}pKEQ;cdV;$f6lTg7oSg_^FUb$4cAI7_EebHEiooLqC{fVo~7 z(Oa7iwlEsjl#YZ=Z;2t3om){eJiyOJ+eYS-u(ra68yvWlm-e{zML8XEopd!kdkVL} zlWY#amGx{VTU%fL&DLu;%Bm_W7^Rgppg0a;(!{da)P5FH(<+-2qOC3)i}R-bz4(M5vB}Bg)@94$$Cy|xvvE86CHSu0 z;HGBJWsRk#QPR6J)^fy+t0wf$HIimbh2B!sAA{uOtY+&<+kfyhmSl~dP(QKX#j)S{ z#{pfzt~F6-wjKL<#~pSl-_)(v2CfQ~uxvJg(gbmK>fG5WS9_Y%X8GXcx$~z!YU)t4OO)y7 z&Q6>E=k(g_Gbc`-%Zu4bB~$08%!pyB-J)D%m+;;{Ia!h^%fBF#>S}nBFFsYa4{Tb8 z3&OhXgOZguadCJ#@oo0Ib!eH=5k;m~jo+ zToHwAf0z}MzY2+x?>tleW_E+%C+Sd5p)`z%J1@mr;dG4+vunvbGRqda+2hUQt?y|gbkQ1hJE)k?_-%y=w;RqHB{a9gh7-(E<~2E( z{Reayvto zy$<$1kJCivZ=LH|YaV9HJ0@PD^3TW0JKN^g2F12s$|aKA`B5QOf^tlTFKTz6q}M`% z&%$v*S=YE8xHG-&yl^*s1ad6L-UySCp)z4DoJUR+LUlMWl;K7tn&gX)eBB^n7RqH0 zueG(QAV0KJBBFhGg-3c(+ai#TDa80gDdxL2d4&(K*xJI*u+EDp4mPTV_=?PeO}tzP z6B6U)8xOkAChRQDC5y9y$Knct{9>Cf<@m0?2x{^(0E*noL7tco%0x0h$>H3+0HDEY zKVCe=22zHo5at1thK?2#inuknXF&8>=?XGOrMxY5c$b5T+K~^~bRlDv<*_$m-G}RK z{%oo$`%`j^r8fXh?yk15Fiu={$}o^z`NmP8&2EiG?1H3DAj;>FO?)+aO@p;DAh`x4 z*#b1{hjW?WBGMsTk)MIrT0l8JC%yOrQQWgTRK6grY>*iW)~{lW4tRg_4LoaF;E+V# zx!N#qsX~OXA=a>$Y=<2O(=JOSi^l(E8}2Y-!Br$R$g-$|4mtM=%@R!;c@1zLv{&4)XYH_0)-} znd-?i=gyw_1f1~sGcQGFx;(I#&mQ>t*?U;#o8&h2iz&VO$2aezH(wcqy-0aIeE;y^ zE?kpw--9a->cHgQLkP~FxKJm-{s|Vw2*u}bVuWwl5d`opNix~LBQ(nH2w=2l314HT zoE9V|-L_lk0cv4>IE4Hi!5TZB%+{w)m>?p&05-Echo4APWtNtQC^-cwZ^>s{c;F`@ zKfLMl3n{WYbNv9hJWFr2li>+{FN)FUu!?+TAkVn6C38(Swir!P>2;>@e3Emx?mqO{ z+N;pqaznbk9Dd}u{VEq*!qSiICD&fxw|;Ye%lJ~hIf#1bZCzgN&@lP5y*$qQ*OD*tO?c+(?>ozuKmuw9 zx_P#^IPk@q!q%079*r;B`ZI%#gq-p0o6(k%w#H736x=FwLh3-58hx1WPSFYn|K{5_PW#4t+9w9$?RydT=@VJ@|3)+AuW_KK zncHdRv@Is)R^d(}ZGZEyJU}Dg+tWy486OIPa&f$*1PuE~w5d0$)wvb+^YR^$h;(;$ zs#bjsQ>~gG6W_w{6ScValCUB2f{6iq6S%#!YUbuLyELt>tjv*b$hC~V2g%65XVi}W zraC2x<(Z;CS1L^pBBlB2Jgnomx9E-3b>k`bxZ`WV2. + + +Copyright 2002-2006 John J Lee +Copyright 1997-1999 Gisle Aas (original libwww-perl code) +Copyright 2002-2003 Johnny Lee (original MSIE Perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import sys, re, copy, time, urllib, types, logging +try: + import threading + _threading = threading; del threading +except ImportError: + import dummy_threading + _threading = dummy_threading; del dummy_threading + +MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " + "instance initialised with one)") +DEFAULT_HTTP_PORT = "80" + +from _headersutil import split_header_words, parse_ns_headers +from _util import isstringlike +import _rfc3986 + +debug = logging.getLogger("mechanize.cookies").debug + + +def reraise_unmasked_exceptions(unmasked=()): + # There are a few catch-all except: statements in this module, for + # catching input that's bad in unexpected ways. + # This function re-raises some exceptions we don't want to trap. + import mechanize, warnings + if not mechanize.USE_BARE_EXCEPT: + raise + unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError) + etype = sys.exc_info()[0] + if issubclass(etype, unmasked): + raise + # swallowed an exception + import traceback, StringIO + f = StringIO.StringIO() + traceback.print_exc(None, f) + msg = f.getvalue() + warnings.warn("mechanize bug!\n%s" % msg, stacklevel=2) + + +IPV4_RE = re.compile(r"\.\d+$") +def is_HDN(text): + """Return True if text is a host domain name.""" + # XXX + # This may well be wrong. Which RFC is HDN defined in, if any (for + # the purposes of RFC 2965)? + # For the current implementation, what about IPv6? Remember to look + # at other uses of IPV4_RE also, if change this. + return not (IPV4_RE.search(text) or + text == "" or + text[0] == "." or text[-1] == ".") + +def domain_match(A, B): + """Return True if domain A domain-matches domain B, according to RFC 2965. + + A and B may be host domain names or IP addresses. + + RFC 2965, section 1: + + Host names can be specified either as an IP address or a HDN string. + Sometimes we compare one host name with another. (Such comparisons SHALL + be case-insensitive.) Host A's name domain-matches host B's if + + * their host name strings string-compare equal; or + + * A is a HDN string and has the form NB, where N is a non-empty + name string, B has the form .B', and B' is a HDN string. (So, + x.y.com domain-matches .Y.com but not Y.com.) + + Note that domain-match is not a commutative operation: a.b.c.com + domain-matches .c.com, but not the reverse. + + """ + # Note that, if A or B are IP addresses, the only relevant part of the + # definition of the domain-match algorithm is the direct string-compare. + A = A.lower() + B = B.lower() + if A == B: + return True + if not is_HDN(A): + return False + i = A.rfind(B) + has_form_nb = not (i == -1 or i == 0) + return ( + has_form_nb and + B.startswith(".") and + is_HDN(B[1:]) + ) + +def liberal_is_HDN(text): + """Return True if text is a sort-of-like a host domain name. + + For accepting/blocking domains. + + """ + return not IPV4_RE.search(text) + +def user_domain_match(A, B): + """For blocking/accepting domains. + + A and B may be host domain names or IP addresses. + + """ + A = A.lower() + B = B.lower() + if not (liberal_is_HDN(A) and liberal_is_HDN(B)): + if A == B: + # equal IP addresses + return True + return False + initial_dot = B.startswith(".") + if initial_dot and A.endswith(B): + return True + if not initial_dot and A == B: + return True + return False + +cut_port_re = re.compile(r":\d+$") +def request_host(request): + """Return request-host, as defined by RFC 2965. + + Variation from RFC: returned value is lowercased, for convenient + comparison. + + """ + url = request.get_full_url() + host = _rfc3986.urlsplit(url)[1] + if host is None: + host = request.get_header("Host", "") + # remove port, if present + return cut_port_re.sub("", host, 1) + +def request_host_lc(request): + return request_host(request).lower() + +def eff_request_host(request): + """Return a tuple (request-host, effective request-host name).""" + erhn = req_host = request_host(request) + if req_host.find(".") == -1 and not IPV4_RE.search(req_host): + erhn = req_host + ".local" + return req_host, erhn + +def eff_request_host_lc(request): + req_host, erhn = eff_request_host(request) + return req_host.lower(), erhn.lower() + +def effective_request_host(request): + """Return the effective request-host, as defined by RFC 2965.""" + return eff_request_host(request)[1] + +def request_path(request): + """Return path component of request-URI, as defined by RFC 2965.""" + url = request.get_full_url() + path = escape_path(_rfc3986.urlsplit(url)[2]) + if not path.startswith("/"): + path = "/" + path + return path + +def request_port(request): + host = request.get_host() + i = host.find(':') + if i >= 0: + port = host[i+1:] + try: + int(port) + except ValueError: + debug("nonnumeric port: '%s'", port) + return None + else: + port = DEFAULT_HTTP_PORT + return port + +def request_is_unverifiable(request): + try: + return request.is_unverifiable() + except AttributeError: + if hasattr(request, "unverifiable"): + return request.unverifiable + else: + raise + +# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't +# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). +HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" +ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") +def uppercase_escaped_char(match): + return "%%%s" % match.group(1).upper() +def escape_path(path): + """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" + # There's no knowing what character encoding was used to create URLs + # containing %-escapes, but since we have to pick one to escape invalid + # path characters, we pick UTF-8, as recommended in the HTML 4.0 + # specification: + # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 + # And here, kind of: draft-fielding-uri-rfc2396bis-03 + # (And in draft IRI specification: draft-duerst-iri-05) + # (And here, for new URI schemes: RFC 2718) + if isinstance(path, types.UnicodeType): + path = path.encode("utf-8") + path = urllib.quote(path, HTTP_PATH_SAFE) + path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) + return path + +def reach(h): + """Return reach of host h, as defined by RFC 2965, section 1. + + The reach R of a host name H is defined as follows: + + * If + + - H is the host domain name of a host; and, + + - H has the form A.B; and + + - A has no embedded (that is, interior) dots; and + + - B has at least one embedded dot, or B is the string "local". + then the reach of H is .B. + + * Otherwise, the reach of H is H. + + >>> reach("www.acme.com") + '.acme.com' + >>> reach("acme.com") + 'acme.com' + >>> reach("acme.local") + '.local' + + """ + i = h.find(".") + if i >= 0: + #a = h[:i] # this line is only here to show what a is + b = h[i+1:] + i = b.find(".") + if is_HDN(h) and (i >= 0 or b == "local"): + return "."+b + return h + +def is_third_party(request): + """ + + RFC 2965, section 3.3.6: + + An unverifiable transaction is to a third-party host if its request- + host U does not domain-match the reach R of the request-host O in the + origin transaction. + + """ + req_host = request_host_lc(request) + # the origin request's request-host was stuffed into request by + # _urllib2_support.AbstractHTTPHandler + return not domain_match(req_host, reach(request.origin_req_host)) + + +try: + all +except NameError: + # python 2.4 + def all(iterable): + for x in iterable: + if not x: + return False + return True + + +class Cookie: + """HTTP Cookie. + + This class represents both Netscape and RFC 2965 cookies. + + This is deliberately a very simple class. It just holds attributes. It's + possible to construct Cookie instances that don't comply with the cookie + standards. CookieJar.make_cookies is the factory function for Cookie + objects -- it deals with cookie parsing, supplying defaults, and + normalising to the representation used in this class. CookiePolicy is + responsible for checking them to see whether they should be accepted from + and returned to the server. + + version: integer; + name: string; + value: string (may be None); + port: string; None indicates no attribute was supplied (e.g. "Port", rather + than eg. "Port=80"); otherwise, a port string (eg. "80") or a port list + string (e.g. "80,8080") + port_specified: boolean; true if a value was supplied with the Port + cookie-attribute + domain: string; + domain_specified: boolean; true if Domain was explicitly set + domain_initial_dot: boolean; true if Domain as set in HTTP header by server + started with a dot (yes, this really is necessary!) + path: string; + path_specified: boolean; true if Path was explicitly set + secure: boolean; true if should only be returned over secure connection + expires: integer; seconds since epoch (RFC 2965 cookies should calculate + this value from the Max-Age attribute) + discard: boolean, true if this is a session cookie; (if no expires value, + this should be true) + comment: string; + comment_url: string; + rfc2109: boolean; true if cookie arrived in a Set-Cookie: (not + Set-Cookie2:) header, but had a version cookie-attribute of 1 + rest: mapping of other cookie-attributes + + Note that the port may be present in the headers, but unspecified ("Port" + rather than"Port=80", for example); if this is the case, port is None. + + """ + + + _attrs = ("version", "name", "value", + "port", "port_specified", + "domain", "domain_specified", "domain_initial_dot", + "path", "path_specified", + "secure", "expires", "discard", "comment", "comment_url", + "rfc2109", "_rest") + + def __init__(self, version, name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest, + rfc2109=False, + ): + + if version is not None: version = int(version) + if expires is not None: expires = int(expires) + if port is None and port_specified is True: + raise ValueError("if port is None, port_specified must be false") + + self.version = version + self.name = name + self.value = value + self.port = port + self.port_specified = port_specified + # normalise case, as per RFC 2965 section 3.3.3 + self.domain = domain.lower() + self.domain_specified = domain_specified + # Sigh. We need to know whether the domain given in the + # cookie-attribute had an initial dot, in order to follow RFC 2965 + # (as clarified in draft errata). Needed for the returned $Domain + # value. + self.domain_initial_dot = domain_initial_dot + self.path = path + self.path_specified = path_specified + self.secure = secure + self.expires = expires + self.discard = discard + self.comment = comment + self.comment_url = comment_url + self.rfc2109 = rfc2109 + + self._rest = copy.copy(rest) + + def has_nonstandard_attr(self, name): + return self._rest.has_key(name) + def get_nonstandard_attr(self, name, default=None): + return self._rest.get(name, default) + def set_nonstandard_attr(self, name, value): + self._rest[name] = value + def nonstandard_attr_keys(self): + return self._rest.keys() + + def is_expired(self, now=None): + if now is None: now = time.time() + return (self.expires is not None) and (self.expires <= now) + + def __eq__(self, other): + return all(getattr(self, a) == getattr(other, a) for a in self._attrs) + + def __ne__(self, other): + return not (self == other) + + def __str__(self): + if self.port is None: p = "" + else: p = ":"+self.port + limit = self.domain + p + self.path + if self.value is not None: + namevalue = "%s=%s" % (self.name, self.value) + else: + namevalue = self.name + return "" % (namevalue, limit) + + def __repr__(self): + args = [] + for name in ["version", "name", "value", + "port", "port_specified", + "domain", "domain_specified", "domain_initial_dot", + "path", "path_specified", + "secure", "expires", "discard", "comment", "comment_url", + ]: + attr = getattr(self, name) + args.append("%s=%s" % (name, repr(attr))) + args.append("rest=%s" % repr(self._rest)) + args.append("rfc2109=%s" % repr(self.rfc2109)) + return "Cookie(%s)" % ", ".join(args) + + +class CookiePolicy: + """Defines which cookies get accepted from and returned to server. + + May also modify cookies. + + The subclass DefaultCookiePolicy defines the standard rules for Netscape + and RFC 2965 cookies -- override that if you want a customised policy. + + As well as implementing set_ok and return_ok, implementations of this + interface must also supply the following attributes, indicating which + protocols should be used, and how. These can be read and set at any time, + though whether that makes complete sense from the protocol point of view is + doubtful. + + Public attributes: + + netscape: implement netscape protocol + rfc2965: implement RFC 2965 protocol + rfc2109_as_netscape: + WARNING: This argument will change or go away if is not accepted into + the Python standard library in this form! + If true, treat RFC 2109 cookies as though they were Netscape cookies. The + default is for this attribute to be None, which means treat 2109 cookies + as RFC 2965 cookies unless RFC 2965 handling is switched off (which it is, + by default), and as Netscape cookies otherwise. + hide_cookie2: don't add Cookie2 header to requests (the presence of + this header indicates to the server that we understand RFC 2965 + cookies) + + """ + def set_ok(self, cookie, request): + """Return true if (and only if) cookie should be accepted from server. + + Currently, pre-expired cookies never get this far -- the CookieJar + class deletes such cookies itself. + + cookie: mechanize.Cookie object + request: object implementing the interface defined by + CookieJar.extract_cookies.__doc__ + + """ + raise NotImplementedError() + + def return_ok(self, cookie, request): + """Return true if (and only if) cookie should be returned to server. + + cookie: mechanize.Cookie object + request: object implementing the interface defined by + CookieJar.add_cookie_header.__doc__ + + """ + raise NotImplementedError() + + def domain_return_ok(self, domain, request): + """Return false if cookies should not be returned, given cookie domain. + + This is here as an optimization, to remove the need for checking every + cookie with a particular domain (which may involve reading many files). + The default implementations of domain_return_ok and path_return_ok + (return True) leave all the work to return_ok. + + If domain_return_ok returns true for the cookie domain, path_return_ok + is called for the cookie path. Otherwise, path_return_ok and return_ok + are never called for that cookie domain. If path_return_ok returns + true, return_ok is called with the Cookie object itself for a full + check. Otherwise, return_ok is never called for that cookie path. + + Note that domain_return_ok is called for every *cookie* domain, not + just for the *request* domain. For example, the function might be + called with both ".acme.com" and "www.acme.com" if the request domain + is "www.acme.com". The same goes for path_return_ok. + + For argument documentation, see the docstring for return_ok. + + """ + return True + + def path_return_ok(self, path, request): + """Return false if cookies should not be returned, given cookie path. + + See the docstring for domain_return_ok. + + """ + return True + + +class DefaultCookiePolicy(CookiePolicy): + """Implements the standard rules for accepting and returning cookies. + + Both RFC 2965 and Netscape cookies are covered. RFC 2965 handling is + switched off by default. + + The easiest way to provide your own policy is to override this class and + call its methods in your overriden implementations before adding your own + additional checks. + + import mechanize + class MyCookiePolicy(mechanize.DefaultCookiePolicy): + def set_ok(self, cookie, request): + if not mechanize.DefaultCookiePolicy.set_ok( + self, cookie, request): + return False + if i_dont_want_to_store_this_cookie(): + return False + return True + + In addition to the features required to implement the CookiePolicy + interface, this class allows you to block and allow domains from setting + and receiving cookies. There are also some strictness switches that allow + you to tighten up the rather loose Netscape protocol rules a little bit (at + the cost of blocking some benign cookies). + + A domain blacklist and whitelist is provided (both off by default). Only + domains not in the blacklist and present in the whitelist (if the whitelist + is active) participate in cookie setting and returning. Use the + blocked_domains constructor argument, and blocked_domains and + set_blocked_domains methods (and the corresponding argument and methods for + allowed_domains). If you set a whitelist, you can turn it off again by + setting it to None. + + Domains in block or allow lists that do not start with a dot must + string-compare equal. For example, "acme.com" matches a blacklist entry of + "acme.com", but "www.acme.com" does not. Domains that do start with a dot + are matched by more specific domains too. For example, both "www.acme.com" + and "www.munitions.acme.com" match ".acme.com" (but "acme.com" itself does + not). IP addresses are an exception, and must match exactly. For example, + if blocked_domains contains "192.168.1.2" and ".168.1.2" 192.168.1.2 is + blocked, but 193.168.1.2 is not. + + Additional Public Attributes: + + General strictness switches + + strict_domain: don't allow sites to set two-component domains with + country-code top-level domains like .co.uk, .gov.uk, .co.nz. etc. + This is far from perfect and isn't guaranteed to work! + + RFC 2965 protocol strictness switches + + strict_rfc2965_unverifiable: follow RFC 2965 rules on unverifiable + transactions (usually, an unverifiable transaction is one resulting from + a redirect or an image hosted on another site); if this is false, cookies + are NEVER blocked on the basis of verifiability + + Netscape protocol strictness switches + + strict_ns_unverifiable: apply RFC 2965 rules on unverifiable transactions + even to Netscape cookies + strict_ns_domain: flags indicating how strict to be with domain-matching + rules for Netscape cookies: + DomainStrictNoDots: when setting cookies, host prefix must not contain a + dot (e.g. www.foo.bar.com can't set a cookie for .bar.com, because + www.foo contains a dot) + DomainStrictNonDomain: cookies that did not explicitly specify a Domain + cookie-attribute can only be returned to a domain that string-compares + equal to the domain that set the cookie (e.g. rockets.acme.com won't + be returned cookies from acme.com that had no Domain cookie-attribute) + DomainRFC2965Match: when setting cookies, require a full RFC 2965 + domain-match + DomainLiberal and DomainStrict are the most useful combinations of the + above flags, for convenience + strict_ns_set_initial_dollar: ignore cookies in Set-Cookie: headers that + have names starting with '$' + strict_ns_set_path: don't allow setting cookies whose path doesn't + path-match request URI + + """ + + DomainStrictNoDots = 1 + DomainStrictNonDomain = 2 + DomainRFC2965Match = 4 + + DomainLiberal = 0 + DomainStrict = DomainStrictNoDots|DomainStrictNonDomain + + def __init__(self, + blocked_domains=None, allowed_domains=None, + netscape=True, rfc2965=False, + # WARNING: this argument will change or go away if is not + # accepted into the Python standard library in this form! + # default, ie. treat 2109 as netscape iff not rfc2965 + rfc2109_as_netscape=None, + hide_cookie2=False, + strict_domain=False, + strict_rfc2965_unverifiable=True, + strict_ns_unverifiable=False, + strict_ns_domain=DomainLiberal, + strict_ns_set_initial_dollar=False, + strict_ns_set_path=False, + ): + """ + Constructor arguments should be used as keyword arguments only. + + blocked_domains: sequence of domain names that we never accept cookies + from, nor return cookies to + allowed_domains: if not None, this is a sequence of the only domains + for which we accept and return cookies + + For other arguments, see CookiePolicy.__doc__ and + DefaultCookiePolicy.__doc__.. + + """ + self.netscape = netscape + self.rfc2965 = rfc2965 + self.rfc2109_as_netscape = rfc2109_as_netscape + self.hide_cookie2 = hide_cookie2 + self.strict_domain = strict_domain + self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable + self.strict_ns_unverifiable = strict_ns_unverifiable + self.strict_ns_domain = strict_ns_domain + self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar + self.strict_ns_set_path = strict_ns_set_path + + if blocked_domains is not None: + self._blocked_domains = tuple(blocked_domains) + else: + self._blocked_domains = () + + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def blocked_domains(self): + """Return the sequence of blocked domains (as a tuple).""" + return self._blocked_domains + def set_blocked_domains(self, blocked_domains): + """Set the sequence of blocked domains.""" + self._blocked_domains = tuple(blocked_domains) + + def is_blocked(self, domain): + for blocked_domain in self._blocked_domains: + if user_domain_match(domain, blocked_domain): + return True + return False + + def allowed_domains(self): + """Return None, or the sequence of allowed domains (as a tuple).""" + return self._allowed_domains + def set_allowed_domains(self, allowed_domains): + """Set the sequence of allowed domains, or None.""" + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def is_not_allowed(self, domain): + if self._allowed_domains is None: + return False + for allowed_domain in self._allowed_domains: + if user_domain_match(domain, allowed_domain): + return False + return True + + def set_ok(self, cookie, request): + """ + If you override set_ok, be sure to call this method. If it returns + false, so should your subclass (assuming your subclass wants to be more + strict about which cookies to accept). + + """ + debug(" - checking cookie %s", cookie) + + assert cookie.name is not None + + for n in "version", "verifiability", "name", "path", "domain", "port": + fn_name = "set_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request): + return False + + return True + + def set_ok_version(self, cookie, request): + if cookie.version is None: + # Version is always set to 0 by parse_ns_headers if it's a Netscape + # cookie, so this must be an invalid RFC 2965 cookie. + debug(" Set-Cookie2 without version attribute (%s)", cookie) + return False + if cookie.version > 0 and not self.rfc2965: + debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + debug(" Netscape cookies are switched off") + return False + return True + + def set_ok_verifiability(self, cookie, request): + if request_is_unverifiable(request) and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + debug(" third-party RFC 2965 cookie during " + "unverifiable transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + debug(" third-party Netscape cookie during " + "unverifiable transaction") + return False + return True + + def set_ok_name(self, cookie, request): + # Try and stop servers setting V0 cookies designed to hack other + # servers that know both V0 and V1 protocols. + if (cookie.version == 0 and self.strict_ns_set_initial_dollar and + cookie.name.startswith("$")): + debug(" illegal name (starts with '$'): '%s'", cookie.name) + return False + return True + + def set_ok_path(self, cookie, request): + if cookie.path_specified: + req_path = request_path(request) + if ((cookie.version > 0 or + (cookie.version == 0 and self.strict_ns_set_path)) and + not req_path.startswith(cookie.path)): + debug(" path attribute %s is not a prefix of request " + "path %s", cookie.path, req_path) + return False + return True + + def set_ok_countrycode_domain(self, cookie, request): + """Return False if explicit cookie domain is not acceptable. + + Called by set_ok_domain, for convenience of overriding by + subclasses. + + """ + if cookie.domain_specified and self.strict_domain: + domain = cookie.domain + # since domain was specified, we know that: + assert domain.startswith(".") + if domain.count(".") == 2: + # domain like .foo.bar + i = domain.rfind(".") + tld = domain[i+1:] + sld = domain[1:i] + if (sld.lower() in [ + "co", "ac", + "com", "edu", "org", "net", "gov", "mil", "int", + "aero", "biz", "cat", "coop", "info", "jobs", "mobi", + "museum", "name", "pro", "travel", + ] and + len(tld) == 2): + # domain like .co.uk + return False + return True + + def set_ok_domain(self, cookie, request): + if self.is_blocked(cookie.domain): + debug(" domain %s is in user block-list", cookie.domain) + return False + if self.is_not_allowed(cookie.domain): + debug(" domain %s is not in user allow-list", cookie.domain) + return False + if not self.set_ok_countrycode_domain(cookie, request): + debug(" country-code second level domain %s", cookie.domain) + return False + if cookie.domain_specified: + req_host, erhn = eff_request_host_lc(request) + domain = cookie.domain + if domain.startswith("."): + undotted_domain = domain[1:] + else: + undotted_domain = domain + embedded_dots = (undotted_domain.find(".") >= 0) + if not embedded_dots and domain != ".local": + debug(" non-local domain %s contains no embedded dot", + domain) + return False + if cookie.version == 0: + if (not erhn.endswith(domain) and + (not erhn.startswith(".") and + not ("."+erhn).endswith(domain))): + debug(" effective request-host %s (even with added " + "initial dot) does not end end with %s", + erhn, domain) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainRFC2965Match)): + if not domain_match(erhn, domain): + debug(" effective request-host %s does not domain-match " + "%s", erhn, domain) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainStrictNoDots)): + host_prefix = req_host[:-len(domain)] + if (host_prefix.find(".") >= 0 and + not IPV4_RE.search(req_host)): + debug(" host prefix %s for domain %s contains a dot", + host_prefix, domain) + return False + return True + + def set_ok_port(self, cookie, request): + if cookie.port_specified: + req_port = request_port(request) + if req_port is None: + req_port = "80" + else: + req_port = str(req_port) + for p in cookie.port.split(","): + try: + int(p) + except ValueError: + debug(" bad port %s (not numeric)", p) + return False + if p == req_port: + break + else: + debug(" request port (%s) not found in %s", + req_port, cookie.port) + return False + return True + + def return_ok(self, cookie, request): + """ + If you override return_ok, be sure to call this method. If it returns + false, so should your subclass (assuming your subclass wants to be more + strict about which cookies to return). + + """ + # Path has already been checked by path_return_ok, and domain blocking + # done by domain_return_ok. + debug(" - checking cookie %s", cookie) + + for n in ("version", "verifiability", "secure", "expires", "port", + "domain"): + fn_name = "return_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request): + return False + return True + + def return_ok_version(self, cookie, request): + if cookie.version > 0 and not self.rfc2965: + debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + debug(" Netscape cookies are switched off") + return False + return True + + def return_ok_verifiability(self, cookie, request): + if request_is_unverifiable(request) and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + debug(" third-party RFC 2965 cookie during unverifiable " + "transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + debug(" third-party Netscape cookie during unverifiable " + "transaction") + return False + return True + + def return_ok_secure(self, cookie, request): + if cookie.secure and request.get_type() != "https": + debug(" secure cookie with non-secure request") + return False + return True + + def return_ok_expires(self, cookie, request): + if cookie.is_expired(self._now): + debug(" cookie expired") + return False + return True + + def return_ok_port(self, cookie, request): + if cookie.port: + req_port = request_port(request) + if req_port is None: + req_port = "80" + for p in cookie.port.split(","): + if p == req_port: + break + else: + debug(" request port %s does not match cookie port %s", + req_port, cookie.port) + return False + return True + + def return_ok_domain(self, cookie, request): + req_host, erhn = eff_request_host_lc(request) + domain = cookie.domain + + # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't + if (cookie.version == 0 and + (self.strict_ns_domain & self.DomainStrictNonDomain) and + not cookie.domain_specified and domain != erhn): + debug(" cookie with unspecified domain does not string-compare " + "equal to request domain") + return False + + if cookie.version > 0 and not domain_match(erhn, domain): + debug(" effective request-host name %s does not domain-match " + "RFC 2965 cookie domain %s", erhn, domain) + return False + if cookie.version == 0 and not ("."+erhn).endswith(domain): + debug(" request-host %s does not match Netscape cookie domain " + "%s", req_host, domain) + return False + return True + + def domain_return_ok(self, domain, request): + # Liberal check of domain. This is here as an optimization to avoid + # having to load lots of MSIE cookie files unless necessary. + + # Munge req_host and erhn to always start with a dot, so as to err on + # the side of letting cookies through. + dotted_req_host, dotted_erhn = eff_request_host_lc(request) + if not dotted_req_host.startswith("."): + dotted_req_host = "."+dotted_req_host + if not dotted_erhn.startswith("."): + dotted_erhn = "."+dotted_erhn + if not (dotted_req_host.endswith(domain) or + dotted_erhn.endswith(domain)): + #debug(" request domain %s does not match cookie domain %s", + # req_host, domain) + return False + + if self.is_blocked(domain): + debug(" domain %s is in user block-list", domain) + return False + if self.is_not_allowed(domain): + debug(" domain %s is not in user allow-list", domain) + return False + + return True + + def path_return_ok(self, path, request): + debug("- checking cookie path=%s", path) + req_path = request_path(request) + if not req_path.startswith(path): + debug(" %s does not path-match %s", req_path, path) + return False + return True + + +def vals_sorted_by_key(adict): + keys = adict.keys() + keys.sort() + return map(adict.get, keys) + +class MappingIterator: + """Iterates over nested mapping, depth-first, in sorted order by key.""" + def __init__(self, mapping): + self._s = [(vals_sorted_by_key(mapping), 0, None)] # LIFO stack + + def __iter__(self): return self + + def next(self): + # this is hairy because of lack of generators + while 1: + try: + vals, i, prev_item = self._s.pop() + except IndexError: + raise StopIteration() + if i < len(vals): + item = vals[i] + i = i + 1 + self._s.append((vals, i, prev_item)) + try: + item.items + except AttributeError: + # non-mapping + break + else: + # mapping + self._s.append((vals_sorted_by_key(item), 0, item)) + continue + return item + + +# Used as second parameter to dict.get method, to distinguish absent +# dict key from one with a None value. +class Absent: pass + +class CookieJar: + """Collection of HTTP cookies. + + You may not need to know about this class: try mechanize.urlopen(). + + The major methods are extract_cookies and add_cookie_header; these are all + you are likely to need. + + CookieJar supports the iterator protocol: + + for cookie in cookiejar: + # do something with cookie + + Methods: + + add_cookie_header(request) + extract_cookies(response, request) + get_policy() + set_policy(policy) + cookies_for_request(request) + make_cookies(response, request) + set_cookie_if_ok(cookie, request) + set_cookie(cookie) + clear_session_cookies() + clear_expired_cookies() + clear(domain=None, path=None, name=None) + + Public attributes + + policy: CookiePolicy object + + """ + + non_word_re = re.compile(r"\W") + quote_re = re.compile(r"([\"\\])") + strict_domain_re = re.compile(r"\.?[^.]*") + domain_re = re.compile(r"[^.]*") + dots_re = re.compile(r"^\.+") + + def __init__(self, policy=None): + """ + See CookieJar.__doc__ for argument documentation. + + """ + if policy is None: + policy = DefaultCookiePolicy() + self._policy = policy + + self._cookies = {} + + # for __getitem__ iteration in pre-2.2 Pythons + self._prev_getitem_index = 0 + + def get_policy(self): + return self._policy + + def set_policy(self, policy): + self._policy = policy + + def _cookies_for_domain(self, domain, request): + cookies = [] + if not self._policy.domain_return_ok(domain, request): + return [] + debug("Checking %s for cookies to return", domain) + cookies_by_path = self._cookies[domain] + for path in cookies_by_path.keys(): + if not self._policy.path_return_ok(path, request): + continue + cookies_by_name = cookies_by_path[path] + for cookie in cookies_by_name.values(): + if not self._policy.return_ok(cookie, request): + debug(" not returning cookie") + continue + debug(" it's a match") + cookies.append(cookie) + return cookies + + def cookies_for_request(self, request): + """Return a list of cookies to be returned to server. + + The returned list of cookie instances is sorted in the order they + should appear in the Cookie: header for return to the server. + + See add_cookie_header.__doc__ for the interface required of the + request argument. + + New in version 0.1.10 + + """ + self._policy._now = self._now = int(time.time()) + cookies = self._cookies_for_request(request) + # add cookies in order of most specific (i.e. longest) path first + def decreasing_size(a, b): return cmp(len(b.path), len(a.path)) + cookies.sort(decreasing_size) + return cookies + + def _cookies_for_request(self, request): + """Return a list of cookies to be returned to server.""" + # this method still exists (alongside cookies_for_request) because it + # is part of an implied protected interface for subclasses of cookiejar + # XXX document that implied interface, or provide another way of + # implementing cookiejars than subclassing + cookies = [] + for domain in self._cookies.keys(): + cookies.extend(self._cookies_for_domain(domain, request)) + return cookies + + def _cookie_attrs(self, cookies): + """Return a list of cookie-attributes to be returned to server. + + The $Version attribute is also added when appropriate (currently only + once per request). + + >>> jar = CookieJar() + >>> ns_cookie = Cookie(0, "foo", '"bar"', None, False, + ... "example.com", False, False, + ... "/", False, False, None, True, + ... None, None, {}) + >>> jar._cookie_attrs([ns_cookie]) + ['foo="bar"'] + >>> rfc2965_cookie = Cookie(1, "foo", "bar", None, False, + ... ".example.com", True, False, + ... "/", False, False, None, True, + ... None, None, {}) + >>> jar._cookie_attrs([rfc2965_cookie]) + ['$Version=1', 'foo=bar', '$Domain="example.com"'] + + """ + version_set = False + + attrs = [] + for cookie in cookies: + # set version of Cookie header + # XXX + # What should it be if multiple matching Set-Cookie headers have + # different versions themselves? + # Answer: there is no answer; was supposed to be settled by + # RFC 2965 errata, but that may never appear... + version = cookie.version + if not version_set: + version_set = True + if version > 0: + attrs.append("$Version=%s" % version) + + # quote cookie value if necessary + # (not for Netscape protocol, which already has any quotes + # intact, due to the poorly-specified Netscape Cookie: syntax) + if ((cookie.value is not None) and + self.non_word_re.search(cookie.value) and version > 0): + value = self.quote_re.sub(r"\\\1", cookie.value) + else: + value = cookie.value + + # add cookie-attributes to be returned in Cookie header + if cookie.value is None: + attrs.append(cookie.name) + else: + attrs.append("%s=%s" % (cookie.name, value)) + if version > 0: + if cookie.path_specified: + attrs.append('$Path="%s"' % cookie.path) + if cookie.domain.startswith("."): + domain = cookie.domain + if (not cookie.domain_initial_dot and + domain.startswith(".")): + domain = domain[1:] + attrs.append('$Domain="%s"' % domain) + if cookie.port is not None: + p = "$Port" + if cookie.port_specified: + p = p + ('="%s"' % cookie.port) + attrs.append(p) + + return attrs + + def add_cookie_header(self, request): + """Add correct Cookie: header to request (mechanize.Request object). + + The Cookie2 header is also added unless policy.hide_cookie2 is true. + + The request object (usually a mechanize.Request instance) must support + the methods get_full_url, get_host, is_unverifiable, get_type, + has_header, get_header, header_items and add_unredirected_header, as + documented by urllib2. + """ + debug("add_cookie_header") + cookies = self.cookies_for_request(request) + + attrs = self._cookie_attrs(cookies) + if attrs: + if not request.has_header("Cookie"): + request.add_unredirected_header("Cookie", "; ".join(attrs)) + + # if necessary, advertise that we know RFC 2965 + if self._policy.rfc2965 and not self._policy.hide_cookie2: + for cookie in cookies: + if cookie.version != 1 and not request.has_header("Cookie2"): + request.add_unredirected_header("Cookie2", '$Version="1"') + break + + self.clear_expired_cookies() + + def _normalized_cookie_tuples(self, attrs_set): + """Return list of tuples containing normalised cookie information. + + attrs_set is the list of lists of key,value pairs extracted from + the Set-Cookie or Set-Cookie2 headers. + + Tuples are name, value, standard, rest, where name and value are the + cookie name and value, standard is a dictionary containing the standard + cookie-attributes (discard, secure, version, expires or max-age, + domain, path and port) and rest is a dictionary containing the rest of + the cookie-attributes. + + """ + cookie_tuples = [] + + boolean_attrs = "discard", "secure" + value_attrs = ("version", + "expires", "max-age", + "domain", "path", "port", + "comment", "commenturl") + + for cookie_attrs in attrs_set: + name, value = cookie_attrs[0] + + # Build dictionary of standard cookie-attributes (standard) and + # dictionary of other cookie-attributes (rest). + + # Note: expiry time is normalised to seconds since epoch. V0 + # cookies should have the Expires cookie-attribute, and V1 cookies + # should have Max-Age, but since V1 includes RFC 2109 cookies (and + # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we + # accept either (but prefer Max-Age). + max_age_set = False + + bad_cookie = False + + standard = {} + rest = {} + for k, v in cookie_attrs[1:]: + lc = k.lower() + # don't lose case distinction for unknown fields + if lc in value_attrs or lc in boolean_attrs: + k = lc + if k in boolean_attrs and v is None: + # boolean cookie-attribute is present, but has no value + # (like "discard", rather than "port=80") + v = True + if standard.has_key(k): + # only first value is significant + continue + if k == "domain": + if v is None: + debug(" missing value for domain attribute") + bad_cookie = True + break + # RFC 2965 section 3.3.3 + v = v.lower() + if k == "expires": + if max_age_set: + # Prefer max-age to expires (like Mozilla) + continue + if v is None: + debug(" missing or invalid value for expires " + "attribute: treating as session cookie") + continue + if k == "max-age": + max_age_set = True + if v is None: + debug(" missing value for max-age attribute") + bad_cookie = True + break + try: + v = int(v) + except ValueError: + debug(" missing or invalid (non-numeric) value for " + "max-age attribute") + bad_cookie = True + break + # convert RFC 2965 Max-Age to seconds since epoch + # XXX Strictly you're supposed to follow RFC 2616 + # age-calculation rules. Remember that zero Max-Age is a + # is a request to discard (old and new) cookie, though. + k = "expires" + v = self._now + v + if (k in value_attrs) or (k in boolean_attrs): + if (v is None and + k not in ["port", "comment", "commenturl"]): + debug(" missing value for %s attribute" % k) + bad_cookie = True + break + standard[k] = v + else: + rest[k] = v + + if bad_cookie: + continue + + cookie_tuples.append((name, value, standard, rest)) + + return cookie_tuples + + def _cookie_from_cookie_tuple(self, tup, request): + # standard is dict of standard cookie-attributes, rest is dict of the + # rest of them + name, value, standard, rest = tup + + domain = standard.get("domain", Absent) + path = standard.get("path", Absent) + port = standard.get("port", Absent) + expires = standard.get("expires", Absent) + + # set the easy defaults + version = standard.get("version", None) + if version is not None: + try: + version = int(version) + except ValueError: + return None # invalid version, ignore cookie + secure = standard.get("secure", False) + # (discard is also set if expires is Absent) + discard = standard.get("discard", False) + comment = standard.get("comment", None) + comment_url = standard.get("commenturl", None) + + # set default path + if path is not Absent and path != "": + path_specified = True + path = escape_path(path) + else: + path_specified = False + path = request_path(request) + i = path.rfind("/") + if i != -1: + if version == 0: + # Netscape spec parts company from reality here + path = path[:i] + else: + path = path[:i+1] + if len(path) == 0: path = "/" + + # set default domain + domain_specified = domain is not Absent + # but first we have to remember whether it starts with a dot + domain_initial_dot = False + if domain_specified: + domain_initial_dot = bool(domain.startswith(".")) + if domain is Absent: + req_host, erhn = eff_request_host_lc(request) + domain = erhn + elif not domain.startswith("."): + domain = "."+domain + + # set default port + port_specified = False + if port is not Absent: + if port is None: + # Port attr present, but has no value: default to request port. + # Cookie should then only be sent back on that port. + port = request_port(request) + else: + port_specified = True + port = re.sub(r"\s+", "", port) + else: + # No port attr present. Cookie can be sent back on any port. + port = None + + # set default expires and discard + if expires is Absent: + expires = None + discard = True + + return Cookie(version, + name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest) + + def _cookies_from_attrs_set(self, attrs_set, request): + cookie_tuples = self._normalized_cookie_tuples(attrs_set) + + cookies = [] + for tup in cookie_tuples: + cookie = self._cookie_from_cookie_tuple(tup, request) + if cookie: cookies.append(cookie) + return cookies + + def _process_rfc2109_cookies(self, cookies): + if self._policy.rfc2109_as_netscape is None: + rfc2109_as_netscape = not self._policy.rfc2965 + else: + rfc2109_as_netscape = self._policy.rfc2109_as_netscape + for cookie in cookies: + if cookie.version == 1: + cookie.rfc2109 = True + if rfc2109_as_netscape: + # treat 2109 cookies as Netscape cookies rather than + # as RFC2965 cookies + cookie.version = 0 + + def _make_cookies(self, response, request): + # get cookie-attributes for RFC 2965 and Netscape protocols + headers = response.info() + rfc2965_hdrs = headers.getheaders("Set-Cookie2") + ns_hdrs = headers.getheaders("Set-Cookie") + + rfc2965 = self._policy.rfc2965 + netscape = self._policy.netscape + + if ((not rfc2965_hdrs and not ns_hdrs) or + (not ns_hdrs and not rfc2965) or + (not rfc2965_hdrs and not netscape) or + (not netscape and not rfc2965)): + return [] # no relevant cookie headers: quick exit + + try: + cookies = self._cookies_from_attrs_set( + split_header_words(rfc2965_hdrs), request) + except: + reraise_unmasked_exceptions() + cookies = [] + + if ns_hdrs and netscape: + try: + # RFC 2109 and Netscape cookies + ns_cookies = self._cookies_from_attrs_set( + parse_ns_headers(ns_hdrs), request) + except: + reraise_unmasked_exceptions() + ns_cookies = [] + self._process_rfc2109_cookies(ns_cookies) + + # Look for Netscape cookies (from Set-Cookie headers) that match + # corresponding RFC 2965 cookies (from Set-Cookie2 headers). + # For each match, keep the RFC 2965 cookie and ignore the Netscape + # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are + # bundled in with the Netscape cookies for this purpose, which is + # reasonable behaviour. + if rfc2965: + lookup = {} + for cookie in cookies: + lookup[(cookie.domain, cookie.path, cookie.name)] = None + + def no_matching_rfc2965(ns_cookie, lookup=lookup): + key = ns_cookie.domain, ns_cookie.path, ns_cookie.name + return not lookup.has_key(key) + ns_cookies = filter(no_matching_rfc2965, ns_cookies) + + if ns_cookies: + cookies.extend(ns_cookies) + + return cookies + + def make_cookies(self, response, request): + """Return sequence of Cookie objects extracted from response object. + + See extract_cookies.__doc__ for the interface required of the + response and request arguments. + + """ + self._policy._now = self._now = int(time.time()) + return [cookie for cookie in self._make_cookies(response, request) + if cookie.expires is None or not cookie.expires <= self._now] + + def set_cookie_if_ok(self, cookie, request): + """Set a cookie if policy says it's OK to do so. + + cookie: mechanize.Cookie instance + request: see extract_cookies.__doc__ for the required interface + + """ + self._policy._now = self._now = int(time.time()) + + if self._policy.set_ok(cookie, request): + self.set_cookie(cookie) + + def set_cookie(self, cookie): + """Set a cookie, without checking whether or not it should be set. + + cookie: mechanize.Cookie instance + """ + c = self._cookies + if not c.has_key(cookie.domain): c[cookie.domain] = {} + c2 = c[cookie.domain] + if not c2.has_key(cookie.path): c2[cookie.path] = {} + c3 = c2[cookie.path] + c3[cookie.name] = cookie + + def extract_cookies(self, response, request): + """Extract cookies from response, where allowable given the request. + + Look for allowable Set-Cookie: and Set-Cookie2: headers in the response + object passed as argument. Any of these headers that are found are + used to update the state of the object (subject to the policy.set_ok + method's approval). + + The response object (usually be the result of a call to + mechanize.urlopen, or similar) should support an info method, which + returns a mimetools.Message object (in fact, the 'mimetools.Message + object' may be any object that provides a getheaders method). + + The request object (usually a mechanize.Request instance) must support + the methods get_full_url, get_type, get_host, and is_unverifiable, as + documented by mechanize, and the port attribute (the port number). The + request is used to set default values for cookie-attributes as well as + for checking that the cookie is OK to be set. + + """ + debug("extract_cookies: %s", response.info()) + self._policy._now = self._now = int(time.time()) + + for cookie in self._make_cookies(response, request): + if cookie.expires is not None and cookie.expires <= self._now: + # Expiry date in past is request to delete cookie. This can't be + # in DefaultCookiePolicy, because can't delete cookies there. + try: + self.clear(cookie.domain, cookie.path, cookie.name) + except KeyError: + pass + debug("Expiring cookie, domain='%s', path='%s', name='%s'", + cookie.domain, cookie.path, cookie.name) + elif self._policy.set_ok(cookie, request): + debug(" setting cookie: %s", cookie) + self.set_cookie(cookie) + + def clear(self, domain=None, path=None, name=None): + """Clear some cookies. + + Invoking this method without arguments will clear all cookies. If + given a single argument, only cookies belonging to that domain will be + removed. If given two arguments, cookies belonging to the specified + path within that domain are removed. If given three arguments, then + the cookie with the specified name, path and domain is removed. + + Raises KeyError if no matching cookie exists. + + """ + if name is not None: + if (domain is None) or (path is None): + raise ValueError( + "domain and path must be given to remove a cookie by name") + del self._cookies[domain][path][name] + elif path is not None: + if domain is None: + raise ValueError( + "domain must be given to remove cookies by path") + del self._cookies[domain][path] + elif domain is not None: + del self._cookies[domain] + else: + self._cookies = {} + + def clear_session_cookies(self): + """Discard all session cookies. + + Discards all cookies held by object which had either no Max-Age or + Expires cookie-attribute or an explicit Discard cookie-attribute, or + which otherwise have ended up with a true discard attribute. For + interactive browsers, the end of a session usually corresponds to + closing the browser window. + + Note that the save method won't save session cookies anyway, unless you + ask otherwise by passing a true ignore_discard argument. + + """ + for cookie in self: + if cookie.discard: + self.clear(cookie.domain, cookie.path, cookie.name) + + def clear_expired_cookies(self): + """Discard all expired cookies. + + You probably don't need to call this method: expired cookies are never + sent back to the server (provided you're using DefaultCookiePolicy), + this method is called by CookieJar itself every so often, and the save + method won't save expired cookies anyway (unless you ask otherwise by + passing a true ignore_expires argument). + + """ + now = time.time() + for cookie in self: + if cookie.is_expired(now): + self.clear(cookie.domain, cookie.path, cookie.name) + + def __getitem__(self, i): + if i == 0: + self._getitem_iterator = self.__iter__() + elif self._prev_getitem_index != i-1: raise IndexError( + "CookieJar.__getitem__ only supports sequential iteration") + self._prev_getitem_index = i + try: + return self._getitem_iterator.next() + except StopIteration: + raise IndexError() + + def __iter__(self): + return MappingIterator(self._cookies) + + def __len__(self): + """Return number of contained cookies.""" + i = 0 + for cookie in self: i = i + 1 + return i + + def __repr__(self): + r = [] + for cookie in self: r.append(repr(cookie)) + return "<%s[%s]>" % (self.__class__, ", ".join(r)) + + def __str__(self): + r = [] + for cookie in self: r.append(str(cookie)) + return "<%s[%s]>" % (self.__class__, ", ".join(r)) + + +class LoadError(Exception): pass + +class FileCookieJar(CookieJar): + """CookieJar that can be loaded from and saved to a file. + + Additional methods + + save(filename=None, ignore_discard=False, ignore_expires=False) + load(filename=None, ignore_discard=False, ignore_expires=False) + revert(filename=None, ignore_discard=False, ignore_expires=False) + + Additional public attributes + + filename: filename for loading and saving cookies + + Additional public readable attributes + + delayload: request that cookies are lazily loaded from disk; this is only + a hint since this only affects performance, not behaviour (unless the + cookies on disk are changing); a CookieJar object may ignore it (in fact, + only MSIECookieJar lazily loads cookies at the moment) + + """ + + def __init__(self, filename=None, delayload=False, policy=None): + """ + See FileCookieJar.__doc__ for argument documentation. + + Cookies are NOT loaded from the named file until either the load or + revert method is called. + + """ + CookieJar.__init__(self, policy) + if filename is not None and not isstringlike(filename): + raise ValueError("filename must be string-like") + self.filename = filename + self.delayload = bool(delayload) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + """Save cookies to a file. + + filename: name of file in which to save cookies + ignore_discard: save even cookies set to be discarded + ignore_expires: save even cookies that have expired + + The file is overwritten if it already exists, thus wiping all its + cookies. Saved cookies can be restored later using the load or revert + methods. If filename is not specified, self.filename is used; if + self.filename is None, ValueError is raised. + + """ + raise NotImplementedError() + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + """Load cookies from a file. + + Old cookies are kept unless overwritten by newly loaded ones. + + Arguments are as for .save(). + + If filename is not specified, self.filename is used; if self.filename + is None, ValueError is raised. The named file must be in the format + understood by the class, or LoadError will be raised. This format will + be identical to that written by the save method, unless the load format + is not sufficiently well understood (as is the case for MSIECookieJar). + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename) + try: + self._really_load(f, filename, ignore_discard, ignore_expires) + finally: + f.close() + + def revert(self, filename=None, + ignore_discard=False, ignore_expires=False): + """Clear all cookies and reload cookies from a saved file. + + Raises LoadError (or IOError) if reversion is not successful; the + object's state will not be altered if this happens. + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + old_state = copy.deepcopy(self._cookies) + self._cookies = {} + try: + self.load(filename, ignore_discard, ignore_expires) + except (LoadError, IOError): + self._cookies = old_state + raise diff --git a/mechanize/_clientcookie.pyc b/mechanize/_clientcookie.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7cf573b12122a08fda6371525d439e5adc6669ca GIT binary patch literal 59678 zcmeIb3zVE!a^Ls$3^2e91`h(ni;u+zkRS#mFeE`Pmjnqe2QLy_a)|8#2?_+a)6@OU zpn;k0L4Q32h6dNxlE*7WwxUf|)@xVVwI!`&%a$W6QKBf3>^vm-#EulpCy%2@i5;I~ zCsvM~96NTRoXziFb-zdV0Nh=2Hcrk7fWG)1_q&g}b*t)D)vY`Hzil2n{r8`Evk`?q z>-ql$e(4VmMNu<~E|FHE=1{ayi7s*B*F({zq3n7%x-^_!uZu3N%dXc)m)2+3BhjUi z?0Q3VX+yNKH@n>!UD_BWq@&SW8>5TuhoW`KSafYH>i$GjsZ>IRo1#mbqJ)%Oo1%6l zdb3Hr<580uH(OEbqlGQe;@0TW)@X5CbZJ{;pN>QecSVcaqf6WUo__3zF71et4bj!R zqvnR_{YrFoXLM<2G)G4_Mjz12T~RU=UELiuNA>p7o~St%eE=kA@+P~uCu)w{#l2B; zvt8VmeX_+qxj$-dwdbFRn%nGRU(~$IF5;-U-7fY=%^h}eAZp%i7Y{_uop$kH)ZAqk z2czch=+Z-3;d`v`Ls4_Dy`6}f_t?eZsClnlJRCLevx`Tf6@Ypqs$6W}13Zr!JpT}I zR+{%)h9eHcPguA2>CyX9^gWj*qh@T6_UDftjhY9d<3^`*t(C-A>h0!2t34mjb-MA5WHxRrw32o&og5pxaHW;Ti=F23LK1hn zt@&2FzOb+oHqYJAV9> z)t4UeuX5}K{FN9D_{ z^vW3DE&`9F-lf~t+4C#CE1h;+d_#ldJN)quzg+Lu_;bfxkJ{IlSC?_NwUFdR`0Lx` zitYPbyf0p6SFd*7Z7nR+i$ZS2uUtG|+-CV!)ve;ne+KXKcY1O3=+Rea&Yd|~Pm}nL z8TH0{8uHvZCmEDeWu87+H1_1o>C)|Oz?p}oAM!;h`{U0%^QogG&lJ-kp2((8 zAI=MP50}2oDtf{yYOh$+ici%QpI82&&ridD=D}Q-ex#kb-i>-Uc{X0@EXR#{o0%bm z_W2OEdYVX&GSQjUt+^E*j4ijDnmWvqUeaC6rwrg`&c~O|zY-ste4J)Cl6K0Bnn#j@lzw>s^^z_j&E{!FKzg$&j0`*wXX ziEl6!+nrvVE-x)XdYbV>Ta<{WswN*h%1?Xsb|c}U)oaxkT4`2Sr=1+`!EU3cADhsB zz3n1&6xORf43$IFe;tzl(+e<dT)JnYkhQe zOdr2D65U!CUEQQs+#1oh75=-r+1_o4Zo!MUL`}-w5bbJG#(KCQd@OywFMFWmLNQ_m83UHk^_mxM-jN!7Uk~g{p&MlYA2_vXKH8O zJay*$g`QygnPg?QQ|~s0)ApD~(z=4Pwc+eD^Ulf_PVOFkH*CjfFa+rgrqY^+qyFBQ58QF_3ew*{da(Nw3Do0H6MXEtY3K>Ah25 zSWfJz8a|;q?A3C6v7TN_npH`2RY8eCqyFB?Qc~TbXV!nA!<@BnF`b{_CbA@blB?H` zUv^$``D7PX^ybXWQOcy#mve|+t~sl9?3!F!>D|kR-K1LwwQE@$)bM1f#}rDR;&r-* zBwDwpGFlm_?5J$3Y^{t8?Wv4&HC{>8)XS5Xn~ywbxT*%K;fAZnx#&gF)rw(QC0cq~ z7ogX*q3jwo1W}6nb=m#+%!JBGA10|Lz2$B@zR+D}D#BkA&3!m@e5I52;$~;D4r|uT zo}@OWm6D!XGHICRGjs#@=gz;)3g$$Pit1s=(~0Lf=8PA zCfo~-m+g8;pN=!bnckf*SjmX?!^+LA3WIK}i}rCJ0R4Z_ya7N5$W8?igHFB16)19e ze)33MZ#1BDlI7rT)w8GKW6wPG$snLinFw_ zSysxCI87Qt!uavyF7dpi5;V*RQt%ndOqY^IYfcnAfdjx)AmO|zN>v^8n6U^#+@vw< zna*O;YpI+Yi3C0FGaOdgJ7Ywgq_IE`42gDi)j0plbuTXQ98?%`MrsMU2?*>{=j>Hix-I*GAw<;A=?Ri}5src7r}8@w8`dciKmj#iia#pBPiB`$&8eiQQSF z6)}185Ce~NeDcs>eFmw{k-nmCPOeD88HB*(_t;Z}W@E*6H+k5*Gu`QleZMI}ucK{{z^%Xkhz ztqGvD_Uu~F9SIcP=bFva*SSx3kwm-4p+4i4b@tm@**i1>32&%T`iUwTH4>L8Q)?)& z-lt8H0m4vF;1q^=v@>^9A`BxlILQNO&uSVrjH$HeADvz3G_HwecyqIfZ)CbEhE%jJnW^b+RB;CO;)fgVn*H8$Zo=-$es}&F_|X146vZlX}^=Sr8F^8=UhR2DI;Z? zxJInRg{phGo1TyWWXlf3(AwPc!a{AiyI_M_>&`Wvc;@M+ToUar*g~c%_!~J>eqYtt zFgKQaHK|`}v{T?kD4x*&Jyj68ZZA~4U@tJN#T*ue3+2~T=8tfnj*&zg#u@wZ%FdzH zR81=|sG84MB+>V^3K4jwo!)$kOGA05H&1fA7HyTZUT$k`q46_3a)RT;S1{A>l%Bsr zk~ec5>pI6p8Vq7WJitN(y6#Dlg{y{qJRIjw}|9G-MOs3o+Cv6Im4`ht~Q71(TW#q>%Wgs~oIsE+Cw9=i8!|(-#Zc| z7kbqDs`|aQb3(#M-k!eU`)j=Mso^tNkk;>%o}VG1oB7HJsvRbQ&kXvWb5}9jACU&IsHzisN|Y!bX)J$Gjq38iR66g_H`zdEUVF+lV+ z^`)eyVylmGUwur;10+_p&5PO4r$Kyi>(?ia)G_Q>ZiI+XY zjd2$=EXJMQkVeHdH|m|K?45Usft@~ccIx$4F4UxtubqFbdcnz;C{$I-jp{=rJ{+*L zH6ISj{BL>teYh}YwDyo0*(Pxjp@X(1z9z3%D^A8`IIGjqb zIXLmwV@IEzdiL$o<>4H%IVDl0kV=QF(+Q{fyPP&m8@)Yv@L=k6&?dUZWxm@% zai}Xa#!1&eVYjx@S_`ZZZM3o_<<*>!G;3&syZ?XmWN1dS2s$b%0mgaMh*fGIYSj^jFz#^M`hne5N$PMwWi)cp@`a| z?G~=rtFO2&Aa8q&!7ldpr;{d(FZbq-K0P5hEof0QwYou23eME1=j-j3EDINSU<$gV zty{Z9l4S8_`L8dd958(KDLr-JrP|EY*)xXLXJ$@Koj-HBcIu_6Dtf-^6I5E2)bGeU zr7M0Pb1_+cfg*mNpHtbf?J#KlHH6d9h|(IBy~ahrHse*u98goqa!}TJz0$2wR%2H1 zxG-wAIML8lEMxW&>p;n3&7R+uCuF2`5y#L`Nt zwy=P>VA>IVT-E1cVQ*TYm$V?s7BsAF<~j>3Ptu^H!;JQD9G@#|wc?{F!}O8l5o?Gq z=v*p{r+IZ(U4-Cnleme5%0c zc&Z(j*=3CBr=3CyrJWXHcbFl#26UT8Q6=|Q9Lup(wD2RyL`oYdp^8m`*Fi`0&(df5 zYzR8e0wZkV`E8+1h+h-wAo`WKSFeMx5HX#$a! zHETd0jXJ~B4I)UxKx8Xe;ftBE@nNHrs#QOulTta|+@oPS{q{Dk%ow(GgT`{^`iAHt z7w8`JQ+Sb8-%rn&lF7T*BO99xk(3H{$7} zmvQ=LuAE{~Waj7c=SKBih1yGSPN~NTmMd}ET9jJB3P=5at`}cj7O&|nG_l2GB4ui? z4yA^+OPw@rF;<#4XtE%w%Z*+zyMx|ta~HcX13>#w&(>&EnVb2&c~+QmH!<(32JbWO zFO!S)Ye_A1A|u^%OjwM(@!WFTXRb79{)v^;nZ1fCB*nW#UJK0x#lF|*<;1e%qqT-g z$ZcT76xhjJeR-h=o1qa_2#y4cauf(AF`vHy;;!dIrP=gJoQ+A|^79>>9+0;jObinX zQ-_o({jMagHmA@>`^=stb4L?ANCB=4QB73jrD@=Hd~Fl+@oO9 zIa0*(xF~!ulF9kW_`rF-K5!()rbD-^2||Q+oMbOgJpI^#!}_D^$mV3e>^a^oiwo8;y<@Pz>Wdy@0)ZHAKJ6? zWH3M;mN-f7bgFg?fV4hT$7^qHs%?Gt;pPEme~*>s$)J8COm2TKOj zOYS|%QU^(QBC|Ya)GwP)C_Nj?3!tw<$N=GZU=DhkNw3y#9-W$J-em;Ofr2|y1KN}i z2~f;@801LEzdB8U8X`wH&O94WAU8;q^)@@g1&gZHTugG6<@F#L%4xA5{^6mt{QWP` z_8)uvv1jsD75E(}UA@~yBWzTv9?vAbqo<4=9G9O|!0_VfvEzqhMoHk!{0sskN0Tr}s#5FnBVIAXCaMZdLyMmR|@FEPI?=8Jihg@>itU7gB{>ziIyoMXQ=D4>sB2Q0MffD}Ec zD@zDs?*d*}+Yt}bUPE4O`VI3t#V;L28q$(Aimr`D-Ft(C%_0?g$%?i3~O)8kqn{SuhHfClJ^T(UV>3OVF~VM*$4TVmKL4!x5zz%y>~U5!7l<-U1WS zLz#-QDou+?vemP?no}~b&P6&~VzLG+CItbA{h=T~?-$#wuM40bD|N4evXADr;{`3ZLXbw0S&Y z5y)>(W$Vx=K56!MXlrG3eSVieZDgyAe5D~ZA|pum(ENeP3~1ldLNKz)An%N8$%-2Z z-{Gl^%Ubm7ZVFTBXuLDwwI;stMfFh)jVcXIxa?;WZwC*`?PhwgmR;R+eT1&u?G1Ue zoea3Jx>j9~Mt-mh-~VA<&?3oaHP8q2&)FQ}3rw*}9Y589YTj_02W!=0gr8L{{((}9 z)rjn8K;O)R9O&CGAT{#bdCK|)iN&f=%m5YTrW5r`9&$!iiaB70mv-mIT6A)ZDo_$Q zqiX(;dM3xn$R>9_an&UK(wGF0GfBRP3oIR}M1MMpzR%|IX=braWWi**p+VA>6i83- zGYZW(zWspM6oq`BPyHdgHlQ(=(5Svqxehlf##sf~)B)EuLtWjVq9-3SMMK@6s3MbI z3~>!OAK4=}9i_16=94yn=|2C%*?^C@uA@rpq{D|YszGTnG*!S7kWUTtDWqo~xg}mr z4K-2i%ULrZO)5S68Qw{h0J=h@T)Q^y7}DR8wZIzddaa$*YClqHlLksCD@?IA-Sjp= zXaJZ54Yq{2kb?5W^?~@sfeo78HVt;`N{$r%)}<9>oVxV$dS^lh_GNHy7v{{ii55KN z+D)F&HoXf*PIJ1cT4RJB2h$S=(^Sm&ITxnmgHoM$`R{pmyE0}k8d;^)fxBPPmE-|e z=83)--nlJmp|yzny6{CJr%9V@0pbl5SgWy4uGM~AFc!;^5mcG+r?*rok16nd_D7$D@lO!(DwUUV9tfY|q}T6Me8Z=-*JgX*7*y zFkGUx(`zURnbFI~TcDIoq_G#h3lesA>zFWcFgrM5E$ zS;%x(JFRvJFO;j^olgy4>>DFlJpGvFE_V@QL9hfy{IoD)mkLE0I?hj<0`?+)?XTR! z-w7cwn%gJ}Dhk7&q&Kt&FB8G%EsR)V*5GjGMKZZ(iiqr%|)hsMV_9 zM`m}V6k)2qtHh={X4k0tc_n{J$@@yaU&+6&2zL!(2Z!{h5lhxt2Z9t4i1uA6M|nf(#PJ;^UUMnb8D#=Bz5YNq~> zT|fYa?E*5e&MroztQJj7H7wC*JH}hlJ4?VwivLo z>SUYD8UyNURGnexPE37L*_JyMteI}0%!IDE%N|rG(V2?eR0qQ}G7X!IU;2U@k{7XIgaWaHI^8pxU`s>N0m&GNqVM z7Skh#ZF50aY*tPb+=;I=4HmAZrade>%|@%DMH76ds!XG%mDOW|zV&aZ+v#;09dzG? zqDd-iQ*oo#zk#;;0-Hdj5wSfOs02~?02NwI@m#LN5*=M<#opEBd9-}FnwlI^&TB8M z`h%mtc1_SkZR|DOPG|rNYI-Hb&^xWRc9v&*IIji>o?o6tBUOq9lr!eTyDB4Evzk_JS?N{+YUPVl)oC^*9tSeg>2>GvWTNdi z*b6JKiT1p9GspAjzHfl3G7N^T3=R3jo2IST$qF+ywGu%i?EcNMGHhCQZ5O2=LM-n0 zFVCT{LL(?0V*-?|7c@9%)M%Cv+(v_fF{aoikPIf-jIqJ>^04u1rl_qFR7@F`qmS$l z=rg1T(}ohw-9=R8RAH4%znE3*rBQnh*H9ViIi^_YdA>26D&=L)W}_wuFc(jF4VYyx z>3eyotwRG3J8DuvHc_*h3nPIW>aQ?AgH`C*aa7wfEwTM57=exj-J!Y`jJ9k(b(ECF zjLFrR^U4htL4RJ@Hu|h8z>7h+k4 zufaLz4Kf7xj!c<6q3LG|@z&ho(D~Bz^{oPc(&kv7(MM=@YPs75TNhT)T6L47VL_I| zu&qrpMv$%7bM>w)WCDxRP|R)KABu7lV>8jbW(a-Hlv8v};V7KKv{kjwm~ zb~{{NW*Ca&;dwEkMYq(9g7$$B!OcGZx@hDi`}^d+l$ix5YiullITyYpu7>DI)q=YD zTt+yOra8nFDixE>y!qzRS#al%%$Ti6C;PuE_N;|dwCUT%i9aOwcZL&pWTpQ~hK5NP zh9zt?xYOWNe~g|MBT@Zvb^J*^S2(q&D3;#P0^1-GB6m~}aK$fijll?3@b2c<~|W(~|AZ43SRPo^Ee}FXY_`x}^PVaTV^V?; z%x+aRUX)l6CR%4%G6jnMgv?MltqoQ;16X6_TL@rO<>5Q!N8)*nN5R_MUxM{*b$NBh z%?W2;v2cXG_E0h%87MJyBL|T)^}G)8<4|aBc|PTxbjm5OBS@`5fS?C#^vR1d{OIpa zU%4OJWCKNsS`KW|ETCFyp+>=QF@D%P{%}4LxhWBZGMABT#2yY5Kg&vc&#D-=%z^ui zPVU38Xz?;;3xB9GXw1RoK<+Ddzy_<|pU%yE`RYN2wH7F2^Ep!z8ZSGbPblQ9dnZ2dC3N%n9JG zZ$$uY73=f(qY=OX)S3waKl(v4*uZZEp((*?tsu08!up3Jz1BeqSU`opU4&nfv4 zB|o9$D@wkq%qB* z`Q^d6Q1H5E9WlQ=I2SR$JUACIzdShCeG1N{X3!V+sUzB#`_vKb%YEwN%%^UD7Mbfn z?p${iZD%eP-cemp?945Et|Y5E0nBzgpTuC0F9?hr#DTU$f~r`^h`cFbuE$+Tr$v5gnYL0g@IRX z4S99NS)FU6Ch{zkx54_u$L&K2EC?$>sX{QJg*ip}l2m}3neQPiWmR_MHwkPXTKc3* z>l1k$`cQwqoeGrCDSd451NsV~_7+PM0?0^xRO@xn#REdQ)22a6o)~EGhn5#Ax`0&V z-1{8S!~*Cw;KonU7~nYN-pcq&HT5DNO3P9PytOx9cZnqzsA9VTO#4LOwlDvdutA8r=l+21^g!iEV0W}7`v$_LbVH&bo!Q>R_If~hPSW+4;ZdRIeK35sJ z`}aV(13pkiHF73X+w&(@9+k^^{hj2noLX57v|w#oNQZfmjf}O~(T7o?MXv)linBrt z3`sNW&X+zfc=DEH9@oqa=FF&xX3*ffjIfwW!=a9LLXNOTKXPG8_>$#?z*viH_=^?^ zghZK$_>bx>JndnI2W2Lc`=uB4`f?64xlmhz&rKQxTCVcnK@{rioEXl z)-SYzj7)7v8dYTNw+TZ#2ixe7WWITw7PElZSj;+)wbRJwaIe!DY_bd7eH&K1JVwi! zw79HDYG6u9D7T8@vY0+0-@T$s*@8yBu;S>64Q?rTkdp}9gtO39j(X;I4h zCR=$-l0iHKn|q)G;{W5%5RLAsrzanuJQfxZ#ck;wGup}u1^F;M{>&3uX3cnwK%fr= z>kBIEsa!E1RM;=#0zkBq+fh3&LpX#w%ZWp-&NNgc3K0Vi;tKJ5H#&xBsA|zkXS76E zX$@xLE(?{&BkM(H>8J%=%QISJ7xo@aE@Ok3obSBj7d&XcI~gavM$ijnItwWXjl8o_ zQn3Y$h_q7aTIZM9K!^*jvllJz_j^}z*(RsO-nl*+-`DG)w(r};dYmP%bw4l8Z8$jB zmUr)XuS;8a(DE*)#8Z+66q-=Zi!PyWr;nW3AxEgi#irCvFRjkgCVZ3biu_3os?Ymw zH#$oL>Wn=q1RRcg1J0$kV#*{E$00-J=`(Mfsb+YoB18>mk>t$sgS=&%dRs876^G1f zhW4$1NKC?VXNjPsjSYpQd0X(6K@ytF3ZLddCI_qu2AhDU)RkfC+(La`1T?okVO0uf zu2RsG8OikQzAq{h6K-Rb3p|Srh?0Ub0ykrmW4d#iO@GH_ero4TEy&?Vd?OaDbFOvM z&1Q^|*qA1MTo1)D4-f-vZ-VIB97J!H4JKHxWB>!HowsOLjsaU9otz}Quscr z0w%1&-Gh6u-R;xHXO3sRWAqVln(V?=Bhk_48v%?gYdemAkyYjmI$9WP#O_u(%>!;V z!UE`74H~WXe8!l<_{Oot8`Izj3H$Pkz?g7kYBmQ{m-tSgLQdn+LyWhFgm-a8{0 z8Nc(s%8wjBc<)unSahc!YD{<;@m>|1ywhk))Mtj$U#Y}$B+pvqopNjo&%EkaJZhAY zF@NH?Ws_80UzA+PFjFkeT%09lv9UPGioDM%sLx7WYq&oU#HZnTiirYBncpiRN0~Il z)9qrOoM&rMrj|7EW2WSoAH3cTa$eyTX;RJc!dblNl})Qd5BgTn{_3RL7~ml;o##Vz zgMEv5-jfdmNJP)QB0UM47sF~QCVGaGB-=)B;X7OC$UQJ)LrMt!WaBu+Fa3Fv;N{H) zuHX0{;hE)L-uV6C`ppykf!x2F4DR1e2KVnKgZp=r!Tr0*;QrlYFyoor)o&_Fhr9i# zK-|)CM>2OWxr&=hp|X9&vRM?7Ja&lkqr{LAZdgtJ8T2RFz+z%rE&1k}f3RKVb7}GM z;8avt5XbGq5OO5^k{WA32w~TTmZxrb%x0Yrsm)c4i>|PBjYBq4dAp219bpn>YQ-!E z6FJW`AT)z$zW8;^t}X1eTO1q+X$KZ%?m0n-Ftu55@hO}4&HJTn zGwum(!@TjXTm01UZ`N2|^K@-rucI8xK zQJ$>q!L^k?92oBjb;&f!wl{bH1MS(s7uB!ni?1vB^Gg1LlAl)cGfI9|$v2ezoD$9L zLhbpH?W0xsP0dQ<8gmidyUNwSg6mRcJI+grv0~nd%uw^@HWDLeEaosu^I)@+UbyNi zYhiPFq6HX)35Ref1Dr?CIy~vR_`k&0e)W#7_TW;c#xN{1LMRW(v`{Jy8ai*UbQ}`Hn%6J0c+3^n@hsV0ZjVxK;MR`PAtJj=MgS$ zde{=}0N-oApys3PzM$PP`Ekx%A7d-{a2Ua9cYRbZsz;Q>O7@cs7)fmJRex*NqMy>d zqmK&(|ERjRkw=lmN8b4Dpx_B|2NZN-3oRVP#W43i3>SZaV%8QH{|Y7kapi7Y7!ynAeq zYIh(mrc72`mu?=Ht;bxvF)<;T$;*D7HlYDP#63Rr;PQO!kV+3L%;wDX!$4{TNNu8? z^zV}7+&}2`a$ALlP(G4KTd6hxvr;izg~>0jCq(hXSLpLxp;-tmVF51fvhSZaBSr2o z3i(f$7bQOTKbLaebS%^<1zFxU=jk!>K3XgnhJxIqX1P$5>4_x+S@<)8-Q><2VJ&$u zHK%kFI{KFTzC&;dc=%R!=aX|Dm! znsxOTg{8JTVZ-L3okN=|W8f*6I+WY1)hWZ&E*C;hkkzMathrbquGmU=&rYCqQTIig z-RlUk`J&~ty$6;4Y=mHHTekINYg;zkXJFY3aW%Q8ZTk`94E+lmsgK!8Oz08Dr*L46 zwHu(tXHohf37_T4tPF`{3@O|EvW$b;7h9l~Gy2fZQLNJ%8)O^8uj|>@l*A+@s(1L2CGSmSBOh)7D2%i?E=f{!SWmU4w5CRf|$lgCh<_Q4RnXE2~Ad61uhfr*>TQk~pU zeVE>+h=C#dRz=ux5!KgdO0}-UhSH?&Vkm!BFJhAK)+lPYicu_-kiV19Mz-eesf-bq zS?PviGiyGMCn+M#Hb3IP$BR&BT-Z3W6U#>}jG%7@VjN*I8b_O5n%zD;Mrt19;1e4p zysndZH0L5YoE{@1Oo-%y?&EMNvZMp>=lIHL-2=KRKpAuU-GZXAG-Fyu8Gm2ElSmm2 z?;(=hHYS0gzq+`ApP~OE7Y?>=tHAxRQ7Xgz^tk{+h?|1MwX&(e#c((>g56I{h;fy| z&MY(_z#L@?6JQwdZtw}P5ym@AjbQ?4cPSIV^go4C@gSP0;L0xNVQEs)dS!Yb%X~1M z80A~vsjiWCsQc11Wei{c%tTWxAzoC~a4QqG^>BDW{M-QDa%O7|B{eo4-nxsqqhDnW$kf zr-C{87M%Q~4?XlSbAUdCnbTFHV}b(aI^iEGRr^0_F5CZ|?!5hvCddBAB4z(CwiYa$ z0>EY?T*59AeLdTH*ZwE^kiCG3F4={h>Z7bzJF}@hS?pl(mu!pCFj-zSl3?G;D5dAv zf;f$6!U&+Zv-&r9T>X!g{3lBOQzieIlHXMFTO_GkfvZIIxAo+2W>08O^*`5*nxM8+ z|F+)$7fOC7doBW2{aro!FG)-c_V^D!t!D@`VFvvQS5QU43R7P7fLi!+hw=FU>idJ>7>2kxoyE%*DZ=5gLr16g+W z<&n^fN}M-*CSnhSkapC1FL_;*UabgbratFLuNS&MR6+g^pK!Vuw-kcD{X+}+#Zn<# zDdd|KNE;eRZ|BnqBfG8=fyHF8aotlFopd;a24DYXMH1Q$D4f=) z)3%urTH-2HZLEfnVZvh?hXXS>o&?_vq4;1nS<|wtwzBTa+GiT~m{ItsLj;PtoPbp* zy2a0x1|BFsebnn~bHa{A&MZEH?gsaQa?nzYfp$a%ZOp@@XS~$Tl2F$s%-HHaB^K&5 z1P7w<-eD5#{$?S>j487Ig z)5rHJxlc(<;=)J&%Cv~zZw)jpH`}y2J+|Tn%4DnVEDS8qdLE-#P^dW+)`B>0rh!40 zcCc@PTcj6ws-XXp0?X+rccTa2msq1NOjk+NF*9&dNJ$tzb45wV}n{d*TnsQL~-Yh{SFj&rgSUC<~j!`oZbjBdEaZRv~{V-LZ8O( zPB9E5JX$0LjiXZ#H;Wibe!#<$=%+k`qTj!#XaByEUnN-sq_iCm)51hk%#Cgy8ip1H z#ay8ZP{WM^X&O5Pg3L(qb|@^T#<>#n54xK|Kj+KZY0D=4+;kAtOvOVssRs-QVd459 zH9#B&v^ALYKC&9k069moZUrr-|JU>^Ci!k@4MdL$#0g5Yx}d#T-pNjW8QtYkY0qOKzeW4I|9{4UPP69 zO=WD_GJ%JMp@T^C-a(}0j4u&mgN$9fl3tL}7t?kSG4Ke02%F%&_*MN-L$;Cydy*VioQP%72MXx22Lw50=^W0xp)P+y^TmEJ;f=0snk1&RCiPq zAloucot=Ho*cqTTZN&>+?s+Y{5?3fDIPims{kfp8HCBh3R(ZS#80VKvAIP||ve#*n z+@n~eNPnc!>iYy8!f%wYe7c(X{Q71 zhlI##gVva%!&*P?lDTGRpRE(uRUQcc8%^z_E`ODeZiCY3u+19@yhL39N<$=eYt0Du zvqH?@y=?EmH`a-I$9JZPxV_@{SzcKO-F{@*r4WR_?LbbmIOC_TI=- z_Ffnt2-7_@eeD%RE>aYFjEAYkMQ$yvoofBEu40nX%FvMLNuHVRA!r^zPCJpq7Ixv8 zPEcF}M6f-`KdXT^ZR^N(sS(0&>+m|XuCl&b>eh2n^PH)?poW$ghpfAutTD*tQN;M9 ze2kF~gQ$l@@>~G16T4QsI>`Q_pmaXhszmK`rw(Z-$A^5ph%x1F?NkKC`@qp_2VI_Y~$&V=cZ96SOmz4an65C+@AL#1Wl>A3ZenW{V*#0|R z{XHdLQSv`3`3Fk=p_2be$^We6kCc2>$xo6Lpt7rv?yvb%6gG-vl}>Xpa()EG1Aj@! zhWUTpFlve7@Av_N&8-_9AsrdrIJ#qW%jo#%=;)@=dq*cncXB_<{Rn@zjP4q}r%|Cu z32h9AbJ~Zv5a+aDNeb!-UxX$=;1GoMA|nb}0AFEt^-q$RSf?|rK~oGr^`+{c;Qu13 zD?v~iJh=fmCA^qX@5hSp0K3)rQT#4H{*C z)ODdg9#X3|@PEL@b~Bsn!eljFtk>>!^*7abUSNXfIg3ZU$5RC#3o-bQuuXc2**4cA zT$Q53vBA}jg1~28iu4sZuunDw`9A2IrHKJBDh%;&;^Q+M&6}b~PJr^c)dwy*_$uL_ zaqrqSy{V-|99u}X4TMewSOb`wy28y**dj{~jJq`#&CKegi_*7Qdi_|KGW?X|{w3c8r^IL`J)5-eg7~XV`n_ z_$iOI-y=iy-_R$55YSsptFP*rP^aLeHi0##sv-gw3>WqMI|BEjz0<`S;E|O@Oy=I- z)d9J09hY>io6yd$AKr(a-X**NzF=hiU-EIa0(6RY+0{oTEY*Kb ziGN!S+sK6|EtGV~Y}Qk=I_s&yWBZfS4`oas4;7IIz495DH`kxp3nd@DX+u=`zo==N zf1Cwt%Ole^5I8PxmnPv!)pM5Y-Y)0{2WSz11PcIE1%!i^gpk$l9B0!19B_TkG$0Z>^iN_@&K#9jCAwY@8B_TkG$0Z>^32pO# z1SlDfE)l831C$V{!~>KNsl)@6kj)SMin9PE`xTHyZ9Wh+KVcUSXjEgt^X~!nQ*0sg z({SXRVW)%nvO5d7@jYyjSMV>hnS|ZB*U)tYA69dswLl=p+5EXuii^R~G95w`Ph?v0 z3v8uWtY5`8nC;D%mUSTTp6#St74eNjXe_FzKEM!bHUn+l*S)qNvo96zSw%VvPAR7{ zB*L*|$&}#cJ!8xesz_Nb-8ik0fbHq=|Fe0y>gz-w=2Es zAx}batJ41ilNaBdlyt@w|h6?3(ph2 z&B26yztoc6o`xWptHQZD=p7WtLoCPh9<3&XiS$N`xLm$y& zTc4YJ`(caR>{lW+pSwKyNG{0e+f$k=pCW;O!q+@vJgc1dEuy*Bs+l+gLxUBv8&S(# zE+9Yo(c-5mAjc;Ku{xFEXtp)Y?B>~4HxRBA5v=5EWlVG0nVeMqRY}t(7zN~TnId3? zZMl(A`kfm1B!0){Q8;T!rjq6+pcPjicP%6)wi__I!{YjJ#8w53HjzdQMn40nL^+om zGU-aMs_RJCR>YDtQOu=-YMVwiy3J_?vaN-|WTjfjeMQUkrRx?qsG%yyl8?*-P1q^T z>{9cFDSRWpTnd!Za2}*XdUJuxRx?^`<9f4CuV$p&x2Mu_D3?ik67z2U8I$4Qf7(X% zdD~WziEYv%dMWeK^|L`W!^6FJB;7o3-h~_ut?y2nKaziEj!EIWpD&peu7BQ~3~7l= zk5U347j7_7l9v~$;vx@&_9fjPurHFOx?Z zEPjcb9{;k{Fe21C;meS1rZys`+icOmA_Khoz>yD=lIei?ro#A#ZCtMjxb91;Fi9Q{ zDL_i>xxBTX+f|{4B_H_6vh3{tp7o}JBy*^sytl&26HcuialQcMst9Qsw(97FgnCzN zRpmQnHeSQs`@$I^82_1OnBc+8ie-N!KSse^=|Tt@Y0GFbMb@BzE@W^AO8_oy}L>JFgl1X z)&G?jC2if4`fQU>Ya7X0v{!Q}o4_TR>4e%WUJ;G&v(v*i7qcd(z*6Rl0uMzuA1jEk zQO12nmzf&5Rw+{~X_{Q$1$IeCg&Zmk)C8uFVzK|vH6MDf!`^9=`WHEsseljB|O2tqHTYt-~Mt%iUw2n%MBKG z^;t0t^1)>wEZ4A6eb%(arljKGE`!=Ls5)paQ9J0 z!1GWQPPXRU$|v%(wdX36^M8Ib)U1bN#-&iEh-=XpZ#EvSBX)p~T2<7=DS&*kDe7*P z2dlEvIi8Gr^CcQ`w>i4X$`J;NonqEJb2l3o<<(aCs8nE$MGm`V3Azu*Ulrix8vity zghDMerXo^&qr&P`9Z!IisU4w30kUj`1XXbntd%*{ALNPB#(FQBXBj zBB9n#4ApAU%!lnxyQW|>s0+*&+E$F$mxf|7Ge)TxpK zwRp6Z9z4$!e#F$X+7Ood&R=WQ6@6?4u9ZKAmTi4A;4N{g=Xe_Jlv!!>1|3ADBdK<; z*I`umvLM}B*)epF7N?cn<~c1dL_fkz$5!24+3jIA#Q7z|7Y;*Xy|tLvKEu#3idGdThj@!GRj40t<SDYc( z=rh6n#*H2dMexw@d+LC#!f`12{PkNA|1OFPP(J*P)dKuw##TD%PcUp#9COm> zK!1r)Hy|cM;e%ktY3})99935{g;O`mH2Fddf8X?BEUeV=JD-wG=@WbjuodV zn6KBWMTi~MWm*i5qGkYH|Kme<%xOj7z$Xqoeqh=?*Iy)GbvwDMdo)!=YKorpw7kAU zd#hj4mnOP@T~~iWiKepox7=RAr+G6di|$t4YqTP+?xnDw(lp;gP0<+B7e9p&Ch@-E zy-ehNIIe8+DXjI0I%`w-1r#_EOHE>>yyiU8r>iQxJU+!=4F9etBq(<8u7o1<%Ab(q9ySb<6sytoC$yi6PC-%h*fc7aXlZZIjV;WA}~h1arR$BGgu{S@KV7VErbb_eN7 z+c|lrSQ#ZkXFO6I(}*BsDl1a|TTD;P>qksi#V%`X@R{9&f{Q?-2s|QjGkGit#TzZ| zGK!p8b|fENSmvBHs1=43DXfmfRtI6$P`5PeT~2nxIssWkU=LZ0vUyAN6&2?6HFVK@ z@kIXy%2MCzgsCLDG;!Ur6@`s%0+iaF)j3dJdHpdF4rNpvI93d5udg(YXxlQ%7hr{b zsebb)5mpO|?br9*A3)m69Y#<2@ zbLnSF?@f}r&Xv()RuwuM4lW?!%6C1d2f3nBjtG=~#erJhVkg}3Q~P8BFXaRhoY({U zg^{q2GBD*|$!te}cNI-6vtYyt*qZH%TwQnQNwL67KYho_*>2T7nr*i#wSRRP`}@mq zER&`Vt#IC8Q(+l6vlBWcpgBIcOe0VSys&(s&*dR7pBhJt;(SmEKVHfvADIDc3t&xSWRU2H=d7~|dg^&?#G;mtU^$oUFqj4dj6 z=zjNVlmxV$pUDYdrQ3)Emtjb-J5t}Bz22e(na&m2T3fUI3GOwfd=+E#Cv0f;C+0nU zl~n^#t^5f*#q+>t@k*l{7!hsQD}YiG5ngoTQFwb{gmw^TB(w72^@FYE>>W49jA7s#L2 z67qksf(+)bV8h53o_)Yc@uyz>#_WTGpe^6oGcsnJY-5tHR zGrF}?!`8m9@7whMs89%{?RKS`I=M>$WCL8z3~{aTq0mD^+VSq4AC0bmMy@nu#xoCn z8=fB0y^YE@9s!JP{d~m*a)gTF^SL2C;g+vVB16IlOb9i|IDB}EdVx6G-03G!VplvO z4`tF5>ZO$|>$b6a(`tTtkduhhxc|9vpC#RjyBHPoQ^!T8Ti@hQ!D8GCewY`R(??v1 zAWz4t+;g0k6~s#<+7$~g=<1}B14@36#2leSrri71T%TXoa}kkDwUBTcyPt`)? z+#`2hk4-`Fkm?pO`nJ2+Rpvz{4=KUTEodvA)zxt&>hhXVvOvBhe)W;Eee0oQYD_e| zeP}z%1^y0ihw3;j+DIf)f3`>K6{A|BU*kiYhwiFuk2qjiRLLk+*wT$xEboVvchB(1 zu$`=JdHjU!!dKRu^2MCi@iRJB$g{GEsrs-6YEp@@G}Z14zkPc4q;6wfNk3FQs$`9pfVN}}kjMK) z!;o{(>l%YC)J^O#ASEZfk?8Kipd+L+_-KYWd{`xg8GaypG8~PwP-Js^o`c()&x$de ze)|ct_n^l?+2Z?jJ}L49P-`q5SSejGCrEDBMc1!H*FUL$7pY>k2c*Hw*4KZY=J<^J zx~{ay_Q{rVRWk4xFfH?10aMFp{6sp~Skqb-@7Pg1eOJ)bZV?MSK+sF=4nUE+(m5H$bc2|lv;qMDV5dPS@u{~K9^-ff<-cp}gh%>#Ikykpl# z-9=epNQ=)QzNSAT-eUn-7bGpMLv**f=Bat4Zud z5PZc@A>Aw=+2^irt6bzITEPzh)3{rwXU5Fpz9AYcyw`7rp9%Ml*-A4jGVKA&HHM6y4b*rx-&IYC90S0pDgoErwoQc1 z4G#bxu@RH|@)EKPoHJQj3&OjDqS~Dr>*t0X@eEOk?f*+cP_-_CY)%MMtAC5c)>LTd zLIC`d3hQgi@~DUrKNMn01fS|-dM4I7&`;|RN1ft8SlvOMQfTarv^v>Kx zLY=mkBeN-7Y2rMu$qtxw?OVjZrrI>2t+_b_)9Nv`Mf1D*q-x~ClaKCeq^Szbh=}D^tqwv z`Z3U7BbWXvNzliXqgjDL|5AzE=gs2~)P&W#0}tGa^!k9@pd z`;44+%q6!pWX`Q_b7**G78pZ0Vt>L&=nRE#J<7^#ZaulDPhr^eXu7v^aIqa&Hlx$r zwf4KY3BIVs9e97UG<3vHGwQFIso}GslcT0nf027eh#AOCHHx1;A{{pm{mFA;K-tY+ zxgQ!E*Gu#8V}dF>&+Un2&u(mo2}RrZg9XnOLoJ!6Jk)3zId@8tZ}Vu!s2!o+90S3A zBn9K*L>&*qtXR%Tc3_sq^xGf&;5%!6G^UnKV^@MMVq^a;Ur=?;RHd_CZ3nm)C6O5glD)JkO2gZU?D9G!&sAi;+ zlr2LVVg#Xry#7Kaw{mPNN(JPKU$^l@>LM#je!NtEJz9zF7O{rnS$&e)A{DXaeKu}h zY~O`Wf~7q=iJy{8B{@j#14~waunq&~87q(V1fmVNVIo=gjP=Rz2}#HNRS|F8Ym_he zcab)FP^a|ogSuyK5@!)BWe`mLEU}D;XIg3t)usrQu^*%^-Ex@mC2H{sjKMvwbu5@= z2N^>UW=^|;U>EntGQUBK9p?o)WzT6!uum@4359JZx@2S?KawfVNq46}PRR+90V;2u zN{I^HV@z?Cmt}{-K;f2`nsSa&eJJR-@{S_U$*MfTsLI5^gFi3KEuQj> zmA?%-)j@l02EV-6aNwCdHbYlA(@b?klmRZ)l?rcgo3ldb0C0D6;0wptMcV# zdHzGOcCOG39aJBRFt7^{oN}wW3i&WJYc+UmPeu0qch3NAR_2l^S}3^69DPUAac&Lw z!wnrD2xXS*&Etvu8Or$Cq^n~RFOf38pTVs#HI(0-P|#etRppGj6c`2A0) zNRjvNs;DN@!2p#0Q0(L$o=11@sO%fsTiHK6T!~>S>k!rV;th^Rs|0lYhWVZ1m;NjX z6qlu^Siwu;6uL(0eo$1}ue(o3iii0i!V|1m;($qYWZp}ry%cbwjg;>BDd$fW<#apt zt;rR)`St{c;ZP^v{a8`(3EVX$Rj^{QV#7w|-vzLzGzNacZeRo@x_++x4&jQ-qzIBZ z@yi8^j3z7OH`r=}sU2k*Lt#doo6CuavyVFGGPdX8CfUc3IF9DZK7qU}blUS)qsXvM z;`7KLR>f>V)w{_eJF9hEH?i&lYr4Uq57~)d`8kh+MMC}keAT3xw@tWi-GU;PD-}bz zT}4;A@@6lpVlKAx!b%g-$`ZQP=%2gFsmmSXvc?gh_Ba+|;$qqDxP3M~C7KAi>e5Y>$nrNT8$}S%u zbC#-YV@PSb4oHOL_?Rp2>M7869V;oHpX1}`DTYz%?v!HJuI}Rh%|i{c*#>%%8k?We zp-We*%{S<3UH(Ji;NqI!x@hhaH?9iEXV*7G{JU7l+6uRDx|KFC=^MTe*m+B15puyA zi)n$(vS8u80}u6sOLPchg2b+10iF0&jP6t7)PCNwWOAd{po2rdU>6eS5d|>I+Ci0Z z*08~Rk|lB+^*Zf5xEml@Y-bqa`;M^13Eqabb@Ly?!yxoCGmJCi&X!8DbsRzjXI#g# z-Odd*exxo?C@8LpIT?5|Nyxn8B=B*peM$OhEOg8!ulW*6q9p{2y-@<_G=7ynMQvK? z=*5gG>Euh6wLHlx%n%iULihbJ$XX7CFz3rH z@->ch--eh2?5x-uxMr&QomAzHVxgm%-hQ9v1V7Ban^7qn{);!|`2QL=EG&XTnFE8K z^SSV(X(_X1D}1Soz08@CX_Vnb(m9Z##MO*&kN09me2J-+A)Zj(odhyJoicUO4rm0% zS=I#Ws!PzoW2#G?!yltxy za{w%oq(Mv$MQ-Cal-gP8g9w?IrnQjxVn1SfiO`pTm}Pl8=u^IzL9f2AM09fyJ%t}` zV)}xv)=2qPy`QQc6FN$oi^kl_VR{qk7juM^WmE!-+k8G@NN3g#Nh-)jh%=dN#AUYdm!B=?{4-~3F*NX0m z;>=p1QM2F|&fWEmVa*z}xGIpntY)f`s32QHO>5LaEQRFr&i1iuw#tG*XE4PNwb6+)%VDm_@C! z=MJWC9ZcVT-oSe#cF9cUsOjnP1|n=z8@tdbU>N@CPaT&B2~ePduo~^GGlr(Ci5H8q(l1x71ols&}g) zJM~oBO$B&t^g`@>c9_`sxxlVF={OZbUwco?@+Qr6b)$;lq>sq zJP#h#Gc(;ir>hf6KCMK{@ah>Qr<5pwjM4e(OS;l5uAbAChPkR97sSD?KC-E;I!v+u zg!ySFdyVWH!AC9p9U9+41fJp1?ZhtIGp4%ApA$-m>fv@KbK+UXJ7+(6+e92r@=M0wu!oZeMRopUojuv85H>V~II1)yuimBX z21^T;jSBz_(&>lCBiBpLF`11s-f43XhuQklGI!*KAi<@+m{slxE~2ZW#pb;9$j%Q;JDrrEV@Q&$8+&)PkYISznJEbj*cY3Ox0}L^y|_ zE(0@iIIS)iXX2)5({sgRsG*rG)K^sA@k|>Z#>*sC$%719{oNKqc6sbr$6QnVAEaCU zN%P|JWj)3NfYnn=J4EfZX=Unjb7;g%_LH;BV4GOcT%~jNx=(ujo83|t7Cuh1ZjIm+Ls4<`N zs5JjBrF*Mcugm7$H=p|~-kH4466p4kX}r5WkkhYS=o>{XeT7s^NG27_+Fpw#mn#qS zP9t7k?K|FQ3*~%eQ(DB@!5$gb`hG9zl0BHPf-H9)a={Ub8W zNNa7HXFifX{_HVekrzrYPP*mfM!||YghMSAGnFaQjaOj& z60e*TaPn|>8a4F%WnB1nait%NavTINJ_J|HmRF{+nJ(N61hO(~+gV;1#~Pvb#K>ye zB4NI*r_6jSI65i@6Gn!Qs)JT|QOvv$icejI7GO|?tfPW*#TM+X1Wnaogv4i3uG!4z z+(7%XskJ;e*J`vVn6WSmW2xbszOBh(jjy8;)*$rp-G0@Q78=e3jG&+8)^@PIdTwUs z-1LjJv*%tpGd=a{nc9UjZ(cBLSNuZr;A3rqW^JV=HWkc(oir)Aidcyk#3Ei-q8);3 zHRV+2zojXswN$i)(?=tI^2e*(Gn7w2GWZ1iH@T2};M)zp!wIMv$GkI#Z>C|ezl}sI zyX`qf2V2i;GNMzx*nR*n5O$A@DmIFAR)1DHE6rKNjFIRIcsgoVr_>xwPxf!Bk!HLr z8y95p>0(4%J$Z#NE-{qmUa&D0NIJo=pL@*^_OLjGk#3!Ap3~>$hTNLj?rni2^EnWl z1mI`pTw%zrISMY<<>^V?6lb#K6`UuMc9{fB`=&2jvF7+xy{OS&rkf^SG?Qeh(OFvY zz+-Reoz_scG}o* z(x^)5V32ly$v5@vbrJ*aLT6sWyTP!B>C+Z5=8ab?ZdJ^=MQ!*q-0^7p%ZrOEMIIl| zmsG6U6E^Ut%Sf*pb9Wjxf>5((!+Q3pvX7FWihYHK;4*A(27C1K6*c=_U424{pjADr z8s;LNYRR z0o-F-OSAzEul=tre?z<2us1%meRzBa!ttijgM@#-n=7eLHV_hi9rs&CH)+#9Attwt pJ}|m%xUvx|>gEkQzJJFXyS^Fy_0jb^;vGXfz8QTpTG}=Ae*;2Ee!Ktx literal 0 HcmV?d00001 diff --git a/mechanize/_debug.py b/mechanize/_debug.py new file mode 100644 index 0000000..8243969 --- /dev/null +++ b/mechanize/_debug.py @@ -0,0 +1,28 @@ +import logging + +from _response import response_seek_wrapper +from _urllib2_fork import BaseHandler + + +class HTTPResponseDebugProcessor(BaseHandler): + handler_order = 900 # before redirections, after everything else + + def http_response(self, request, response): + if not hasattr(response, "seek"): + response = response_seek_wrapper(response) + info = logging.getLogger("mechanize.http_responses").info + try: + info(response.read()) + finally: + response.seek(0) + info("*****************************************************") + return response + + https_response = http_response + +class HTTPRedirectDebugProcessor(BaseHandler): + def http_request(self, request): + if hasattr(request, "redirect_dict"): + info = logging.getLogger("mechanize.http_redirects").info + info("redirecting to %s", request.get_full_url()) + return request diff --git a/mechanize/_debug.pyc b/mechanize/_debug.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2870b1a1461b9adf6b11224177df99d67924c608 GIT binary patch literal 1528 zcmbu8-)j>=5XWcllGOf=rB)G?z6giVJ}4rnh_rp^gOswVSVLgB-ffagFL!ZwgS50y z>3`~<;2)sh*-N5=Algf3x;s0&GduIy@Z0Lr!Pn=f2~EEc|KCAu2P(y%pq!{(jRVRB z4FhTiBzP^PJfvZb+BM8;q-)d_l-B8nXhQUpVYRYapQ|q>9*wlAmY1=-7h8Q87iq4Idx-hr$;q*tZaL6r~yU9v5lQG62r=nm%UzA^xPs0J?-9Oih6TW zS)UZs) zIZypr6-KHwsWx&24sH#KL)dwde`f4Q|DLvFrugo*0o4$VU~%~ua?{M{#QlxjyVyD> z_r}8uS{dwwkLNb9>_hBpC2ee5_PypZ+=p21N<;V;=I`!b(j2Plp>c4& ne4d>>Q{B>>OKzy=Me;$nPRGB1Hx=`5`D|Z-m7pn_;a2kp + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import logging +import time + +from _clientcookie import CookieJar, Cookie, MappingIterator +from _util import isstringlike, experimental +debug = logging.getLogger("mechanize.cookies").debug + + +class Firefox3CookieJar(CookieJar): + + """Firefox 3 cookie jar. + + The cookies are stored in Firefox 3's "cookies.sqlite" format. + + Constructor arguments: + + filename: filename of cookies.sqlite (typically found at the top level + of a firefox profile directory) + autoconnect: as a convenience, connect to the SQLite cookies database at + Firefox3CookieJar construction time (default True) + policy: an object satisfying the mechanize.CookiePolicy interface + + Note that this is NOT a FileCookieJar, and there are no .load(), + .save() or .restore() methods. The database is in sync with the + cookiejar object's state after each public method call. + + Following Firefox's own behaviour, session cookies are never saved to + the database. + + The file is created, and an sqlite database written to it, if it does + not already exist. The moz_cookies database table is created if it does + not already exist. + """ + + # XXX + # handle DatabaseError exceptions + # add a FileCookieJar (explicit .save() / .revert() / .load() methods) + + def __init__(self, filename, autoconnect=True, policy=None): + experimental("Firefox3CookieJar is experimental code") + CookieJar.__init__(self, policy) + if filename is not None and not isstringlike(filename): + raise ValueError("filename must be string-like") + self.filename = filename + self._conn = None + if autoconnect: + self.connect() + + def connect(self): + import sqlite3 # not available in Python 2.4 stdlib + self._conn = sqlite3.connect(self.filename) + self._conn.isolation_level = "DEFERRED" + self._create_table_if_necessary() + + def close(self): + self._conn.close() + + def _transaction(self, func): + try: + cur = self._conn.cursor() + try: + result = func(cur) + finally: + cur.close() + except: + self._conn.rollback() + raise + else: + self._conn.commit() + return result + + def _execute(self, query, params=()): + return self._transaction(lambda cur: cur.execute(query, params)) + + def _query(self, query, params=()): + # XXX should we bother with a transaction? + cur = self._conn.cursor() + try: + cur.execute(query, params) + return cur.fetchall() + finally: + cur.close() + + def _create_table_if_necessary(self): + self._execute("""\ +CREATE TABLE IF NOT EXISTS moz_cookies (id INTEGER PRIMARY KEY, name TEXT, + value TEXT, host TEXT, path TEXT,expiry INTEGER, + lastAccessed INTEGER, isSecure INTEGER, isHttpOnly INTEGER)""") + + def _cookie_from_row(self, row): + (pk, name, value, domain, path, expires, + last_accessed, secure, http_only) = row + + version = 0 + domain = domain.encode("ascii", "ignore") + path = path.encode("ascii", "ignore") + name = name.encode("ascii", "ignore") + value = value.encode("ascii", "ignore") + secure = bool(secure) + + # last_accessed isn't a cookie attribute, so isn't added to rest + rest = {} + if http_only: + rest["HttpOnly"] = None + + if name == "": + name = value + value = None + + initial_dot = domain.startswith(".") + domain_specified = initial_dot + + discard = False + if expires == "": + expires = None + discard = True + + return Cookie(version, name, value, + None, False, + domain, domain_specified, initial_dot, + path, False, + secure, + expires, + discard, + None, + None, + rest) + + def clear(self, domain=None, path=None, name=None): + CookieJar.clear(self, domain, path, name) + where_parts = [] + sql_params = [] + if domain is not None: + where_parts.append("host = ?") + sql_params.append(domain) + if path is not None: + where_parts.append("path = ?") + sql_params.append(path) + if name is not None: + where_parts.append("name = ?") + sql_params.append(name) + where = " AND ".join(where_parts) + if where: + where = " WHERE " + where + def clear(cur): + cur.execute("DELETE FROM moz_cookies%s" % where, + tuple(sql_params)) + self._transaction(clear) + + def _row_from_cookie(self, cookie, cur): + expires = cookie.expires + if cookie.discard: + expires = "" + + domain = unicode(cookie.domain) + path = unicode(cookie.path) + name = unicode(cookie.name) + value = unicode(cookie.value) + secure = bool(int(cookie.secure)) + + if value is None: + value = name + name = "" + + last_accessed = int(time.time()) + http_only = cookie.has_nonstandard_attr("HttpOnly") + + query = cur.execute("""SELECT MAX(id) + 1 from moz_cookies""") + pk = query.fetchone()[0] + if pk is None: + pk = 1 + + return (pk, name, value, domain, path, expires, + last_accessed, secure, http_only) + + def set_cookie(self, cookie): + if cookie.discard: + CookieJar.set_cookie(self, cookie) + return + + def set_cookie(cur): + # XXX + # is this RFC 2965-correct? + # could this do an UPDATE instead? + row = self._row_from_cookie(cookie, cur) + name, unused, domain, path = row[1:5] + cur.execute("""\ +DELETE FROM moz_cookies WHERE host = ? AND path = ? AND name = ?""", + (domain, path, name)) + cur.execute("""\ +INSERT INTO moz_cookies VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) +""", row) + self._transaction(set_cookie) + + def __iter__(self): + # session (non-persistent) cookies + for cookie in MappingIterator(self._cookies): + yield cookie + # persistent cookies + for row in self._query("""\ +SELECT * FROM moz_cookies ORDER BY name, path, host"""): + yield self._cookie_from_row(row) + + def _cookies_for_request(self, request): + session_cookies = CookieJar._cookies_for_request(self, request) + def get_cookies(cur): + query = cur.execute("SELECT host from moz_cookies") + domains = [row[0] for row in query.fetchall()] + cookies = [] + for domain in domains: + cookies += self._persistent_cookies_for_domain(domain, + request, cur) + return cookies + persistent_coookies = self._transaction(get_cookies) + return session_cookies + persistent_coookies + + def _persistent_cookies_for_domain(self, domain, request, cur): + cookies = [] + if not self._policy.domain_return_ok(domain, request): + return [] + debug("Checking %s for cookies to return", domain) + query = cur.execute("""\ +SELECT * from moz_cookies WHERE host = ? ORDER BY path""", + (domain,)) + cookies = [self._cookie_from_row(row) for row in query.fetchall()] + last_path = None + r = [] + for cookie in cookies: + if (cookie.path != last_path and + not self._policy.path_return_ok(cookie.path, request)): + last_path = cookie.path + continue + if not self._policy.return_ok(cookie, request): + debug(" not returning cookie") + continue + debug(" it's a match") + r.append(cookie) + return r diff --git a/mechanize/_firefox3cookiejar.pyc b/mechanize/_firefox3cookiejar.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb5bd73b6742d48ec967ab256b2670e3e2eb805d GIT binary patch literal 9003 zcmb_iTXP&o74F$xue7oxD@vS@#37x8kTodE#ax}MEv*wL_S(!Uv8-~K8qLf~BhAjN zr)O<#N_rqp5mUVK1Vt760-oW8U%)HH1H}U=egTgZ1>bjicD1%c@gQ58R$otdpZ?Bw zPS3dd&$*e^FVCz8s{B{Q-*@qteI%h$8%P}$R#f7s4P{nCCloljiisOz_7venXW_$XKem=;u-8eGK=4KKX(ev7tbwA2YY>FriqUD*Hm8?I^ z@1_LhUJ_x8-p+IDOpB#bovc5`$wS+}_D z=g}K_m<@E`r#g>9$j;-L7jAxFs z^fa~>{|EotO&(IwK(ZqG_`uJL`nXh_K=!iV@5kxZMXbm#vb1D;a<|MB;Jjr zBHDxY;$D;%eqtU$k6sjP`)T}X1RaUC3)Ys0aArK?7%?@pp2uT;izEVKaF|FdDypjR z1X1!i;Bt(IiW3Gn!m4DZh!ECHt8js!_#6mpsPGYjf_WfCKtY&HOHUvJ0_|@@eO^@? zb1IyXrg;?}mke5GB?Fmrl7YZ^$t)_j(_G|I??Quzuqsis%#XnGJ}C}eC6s9|%~%IC?aelI#J7+{W|DQ7Ift((QL z9|wMt40SgH#H>Il0vWL zgUBwTpTPu%m_OCo<__mGei57Q5NMQn4}oVvc2x#I>i}`r4cq z*07@s*tZ=Ab6BF4QYa3O2bgAhIm!I6d3s6qux$KWQS&tIb2*RXcv0#_#da2&Web4u zzBx;(%`gqdhABeqc>y)$egFwm_*k9a#fo+02iv+o*u<_&$)J-YDdBM;OOot1w_d6f zqqEzo-i)^WTX8nP7C;t;3N3MAH-dxDp}1RsW0A=+NF}4SPT*H?fTag{gf)d$BhbC2 z&hZF+JC6%6Cl5a^mUP@D$AwvBM0lEk@Dhj!hdSB=4=#&w^s-OA(XvK!Yrq81JDl?W z%d%=Df$)J%suf!2E|Nn04*Y~uz*kfh+|hnj-Ki*NU-)IWq4M*ps3{cDYnBD!7!*Ep zREq8n2YKKdu&%=_HYG-NWa=*n4l8^ml5Vi6=%cbW?GWLUE zG|plQxzb9T94|sllyqIrR2)Y#VdLTqthqJOq9idtMDC-rH?9r=^7O{}9PVtr(|Nh) z!*S*}#>RKUE6L={7~jkNp-cbr8XmKNL^(6gg7c(v!fD{A?ga4CiY|ta9N`44{Du@dp8e_^M5pO3dS#4ctxo&H<$z_N(ER4^% z+`1^ir6$Gq#U@L@ELrLaO%_j~!Lw%I2}ZrR>w)lKUVc6l*XYtUy4>1BQy3RUEf^lY za%P>ActlZD79UX*eJl5lUjTWueKxo4%4AzaLXc!8YO+OE`_LBpC?uZ4V|aMV3HXN? z<0DJFfz0r=dscZ+QJ*e|g~@RqRMj;c8|;h63nA;*(kGPTIOWb@xTyS0sdQegmvuF% zgH3m2Pwgi6Xb+Kx91L=k*hcWyiwm&?(S?{pcaR1})eeFEw`$ zc;7*V)u6R6K=sqF)0NPtfQW!s`V^D}q#eOAN*12$K=nxBV|=GM85zA?Ma?Y)-1$o_eer?>Q?2!jowW{bIEPNY8Hf5}d#&|e%hgxiiq}aK*R|IA znhmUO(UoZ_>+KAV(PsNTE;N#bGmZ1%NR}Oxzy+NP$j69k%2^PJm=1uA;BBJj{i5h! zNf8m3b*C-xp2bFNjy(RbEZ*f+;m3Fk5v81ZRk)E}f?5^zo`M+yMmHJ?ln87c)YQQ- zbx>CaQ|e$^9iSf97)Ko(SNq4v^6+_~#4+f$U%$7eF1VcRQ&s!5d;8QB;-UQpj}pRT zIM-RVKcn(rNSK2QQcb(4ez>9_ZDv{GQlLalh_&;=@Tx45FW~xW zyYb55Qc9CYWgP^0Az5;0m=Gja_OgYWzf?DWO$@^}!0Rvvbom+lDIoDj0v?$sP(}~<~ars>5Q!b6F7K96{VqA$W9igu7nS4 zu;Ih2%<;0-1M{?jIs6=^k0aiL zoj8kCiStMe4Lsrz{)G%c45|m|GUKTHX>kEn;ZX@Y7;J#RLFTq7kObPwqM!{^V`KsAv!ZRHA zT_zIWzKooOCNf6^q@{0Aw&iZSrr%KPHz`~$;G}B%bF!rFMdt+S7692sWzk;gs^~p> z%)2O%Bf)^r(BmQ)zB{^L2;G1MC+{c-ZqQD8w!{lqOx{_jEgHXY**ayTv&aLffx+wx z0N@hQ&vQ2rYD~mRq&+l1w8c|Jb%9657!OhH{V&(U;TS}#uCs)2oXqGzGKYqE-?}IqJ{?P z00HhHhV3|c^YA{Jn|O{27Gd|ODBV5ddoxe5wMD$Gq4!>ZPq%jz+LmSI51AA4B|a?q zazQQ|WL5XA^oSHov46m0?=C>_f>Wz3&IUC85kdpDv*X?=#uOF{Ag$F;j+bcq*txl& zuI;@jZE}~E=En#wb{}CC{a%r`6C)Q&V=W2kTMsx-JvaJgc8PD{)nAG&SNcM1|mqs`@^9-F!z~!xBbsWlv$k z^0dT~ztV45)jy4{u0~QYobtvW1xMy!Jba)*?9CW|!bmxTSgMV_rP)Zne8VjkduRgE zlt;LN8~bwUXyzb+&DPjv&0nw_Yqa3>7bMJ>bsll-7SX1}p^e}tTPhc0ZI~#qHvA)Y zrZX)+B#g$mH2Z?bd9RA(W1mGVVSXs&Xd84p5cX$0<(p7kHl|NFK@BwoG1t4o)_)_h zK(PShJp#Y-g2p_{rVipEz^?QMz!Hp{?~w5_O**~VjbTL%6eJNEE}XBWZc}Bs7P#L zN5+>+{u{$L8(TTH6U!3mIf$3=AsjNWCN^F}MMvI%whakoWjhLX`Elq)e3r&XsuGF( zNP|1`AWzNr&}+!IKH4O;tIUOQ4E{MCHGyFngoT#p2S;3 zv5mp5GIwaL94Z%bij=nox%)oqEP|0aBImkoi1?CXzC~fM#FI`9TuV1P>ooC~SAY|6 zto#IW5}!oscph=uhYM)Z$gI2Y7;WI|4|nHz_|F4_gvBW-hFRcw?zfS*+9KWBFB=pf zcJ1;VmvuIa%#q*ieM$lkx=Qyl6Iz=rePS7?I;t{grd*66x7NQ_ub-G{)Em{t{A_i$ zf!s@rCmT;S9&I!lb2g0j&`-9PWLsN6nA>6nBWy27@Zq_zh_fLGUB4^7mGgLmB2MJ< un0!N%w<}=COAr+0F5^p=-5%-&<&@sE=khk8Wf+*%5l_!nPT?B3Sp6@NsLVeA literal 0 HcmV?d00001 diff --git a/mechanize/_form.py b/mechanize/_form.py new file mode 100644 index 0000000..d45bdfc --- /dev/null +++ b/mechanize/_form.py @@ -0,0 +1,3280 @@ +"""HTML form handling for web clients. + +HTML form handling for web clients: useful for parsing HTML forms, filling them +in and returning the completed forms to the server. This code developed from a +port of Gisle Aas' Perl module HTML::Form, from the libwww-perl library, but +the interface is not the same. + +The most useful docstring is the one for HTMLForm. + +RFC 1866: HTML 2.0 +RFC 1867: Form-based File Upload in HTML +RFC 2388: Returning Values from Forms: multipart/form-data +HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX) +HTML 4.01 Specification, W3C Recommendation 24 December 1999 + + +Copyright 2002-2007 John J. Lee +Copyright 2005 Gary Poster +Copyright 2005 Zope Corporation +Copyright 1998-2000 Gisle Aas. + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +# TODO: +# Clean up post the merge into mechanize +# * Remove code that was duplicated in ClientForm and mechanize +# * Remove weird import stuff +# * Remove pre-Python 2.4 compatibility cruft +# * Clean up tests +# * Later release: Remove the ClientForm 0.1 backwards-compatibility switch +# Remove parser testing hack +# Clean action URI +# Switch to unicode throughout +# See Wichert Akkerman's 2004-01-22 message to c.l.py. +# Apply recommendations from google code project CURLIES +# Apply recommendations from HTML 5 spec +# Add charset parameter to Content-type headers? How to find value?? +# Functional tests to add: +# Single and multiple file upload +# File upload with missing name (check standards) +# mailto: submission & enctype text/plain?? + +# Replace by_label etc. with moniker / selector concept. Allows, e.g., a +# choice between selection by value / id / label / element contents. Or +# choice between matching labels exactly or by substring. etc. + + +__all__ = ['AmbiguityError', 'CheckboxControl', 'Control', + 'ControlNotFoundError', 'FileControl', 'FormParser', 'HTMLForm', + 'HiddenControl', 'IgnoreControl', 'ImageControl', 'IsindexControl', + 'Item', 'ItemCountError', 'ItemNotFoundError', 'Label', + 'ListControl', 'LocateError', 'Missing', 'ParseError', 'ParseFile', + 'ParseFileEx', 'ParseResponse', 'ParseResponseEx','PasswordControl', + 'RadioControl', 'ScalarControl', 'SelectControl', + 'SubmitButtonControl', 'SubmitControl', 'TextControl', + 'TextareaControl', 'XHTMLCompatibleFormParser'] + +import HTMLParser +from cStringIO import StringIO +import inspect +import logging +import random +import re +import sys +import urllib +import urlparse +import warnings + +import _beautifulsoup +import _request + +# from Python itself, for backwards compatibility of raised exceptions +import sgmllib +# bundled copy of sgmllib +import _sgmllib_copy + + +VERSION = "0.2.11" + +CHUNK = 1024 # size of chunks fed to parser, in bytes + +DEFAULT_ENCODING = "latin-1" + +_logger = logging.getLogger("mechanize.forms") +OPTIMIZATION_HACK = True + +def debug(msg, *args, **kwds): + if OPTIMIZATION_HACK: + return + + caller_name = inspect.stack()[1][3] + extended_msg = '%%s %s' % msg + extended_args = (caller_name,)+args + _logger.debug(extended_msg, *extended_args, **kwds) + +def _show_debug_messages(): + global OPTIMIZATION_HACK + OPTIMIZATION_HACK = False + _logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler(sys.stdout) + handler.setLevel(logging.DEBUG) + _logger.addHandler(handler) + + +def deprecation(message, stack_offset=0): + warnings.warn(message, DeprecationWarning, stacklevel=3+stack_offset) + + +class Missing: pass + +_compress_re = re.compile(r"\s+") +def compress_text(text): return _compress_re.sub(" ", text.strip()) + +def normalize_line_endings(text): + return re.sub(r"(?:(? + w = MimeWriter(f) + ...call w.addheader(key, value) 0 or more times... + + followed by either: + + f = w.startbody(content_type) + ...call f.write(data) for body data... + + or: + + w.startmultipartbody(subtype) + for each part: + subwriter = w.nextpart() + ...use the subwriter's methods to create the subpart... + w.lastpart() + + The subwriter is another MimeWriter instance, and should be + treated in the same way as the toplevel MimeWriter. This way, + writing recursive body parts is easy. + + Warning: don't forget to call lastpart()! + + XXX There should be more state so calls made in the wrong order + are detected. + + Some special cases: + + - startbody() just returns the file passed to the constructor; + but don't use this knowledge, as it may be changed. + + - startmultipartbody() actually returns a file as well; + this can be used to write the initial 'if you can read this your + mailer is not MIME-aware' message. + + - If you call flushheaders(), the headers accumulated so far are + written out (and forgotten); this is useful if you don't need a + body part at all, e.g. for a subpart of type message/rfc822 + that's (mis)used to store some header-like information. + + - Passing a keyword argument 'prefix=' to addheader(), + start*body() affects where the header is inserted; 0 means + append at the end, 1 means insert at the start; default is + append for addheader(), but insert for start*body(), which use + it to determine where the Content-type header goes. + + """ + + def __init__(self, fp, http_hdrs=None): + self._http_hdrs = http_hdrs + self._fp = fp + self._headers = [] + self._boundary = [] + self._first_part = True + + def addheader(self, key, value, prefix=0, + add_to_http_hdrs=0): + """ + prefix is ignored if add_to_http_hdrs is true. + """ + lines = value.split("\r\n") + while lines and not lines[-1]: del lines[-1] + while lines and not lines[0]: del lines[0] + if add_to_http_hdrs: + value = "".join(lines) + # 2.2 urllib2 doesn't normalize header case + self._http_hdrs.append((key.capitalize(), value)) + else: + for i in range(1, len(lines)): + lines[i] = " " + lines[i].strip() + value = "\r\n".join(lines) + "\r\n" + line = key.title() + ": " + value + if prefix: + self._headers.insert(0, line) + else: + self._headers.append(line) + + def flushheaders(self): + self._fp.writelines(self._headers) + self._headers = [] + + def startbody(self, ctype=None, plist=[], prefix=1, + add_to_http_hdrs=0, content_type=1): + """ + prefix is ignored if add_to_http_hdrs is true. + """ + if content_type and ctype: + for name, value in plist: + ctype = ctype + ';\r\n %s=%s' % (name, value) + self.addheader("Content-Type", ctype, prefix=prefix, + add_to_http_hdrs=add_to_http_hdrs) + self.flushheaders() + if not add_to_http_hdrs: self._fp.write("\r\n") + self._first_part = True + return self._fp + + def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1, + add_to_http_hdrs=0, content_type=1): + boundary = boundary or choose_boundary() + self._boundary.append(boundary) + return self.startbody("multipart/" + subtype, + [("boundary", boundary)] + plist, + prefix=prefix, + add_to_http_hdrs=add_to_http_hdrs, + content_type=content_type) + + def nextpart(self): + boundary = self._boundary[-1] + if self._first_part: + self._first_part = False + else: + self._fp.write("\r\n") + self._fp.write("--" + boundary + "\r\n") + return self.__class__(self._fp) + + def lastpart(self): + if self._first_part: + self.nextpart() + boundary = self._boundary.pop() + self._fp.write("\r\n--" + boundary + "--\r\n") + + +class LocateError(ValueError): pass +class AmbiguityError(LocateError): pass +class ControlNotFoundError(LocateError): pass +class ItemNotFoundError(LocateError): pass + +class ItemCountError(ValueError): pass + +# for backwards compatibility, ParseError derives from exceptions that were +# raised by versions of ClientForm <= 0.2.5 +# TODO: move to _html +class ParseError(sgmllib.SGMLParseError, + HTMLParser.HTMLParseError): + + def __init__(self, *args, **kwds): + Exception.__init__(self, *args, **kwds) + + def __str__(self): + return Exception.__str__(self) + + +class _AbstractFormParser: + """forms attribute contains HTMLForm instances on completion.""" + # thanks to Moshe Zadka for an example of sgmllib/htmllib usage + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + if entitydefs is None: + entitydefs = get_entitydefs() + self._entitydefs = entitydefs + self._encoding = encoding + + self.base = None + self.forms = [] + self.labels = [] + self._current_label = None + self._current_form = None + self._select = None + self._optgroup = None + self._option = None + self._textarea = None + + # forms[0] will contain all controls that are outside of any form + # self._global_form is an alias for self.forms[0] + self._global_form = None + self.start_form([]) + self.end_form() + self._current_form = self._global_form = self.forms[0] + + def do_base(self, attrs): + debug("%s", attrs) + for key, value in attrs: + if key == "href": + self.base = self.unescape_attr_if_required(value) + + def end_body(self): + debug("") + if self._current_label is not None: + self.end_label() + if self._current_form is not self._global_form: + self.end_form() + + def start_form(self, attrs): + debug("%s", attrs) + if self._current_form is not self._global_form: + raise ParseError("nested FORMs") + name = None + action = None + enctype = "application/x-www-form-urlencoded" + method = "GET" + d = {} + for key, value in attrs: + if key == "name": + name = self.unescape_attr_if_required(value) + elif key == "action": + action = self.unescape_attr_if_required(value) + elif key == "method": + method = self.unescape_attr_if_required(value.upper()) + elif key == "enctype": + enctype = self.unescape_attr_if_required(value.lower()) + d[key] = self.unescape_attr_if_required(value) + controls = [] + self._current_form = (name, action, method, enctype), d, controls + + def end_form(self): + debug("") + if self._current_label is not None: + self.end_label() + if self._current_form is self._global_form: + raise ParseError("end of FORM before start") + self.forms.append(self._current_form) + self._current_form = self._global_form + + def start_select(self, attrs): + debug("%s", attrs) + if self._select is not None: + raise ParseError("nested SELECTs") + if self._textarea is not None: + raise ParseError("SELECT inside TEXTAREA") + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + + self._select = d + self._add_label(d) + + self._append_select_control({"__select": d}) + + def end_select(self): + debug("") + if self._select is None: + raise ParseError("end of SELECT before start") + + if self._option is not None: + self._end_option() + + self._select = None + + def start_optgroup(self, attrs): + debug("%s", attrs) + if self._select is None: + raise ParseError("OPTGROUP outside of SELECT") + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + + self._optgroup = d + + def end_optgroup(self): + debug("") + if self._optgroup is None: + raise ParseError("end of OPTGROUP before start") + self._optgroup = None + + def _start_option(self, attrs): + debug("%s", attrs) + if self._select is None: + raise ParseError("OPTION outside of SELECT") + if self._option is not None: + self._end_option() + + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + + self._option = {} + self._option.update(d) + if (self._optgroup and self._optgroup.has_key("disabled") and + not self._option.has_key("disabled")): + self._option["disabled"] = None + + def _end_option(self): + debug("") + if self._option is None: + raise ParseError("end of OPTION before start") + + contents = self._option.get("contents", "").strip() + self._option["contents"] = contents + if not self._option.has_key("value"): + self._option["value"] = contents + if not self._option.has_key("label"): + self._option["label"] = contents + # stuff dict of SELECT HTML attrs into a special private key + # (gets deleted again later) + self._option["__select"] = self._select + self._append_select_control(self._option) + self._option = None + + def _append_select_control(self, attrs): + debug("%s", attrs) + controls = self._current_form[2] + name = self._select.get("name") + controls.append(("select", name, attrs)) + + def start_textarea(self, attrs): + debug("%s", attrs) + if self._textarea is not None: + raise ParseError("nested TEXTAREAs") + if self._select is not None: + raise ParseError("TEXTAREA inside SELECT") + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + self._add_label(d) + + self._textarea = d + + def end_textarea(self): + debug("") + if self._textarea is None: + raise ParseError("end of TEXTAREA before start") + controls = self._current_form[2] + name = self._textarea.get("name") + controls.append(("textarea", name, self._textarea)) + self._textarea = None + + def start_label(self, attrs): + debug("%s", attrs) + if self._current_label: + self.end_label() + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + taken = bool(d.get("for")) # empty id is invalid + d["__text"] = "" + d["__taken"] = taken + if taken: + self.labels.append(d) + self._current_label = d + + def end_label(self): + debug("") + label = self._current_label + if label is None: + # something is ugly in the HTML, but we're ignoring it + return + self._current_label = None + # if it is staying around, it is True in all cases + del label["__taken"] + + def _add_label(self, d): + #debug("%s", d) + if self._current_label is not None: + if not self._current_label["__taken"]: + self._current_label["__taken"] = True + d["__label"] = self._current_label + + def handle_data(self, data): + debug("%s", data) + + if self._option is not None: + # self._option is a dictionary of the OPTION element's HTML + # attributes, but it has two special keys, one of which is the + # special "contents" key contains text between OPTION tags (the + # other is the "__select" key: see the end_option method) + map = self._option + key = "contents" + elif self._textarea is not None: + map = self._textarea + key = "value" + data = normalize_line_endings(data) + # not if within option or textarea + elif self._current_label is not None: + map = self._current_label + key = "__text" + else: + return + + if data and not map.has_key(key): + # according to + # http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.1 line break + # immediately after start tags or immediately before end tags must + # be ignored, but real browsers only ignore a line break after a + # start tag, so we'll do that. + if data[0:2] == "\r\n": + data = data[2:] + elif data[0:1] in ["\n", "\r"]: + data = data[1:] + map[key] = data + else: + map[key] = map[key] + data + + def do_button(self, attrs): + debug("%s", attrs) + d = {} + d["type"] = "submit" # default + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + controls = self._current_form[2] + + type = d["type"] + name = d.get("name") + # we don't want to lose information, so use a type string that + # doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON} + # e.g. type for BUTTON/RESET is "resetbutton" + # (type for INPUT/RESET is "reset") + type = type+"button" + self._add_label(d) + controls.append((type, name, d)) + + def do_input(self, attrs): + debug("%s", attrs) + d = {} + d["type"] = "text" # default + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + controls = self._current_form[2] + + type = d["type"] + name = d.get("name") + self._add_label(d) + controls.append((type, name, d)) + + def do_isindex(self, attrs): + debug("%s", attrs) + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + controls = self._current_form[2] + + self._add_label(d) + # isindex doesn't have type or name HTML attributes + controls.append(("isindex", None, d)) + + def handle_entityref(self, name): + #debug("%s", name) + self.handle_data(unescape( + '&%s;' % name, self._entitydefs, self._encoding)) + + def handle_charref(self, name): + #debug("%s", name) + self.handle_data(unescape_charref(name, self._encoding)) + + def unescape_attr(self, name): + #debug("%s", name) + return unescape(name, self._entitydefs, self._encoding) + + def unescape_attrs(self, attrs): + #debug("%s", attrs) + escaped_attrs = {} + for key, val in attrs.items(): + try: + val.items + except AttributeError: + escaped_attrs[key] = self.unescape_attr(val) + else: + # e.g. "__select" -- yuck! + escaped_attrs[key] = self.unescape_attrs(val) + return escaped_attrs + + def unknown_entityref(self, ref): self.handle_data("&%s;" % ref) + def unknown_charref(self, ref): self.handle_data("&#%s;" % ref) + + +class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser): + """Good for XHTML, bad for tolerance of incorrect HTML.""" + # thanks to Michael Howitz for this! + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + HTMLParser.HTMLParser.__init__(self) + _AbstractFormParser.__init__(self, entitydefs, encoding) + + def feed(self, data): + try: + HTMLParser.HTMLParser.feed(self, data) + except HTMLParser.HTMLParseError, exc: + raise ParseError(exc) + + def start_option(self, attrs): + _AbstractFormParser._start_option(self, attrs) + + def end_option(self): + _AbstractFormParser._end_option(self) + + def handle_starttag(self, tag, attrs): + try: + method = getattr(self, "start_" + tag) + except AttributeError: + try: + method = getattr(self, "do_" + tag) + except AttributeError: + pass # unknown tag + else: + method(attrs) + else: + method(attrs) + + def handle_endtag(self, tag): + try: + method = getattr(self, "end_" + tag) + except AttributeError: + pass # unknown tag + else: + method() + + def unescape(self, name): + # Use the entitydefs passed into constructor, not + # HTMLParser.HTMLParser's entitydefs. + return self.unescape_attr(name) + + def unescape_attr_if_required(self, name): + return name # HTMLParser.HTMLParser already did it + def unescape_attrs_if_required(self, attrs): + return attrs # ditto + + def close(self): + HTMLParser.HTMLParser.close(self) + self.end_body() + + +class _AbstractSgmllibParser(_AbstractFormParser): + + def do_option(self, attrs): + _AbstractFormParser._start_option(self, attrs) + + # we override this attr to decode hex charrefs + entity_or_charref = re.compile( + '&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(x?[0-9a-fA-F]+))(;?)') + def convert_entityref(self, name): + return unescape("&%s;" % name, self._entitydefs, self._encoding) + def convert_charref(self, name): + return unescape_charref("%s" % name, self._encoding) + def unescape_attr_if_required(self, name): + return name # sgmllib already did it + def unescape_attrs_if_required(self, attrs): + return attrs # ditto + + +class FormParser(_AbstractSgmllibParser, _sgmllib_copy.SGMLParser): + """Good for tolerance of incorrect HTML, bad for XHTML.""" + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + _sgmllib_copy.SGMLParser.__init__(self) + _AbstractFormParser.__init__(self, entitydefs, encoding) + + def feed(self, data): + try: + _sgmllib_copy.SGMLParser.feed(self, data) + except _sgmllib_copy.SGMLParseError, exc: + raise ParseError(exc) + + def close(self): + _sgmllib_copy.SGMLParser.close(self) + self.end_body() + + +class _AbstractBSFormParser(_AbstractSgmllibParser): + + bs_base_class = None + + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + _AbstractFormParser.__init__(self, entitydefs, encoding) + self.bs_base_class.__init__(self) + + def handle_data(self, data): + _AbstractFormParser.handle_data(self, data) + self.bs_base_class.handle_data(self, data) + + def feed(self, data): + try: + self.bs_base_class.feed(self, data) + except _sgmllib_copy.SGMLParseError, exc: + raise ParseError(exc) + + def close(self): + self.bs_base_class.close(self) + self.end_body() + + +class RobustFormParser(_AbstractBSFormParser, _beautifulsoup.BeautifulSoup): + + """Tries to be highly tolerant of incorrect HTML.""" + + bs_base_class = _beautifulsoup.BeautifulSoup + + +class NestingRobustFormParser(_AbstractBSFormParser, + _beautifulsoup.ICantBelieveItsBeautifulSoup): + + """Tries to be highly tolerant of incorrect HTML. + + Different from RobustFormParser in that it more often guesses nesting + above missing end tags (see BeautifulSoup docs). + """ + + bs_base_class = _beautifulsoup.ICantBelieveItsBeautifulSoup + + +#FormParser = XHTMLCompatibleFormParser # testing hack +#FormParser = RobustFormParser # testing hack + + +def ParseResponseEx(response, + select_default=False, + form_parser_class=FormParser, + request_class=_request.Request, + entitydefs=None, + encoding=DEFAULT_ENCODING, + + # private + _urljoin=urlparse.urljoin, + _urlparse=urlparse.urlparse, + _urlunparse=urlparse.urlunparse, + ): + """Identical to ParseResponse, except that: + + 1. The returned list contains an extra item. The first form in the list + contains all controls not contained in any FORM element. + + 2. The arguments ignore_errors and backwards_compat have been removed. + + 3. Backwards-compatibility mode (backwards_compat=True) is not available. + """ + return _ParseFileEx(response, response.geturl(), + select_default, + False, + form_parser_class, + request_class, + entitydefs, + False, + encoding, + _urljoin=_urljoin, + _urlparse=_urlparse, + _urlunparse=_urlunparse, + ) + +def ParseFileEx(file, base_uri, + select_default=False, + form_parser_class=FormParser, + request_class=_request.Request, + entitydefs=None, + encoding=DEFAULT_ENCODING, + + # private + _urljoin=urlparse.urljoin, + _urlparse=urlparse.urlparse, + _urlunparse=urlparse.urlunparse, + ): + """Identical to ParseFile, except that: + + 1. The returned list contains an extra item. The first form in the list + contains all controls not contained in any FORM element. + + 2. The arguments ignore_errors and backwards_compat have been removed. + + 3. Backwards-compatibility mode (backwards_compat=True) is not available. + """ + return _ParseFileEx(file, base_uri, + select_default, + False, + form_parser_class, + request_class, + entitydefs, + False, + encoding, + _urljoin=_urljoin, + _urlparse=_urlparse, + _urlunparse=_urlunparse, + ) + +def ParseString(text, base_uri, *args, **kwds): + fh = StringIO(text) + return ParseFileEx(fh, base_uri, *args, **kwds) + +def ParseResponse(response, *args, **kwds): + """Parse HTTP response and return a list of HTMLForm instances. + + The return value of mechanize.urlopen can be conveniently passed to this + function as the response parameter. + + mechanize.ParseError is raised on parse errors. + + response: file-like object (supporting read() method) with a method + geturl(), returning the URI of the HTTP response + select_default: for multiple-selection SELECT controls and RADIO controls, + pick the first item as the default if none are selected in the HTML + form_parser_class: class to instantiate and use to pass + request_class: class to return from .click() method (default is + mechanize.Request) + entitydefs: mapping like {"&": "&", ...} containing HTML entity + definitions (a sensible default is used) + encoding: character encoding used for encoding numeric character references + when matching link text. mechanize does not attempt to find the encoding + in a META HTTP-EQUIV attribute in the document itself (mechanize, for + example, does do that and will pass the correct value to mechanize using + this parameter). + + backwards_compat: boolean that determines whether the returned HTMLForm + objects are backwards-compatible with old code. If backwards_compat is + true: + + - ClientForm 0.1 code will continue to work as before. + + - Label searches that do not specify a nr (number or count) will always + get the first match, even if other controls match. If + backwards_compat is False, label searches that have ambiguous results + will raise an AmbiguityError. + + - Item label matching is done by strict string comparison rather than + substring matching. + + - De-selecting individual list items is allowed even if the Item is + disabled. + + The backwards_compat argument will be removed in a future release. + + Pass a true value for select_default if you want the behaviour specified by + RFC 1866 (the HTML 2.0 standard), which is to select the first item in a + RADIO or multiple-selection SELECT control if none were selected in the + HTML. Most browsers (including Microsoft Internet Explorer (IE) and + Netscape Navigator) instead leave all items unselected in these cases. The + W3C HTML 4.0 standard leaves this behaviour undefined in the case of + multiple-selection SELECT controls, but insists that at least one RADIO + button should be checked at all times, in contradiction to browser + behaviour. + + There is a choice of parsers. mechanize.XHTMLCompatibleFormParser (uses + HTMLParser.HTMLParser) works best for XHTML, mechanize.FormParser (uses + bundled copy of sgmllib.SGMLParser) (the default) works better for ordinary + grubby HTML. Note that HTMLParser is only available in Python 2.2 and + later. You can pass your own class in here as a hack to work around bad + HTML, but at your own risk: there is no well-defined interface. + + """ + return _ParseFileEx(response, response.geturl(), *args, **kwds)[1:] + +def ParseFile(file, base_uri, *args, **kwds): + """Parse HTML and return a list of HTMLForm instances. + + mechanize.ParseError is raised on parse errors. + + file: file-like object (supporting read() method) containing HTML with zero + or more forms to be parsed + base_uri: the URI of the document (note that the base URI used to submit + the form will be that given in the BASE element if present, not that of + the document) + + For the other arguments and further details, see ParseResponse.__doc__. + + """ + return _ParseFileEx(file, base_uri, *args, **kwds)[1:] + +def _ParseFileEx(file, base_uri, + select_default=False, + ignore_errors=False, + form_parser_class=FormParser, + request_class=_request.Request, + entitydefs=None, + backwards_compat=True, + encoding=DEFAULT_ENCODING, + _urljoin=urlparse.urljoin, + _urlparse=urlparse.urlparse, + _urlunparse=urlparse.urlunparse, + ): + if backwards_compat: + deprecation("operating in backwards-compatibility mode", 1) + fp = form_parser_class(entitydefs, encoding) + while 1: + data = file.read(CHUNK) + try: + fp.feed(data) + except ParseError, e: + e.base_uri = base_uri + raise + if len(data) != CHUNK: break + fp.close() + if fp.base is not None: + # HTML BASE element takes precedence over document URI + base_uri = fp.base + labels = [] # Label(label) for label in fp.labels] + id_to_labels = {} + for l in fp.labels: + label = Label(l) + labels.append(label) + for_id = l["for"] + coll = id_to_labels.get(for_id) + if coll is None: + id_to_labels[for_id] = [label] + else: + coll.append(label) + forms = [] + for (name, action, method, enctype), attrs, controls in fp.forms: + if action is None: + action = base_uri + else: + action = _urljoin(base_uri, action) + # would be nice to make HTMLForm class (form builder) pluggable + form = HTMLForm( + action, method, enctype, name, attrs, request_class, + forms, labels, id_to_labels, backwards_compat) + form._urlparse = _urlparse + form._urlunparse = _urlunparse + for ii in range(len(controls)): + type, name, attrs = controls[ii] + # index=ii*10 allows ImageControl to return multiple ordered pairs + form.new_control( + type, name, attrs, select_default=select_default, index=ii*10) + forms.append(form) + for form in forms: + form.fixup() + return forms + + +class Label: + def __init__(self, attrs): + self.id = attrs.get("for") + self._text = attrs.get("__text").strip() + self._ctext = compress_text(self._text) + self.attrs = attrs + self._backwards_compat = False # maintained by HTMLForm + + def __getattr__(self, name): + if name == "text": + if self._backwards_compat: + return self._text + else: + return self._ctext + return getattr(Label, name) + + def __setattr__(self, name, value): + if name == "text": + # don't see any need for this, so make it read-only + raise AttributeError("text attribute is read-only") + self.__dict__[name] = value + + def __str__(self): + return "" % (self.id, self.text) + + +def _get_label(attrs): + text = attrs.get("__label") + if text is not None: + return Label(text) + else: + return None + +class Control: + """An HTML form control. + + An HTMLForm contains a sequence of Controls. The Controls in an HTMLForm + are accessed using the HTMLForm.find_control method or the + HTMLForm.controls attribute. + + Control instances are usually constructed using the ParseFile / + ParseResponse functions. If you use those functions, you can ignore the + rest of this paragraph. A Control is only properly initialised after the + fixup method has been called. In fact, this is only strictly necessary for + ListControl instances. This is necessary because ListControls are built up + from ListControls each containing only a single item, and their initial + value(s) can only be known after the sequence is complete. + + The types and values that are acceptable for assignment to the value + attribute are defined by subclasses. + + If the disabled attribute is true, this represents the state typically + represented by browsers by 'greying out' a control. If the disabled + attribute is true, the Control will raise AttributeError if an attempt is + made to change its value. In addition, the control will not be considered + 'successful' as defined by the W3C HTML 4 standard -- ie. it will + contribute no data to the return value of the HTMLForm.click* methods. To + enable a control, set the disabled attribute to a false value. + + If the readonly attribute is true, the Control will raise AttributeError if + an attempt is made to change its value. To make a control writable, set + the readonly attribute to a false value. + + All controls have the disabled and readonly attributes, not only those that + may have the HTML attributes of the same names. + + On assignment to the value attribute, the following exceptions are raised: + TypeError, AttributeError (if the value attribute should not be assigned + to, because the control is disabled, for example) and ValueError. + + If the name or value attributes are None, or the value is an empty list, or + if the control is disabled, the control is not successful. + + Public attributes: + + type: string describing type of control (see the keys of the + HTMLForm.type2class dictionary for the allowable values) (readonly) + name: name of control (readonly) + value: current value of control (subclasses may allow a single value, a + sequence of values, or either) + disabled: disabled state + readonly: readonly state + id: value of id HTML attribute + + """ + def __init__(self, type, name, attrs, index=None): + """ + type: string describing type of control (see the keys of the + HTMLForm.type2class dictionary for the allowable values) + name: control name + attrs: HTML attributes of control's HTML element + + """ + raise NotImplementedError() + + def add_to_form(self, form): + self._form = form + form.controls.append(self) + + def fixup(self): + pass + + def is_of_kind(self, kind): + raise NotImplementedError() + + def clear(self): + raise NotImplementedError() + + def __getattr__(self, name): raise NotImplementedError() + def __setattr__(self, name, value): raise NotImplementedError() + + def pairs(self): + """Return list of (key, value) pairs suitable for passing to urlencode. + """ + return [(k, v) for (i, k, v) in self._totally_ordered_pairs()] + + def _totally_ordered_pairs(self): + """Return list of (key, value, index) tuples. + + Like pairs, but allows preserving correct ordering even where several + controls are involved. + + """ + raise NotImplementedError() + + def _write_mime_data(self, mw, name, value): + """Write data for a subitem of this control to a MimeWriter.""" + # called by HTMLForm + mw2 = mw.nextpart() + mw2.addheader("Content-Disposition", + 'form-data; name="%s"' % name, 1) + f = mw2.startbody(prefix=0) + f.write(value) + + def __str__(self): + raise NotImplementedError() + + def get_labels(self): + """Return all labels (Label instances) for this control. + + If the control was surrounded by a