
K!Fc           @   s  d  Z  d d g Z d g Z d Z d Z d Z d Z d d g Z d	 d
 k Z d	 d
 k	 Z	 d	 d
 k
 Z
 d	 d
 k Z y< d	 d
 k Z e o d	 d
 k Z d e i _ n d   Z Wn d
 Z d   Z n Xd e i f d     YZ d e f d     YZ d d
 d
 e d  Z h d d 6d d 6d d 6d d 6d d 6Z h d d 6d d 6d d  6d d! 6d d" 6d# d$ 6Z d
 a d%   Z d&   Z e d'  Z e d
 d(  Z d
 S()   s7   
sanitize: bringing sanitiy to world of messed-up data
s'   Mark Pilgrim <http://diveintomark.org/>s&   Aaron Swartz <http://www.aaronsw.com/>s#   Sam Ruby <http://intertwingly.net/>t   BSDs   0.33i    t   uTidyt   mxTidyiNi   c         C   s   t  i |   d  S(   t   encoding(   t   chardett   detect(   t   data(    (    s   sanitize.pyt   <lambda>    s    c         C   s   d  S(   N(   t   None(   R   (    (    s   sanitize.pyR   #   s    t   _BaseHTMLProcessorc           B   s  e  Z d  d d d d d d d d d	 d
 d d g Z e i d e i  Z e i d  Z e i d  Z d   Z	 d   Z
 d   Z d   Z d   Z d   Z d   Z d   Z d   Z d   Z d   Z d   Z d   Z e i d  i Z d   Z d   Z RS(    t   areat   baset   basefontt   brt   colt   framet   hrt   imgt   inputt   isindext   linkt   metat   params   <!((?!DOCTYPE|--|\[))s   &(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)s   <([^<\s]+?)\s*/>c         C   s?   | |  _  t o t i i d |  i   n t i i |   d  S(   Ns(   entering BaseHTMLProcessor, encoding=%s
(   R   t   _debugt   syst   stderrt   writet   sgmllibt
   SGMLParsert   __init__(   t   selfR   (    (    s   sanitize.pyR   -   s    	 c         C   s   g  |  _  t i i |   d  S(   N(   t   piecesR   R   t   reset(   R   (    (    s   sanitize.pyR    2   s    	c         C   sD   | i  d  } | |  i j o d | d Sd | d | d Sd  S(   Ni   t   <s    />s   ></t   >(   t   groupt   elements_no_end_tag(   R   t   matcht   tag(    (    s   sanitize.pyt   _shorttag_replace6   s    c         C   s   t  o t i i d t |   n |  i i d |  } |  i i d |  } |  i i |  i	 |  } |  i
 o/ t |  t d  j o | i |  i
  } n t i i |  |  d  S(   Ns"   _BaseHTMLProcessor, feed, data=%s
s   &lt;!\1s   &amp;u    (   R   R   R   R   t   reprt   _r_barebangt   subt
   _r_bareampt   _r_shorttagR'   R   t   typet   encodeR   R   t   feed(   R   R   (    (    s   sanitize.pyR/   =   s     #c      	   C   s~   g  } | D] \ } } | | i    | f q ~ } g  } | D]3 \ } } | | | d j o | i    p | f q> ~ } | S(   Nt   relR-   (   R0   s   type(   t   lower(   R   t   attrst   _[1]t   kt   vt   _[2](    (    s   sanitize.pyt   normalize_attrsF   s    3Gc            sB  t  o t i i d |  n   f d   } g  } x | D]} \ } } g  } xL | D]D } t |  d j o d t t |   d } n | i |  qT W| i | d i |  f  q; Wd i g  }	 | D]# \ } } |	 d | | |  f q ~	  }
 |   i j o   i	 i d t
    n   i	 i d	 t
    d  S(
   Ns-   _BaseHTMLProcessor, unknown_starttag, tag=%s
c            s+     i  i d |   }  |  i d d  }  |  S(   Ns   &amp;t   "s   &quot;(   R+   R*   t   replace(   R   (   R   (    s   sanitize.pyt	   attrquoteR   s    i   s   &#t   ;t    s    %s="%s"s   <%(tag)s%(strattrs)s />s   <%(tag)s%(strattrs)s>(   R   R   R   R   t   ordt   strt   appendt   joinR$   R   t   locals(   R   R&   R2   R:   t   newattrst   keyt   valuet   newvaluet   cR3   t   strattrs(    (   R   s   sanitize.pyt   unknown_starttagL   s"        @c         C   s/   | |  i  j o |  i i d t    n d  S(   Ns
   </%(tag)s>(   R$   R   R?   RA   (   R   R&   (    (    s   sanitize.pyt   unknown_endtagg   s    c         C   s   |  i  i d t    d  S(   Ns
   &#%(ref)s;(   R   R?   RA   (   R   t   ref(    (    s   sanitize.pyt   handle_charrefm   s    c         C   s   |  i  i d t    d  S(   Ns	   &%(ref)s;(   R   R?   RA   (   R   RJ   (    (    s   sanitize.pyt   handle_entityrefr   s    c         C   s3   t  o t i i d |  n |  i i |  d  S(   Ns)   _BaseHTMLProcessor, handle_text, text=%s
(   R   R   R   R   R   R?   (   R   t   text(    (    s   sanitize.pyt   handle_dataw   s     c         C   s   |  i  i d t    d  S(   Ns   <!--%(text)s-->(   R   R?   RA   (   R   RM   (    (    s   sanitize.pyt   handle_comment~   s    c         C   s   |  i  i d t    d  S(   Ns   <?%(text)s>(   R   R?   RA   (   R   RM   (    (    s   sanitize.pyt	   handle_pi   s    c         C   s   |  i  i d t    d  S(   Ns   <!%(text)s>(   R   R?   RA   (   R   RM   (    (    s   sanitize.pyt   handle_decl   s    s   [a-zA-Z][-_.a-zA-Z0-9:]*\s*c         C   s   |  i  } t |  } | | j o d S|  i | |  } | oK | i   } | i   } | t |  | j o d S| i   | i   f S|  i |  d Sd  S(   Ni(   Ni(   Ni(   Ni(	   t   rawdatat   lenR   t   _new_declname_matchR#   t   stripR1   t   endRN   (   R   t   it   declstartposRR   t   nt   mt   st   name(    (    s   sanitize.pyt
   _scan_name   s    	c         C   s   d i  |  i  S(   s(   Return processed HTML as a single stringR<   (   R@   R   (   R   (    (    s   sanitize.pyt   output   s    (   t   __name__t
   __module__R$   t   ret   compilet
   IGNORECASER)   R+   R,   R   R    R'   R/   R7   RH   RI   RK   RL   RN   RO   RP   RQ   R%   RT   R]   R^   (    (    (    s   sanitize.pyR	   %   s*   															t   _HTMLSanitizerc        G   B   s  e  Z d  d d d d d d d d d	 d
 d d d d d d d d d d d d d d d d d d d d d d  d! d" d# d$ d% d& d' d( d) d* d+ d, d- d. d/ d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d: d; d< d= d> d? d@ dA dB dC dD dE dF gG Z d dG dH dI dJ dK dL dM dN dO dP dQ dR dS dT d dU dV dW dX dY dZ d[ d\ d d] d^ d_ d` da db dc dd de df dg d' dh di dj dk dl dm dn do dp dq dr ds dt du dv dw dx dy dz d{ d| d6 d} d~ d d d d d d d d d d gG Z d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d g' Z d d d g Z d d d d d d d d d d d d d d d d d d d d d d d d d g Z d d  Z d   Z	 d   Z
 d   Z d   Z d   Z d   Z d   Z d   Z RS(   t   at   abbrt   acronymt   addressR
   t   bt   bigt
   blockquoteR   t   buttont   captiont   centert   citet   codeR   t   colgroupt   ddt   delt   dfnt   dirt   divt   dlt   dtt   emt   fieldsett   fontt   formt   h1t   h2t   h3t   h4t   h5t   h6R   RW   R   R   t   inst   kbdt   labelt   legendt   lit   mapt   menut   olt   optgroupt   optiont   pt   pret   qR[   t   sampt   selectt   smallt   spant   striket   strongR*   t   supt   tablet   textareat   tbodyt   tdt   tfoott   tht   theadt   trt   ttt   ut   ult   vart   accepts   accept-charsett	   accesskeyt   actiont   alignt   altt   axist   bordert   cellpaddingt   cellspacingt   chart   charofft   charsett   checkedt   classt   cleart   colst   colspant   colort   compactt   coordst   datetimet   disabledt   enctypet   forR   t   headerst   heightt   hreft   hreflangt   hspacet   idt   ismapt   langt   longdesct	   maxlengtht   mediat   methodt   multipleR\   t   nohreft   noshadet   nowrapt   promptt   readonlyR0   t   revt   rowst   rowspant   rulest   scopet   selectedt   shapet   sizet   srct   startt   summaryt   tabindext   targett   titleR-   t   usemapt   valignRD   t   vspacet   widtht   cidt   cridR   t   davt   dictt   dnst   faxt   ftpt   got   gophert   h323t   httpt   httpst   imt   imapt   infot   ipps	   iris.beept   ldapt   mailtot   midt   modemt   newst   nfst   nntpt   prest   rtspt   sipt   sipst   snmpR&   t   telt   telnett   tftpt   urnt   aimt   ircR/   t   webcalt   scriptt   applett   stylet   codebaset   bodyt
   backgroundt   iframet   headt   profileR   t   objectt   classidc         C   s/   t  i |  |  | |  _ | |  _ h  t _ d  S(   N(   R	   R   t   baseurit   required_attributest   urlparset   _parse_cache(   R   R  R   R  (    (    s   sanitize.pyR      s    		c         C   sm   d | j o: | i  d d  \ } } | |  i j o d | } qG n |  i o t i |  i |  S| Sd  S(   Nt   :i   t   #(   t   splitt   acceptable_uri_schemesR  R  t   urljoin(   R   t   urit   schemet   rest(    (    s   sanitize.pyt
   resolveURI   s    
c         C   s#   t  i |   g  |  _ d |  _ d  S(   Ni    (   R	   R    t	   tag_stackt   ignore_level(   R   (    (    s   sanitize.pyR       s    	c         C   s?   t  i |  |  x( |  i o t  i |  |  i i    q Wd  S(   N(   R	   R/   R  RI   t   pop(   R   R   (    (    s   sanitize.pyR/      s     
c         C   s  | |  i  j o |  i d 7_ d  S|  i o d  S| |  i j os|  i |  } g  } | D]- \ } } | |  i j o | | | f q] q] ~ } g  } | D]? \ } } | | | | f |  i j o |  i |  p | f q ~ } |  i o | |  i j o{ g  } | D]R \ } } | g  } |  i | D] \ }	 }
 | |	 q,~ j o | | | f qq~ } | |  i | 7} n | |  i j o |  i	 i
 |  n t i |  | |  n d  S(   Ni   (   t   ignorable_elementsR  t   acceptable_elementsR7   t   acceptable_attributest   relative_urisR  R  R$   R  R?   R	   RH   (   R   R&   R2   R3   RC   RD   R6   t   _[3]t   _[4]R4   R5   (    (    s   sanitize.pyRH     s    
ASfc         C   s   | |  i  j o |  i d 8_ d  S|  i o d  S| |  i j o~ | |  i j on t } xF |  i o; |  i i   } | | j o t } Pn t i	 |  |  q\ W| o t i	 |  |  q n d  S(   Ni   (
   R  R  R  R$   t   FalseR  R  t   TrueR	   RI   (   R   R&   R%   t   top(    (    s   sanitize.pyRI     s     
  
c         C   s   d  S(   N(    (   R   RM   (    (    s   sanitize.pyRP   ,  s    c         C   s   d  S(   N(    (   R   RM   (    (    s   sanitize.pyRQ   /  s    c         C   s4   |  i  p& | i d d  } t i |  |  n d  S(   NR!   R<   (   R  R9   R	   RN   (   R   RM   (    (    s   sanitize.pyRN   2  s    
(   Re   R   (   R  R  (   s   areaR   (   Rk   Ro   (   R  R  (   Rs   Ro   (   R|   s   action(   s   frameR   (   s   frameR   (   R  R   (   R  R   (   s   headR
  (   s   imgR   (   s   imgR   (   s   imgR   (   s   inputR   (   s   inputR   (   R   Ro   (   s   linkR   (   s   objectR  (   s   objectR  (   s   objects   data(   s   objectR   (   R   Ro   (   R  R   N(   R_   R`   R  R  R  R  R   R   R   R  R    R/   RH   RI   RP   RQ   RN   (    (    (    s   sanitize.pyRd      sx   $	
	
						t   utf8c   
         s  | p
 h  } n | o d g | d <n t  | | |  } | i |   | i   } t od  } x~ t D]v } yf | d j o$ d d k l     f d   } Pn2 | d j o$ d d	 k l	   f d
   } Pn Wqh qh Xqh W| o t
 |  t
 d  j }	 |	 o | i d  } n | | d d d d d d d d } |	 o t | d  } n | i d  oD | i d d  d } | i d  o | i d d  d } qn | i d  o | i d d  d } qqn | i   i d d  } | S(   NR0   t   nofollowRe   R   i(   t   parseStringc            s   t    |  |   S(   N(   R>   (   R   t   kwargs(   t   _utidy(    s   sanitize.pyt   _tidyG  s    R   (   t   Tidyc            s"     i  |  |  \ } } }  } |  S(   N(   t   tidy(   R   R)  t   nerrorst	   nwarningst	   errordata(   t   _mxtidy(    s   sanitize.pyR+  L  s    u    s   utf-8t   output_xhtmli   t   numeric_entitiest   wrapi    t   char_encodingR&  s   <bodyR"   s   </bodys   
s   
(   s   relR'  (   Rd   R/   R^   t   TIDY_MARKUPR   t   PREFERRED_TIDY_INTERFACESR-  R(  t   mx.TidyR,  R-   R.   t   unicodet   countR  RU   R9   (
   t
   htmlSourceR   R  R  t   addnofollowR   R   R+  t   tidy_interfaceR&  (    (   R*  R1  s   sanitize.pyt   HTML7  sJ    
 	$"s   utf-32bet     s   utf-32les     s   utf-16bes   ##s   utf-16les   ##s   utf-8s   bft      <s   <   t    < ?s   < ? s   <?xmt   ebcdics   Loc         C   sf   t  pR d} dd  k } | i di t t t d   di t t |    a  n |  i t   S(  Ni    i   i   i   i   i	   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i
   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i    i   i   i   i   i   i   i   i   i   i[   i.   i<   i(   i+   i!   i&   i   i   i   i   i   i   i   i   i   i]   i$   i*   i)   i;   i^   i-   i/   i   i   i   i   i   i   i   i   i|   i,   i%   i_   i>   i?   i   i   i   i   i   i   i   i   i   i`   i:   i#   i@   i'   i=   i"   i   ia   ib   ic   id   ie   if   ig   ih   ii   i   i   i   i   i   i   i   ij   ik   il   im   in   io   ip   iq   ir   i   i   i   i   i   i   i   i~   is   it   iu   iv   iw   ix   iy   iz   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i{   iA   iB   iC   iD   iE   iF   iG   iH   iI   i   i   i   i   i   i   i}   iJ   iK   iL   iM   iN   iO   iP   iQ   iR   i   i   i   i   i   i   i\   i   iS   iT   iU   iV   iW   iX   iY   iZ   i   i   i   i   i   i   i0   i1   i2   i3   i4   i5   i6   i7   i8   i9   i   i   i   i   i   i   iR<   i   (   i    i   i   i   i   i	   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i
   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i    i   i   i   i   i   i   i   i   i   i[   i.   i<   i(   i+   i!   i&   i   i   i   i   i   i   i   i   i   i]   i$   i*   i)   i;   i^   i-   i/   i   i   i   i   i   i   i   i   i|   i,   i%   i_   i>   i?   i   i   i   i   i   i   i   i   i   i`   i:   i#   i@   i'   i=   i"   i   ia   ib   ic   id   ie   if   ig   ih   ii   i   i   i   i   i   i   i   ij   ik   il   im   in   io   ip   iq   ir   i   i   i   i   i   i   i   i~   is   it   iu   iv   iw   ix   iy   iz   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i{   iA   iB   iC   iD   iE   iF   iG   iH   iI   i   i   i   i   i   i   i}   iJ   iK   iL   iM   iN   iO   iP   iQ   iR   i   i   i   i   i   i   i\   i   iS   iT   iU   iV   iW   iX   iY   iZ   i   i   i   i   i   i   i0   i1   i2   i3   i4   i5   i6   i7   i8   i9   i   i   i   i   i   i   (   t   _ebcdic_to_ascii_mapt   stringt	   maketransR@   R   t   chrt   ranget	   translate(   R[   t   emapRD  (    (    s   sanitize.pyt   _ebcdic_to_asciis  s*                   :c         C   s^   xW t  |  D]I \ } } | d j o |  | d j o t Sq |  | | j o t Sq Wt S(   NR  t    (   t	   enumerateR#  R$  (   RM   t   bomRW   RF   (    (    s   sanitize.pyt   _startswithbom  s     		c         C   s6   x/ | i    D]! \ } } t |  |  o | Sq Wd  S(   N(   t	   iteritemsRN  R   (   RM   t   bom_mapRM  R   (    (    s   sanitize.pyt
   _detectbom  s
     	c            s   g      f d   } | |  pg | t      pT | o | t    t   p7 | t     p$ | d  p | d  p
 | d  S(   sd   
    Takes a string text of unknown encoding and tries to 
    provide a Unicode string for it.
    c            sh   |  o] |   j oP |  d j o t     Sy t   |   SWn t j
 o n X i |   n d  S(   NRB  (   RJ  R9  t   UnicodeDecodeErrorR?   (   R   (   RM   t   _triedEncodings(    s   sanitize.pyt   tryEncoding  s    R&  s   windows-1252s
   iso-8859-1(   RQ  t   xml_bom_mapt   _chardet(   RM   t   isXMLt   guessRT  (    (   RM   RS  s   sanitize.pyt
   characters  s    (   t   __doc__t
   __author__t   __contributors__t   __license__t   __version__R   R6  R7  R   Ra   R  R   R   t   chardet.constantst	   constantsRV  R   R   R	   Rd   R#  R>  t   unicode_bom_mapRU  RC  RJ  RN  RQ  RY  (    (    (    s   sanitize.pyt   <module>   sN   		0+

		
