
    Dg.                        d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZ  ej        d	ej                  Z ej        d
ej                  Z ej        dej                  Z ej        dej        ej        z            Z ej        dej        ej        z            Z ej        dej                  ZdZ	 	 	 d-dedee         dededef
dZ d.dedee         defdZ!	 d/dededee         defdZ" ej        dej                  Z#d.dedee         defdZ$	 	 	 d0dedee         dee         dee         def
d Z%	 d1dedee         dee         defd!Z&	 	 	 d2dedee         d#edee         def
d$Z'	 	 	 d3dedee         dedee         def
d%Z(	 d4ded&ededefd'Z)	 	 	 d5ded&eded)ee         de
e	d*         e	e*ef         f         f
d+Z+dedefd,Z,dS )6z(
Functions for dealing with markup text
    N)name2codepoint)IterableMatchOptionalPatternTupleUnion)urljoin)
StrOrBytes)safe_url_string)
to_unicodezI&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)z<[a-zA-Z\/!].*?>z5<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']z}<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)z<meta\s[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)[^>]*?\shttp-equiv\s*=[^>]*refreshz<((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))z 	
 Tutf-8textkeepremove_illegalencodingreturnc                     dt           t                   dt          ffd}t                              |t	          | |                    S )u  Remove entities from the given `text` by converting them to their
    corresponding unicode character.

    `text` can be a unicode string or a byte string encoded in the given
    `encoding` (which defaults to 'utf-8').

    If `keep` is passed (with a list of entity names) those entities will
    be kept (they won't be removed).

    It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
    and named entities (such as ``&nbsp;`` or ``&gt;``).

    If `remove_illegal` is ``True``, entities that can't be converted are removed.
    If `remove_illegal` is ``False``, entities that can't be converted are kept "as
    is". For more information see the tests.

    Always returns a unicode string (with the entities removed).

    >>> import w3lib.html
    >>> w3lib.html.replace_entities(b'Price: &pound;100')
    'Price: \xa3100'
    >>> print(w3lib.html.replace_entities(b'Price: &pound;100'))
    Price: £100
    >>>

    mr   c                    |                                  }d }|                    d          rt          |d         d          }n|                    d          rt          |d         d          }n|                    d          rm|d         }|                                v r|                     d          S t          j        |          p%t          j        |                                          }|Z	 d|cxk    rdk    r&n n#t          |f                              d	          S t          |          S # t          t          f$ r Y nw xY wr|                    d
          rdn|                     d          S )Ndec
   hex   namedr         cp1252	semicolon )	groupdictgetintlowergroupr   bytesdecodechr
ValueErrorOverflowError)r   groupsnumberentity_namer   r   s       ?/var/www/sysmax/venv/lib/python3.11/site-packages/w3lib/html.pyconvert_entityz(replace_entities.<locals>.convert_entityE   s   ::e 	++FFZZ 		++FFZZ   	 /K  ""d**wwqzz!'+K88 N<N%%''= = 
6))))T))))) &++228<<<v;;&.    $O

;(?(?OrrQWWQZZOs   52D7 (D7 7E
E)r   str_ent_resubr   )r   r   r   r   r0   s    ``  r/   replace_entitiesr4   $   sc    BP%* P P P P P P P P< ;;~z$'A'ABBB    c                 l    t          t                              t          | |                              S N)boolr2   searchr   )r   r   s     r/   has_entitiesr:   f   s&    z$99::;;;r5   r!   tokenc                 T    t                               |t          | |                    S )ac  Replace all markup tags found in the given `text` by the given token.
    By default `token` is an empty string so it just removes all tags.

    `text` can be a unicode string or a regular string encoded as `encoding`
    (or ``'utf-8'`` if `encoding` is not given.)

    Always returns a unicode string.

    Examples:

    >>> import w3lib.html
    >>> w3lib.html.replace_tags('This text contains <a>some tag</a>')
    'This text contains some tag'
    >>> w3lib.html.replace_tags('<p>Je ne parle pas <b>fran\xe7ais</b></p>', ' -- ', 'latin-1')
    ' -- Je ne parle pas  -- fran\xe7ais --  -- '
    >>>

    )_tag_rer3   r   )r   r;   r   s      r/   replace_tagsr>   j   s"    , ;;ujx88999r5   z<!--.*?(?:-->|$)c                 X    t          | |          }t                              d|          S )zRemove HTML Comments.

    >>> import w3lib.html
    >>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
    'test  whatever'
    >>>

    r!   )r   _REMOVECOMMENTS_REr3   )r   r   utexts      r/   remove_commentsrB      s)     tX&&E!!"e,,,r5   
which_onesc                 j   rrt          d          d D             d D             dt          dt          ffddt          t                   dt          ffd}d	}t	          j        |t          j        t          j        z            }|                    |t          | |                    S )
a;  Remove HTML Tags only.

    `which_ones` and `keep` are both tuples, there are four cases:

    ==============  ============= ==========================================
    ``which_ones``  ``keep``      what it does
    ==============  ============= ==========================================
    **not empty**   empty         remove all tags in ``which_ones``
    empty           **not empty** remove all tags except the ones in ``keep``
    empty           empty         remove all tags
    **not empty**   **not empty** not allowed
    ==============  ============= ==========================================


    Remove all tags:

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags(doc)
    'This is a link: example'
    >>>

    Keep only some tags:

    >>> w3lib.html.remove_tags(doc, keep=('div',))
    '<div>This is a link: example</div>'
    >>>

    Remove only specific tags:

    >>> w3lib.html.remove_tags(doc, which_ones=('a','b'))
    '<div><p>This is a link: example</p></div>'
    >>>

    You can't remove some and keep some:

    >>> w3lib.html.remove_tags(doc, which_ones=('a',), keep=('p',))
    Traceback (most recent call last):
        ...
    ValueError: Cannot use both which_ones and keep
    >>>

    z#Cannot use both which_ones and keepc                 6    h | ]}|                                 S r   r%   .0tags     r/   	<setcomp>zremove_tags.<locals>.<setcomp>   s     444##))++444r5   c                 6    h | ]}|                                 S r   rF   rG   s     r/   rJ   zremove_tags.<locals>.<setcomp>   s     (((CCIIKK(((r5   rI   r   c                 @    |                                  } r| v S | vS r7   rF   )rI   r   rC   s    r/   will_removez remove_tags.<locals>.will_remove   s-    iikk 	#*$$d?"r5   r   c                 r    |                      d          } |          rdn|                      d          S )N   r!   r   )r&   )r   rI   rM   s     r/   
remove_tagzremove_tags.<locals>.remove_tag   s5    ggajj [%%5rr1771::5r5   z</?([^ >/]+).*?>)
r*   r1   r8   r   recompileDOTALL
IGNORECASEr3   r   )r   rC   r   r   rP   regexretagsrM   s    ``    @r/   remove_tagsrW      s    b  @d @>???44444J((4(((D# # # # # # # # #6eCj 6S 6 6 6 6 6 6 EZry2=899F::j*T8"<"<===r5   c                     t          | |          }|rad                    d |D                       }t          j        |t          j        t          j        z            }|                    d|          }|S )a  Remove tags and their content.

    `which_ones` is a tuple of which tags to remove including their content.
    If is empty, returns the string unmodified.

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',))
    '<div><p> <a href="http://www.example.com">example</a></p></div>'
    >>>

    |c           	      &    g | ]}d | d| d| dS )<z\b.*?</z>|<z\s*/>r   rG   s     r/   
<listcomp>z,remove_tags_with_content.<locals>.<listcomp>   s4    SSS<c<<#<<#<<<SSSr5   r!   )r   joinrQ   rR   rS   rT   r3   )r   rC   r   rA   tagsrV   s         r/   remove_tags_with_contentr_      sm      tX&&E &xxSS
SSSTTD")bm";<<

2u%%Lr5   
	
replace_byc                 x    t          | |          }|D ]&}|                    |t          ||                    }'|S )a$  Remove escape characters.

    `which_ones` is a tuple of which escape characters we want to remove.
    By default removes ``\n``, ``\t``, ``\r``.

    `replace_by` is the string to replace the escape characters by.
    It defaults to ``''``, meaning the escape characters are removed.

    )r   replace)r   rC   rd   r   rA   ecs         r/   replace_escape_charsrh      sH      tX&&E D Db*Z"B"BCCLr5   c           
      p   dt           dt          t                    dt          t          t           t          t                    f                  fd}t          | |          }d} ||t                    D ]E}t          |t                     r|t          |||          z  }-||	                    d          z  }F|S )a`  
    This function receives markup as a text (always a unicode string or
    a UTF-8 encoded string) and does the following:

    1. removes entities (except the ones in `keep`) from any part of it
        that is not inside a CDATA
    2. searches for CDATAs and extracts their text (if any) without modifying it.
    3. removes the found CDATAs

    txtpatternr   c              3      K   d}|                     |           D ],}|                    d          \  }}| ||         V  |V  |}-| |d          V  d S )Nr   rO   )finditerspan)rj   rk   offsetmatchmatch_smatch_es         r/   _get_fragmentsz&unquote_markup.<locals>._get_fragments  s}       %%c** 	 	E$zz!}}GWfWn%%%%KKKFF&''lr5   r!   )r   r   cdata_d)
r1   r   r   r	   r   r   	_cdata_re
isinstancer4   r&   )r   r   r   r   rs   rA   ret_textfragments           r/   unquote_markupry   
  s    "		"3<		%U3Z(	)	 	 	 	 tX&&EH"N5)44 2 2h$$ 	2(tN   HH
 y111HHOr5   baseurlc                     t          | |          }t                              |          }|r?t          t	          |          t	          |                    d          |                    S t	          |          S )zReturn the base url if declared in the given HTML `text`,
    relative to the given base url.

    If no base url is found, the given `baseurl` is returned.

    )r   rO   )rB   _baseurl_rer9   r
   r   r&   )r   rz   r   rA   r   s        r/   get_base_urlr}   4  sw     !999E5!!A (G$$oaggajj8&T&T&T
 
 	
 w'''r5   scriptnoscriptignore_tagsNNc                    	 t          | |          }n# t          $ r t          |             w xY wt          ||          }t	          t          |                    }t                              |          pt                              |          }|rlt          |
                    d                    }t          |
                    d                              d          |          }t          ||          }||fS dS )aX  Return the http-equiv parameter of the HTML meta element from the given
    HTML text and return a tuple ``(interval, url)`` where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, ``(None, None)`` is returned.

    r$   urlz "'r   )r   UnicodeDecodeErrorprintr_   rB   r4   _meta_refresh_rer9   _meta_refresh_re2floatr&   r   stripr
   )r   rz   r   r   rA   r   intervalr   s           r/   get_meta_refreshr   H  s    4**   d %UK88E,U3344E&&I*;*B*B5*I*IA ((aggenn226::HEEgs##}zs    .c                 6    |                      t                    S )a  
    Strip all leading and trailing space characters (as defined in
    https://www.w3.org/TR/html5/infrastructure.html#space-character).

    Such stripping is useful e.g. for processing HTML element attributes which
    contain URLs, like ``href``, ``src`` or form ``action`` - HTML5 standard
    defines them as "valid URL potentially surrounded by spaces"
    or "valid non-empty URL potentially surrounded by spaces".

    >>> strip_html5_whitespace(' hello\n')
    'hello'
    )r   HTML5_WHITESPACE)r   s    r/   strip_html5_whitespacer   h  s     ::&'''r5   )r   Tr   r7   )r!   N)r   r   N)r   N)r`   r!   N)r   TN)r!   r   )r!   r   r~   )-__doc__rQ   html.entitiesr   typingr   r   r   r   r   r	   urllib.parser
   w3lib._typesr   	w3lib.urlr   
w3lib.utilr   rR   rT   r2   rS   r=   Ir|   r   r   ru   r   r1   r8   r4   r:   r>   r@   rB   rW   r_   rh   ry   r}   r   r   r   r   r5   r/   <module>r      s    
			 ( ( ( ( ( ( C C C C C C C C C C C C C C C C             # # # # # # % % % % % % ! ! ! ! ! !
"*PM  "*(")
4
4bjQSUSWXX2: EI   BJ LI  
 BJCRY 	 ! 
 	?C ?C
?C
3-?C ?C 	?C
 	?C ?C ?C ?CD< <z <Xc] <d < < < <
 BF: :
: :19#:: : : :2  RZ 2BI>> - -* - - - - - -  !#"	E> E>
E>E> 3-E> sm	E>
 	E> E> E> E>R QU 
"*3-@H   4 !3"	 
  sm	
 	   0 "	' '
'
3-' ' sm	'
 	' ' ' 'V AH( (
()(:=(( ( ( (, !7	 
  #	
 5eE3J//0   @( ( ( ( ( ( ( (r5   