
    %$}gD                        d Z 	 ddlZdZn# e$ r dZY nw xY wddlZddlZ	 ddlZdZn# e$ r dZY nw xY wddlm	Z	 ddl
mZ ddlmZmZmZmZmZmZ ddlmZ 	 ddlZdZn# e$ r dZY nw xY w	 ddlZdZn# e$ r dZY nw xY w	 ddlZdZn# e$ r dZY nw xY w	 dd	lmZ n# e$ r dZY nw xY wdd
lmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z'  ej(        e)          Z*ddhZ+ ej,        dej-                  Z. ej,        dej-                  Z/ ej,        d          Z0 e$ddddd          Z1 ej,        dej2        ej3        z            Z4 ej,        d          Z5 ej,        d          Z6h dZ7ddhZ8dZ9 ej,        d          Z: ej,        dej;                  Z<de=de=fd Z>d!e=de?fd"Z@d#e=deeA         fd$ZBdee=eAf         deAfd%ZCd&eAde?fd'ZDd(eAd&eAdeAfd)ZEd*eAdee#         fd+ZFd*edee#         fd,ZG e	d-.          d/eAdeAfd0            ZHd1eAdeAfd2ZIdUd1eAd4ed5         deAfd6ZJ e	d7.          dVd8eAd9e?d:e?deeA         fd;            ZKdVd<eAd9e?d:e?deeA         fd=ZLd>e!de!fd?ZM e	d7.          d1eAdeAfd@            ZNdAe!de?fdBZOdCeeA         de?fdDZPdEedFeQdefdGZRdHeQdIede?fdJZSdWd>e#dKeAdLe?de?fdMZTdNeAdOeAdeeA         fdPZUdNeAdOeAdKeAdQedee?ef         f
dRZVdAe!de?fdSZWd1eeA         de?fdTZXdS )Xzj
Module bundling functions related to HTML and text processing,
content filtering and language detection.
    NTF)	lru_cache)islice)AnyListLiteralOptionalTupleUnion	normalize)detect)
from_bytes)_Element)HtmlElement
HTMLParser
fromstring)HTTPResponseutf-8utf_8z^< ?! ?DOCTYPE.+?/ ?>z(<html.*?)\s*/>z(<!--.*?-->|<[^>]*>))collect_idsdefault_doctypeencodingremove_comments
remove_pisz(?<![p{P}>])\n)flagsz^https?://|/+$z3[^\s]+\.(avif|bmp|gif|hei[cf]|jpe?g|png|webp)(\b|$)>   phitdrefcellheaditemquotecodepre)zhttp-equiv="content-language"zproperty="og:locale"z
([a-z]{2})z\W*(Drucken|E-?Mail|Facebook|Flipboard|Google|Instagram|Linkedin|Mail|PDF|Pinterest|Pocket|Print|QQ|Reddit|Twitter|WeChat|WeiBo|Whatsapp|Xing|Mehr zum Thema:?|More on this.{,8}$)$filecontentreturnc                 R   t          | t                    s| S t          rM| dd         dk    r?	 t          j        |           S # t
          $ r t                              d           Y nw xY wt          rR| dd         dk    rD	 t          j        |           S # t          j
        $ r t                              d           Y nw xY wt          r*	 t          j        |           S # t          j        $ r Y nw xY wt          r*	 t          j        |           S # t          j        $ r Y nw xY w| S )z
    Don't trust response headers and try to decompress a binary string
    with a cascade of installed packages. Use magic numbers when available.
    N   s   zinvalid GZ file   s   (/zinvalid ZSTD file)
isinstancebytesHAS_GZIPgzip
decompress	ExceptionLOGGERwarningHAS_ZSTD	zstandard	ZstdError
HAS_BROTLIbrotlierrorHAS_ZLIBzlib)r&   s    Q/var/www/py-google-trends/myenv/lib/python3.11/site-packages/trafilatura/utils.pyhandle_compressed_filer<   ^   sv   
 k5))   .KO66	.?;/// 	. 	. 	.NN,-----	.  0KO':::	0'444" 	0 	0 	0NN./////	0  	$[111| 	 	 	D	  	?;///z 	 	 	D	 sG   A $A)(A)B )CCC! !C32C3>D D$#D$datac                 V    	 |                      d           n# t          $ r Y dS w xY wdS )zLSimple heuristic to determine if a bytestring uses standard unicode encodingzUTF-8FT)decodeUnicodeDecodeError)r=   s    r;   isutf8rA      sC    G   uu4s    
&&bytesobjectc                    t          |           rdgS g }t          >t          |           d         }|'|                    |                                           t	          |           dk     rt          |           }n1t          | dd         | dd         z             pt          |           }t	          |          dk    r|                    d |D                        d	 |D             S )
z="Read all input or first chunk and return a list of encodingsr   Nr   i'  i  ixr   c                     g | ]	}|j         
S  )r   ).0rs     r;   
<listcomp>z#detect_encoding.<locals>.<listcomp>   s    >>>q
>>>    c                 $    g | ]}|t           v|S rE   )UNICODE_ALIASES)rF   gs     r;   rH   z#detect_encoding.<locals>.<listcomp>   s"    ;;;!!?":":A":":":rI   )rA   cchardet_detectappendlowerlenr   extend)rB   guessescchardet_guessdetection_resultss       r;   detect_encodingrU      s    k yG"(55jA%NN>//11222
;%&{33&{5D5'9K<O'OPP 4&{33 	 !!>>,=>>>???;;w;;;;rI   c                 4   t          | t                    r| S d}t          |           } t          |           D ]M}	 |                     |          } n4# t
          t          f$ r  t                              d|           d}Y Jw xY w|pt          | dd          S )zCheck if the bytestring could be GZip and eventually decompress it,
       guess bytestring encoding and try to decode to Unicode string.
       Resort to destructive conversion otherwise.Nzwrong encoding detected: %sr   replace)r   errors)	r+   strr<   rU   r?   LookupErrorr@   r1   r2   )r&   htmltextguessed_encodings      r;   decode_filer]      s     +s## H )55K+K88  	"))*:;;H
 E	 /0 	 	 	NN8:JKKKHHH	 Ks;KKKKs   A.BB	beginningc                 
    d| vS )zOAssess if the object is proper HTML (awith a corresponding tag or declaration).htmlrE   )r^   s    r;   is_dubious_htmlra      s    ""rI   
htmlstringc                 n   d|v r<|                      d          \  }}}t                              d|d          dz   |z   } t          t	          |                                                     D ]E\  }}d|v r4|                    d          rt                              d| d          }  n	|d	k    r nF| S )
z>Repair faulty HTML strings to make then palatable for libxml2.doctype
    )countz<htmlz/>z\1>   )	partitionDOCTYPE_TAGsub	enumerateiter
splitlinesendswithFAULTY_HTML)rb   r^   	firstline_restilines          r;   repair_faulty_htmlrw      s     I'11$77	1d __R!_<<tCdJ
T*"7"7"9"9::;;  4d??t}}T22?$1EEJEq55E rI   
htmlobjectc                     d}	 t          |                     dd          t                    }n2# t          $ r%}t                              d|           Y d}~nd}~ww xY w|S )z!Try to pass bytes to LXML parser.Nutf8surrogatepassparserzlxml parser bytestring %s)r   encodeHTML_PARSERr0   r1   r8   )rx   treeerrs      r;   fromstring_bytesr      sx    D7*++FODD[YYY 7 7 70#666666667Ks   */ 
AAAc                 8   t          | t                    r| S t          | t                    st          | d          r| j        } t          | t
          t          f          st          dt          |                     d}t          |           } | dd         
                                }t          |          }t          | |          } d}	 t          | t                    }nN# t          $ r t!          |           }d}Y n1t"          $ r%}t$                              d|           Y d}~nd}~ww xY w|t)          |          d	k     r|st!          |           }|A|du r=t)          |          d
k     r*t$                              dt)          |                     d}|S )zLoad object given as input and validate its type
    (accepted: lxml.html tree, trafilatura/urllib3 response, bytestring and string)
    r=   zincompatible input typeN2   Fr|   Tzlxml parsing failed: %srg   ri   z9parsed tree length: %s, wrong data type or not valid HTML)r+   r   r   hasattrr=   r,   rY   	TypeErrortyper]   rO   ra   rw   r   r   
ValueErrorr   r0   r1   r8   rP   )rx   r   r^   
check_flagfallback_parser   s         r;   	load_htmlr      s   
 *k** *l++ %wz6/J/J %_
j5#,// E14
3C3CDDDDZ((J3B3%%''I ++J#J	::JN5*[999   
++ 5 5 5.444444445 	D		A~
++ J$..3t99q==GT	
 	
 	
 Ks   C# #D. 	D.	D))D.i @  )maxsizecharc                 Z    |                                  s|                                 r| ndS )z3Return a character if it belongs to certain classesrf   )isprintableisspace)r   s    r;   return_printables_and_spacesr   
  s,     ##%%??44R?rI   stringc                 R    d                     t          t          |                     S )z6Prevent non-printable and XML invalid character errorsrf   )joinmapr   r   s    r;   remove_control_charactersr     s    7733V<<===rI   NFCunicodeform)r   NFDNFKCNFKDc                 "    t          ||           S )z;Normalize the given string to the specified unicode format.r   )r   r   s     r;   normalize_unicoder     s    [&)))rI   i   rv   preserve_spacetrailing_spacec                    t          |                     dd                              dd                              dd                    }|st          t                              d|                    }t          t          t          j        |                    rd}nV|rT| d	                                         rdnd
}| d                                         rdnd
}d
	                    |||g          }|S )zmRemove HTML space entities, then discard incompatible unicode
       and invalid XML characters on line levelz&#13;z&#10;re   z&nbsp;     Nr   rf   )
r   rW   trimLINES_TRIMMINGrl   allr   rY   r   r   )rv   r   r   new_linespace_beforespace_afters         r;   line_processingr     s     )gt)D)D)L)LWVZ)[)[)c)cdlnv)w)wxxH 
F **4::;;s3;))** 	FHH 	F"&q'//"3"3;33L!%b!1!1!3!3;##KwwhDEEHOrI   textc                     |rt          | d          S 	 d                    t          dfd|                                 D                                                     dd          S # t
          $ r Y dS w xY w)z<Convert text and discard incompatible and invalid charactersTre   Nc              3   8   K   | ]}t          |          V  d S )N)r   )rF   lr   s     r;   	<genexpr>zsanitize.<locals>.<genexpr>6  s-      &e&eaq.'I'I&e&e&e&e&e&erI   u   ␤rf   )r   r   filterro   rW   AttributeError)r   r   r   s    ` r;   sanitizer   /  s      ;t^T:::yy&e&e&e&eSWSbSbSdSd&e&e&effggoopxz|}}}   tts   AA+ +
A98A9r   c                 
   |                                  D ]}|                                }||j        nd}|j        t          v p|t          v }|j        t          v p
|t          v p|}|j        D ]P}d|v rJ|j        |         r#|                    dd          d         | j        vr|j                            |           Q|j	        rt          |j	        ||          |_	        |j        rt          |j        ||          |_        | S )z?Trims spaces, removes control characters and normalizes unicodeNrf   :rg   r   )rn   	getparenttagSPACING_PROTECTEDFORMATTING_PROTECTEDattribsplitnsmappopr   r   tail)r   elemparent
parent_tagr   r   	attributes          r;   sanitize_treer   ;  s   		 L L!!#)#5VZZ2
 %66Y*HY:Y%99qZK_=_qcq  	/ 	/Ii{9- /a1H1H1KSWS]1]1]KOOI...9 	L NNKKDI9 	L NNKKDIKrI   c                     	 d                     |                                                                           S # t          t          f$ r Y dS w xY w)z/Remove unnecessary spaces within a text string.r   rf   )r   r   stripr   r   r   s    r;   r   r   S  sT    xx''--///I&   rrs   8; AAelementc                     dD ]*}|                      |d          }t          |          r dS +| j                                        D ],\  }}|                    d          rt          |          r dS -dS )z*Check if an element is a valid img element)data-srcsrcrf   Tr   F)getis_image_filer   items
startswith)r   attrr   values       r;   is_image_elementr   ]  s    #  kk$## 	44	 #>//11 	 	KD%z** }U/C/C tt5rI   imagesrcc                 ~    | t          |           dk    rdS t          t                              |                     S )zCheck if the observed string corresponds to a valid image extension.
       Use a length threshold and apply a regex on the content.Ni    F)rP   boolIMAGE_EXTENSIONsearch)r   s    r;   r   r   k  s;     3x==4//u&&x00111rI   iterablenc              #      K   t          |           }t          t          ||                    x}r%|V  t          t          ||                    x}#dS dS )zChunk data into smaller pieces.N)rn   tupler   )r   r   iteratorbatchs       r;   make_chunksr   s  sp       H~~H!,,--
-%  !,,--
-%     rI   my_lenoptionsc                     | |j         k     r"t                              d|j                   dS | |j        k    r#t                              d| |j                   dS dS )z=Check if the document length is within acceptable boundaries.ztoo small/incorrect for URL %sFztoo large: length %s for URL %sT)min_file_sizer1   r8   urlmax_file_size)r   r   s     r;   is_acceptable_lengthr   {  s]    %%%5w{CCCu%%%6LLLu4rI   target_languagestrictc                    t           D ]Y}|                     d| d          }|r<t          fd|D                       r dS t                              d|            dS Z|rP|                     d          }|r9t          fd|D                       rdS t                              d	           dS t                              d
           dS )zrCheck HTML meta-elements for language information and split
       the result in case there are several languages.z	.//meta[@z][@content]c              3      K   | ]F}t                               |                    d d                                                    v V  GdS )contentrf   NRE_HTML_LANGr   r   rO   rF   r   r   s     r;   r   z"check_html_lang.<locals>.<genexpr>  sQ      ll^b?l&8&8)R9P9P9V9V9X9X&Y&YYllllllrI   Tz%s lang attr failedFz//html[@lang]c              3      K   | ]F}t                               |                    d d                                                    v V  GdS )langrf   Nr   r   s     r;   r   z"check_html_lang.<locals>.<genexpr>  sQ      ii[_?l&8&8&"9M9M9S9S9U9U&V&VViiiiiirI   zHTML lang failedzNo relevant lang elements found)TARGET_LANG_ATTRSfindallanyr1   debugxpath)r   r   r   r   elemss    `   r;   check_html_langr     s    "  ::::;; 	llllfklllll ttLL.55555		  

?++ 	iiiichiiiii tLL+,,,5
LL23334rI   	temp_texttemp_commentsc                     t           du rLt          |           t          |          k    rt          j        |           nt          j        |          \  }}nt                              d           d}|S )zARun external component (if installed) for language identificationTz3Language detector not installed, skipping detectionN)LANGID_FLAGrP   	py3langidclassifyr1   r2   )r   r   resultrs   s       r;   language_classifierr     ss    d 9~~M 2 222 y)))#M22 	 	LMMMMrI   docmetac                     |Qt          | |          |_        |j        5|j        |k    r*t                              d|j        |j                   d|fS d|fS )zFFilter text based on language detection and store relevant informationNzwrong language: %s %sTF)r   languager1   r2   r   )r   r   r   r   s       r;   language_filterr    sd     ".y-HH 'G,<,O,ONN2G4DgkRRR= '>rI   c                     | j         | j        n| j         }| pL|                                p8t          t	          t
          j        |                                                    S )zFilter out unwanted text)r   r   r   r   r   	RE_FILTERmatchro   )r   testtexts     r;   
textfilterr    sT    &|3w||H<a8++--aS(J]J]J_J_5`5`1a1aarI   c                 J    t          |           o|                                  S )zJDetermine if a string is only composed of spaces and/or control characters)r   r   r   s    r;   text_chars_testr    s"     <<0 0 000rI   )r   )FF)F)Y__doc__r.   r-   ImportErrorloggingrer:   r9   	functoolsr   	itertoolsr   typingr   r   r   r   r	   r
   unicodedatar   r7   r6   r4   r3   r   r   cchardetr   rM   charset_normalizerr   
lxml.etreer   	lxml.htmlr   r   r   urllib3.responser   	getLogger__name__r1   rK   compileIrk   rq   HTML_STRIP_TAGSr   UNICODE	MULTILINEr   URL_BLACKLIST_REGEXr   r   r   r   r   
IGNORECASEr  r,   r<   r   rA   rY   rU   r]   ra   rw   r   r   r   r   r   r   r   r   r   r   r   intr   r   r   r   r  r  r  rE   rI   r;   <module>r      s   
KKKHH   HHH  				KKKHH   HHH              = = = = = = = = = = = = = = = = ! ! ! ! ! !MMMJJ   JJJHH   HHHKK   KKK2222222   OOO * ) ) ) ) )       9 9 9 9 9 9 9 9 9 9 ) ) ) ) ) ) 
	8	$	$G$bj0"$77bj+RT22"*455 jUEGeivz{{{-RZ5LMMM bj!233  "*STTPPP UO  N rz-(( BJ [  ], , ,	" "% " " " "J 4    < <49 < < < <2LU5#:. L3 L L L L2#s #t # # # #
3 3 3      +)>    *# *(;"7 * * * *Z 5@s @s @ @ @ @
>c >c > > > >
* *c *8T0U *be * * * *
 4 # t T ^fgj^k    (	 	3 	 	d 	W_`cWd 	 	 	 	 X    0 4     h 4    2HSM 2d 2 2 2 2# # #     s t     +  T VZ    03 s x}    s 3  WZ _deiknen_o    "b bT b b b b1HSM 1d 1 1 1 1 1 1s`    ( 22A A)(A)-A4 4A>=A>B	 	BBB B('B(