
    %$}gP9                        d Z ddlZddlmZ ddlmZmZmZ ddlm	Z	m
Z
 ddlmZmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZmZmZ ddl m!Z!m"Z"  ej#        e$          Z%ddddddddddddZ&d e&'                                D             Z(h dZ)dededefdZ*dXdede+defdZ,	 dYdedee         de-defd Z.d!ee         dee/e/e/ee+         f         fd"Z0	 dYd#ed$e+d%e-dee-ee+         f         fd&Z1d#ede-fd'Z2	 	 dZd(ed)e+d*e-d%e-def
d+Z3	 	 d[d-eded.e-d/e-dee         f
d0Z4d-ededee         fd1Z5d-eddfd2Z6d-eddfd3Z7d-eddfd4Z8d-eddfd5Z9d-eddfd6Z:d-eddfd7Z;i d8e6d9e6d:e6d;e8d<e8d=e8d>e8d?e8d@e8dAe9dBe9dCe7dDe7dEe7dFe:dGe:dHe:dIe;iZ<d-edJee+         ddfdKZ=	 d\dededLee+         defdMZ>d:dNdDdCdO dAdPdQdR dS	Z?dedefdTZ@dYdUedVe-de+fdWZAdS )]z*
Functions to process nodes in HTML code.
    N)deepcopy)ListOptionalTuple)fix_relative_urlsget_base_url)_ElementElement
SubElementXPath
strip_tagstostring)HtmlElement   )duplicate_test)Document	ExtractorCUT_EMPTY_ELEMSMANUALLY_CLEANEDMANUALLY_STRIPPED)
textfiltertrimis_image_element)META_ATTRIBUTESdelete_element#iz#bz#uz#tz#subz#sup)emibstrongukbdsampttvarsubsupc                     i | ]\  }}||	S  r)   ).0kvs      Z/var/www/py-google-trends/myenv/lib/python3.11/site-packages/trafilatura/htmlprocessing.py
<dictcomp>r.   +   s    >>>TQAq>>>    >   figuresourcepicturetreeoptionsreturnc                    t          j                    t          j                    }}|j        s|                    g d           n|                     d          D ]	}d|_        
|j        r!d |D             }|                    d           t          | |           |j
        dk    rh|                     d          St          |           }|D ])}|                     |          D ]}t          |           *|                     d          |} n,|D ])}|                     |          D ]}t          |           *t          | |j
                  S )z/Prune the tree by discarding unwanted elements.)tabletdthtrz.//figure[descendant::table]divc                 $    g | ]}|t           v|S r)   )PRESERVE_IMG_CLEANINGr*   es     r-   
<listcomp>z!tree_cleaning.<locals>.<listcomp><   s#    TTTqQ>S5S5S5S5S5Sr/   imgrecallz.//p)r   copyr   tablesextendxpathtagimagesremover   focusfindr   iterr   
prune_html)r3   r4   cleaning_liststripping_listelemtcopy
expressionelements           r-   tree_cleaningrT   0   s    %5$9$;$;=N=S=U=U>M> 8889999 JJ=>> 	 	DDHH~ %TTMTTTe$$$ t^$$$ }  TYYv%6%6%B' 	( 	(J99Z00 ( (w''''(99V$D ( 	( 	(J99Z00 ( (w''''( dGM***r/   balancedrJ   c                     |dk    }|                      d          D ]!}|j        t          v rt          ||           "| S )zADelete selected empty elements to save space and processing time.	precisionz-.//processing-instruction()|.//*[not(node())])	keep_tail)rF   rG   r   r   )r3   rJ   tailsrS   s       r-   rM   rM   S   sN    [ E::MNN 5 5;/))7e4444Kr/   Fnodelistwith_backupc                    |r0t          |                                           }t          |           }|D ]} ||           D ]u}|j        E|                                }||                                }||j        pddz   |j        z   |_        |                                                    |           v|r.t          |                                           }||dz  k    r| n|S | S )z2Prune the HTML tree by removing unwanted sections.N     )lentext_contentr   tailgetprevious	getparentrI   )	r3   rZ   r[   old_lenbackuprR   subtreeprevnew_lens	            r-   prune_unwanted_nodesrj   ]   s      d''))**$ 0 0
!z$'' 
	0 
	0G|'**,,<",,..D#!%bC 7', FDI&&w////
	0  9d''))**1,,tt&8Kr/   links_xpathc                     d d | D             D             }t          t          t          |                    }t          d |D                       }t          |          t          |          ||fS )zCollect heuristics on link textc                     g | ]}||S r)   r)   r>   s     r-   r@   z%collect_link_info.<locals>.<listcomp>}   s    RRRAPQRaRRRr/   c              3   X   K   | ]%}t          |                                          V  &d S N)r   ra   )r*   rP   s     r-   	<genexpr>z$collect_link_info.<locals>.<genexpr>}   s6      LL$t002233LLLLLLr/   c              3   &   K   | ]}|d k     dV  dS )
   r   Nr)   )r*   ls     r-   rp   z$collect_link_info.<locals>.<genexpr>   s&      2211r66Q666622r/   )listmapr`   sum)rk   mylistlengths
shortelemss       r-   collect_link_inforz   y   su     SRLLLLLRRRF3sF##$$G2222222Jw<<Vj&88r/   rS   textfavor_precisionc                    |                      d          }|sdg fS g }t          |          dk    rg|rdnd}t          |d                                                   }t          |          |k    r't          |          t          |          dz  k    rdg fS | j        d	k    r|                                 dnd}n|                                 d}nd}t          |          }||k     rXt          |          \  }	}
}}|
dk    rd|fS t                              d|	|||
           |	|dz  k    s|
dk    r||
z  dk    rd|fS d|fS )z>Remove sections which are rich in links (probably boilerplate).//refFr   rr   d   r   g?TpN<      i,  u8   list link text/total: %s/%s – short elems/total: %s/%s皙?)	findallr`   r   ra   rG   getnextrz   LOGGERdebug)rS   r{   r|   rk   rw   len_threshold	link_textlimitlenelemlenlinklenelemnumry   s               r-   link_density_testr      s    //(++K byF
;1-63Q446677	y>>M))c)nns4yy3.N.N8O{c **222??$HH H$iiG/@/M/M,*fa<<<F	
 	
 	
 Ws]""w{{zG7Kc7Q7Q<&=r/   c                 @   |                      d          }|sdS t          t          |                                                     }|dk     rdS t	          |          \  }}}}|dk    rdS t
                              d||           |dk     r	|d|z  k    n|d	|z  k    S )
z=Remove tables which are rich in links (probably boilerplate).r~   F   r   Tztable link text: %s / total: %si  r   g      ?)r   r`   r   ra   rz   r   r   )rS   rk   r   r   r   _s         r-   link_density_test_tablesr      s    //(++K u$w++--..//G}}u.{;;GWa!||t
LL2GWEEE&-nn7S7]""'C'M:QQr/   rg   tagnamebacktrackingc                    g }|rdnd}|rdnd}|                      |          D ]}t          |                                          }t          |||          \  }	}
|	s4|rG|
rEdt	          |          cxk     r|k     r+n Zt	          |          |k    r|                    |           t                              |          D ]}t          |           | S )z{Determine the link density of elements with respect to their length,
    and remove the elements identified as boilerplate.r   r   r      r   )	rL   r   ra   r   r`   appenddictfromkeysr   )rg   r   r   r|   	deletionsr   depth_thresholdrP   elemtextresulttemplists              r-   delete_by_link_densityr      s
    I*3CCM*1aaOW%% 	# 	#))++,,,T8_MM 	#	#	# CMM1111M11111D		_,,T""" i((  tNr/   TrP   comments_fixpreserve_spacesc                 R   | j         dk    rt          |           r| S | j         dk    s!t          |           dk    r| j        s	| j        sdS |s*| j         dk    r|st          | j                  pd| _        | S | j        s;t          |           dk    r(| j        dc| _        | _        |r| j         dk    rd| _         |s=t          | j                  pd| _        | j        rt          | j                  pd| _        | j        st          |           s|j        rt          | |          rdS | S )z3Convert, format, and probe potential text elements.graphicdoner   Nlbr]   r   )	rG   r   r`   r{   rb   r   r   dedupr   )rP   r4   r   r   s       r-   handle_textnoder      sT    x9!1$!7!7x6c$ii1nnTYntynt  DH,, 	0TY/4DI 9 Ta  $y"	49 	DH,,DH  0OO+t	9 	0TY/4DI
 It M -T7;;
 tKr/   c                    | j         dk    s!t          |           dk    r| j        s	| j        sdS t	          | j                  pdt	          | j                  pdc| _        | _        | j         dk    r"| j        s| j        r| j        dc| _        | _        | j        s| j        r(t          |           s|j        rt          | |          rdS | S )zBConvert, format, and probe potential text elements (light format).r   r   Nr   )rG   r`   r{   rb   r   r   r   r   )rP   r4   s     r-   process_noder     s    x6c$ii1nnTYntynt  	??2dDOO4KtDIty x4	di#y$	49 y DI d 	 	.w2O2O 	4Kr/   c                    |                      d| j                   d| _        d}|                     ddd          D ]O}|j        dv r=|                     dt          |j                   d|            |j        dk    r|dz  }d	|_        Pd
S )zGConvert <ul> and <ol> to <list> and underlying <li> elements to <item>.rendrt   r   dddtli)r   r   -itemN)setrG   rL   str)rP   r   subelems      r-   convert_listsr      s    HHVTXDH	A99T4..  ;,&&KK3w{#3#3 9 9a 9 9:::{d""Q r/   c                     d}| j         dk    r]t          |           dk    r| d         j         dk    rd}|                     d          }|r d}|D ]}|j                                         |rdnd	| _         d
S )z?Convert quoted elements while accounting for nested structures.Fprer   r   spanTz#.//span[starts-with(@class,'hljs')]codequoteN)rG   r`   rF   attribclear)rP   	code_flag
code_elemsr   s       r-   convert_quotesr   0  s    Ix5 t99>>d1gkV33IZZ EFF
 	'I% ' '$$&&&&"/vvDHHHr/   c                 |    | j                                          |                     d| j                   d| _        dS )z$Add head tags and delete attributes.r   headN)r   r   r   rG   rP   s    r-   convert_headingsr   A  s7    KHHVTXDHHHr/   c                     d| _         dS )zConvert <br> and <hr> to <lb>r   N)rG   r   s    r-   convert_line_breaksr   H  s    DHHHr/   c                 @    d| _         |                     dd           dS )z7Convert <del>, <s>, <strike> to <del rend="overstrike">delr   
overstrikeN)rG   r   r   s    r-   convert_deletionsr   M  s#    DHHHV\"""""r/   c                 R    d| _         |                     d          D ]	}d|_         
dS )zHandle details and summary.r;   summaryr   N)rG   rL   )rP   r   s     r-   convert_detailsr   S  s7    DH99Y''   r/   dlolulh1h2h3h4h5h6brhr
blockquoter   qr   sstrikedetailsbase_urlc                     d| _         |                     d          }| j                                         |r*|rt	          ||          }|                     d|           dS dS )z7Replace link tags and href attributes, delete the rest.refhreftargetN)rG   getr   r   r   r   )rP   r   r   s      r-   convert_linkr   q  sq    DHXXfFK # 	9&x88F6"""""	# #r/   urlc                 $   |j         s>d}|j        r|dz  }|                     |          D ]	}d|_        
t	          | d           n:|ot          |          }|                     dd          D ]}t          ||           |j        rv|                     t          
                                          D ]H}|j                                         |                    dt          |j                            d|_        In$t	          | gt          
                                R   |                     t          
                                          D ]}t          |j                 |           |j        r|                     d          D ]	}d|_        
| S )	zBSimplify markup and convert relevant HTML tags to an XML standard.z).//*[self::div or self::li or self::p]//az|.//table//ar   ar   hirA   r   )linksrD   rF   rG   r   r   rL   r   
formattingREND_TAG_MAPPINGkeysr   r   r   CONVERSIONSrH   )r3   r4   r   
xpath_exprrP   r   s         r-   convert_tagsr   }  s   
 = )@
> 	).(JJJz** 	 	DDHH4 ,<,,IIc5)) 	) 	)Dx(((( 3II.335566 	 	DKHHV-dh7888DHH	
 	42*//112222 		+**,,-- $ $DHd####~ !IIe$$ 	! 	!D DHHKr/   r   c                 ^    dt          |                     dd          dd                     S )Nhr   r   r   )intr   r   s    r-   <lambda>r     s.    >S&$!7!7!;<<>> r/   r   r   c                 D    t           |                     dd                   S )Nr   r   )HTML_TAG_MAPPINGr   r   s    r-   r   r     s    '(>(>? r/   )	rt   r   r   r   r   r   rA   r   r   c                    |                      t                                                    D ]}t          t          |j                           }t          |          r ||          |_        n||_        |j        dk    r0|                    d|j                            dd                     |j        	                                 d| _        t          d          }|                    |            |S )zConvert XML to simplified HTML.r   r   r   r]   bodyhtml)rL   HTML_CONVERSIONSr   r   rG   callabler   r   popr   r
   r   )r3   rP   
conversionroots       r-   convert_to_htmlr     s    		*//1122    %c$(mm4
J 	"!z$''DHH!DH8s??HHVT[__Xr::;;;;KDH6??DKKKr/   documentwith_metadatac                    t          | j                  }|rTt          d          }t          D ]'}t	          | |          x}rt          |d||           (|                    d|           t          |dd                                          S )z1Convert the document to HTML and return a string.r   meta)namecontentr   Tunicode)pretty_printencoding)	r   r   r
   r   getattrr   insertr   strip)r   r   	html_treer   r   values         r-   build_html_outputr    s    ..I "v# 	C 	CD$///u C4dEBBBBD!!!ID9EEEKKMMMr/   )rU   )F)FF)TFro   )B__doc__loggingrC   r   typingr   r   r   courlan.urlutilsr   r   
lxml.etreer	   r
   r   r   r   r   	lxml.htmlr   deduplicationr   settingsr   r   r   r   r   utilsr   r   r   xmlr   r   	getLogger__name__r   r   itemsr   r=   rT   r   rM   boolrj   r   rz   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r)   r/   r-   <module>r     s           ( ( ( ( ( ( ( ( ( ( < < < < < < < < Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q ! ! ! ! ! ! ) ) ) ) ) )              6 5 5 5 5 5 5 5 5 5 0 0 0 0 0 0 0 0 
	8	$	$ 			
   ?>%5%;%;%=%=>>> 777  +  +i  +K  +  +  +  +F [  k     CH 
!%e;?   89k"9
3S$s)#$9 9 9 9 >C% %% #%6:%
4c?% % % %PRk Rd R R R R, !	   	
    D !	+ +
++ + 	+
 h+ + + +\x ) 8J    ( T     0 0d 0 0 0 0"8     h 4    
#H # # # # #( t    -- 	- 	
	
 	
 	
 	
 	
 	
 	
 	
 . 
>  
  	!" #$ % .	#{ 	#hsm 	# 	# 	# 	# 	# AE$ $
$ )$08$$ $ $ $P >>

?
?
 
 ( x    (N N N N# N N N N N Nr/   