
    %$}gs                     Z   d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
 ddlmZmZmZmZmZmZ ddlmZ ddlmZmZmZmZmZ dd	lmZmZ dd
lmZmZm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+  ej,        e-          Z.ddhZ/ddhZ0h dZ1h dZ2ddhZ3ddhZ4de5dedee
e6e5f                  ddfdZ7dededee         fdZ8dededee         fdZ9ded ed!eddfd"Z:d#edededdfd$Z;d%ed&eddfd'Z<d%ede=fd(Z>d)ed*eddfd+Z?dededee         fd,Z@dede=fd-ZAdedefd.ZBdededee         fd/ZCded0e	e5         dedee         fd1ZDded0e	e5         dedee         fd2ZEd3e=defd4ZFd5ed0e	e5         dedee         fd6ZGdee         dee         fd7ZHded0e	e5         dedee         fd8ZIefd9ed:eded0edef
d;ZJd9ed0e	e5         dedefd<ZKd9ededeee5e	e5         f         fd=ZLd>ededeee5eMf         fd?ZNd%ed0e	e5         dedee         fd@ZOd9ededeee5eMef         fdAZPdS )Bz6
Functions related to the main Trafilatura extractor.
    N)deepcopy)AnyOptionalTupleSetUnion)_ElementElement
SubElementstrip_elements
strip_tagstostring)HtmlElement   )delete_by_link_densityhandle_textnodelink_density_test_tablesprocess_nodeprune_unwanted_nodes)TAG_CATALOG	Extractor)FORMATTING_PROTECTEDis_image_filetext_chars_testtrim)delete_element)
BODY_XPATHCOMMENTS_DISCARD_XPATHCOMMENTS_XPATHDISCARD_IMAGE_ELEMENTSOVERALL_DISCARD_XPATHPRECISION_DISCARD_XPATHTEASER_DISCARD_XPATHhireftdth>   r$   r&   r'   >   r$   r%   spancodequoteheadmsgtagtextreturnc                 b    t                               d| |t          |pd          pd           dS )z/Format extraction event for debugging purposes.z	%s: %s %s NoneN)LOGGERdebugr   )r,   r-   r.   s      Z/var/www/py-google-trends/myenv/lib/python3.11/site-packages/trafilatura/main_extractor.py
_log_eventr6   &   s1    
LLc3TZR(8(8(BFCCCCC    elementoptionsc                 d   t          |           dk    rt          | |          }nQt          |           }t          |           D ]2}t	          ||d          }||                    |           d|_        3|8t          d                    |	                                                    du r|S dS )zProcess head elements (titles)r   Fcomments_fixNdoner1   T)
lenr   r   listr   appendr-   r   joinitertext)r8   r9   titlechildprocessed_childs        r5   handle_titlesrF   +   s    
7||q
 Wg.. !! ']] 	 	E .eW5QQQO*_---EII_RWWU^^5E5E-F-FGG4OO4r7   c                     t          | |          }|dS |                                 }||                                 }||j        t          vr&t          d          }|                    d|           n|}|S )z[Process formatting elements (b, i, etc. converted to hi) found
       outside of paragraphsNpr   )r   	getparentgetpreviousr-   r   r
   insert)r8   r9   
formattingparentprocessed_elements        r5   handle_formattingrO   E   s     gw//JtD   F~$$&&~+???#CLL  J////&r7   new_child_elemsubelemprocessed_subchildc                     t          | |j                  }|j        |j        c|_        |_        |j        D ]#}|                    ||j        |                    $dS )z/Add a sub-element to an existing child element.N)r   r-   r.   tailattribset)rP   rQ   rR   sub_child_elemattrs        r5   add_sub_elementrY   w   si    0B0FGGN/A/FHZH_,N, 7 74!566667 7r7   rD   c                    | j         |_         |                     d          D ]a}|j        dk    r(t          ||          }||                    |           n%t          ||d          }|t          |||           d|_        bdS )z<Iterate through an element child and rewire its descendants.*r?   NFr;   r=   )r.   iterdescendantsr-   handle_listsr@   r   rY   )rD   rP   r9   rQ   rR   s        r5   process_nested_elementsr^      s    *N((-- 	 	;&  !-gw!?!?!-%%&8999!0'PU!V!V!V!-9KLLL	 	r7   elemnew_elemc                 d    |                      d          x}r|                    d|           dS dS )z>Copy the rend attribute from an existing element to a new one.rendN)getrV   )r_   r`   	rend_attrs      r5   update_elem_renditionre      s?    HHV$$$y (VY'''''( (r7   c                 v    | duo5t          d                    |                                                     du S )z"Find if the element contains text.Nr1   T)r   rA   rB   )r_   s    r5   is_text_elementrg      s3    tQ0H0H I IT QQr7   processed_elem	orig_elemc                 j    | 0t          || j                  }| j        | j        c|_        |_        dS dS )z&Create a new sub-element if necessary.N)r   r-   r.   rT   )rh   ri   	childelems      r5   define_newelemrl      s<    !y.*<==	)7)<n>Q&		 "!r7   c                    t          | j                  }| j        5| j                                        rt	          |d          }| j        |_        |                     d          D ]e}t          d          }t          |          dk    rnt          ||          }|[|j        pd|_        |j        r1|j                                        r|xj        d|j        z   z  c_        |	                    |           nt          |||           |j        t|j                                        r[d |D             }|rM|d         }|j        |j                                        s|j        |_        n|xj        d|j        z   z  c_        |j        st          |          dk    r%t          ||           |	                    |           d|_        gd| _        t          |          rt          | |           |S dS )	z3Process lists elements including their descendants.Nitemr   r1    c                 (    g | ]}|j         d k    |S )r=   r-   .0els     r5   
<listcomp>z handle_lists.<locals>.<listcomp>   s%    *[*[*[""&TZJZJZ2JZJZJZr7   r=   )r
   r-   r.   stripr   r\   r>   r   rT   r@   r^   re   rg   )r8   r9   rN   rP   rD   rE   new_child_elem_childrenlast_subchilds           r5   r]   r]      s   ,,|GL$6$6$8$8#$5v>>%l ((00   u::??*5'::O*&5&:&@b#"' FO,@,F,F,H,H F"''31E+EE''!((888#E>7CCCz%%**:*:*<*<%*[*[*[*[*['* ?$;B$?M$)19K9Q9Q9S9S1-2Z**%**cEJ.>>** 	5#n"5"5"9"9!%888$$^444		GK()) !g'8999  4r7   c                    |                      d          s| j        dk    rdS |                                 }|d|                     dd          v rdS |                     d          }|t	          |           dk    rdS d	S )
zECheck if it is a code element according to common structural markers.langr)   TN	highlightclassr1   r   F)rc   r-   rI   findr>   )r8   rM   r)   s      r5   is_code_block_elementr      s     {{6 gkV33t  FkVZZ-D-DDDt<<DCLLA--t5r7   c                 p    t          |           }|                     d          D ]	}d|_        
d|_        |S )z/Turn element into a properly tagged code block.r[   r=   r)   )r   iterr-   )r8   rN   rD   s      r5   handle_code_blocksr      sC     ))c""  		"r7   c                 .   t          |           rt          |           S t          | j                  }|                     d          D ]+}t          ||          }|t          ||           d|_        ,t          |          rt          |d           |S dS )zProcess quotes elements.r[   Nr=   r*   )	r   r   r
   r-   r   r   rl   rg   r   )r8   r9   rN   rD   rE   s        r5   handle_quotesr      s    W%% +!'***,,c""  &ug66&?,=>>>		()) !$g...  4r7   potential_tagsc                    | j         dk    r'd|                     dd          v rt          |           S | j         |vr(| j         dk    rt          d| j         | j                   dS | j         dk    rXt          | |dd	
          }|Ct          |j                  d	u r-|j                                         |j         dk    rd|_         |S dS )zAHandle diverse or unknown elements in the scope of relevant tags.divzw3-coder}   r1   r=   zdiscarding elementNFTr<   preserve_spacesrH   )	r-   rc   r   r6   r.   r   r   rU   clear)r8   r   r9   rN   s       r5   handle_other_elementsr      s     {e	W[["-E-E E E!'*** {.((;&  +W[',GGGt{e ,GW5bfggg(_=N=S-T-TX\-\-\$**,,, $--(+!%$$4r7   c                 l   | j                                          t          |           dk    rt          | |          S t	          | j                  }|                     d          D ]}|j        |vr'|j        dk    rt          d|j        |j                   3t          ||dd          }||j        d	k    rLt          d
d	|j                   |j        r|xj        d|j        pdz   z  c_        n|j        |_        d|_        t	          |j                  }|j        t          v rt          |          dk    r?|D ]<}t          |j                  du rd|j        z   |_        t          ||j                   =|j        dk    r+|                    d|                    dd                     nJ|j        dk    r?|                    d          *|                    d|                    dd                     |j        |j        c|_        |_        |j        dk    rt!          |          }||}|                    |           d|_        t          |          dk    r+|d         }	|	j        dk    r|	j        t%          |	           |S |j        r|S t          dd	t'          |                     dS )zIProcess paragraphs along with their children, trim and clean the content.r   r[   r=   zunexpected in pFTr   NrH   z
extra in pro   r1   r$   rb   r%   targetgraphicrv   lbzdiscarding element:)rU   r   r>   r   r
   r-   r   r6   r.   r   P_FORMATTINGr   r   rV   rc   rT   handle_imager@   r   r   )
r8   r   r9   rN   rD   rE   newsubrn   
image_elem	last_elems
             r5   handle_paragraphsr     s   N 7||qGW---  ,,c"" 9 99N**uyF/B/B(%)UZ@@@ *%u^bccc&"c))<o.BCCC$) B%**c_5I5OR.PP***-<-A%*"	UY''F"l22''!++ / > >*4955==(+diDI"?DH====9$$JJvuyy'<'<====Y%''yy**6

8UYYx-D-DEEE& (7';_=Q$FK"i//)/::
)'F$$V,,,		
!!%b)	=D  Y^%;9%%%   !  $c84E+F+FGGG4r7   	is_headerc                 T    t          d          }| r|                    dd           |S )z1Determine cell element type and mint new element.cellroler+   )r
   rV   )r   cell_elements     r5   define_cell_typer   b  s3     6??L )(((r7   
table_elemc           
      ^   t          d          }t          | ddd           d}|                     d          D ]A}t          |t	          d |                    t
                    D                                 }Bd}d}|d	k    rt          |          nd
}t          d          }	|r|	                    d|           |                                 D ]}
|
j	        dk    rUt          |	          dk    r@|                    |	           t          d          }	|r|	                    d|           |p|}n|
j	        t
          v r||
j	        dk    o| }|p|}t          |          }t          |
          dk    r,t          |
|          }||j        |j        c|_        |_        n|
j        |
j        c|_        |_        d|
_	        |
                                D ]}|j	        t           v r)|j	        t
          v rd|_	        t#          ||dd          }ne|j	        dk    r5|j        dk    r*t'          ||          }||                    |           d}n%t)          ||                    dg          |          }|t-          ||           d|_	        |j        st          |          dk    r|	                    |           n|
j	        dk    r n	d|
_	        |	j                            dd           t          |	          dk    r|                    |	           t          |          dk    r|S dS )zProcess single table element.tabletheadtbodytfootr   trc              3   \   K   | ]'}t          |                    d d                    V  (dS )colspanr   N)intrc   )rs   r&   s     r5   	<genexpr>zhandle_table.<locals>.<genexpr>u  s8      $^$^2S	1)=)=%>%>$^$^$^$^$^$^r7   Fr   r1   rowr(   r'   Nr=   r   T)r   r<   r?   recallr   )r
   r   r   maxsumTABLE_ELEMSstrrV   r\   r-   r>   r@   r   r   r.   rT   	TABLE_ALLr   focusr]   handle_textelemunionrl   rU   pop)r   r   r9   newtablemax_colsr   seen_header_rowseen_header	span_attrnewrow
subelementr   rP   processed_cellrD   rR   s                   r5   handle_tabler   k  sh   wH z7GW555 Hood## ` `x$^$^Q\I]I]$^$^$^!^!^__ OK!)AH2IU^^F &

69%%% 0022 1  1 
>T!!6{{Q'''  2JJvy111"1"@[^{**"$.F3FI%2K-i88N:!##!-j'!B!B!-?M?RTbTg<N')< <F?JO8#^%8!'
'7799 ' 'EyI-- 933(.EI-<UG]apt-u-u-u**f,,(1J1J-9%-I-I*-9*112DEEE15. .=UNDXDXZ_Y`DaDacj-k-k*)5&'9>JJJ &EII" .c.&9&9A&=&=n---^w&&E
 Mfd### 6{{Q
8}}q4r7   c                 
   | dS t          | j                  }dD ]?}|                     |d          }t          |          r|                    d|            n\@| j                                        D ]A\  }}|                    d          r't          |          r|                    d|            nB|                     d          x}r|                    d|           |                     d          x}r|                    d|           |j        r|                    d          sdS |                    dd          }|                    d          s*|                    dt          j	        d	d
|                     |S )z5Process image elements and their relevant attributes.N)data-srcsrcr1   r   r   altrC   httpz^//zhttp://)
r
   r-   rc   r   rV   rU   items
startswithresub)r8   rN   rX   r   valuealt_attr
title_attrsrc_attrs           r5   r   r     s   t,,# 
 
kk$## 	!!%---E	
 #>//11 	 	KD%z** }U/C/C !%%eU333 ;;u%%%x /eX...[[)))z 3gz222 # +<+@+@+G+G t !$$UB//Hv&& JeRVFIx%H%HIIIr7   c                 r   d}| j         dk    rt          | |          }n| j         t          v rt          | |          }n| j         dk    rt	          | |          }n| j         dk    rt          | ||          }n| j         dk    rDt          | j                  du r-t          | |          }|t          d          }|j        |_
        np| j         t          v rt          | |          }nQ| j         dk    rd|v rt          | ||          }n0| j         dk    rd|v rt          |           }nt          | ||          }|S )	z?Process text element and determine how to deal with its contentNr?   r+   rH   r   Tr   r   )r-   r]   CODES_QUOTESr   rF   r   r   rT   r   r
   r.   
FORMATTINGrO   r   r   r   )r8   r   r9   new_elementthis_elements        r5   r   r     sV   K{f"7G44		$	$#GW55			#GW55			'II			7<((D00'99L'%cll#/#4 	
	"	"'99			G~$=$="7NGDD			!	!i>&A&A"7++ ,G^WMMr7   treeresult_bodyc                    t                               d           d}j        dk    r                    ddg           |dz  }t	          |           }dvrt          |ddd	           nt          |d	           |                    |          }|                    t          d
 fd|D                                  |S )zLook for all previously unconsidered wild elements, including outside of the determined
       frame and throughout the document to recover potentially missing text partszRecovering wild text elementsz\.//blockquote|.//code|.//p|.//pre|.//q|.//quote|.//table|.//div[contains(@class, 'w3-code')]r   r   r   z|.//div|.//lb|.//listr%   ar(   c                 
    | d uS N xs    r5   <lambda>z#recover_wild_text.<locals>.<lambda>  s
     r7   c              3   :   K   | ]}t          |          V  d S r   r   rs   er9   r   s     r5   r   z$recover_wild_text.<locals>.<genexpr>  sE       8* 8* 9H>[b8c8c 8* 8* 8* 8* 8* 8*r7   )	r3   r4   r   updateprune_unwanted_sectionsr   xpathextendfilter)r   r   r9   r   search_exprsearch_treesubelemss     ``   r5   recover_wild_textr      s     LL0111rK}  udm,,,..)$HHKN"";UF3333;'''  --Hv55 8* 8* 8* 8* 8* (8* 8* 8* + + , , ,r7   c                    |j         dk    }t          | t          d          } d|vrt          | t                    } |j         dk    r,t          | t                    } |rt          | t
                    } t          d          D ];}t          | dd|          } t          | d	d
|          } t          | dd
|          } <d|v s|r:|                     d          D ]$}t          |          du rt          |d
           %|rt          |           dk    rL| d         j        dk    r;t          | d         d
           t          |           dk    r| d         j        dk    ;t          | dd
d          } t          | dd
d          } | S )z1Rule-based deletion of targeted document sections	precisionT)with_backupr   r      r   )backtrackingfavor_precisionr?   FrH   r   	keep_tailr   rv   r+   r*   )r   r   r!   r    r#   r"   ranger   r   r   r   r>   r-   )r   r   r9   r   _r_   s         r5   r   r     s   m{2O&;NNND&&#D*@AA}  #D*>?? 	G'.EFFD1XX f f%dEVefff%dFXghhh%dCeUdeee.  O IIg&& 	6 	6D'--55tu5555 _$ii!mmb!7!748u5555 $ii!mmb!7!7%dFX\]]]%dG%Y]^^^Kr7   c                   	 t          t                    	j        du r	                    g d           j        du r	                    d           j        du r	                    d           t          d          }t          D ]
}t          d  ||           D             d           }|)t          |	          }t          |          dk    rN|                    d          }j        d	k    rd
}nd}|r.t          d                    |                    j        |z  k     r	                    d           d	vrt!          |d           d	vrt!          |d           t"                              t'          	                     |                    d          }d |D             dhk    r|g}|                    d 	fd|D             D                        t          |          dk    rR|d         j        t,          v r>t/          |d         d           t          |          dk    r|d         j        t,          v >t          |          d
k    r6t"                              t1          t3          |                                nd                    |                                                                          }||	fS )NT)r   r&   r'   r   r   r%   bodyc              3      K   | ]}||V  	d S r   r   rs   ss     r5   r   z_extract.<locals>.<genexpr>D  "      ??a??r7   r   z//p//text()r   r      r1   r   r(   .//*c                     h | ]	}|j         
S r   rq   )rs   r   s     r5   	<setcomp>z_extract.<locals>.<setcomp>]  s    $$$aAE$$$r7   r   c                     g | ]}||S r   r   rr   s     r5   ru   z_extract.<locals>.<listcomp>`  s    {{{2lnlzBlzlzlzr7   c              3   :   K   | ]}t          |          V  d S r   r   r   s     r5   r   z_extract.<locals>.<genexpr>`  s0      )h)hZ[/!^W*U*U)h)h)h)h)h)hr7   rv   Fr   ro   )rV   r   tablesr   imagesaddlinksr
   r   nextr   r>   r   r   rA   min_extracted_sizer   r3   r4   sortedr   r-   NOT_AT_THE_ENDr   r   r   rB   rw   )
r   r9   r   exprsubtreeptestfactorr   	temp_textr   s
    `       @r5   _extractr  7  s   %%N~999:::~9%%%}5!!!&//K % %??44::???FF?)'>7KKw<<1m,,=K''FFF 	&BGGENN++g.H6.QQQu%%%&&w&&&''w'''VN++,,,==(($$8$$$..yH{{)h)h)h)h)h_g)h)h)h{{{|||+""B(;~(M(M;r?e<<<< +""B(;~(M(M {aLLc$ii)))E   --//006688I	>11r7   cleaned_treec                    t          |           }t          | |          \  }}}t          |          dk    st          |          |j        k     rKt	          ||||          }d                    |                                                                          }t          |d           t          |d           ||t          |          fS )zFind the main content of a page using a set of XPath expressions,
       then extract relevant elements, strip them of unwanted subparts and
       convert themr   ro   r=   r   )
r   r  r>   r   r   rA   rB   rw   r   r   )r  r9   backup_treer   r  r   s         r5   extract_contentr  l  s    
 <((K-5lG-L-L*KN ;1I1K K K'['>ZZHH[113344::<<	;'''{E"""	3y>>11r7   c                 v    | j         |v r/t          | |d          }||j                                         |S dS )z?Process comment node and determine how to deal with its contentTr;   N)r-   r   rU   r   )r_   r   r9   rN   s       r5   process_comments_noder    sL    x>!!+D'MMM($**,,, %$4r7   c           
         t          d          }t          t                    t          D ]}t	          d  ||           D             d          }|(t          |t                    }t          |ddd           |                    t          d fd|
                    d	          D                                  t          |          d
k    r-t                              |           t          |d            nd                    |                                                                          }||t          |          | fS )z>Try to extract comments out of potential sections in the HTML.r   c              3      K   | ]}||V  	d S r   r   r   s     r5   r   z#extract_comments.<locals>.<genexpr>  r   r7   Nr   r%   r(   c                 
    | d uS r   r   r   s    r5   r   z"extract_comments.<locals>.<lambda>  s
    atm r7   c              3   :   K   | ]}t          |          V  d S r   )r  r   s     r5   r   z#extract_comments.<locals>.<genexpr>  sL        >P  >Ptu>STUWegn>o>o  >P  >P  >P  >P  >P  >Pr7   r   r   Fr   ro   )r
   rV   r   r   r   r   r   r   r   r   r   r>   r3   r4   r   rA   rB   rw   )r   r9   comments_bodyr  r  temp_commentsr   s    `    @r5   extract_commentsr    s   FOOM%%N  ??44::???FF?&w0FGG7C/// 	V$;$;  >P  >P  >P  >P  >P  zA  zG  zG  HN  zO  zO  >P  >P  >P  Q  Q  	R  	R  	R}!!LL7e4444E	 " HH]335566<<>>M-]););TAAr7   )Q__doc__loggingr   copyr   typingr   r   r   r   r   
lxml.etreer	   r
   r   r   r   r   	lxml.htmlr   htmlprocessingr   r   r   r   r   settingsr   r   utilsr   r   r   r   xmlr   xpathsr   r   r   r    r!   r"   r#   	getLogger__name__r3   r   r   r   r   r   r  r   bytesr6   rF   rO   rY   r^   re   boolrg   rl   r]   r   r   r   r   r   r   r   r   r   r   r   r  r   r  r  r  r   r7   r5   <module>r$     s     				       3 3 3 3 3 3 3 3 3 3 3 3 3 3 Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z ! ! ! ! ! !3 3 3 3 3 3 3 3 3 3 3 3 3 3 - , , , , , , , M M M M M M M M M M M M      D D D D D D D D D D D D D D D D D D
 
	8	$	$ e}Tl	"""
 %DC Dc D%s
2C)D D D D D D
8 i HX<N    4/x /) /@R / / / /d7H 7x 7U] 7bf 7 7 7 78 X PY ^b     ( (H ( ( ( ( (R( Rt R R R R
R8 R RT R R R R&( &Y &8H;M & & & &R8       X    8 i HX<N    $8 SX PY ^fgo^p    6Ox OS OI OZbckZl O O O Od     OX Os3x O) OX`aiXj O O O Od"(8, "(1C " " " "JX s3x ) X`aiXj    < kv  K h  dg   {C    *+ s3x R[ `k    D22; 22 22uXsCPSH=T7U 22 22 22 22j2+ 2	 2eHVY[^L^F_ 2 2 2 2. #c( Y [cdl[m    B; B BuXsTWYdEd?e B B B B B Br7   