
    %$}g                        d Z ddlZddlmZmZ ddlmZmZmZ ddl	m
Z
mZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$  ej%        e&          Z'da(dZ)dedefdZ*dededede+de,dedeee+e,f         fdZ-dee+         fdZ.dedee+         defdZ/dede+de+defd Z0dededeee+e,f         fd!Z1dededeee+e,f         fd"Z2dS )#z.
Functions grounding on third-party software.
    N)AnyTuple)ParagraphMakerclassify_paragraphsrevise_paragraph_classification)get_stoplistget_stoplists)_ElementElement
strip_tagstostring)HtmlElement   )basic_cleaning)convert_tagsprune_unwanted_nodestree_cleaning)Document)JUSTEXT_LANGUAGES)fromstring_bytestrimTEI_VALID_TAGS)OVERALL_DISCARD_XPATHz.//aside|.//audio|.//button|.//fieldset|.//figure|.//footer|.//iframe|.//input|.//label|.//link|.//nav|.//noindex|.//noscript|.//object|.//option|.//select|.//source|.//svg|.//time	htmlinputreturnc                    	 t          | dd          }t          |                                          }||nt                      S # t          $ r3}t
                              d|           t                      cY d}~S d}~ww xY w)z6Safety net: try with the generic algorithm readability      )min_text_lengthretry_lengthNzreadability_lxml failed: %s)ReadabilityDocumentr   summaryr   	ExceptionLOGGERwarning)r   docr#   errs       T/var/www/py-google-trends/myenv/lib/python3.11/site-packages/trafilatura/external.pytry_readabilityr*       s    !)RcRRR"3;;==11!-ww;==@   4c:::}}s   AA 
B(A?9B?Btreebackup_treebodytextlen_textoptionsc                    |j         dk    r||j        dz  k    r|||fS d\  }}|j         dk    rt          |t                    }t	          |          }t          t          |dd                              d                    }	t          |	          }
t          
                    d|
|           |
d	|fv rd
}n|d	k    r
|
d	k    rd}n|d|
z  k    rd
}n |
d|z  k    r|	                    d          sd}n|                    d          s|
|j        dz  k    rd}nt          |                    d                    t          |                    d                    k    r|
|j        dz  k    rd}nb|j         dk    r3|                    d          s|                    d          r	|
|k    rd}n$t          
                    d||
|j                   d
}|r'||	|
}}}t          
                    d|j                   n t          
                    d|j                   |                    t                    s||j        k     rot          
                    d|j                   t!          | |          \  }}}t#          |          }|r*|d|z  k    s!t          
                    d|           |||}}}|r|st%          ||          \  }}}|||fS )zZDecide whether to choose own or external extraction
       based on a series of heuristicsrecall
   )FF	precisionr.   zutf-8)methodencodingz0extracted length: %s (algorithm) %s (extraction)r   FT   {z.//p//text()z.//tablez.//pz.//headz.//h2|.//h3|.//h4zextraction values: %s %s for %szusing generic algorithm: %szusing custom extraction: %sz3unclean document triggering justext examination: %s   zusing justext, length: %s)focusmin_extracted_sizer   r   r*   r   r   decodelenr%   debug
startswithxpathfindallsourceSANITIZED_XPATHjustext_rescueboolsanitize_tree)r+   r,   r-   r.   r/   r0   use_readability	jt_resulttemppost_algo	algo_textlen_algobody2text2	len_text2s                 r)   compare_extractionrO   -   s    }  X0JR0O%O%OT8##!-OY}##*;8MNN $K00MXmFWMMMTTU\]]^^I9~~H LLCXxXXXAx=  	Q8a<<	AL	 	 	AL	 	 )=)=c)B)B	 ZZ'' 	 Hw7QTU7U,U,U	T\\*%%	&	&T\\&-A-A)B)B	B	BxRYRlopRpGpGp	(	"	"4::i+@+@	"]EXEXYlEmEm	"rz  ~F  sF  sF6(GN[[[  D,iHd2GNCCCC2GNCCC zz/"" ;h1K&K&KJGN[[["0w"?"?uiKK	 	;AiK//LL4i@@@#(%$D  <y <,T7;;dHx    c                      t                      } t                      D ]$}|                     t          |                     %t	          |           at
          S )z8Retrieve and return the content of all JusText stoplists)setr	   updater   tupleJT_STOPLIST)stoplistlanguages     r)   jt_stoplist_initrX   o   sN     uuH!OO 0 0X..//////KrP   rV   c           
      z    t          j        |           }t          ||dddddd           t          |d           |S )z(Customized version of JusText processing2      g?g?g      ?T)r   make_paragraphsr   r   )r+   rV   
paragraphss      r)   custom_justextr^   y   sD    /55J
Hb#sCtLLL#J444rP   urltarget_languagec                    t          d          }|t          v rt          t          |                   }nt          pt	                      }	 t          | |          }|D ];}|j        r
t          d          |j        c}|_        |                    |           <n3# t          $ r&}t                              d||           Y d}~nd}~ww xY w|S )z9Second safety net: try with the generic algorithm justextr-   pzjustext %s %sN)r   r   r   rU   rX   r^   is_boilerplater.   appendr$   r%   error)	r+   r_   r`   result_bodyjustext_stoplistr]   	paragraphelemr(   s	            r)   try_justextrj      s     &//K+++'(9/(JKK&<*:*<*<
%#D*:;;
 $ 	% 	%I' %cllINOD$)t$$$$	%  0 0 0_c3////////0 s   
B 
C	#CC	c                     t          |           } t          | |j        |j                  }t	          d                    |                                                    }||t          |          fS )z1Try to use justext algorithm as a second fallback )r   rj   r_   langr   joinitertextr=   )r+   r0   rI   	temp_texts       r)   rD   rD      s^     $Dgk7<@@MSXXm44667788I)S^^33rP   c                 b   t          | |          }|j        du rt          |d           t          |d           t          ||          }|                    ddd          D ]F}|j        dk    rd|_        |j        dv r(|j        dk    r|                    d	d
           d|_        Gd d t          |                    d                    D             D             }t          |g|R   t          d                    |	                                                    }||t          |          fS )zLConvert and sanitize the output from the generic algorithm (post-processing)Faspantdthtrrow)rt   ru   roleheadcellc                 $    g | ]}|t           v|S  r   ).0tagnames     r)   
<listcomp>z!sanitize_tree.<locals>.<listcomp>   s-       .(( 	(((rP   c                     g | ]	}|j         
S r|   )tag)r}   elements     r)   r   z!sanitize_tree.<locals>.<listcomp>   s    OOOOOOrP   *rl   )r   linksr   r   iterr   rR   r   rn   ro   r=   )r+   r0   cleaned_treeri   sanitization_listr.   s         r)   rF   rF      sR    !w//L}<%%%|V$$$g66L!!$d33   8tDHHX%%x4(((DH OO3|7H7H7M7M3N3NOOO  
 |0/0000..001122Ds4yy((rP   )3__doc__loggingtypingr   r   justext.corer   r   r   justext.utilsr   r	   
lxml.etreer
   r   r   r   	lxml.htmlr   baseliner   htmlprocessingr   r   r   readability_lxmlr   r"   settingsr   utilsr   r   xmlr   xpathsr   	getLogger__name__r%   rU   rC   r*   strintrO   rX   r^   rj   rD   rF   r|   rP   r)   <module>r      s             ^ ] ] ] ] ] ] ] ] ] 5 5 5 5 5 5 5 5 > > > > > > > > > > > > ! ! ! ! ! ! % $ $ $ $ $ M M M M M M M M M M = = = = = = ' ' ' ' ' ' ) ) ) ) ) ) ) )       ) ) ) ) ) )		8	$	$ I
{ 
{ 
 
 
 
? [ ? { ? ( ? Z] ? il ? wz ?   @E  FN  PS  UX  FX  @Y ?  ?  ?  ? D%*     c
 s    k  c h    04 4s 4uXsC=O7P 4 4 4 4) )c )eKc<Q6R ) ) ) ) ) )rP   