
    %$}g                         d Z ddlZddlmZmZ ddlmZmZmZ ddl	m
Z
 ddlmZ ddlmZmZ dd	lmZ d
e
de
fdZdedeeeef         fdZddededefdZdS )z<
Module regrouping baseline and basic extraction functions.
    N)AnyTuple)_ElementElement
SubElement)HtmlElement   )BASIC_CLEAN_XPATH)	load_htmltrim)delete_elementtreereturnc                 H    t          |           D ]}t          |           | S )z-Remove a few section types from the document.)r
   r   )r   elems     T/var/www/py-google-trends/myenv/lib/python3.11/site-packages/trafilatura/baseline.pybasic_cleaningr      s/    !$''  tK    filecontentc                 z   t          |           }t          d          }||ddfS d}|                    d          D ]}|j        rd|j        v r	 t	          j        |j                                      dd          }n# t          $ r d}Y nw xY w|rid|v r5t          |          }|!t          |	                                          nd}nt          |          }|t          |d          _        ||rd	|z   n|z  }t          |          d
k    r||t          |          fS t          |          }d}|                    d          D ]W}t          |	                                          }t          |          d
k    r!|t          |d          _        ||rd	|z   n|z  }Xt          |          dk    r||t          |          fS t                      }	d}|                    dddddd          D ]]}
t          |
	                                          }||	vr6|t          |d          _        ||rd	|z   n|z  }|	                    |           ^t          |          d
k    r||t          |          fS t          d          }|                    d          }|nt          |d          }d |                                D             }d                    d |D                       |_        ||j        t          |j                  fS t'          |d          }|t          |d          _        ||t          |          fS )a)  Use baseline extraction function targeting text paragraphs and/or JSON metadata.

    Args:
        filecontent: HTML code as binary string or string.

    Returns:
        A LXML <body> element containing the extracted paragraphs,
        the main text as string, and its length as integer.

    bodyN r   z&.//script[@type="application/ld+json"]articleBodyz<p>p d   z
.//article
blockquotecodepreqquote.//bodyc                 ,    g | ]}t          |          S  )r   .0es     r   
<listcomp>zbaseline.<locals>.<listcomp>^   s    <<<!d1gg<<<r   
c                     g | ]}||S r$   r$   r%   s     r   r(   zbaseline.<locals>.<listcomp>_   s     < < <q! < < < <r   F)clean)r   r   iterfindtextjsonloadsget	Exceptionr   text_contentr   lenr   setiteraddfinditertextjoinhtml2txt)r   r   postbody	temp_textr   	json_bodyparsedr-   article_elemresultselemententry	body_elemp_elem
text_elemss                  r   baselinerF      su    [!!DvH|Q IFGG ? ?9 	?$)33 Jty1155mRHH		   			 ?I%%&y11F:@:L4 3 3 5 5666RTDD	??D15
8S)).9>S4ZZ$>	
9~~C	NN22$D Il33 ; ;L--//00t99s??-1Jx%%*y:td:I
8}}qC	NN22 eeGI99\63sGLL  W))++,,-2Jx%%*	<uu<IKK
9~~C	NN22 vH		)$$IHc**<<y'9'9';';<<<
ii < <J < < <==c&+&6&666 D&&&D%)Jx"T3t99$$s   -A>>BBTcontentr+   c                    t          |           }|dS |                    d          }|dS |rt          |          }d                    |                                                                                                          S )zRun basic html2txt on a document.

    Args:
        content: HTML document as string or LXML element.
        clean: remove potentially undesirable elements.

    Returns:
        The extracted text in the form of a string or an empty string.

    Nr   r"   r   )r   r7   r   r9   r2   splitstrip)rG   r+   r   r   s       r   r:   r:   h   s     WD|r99YD|r $d##88D%%''--//0066888r   )T)__doc__r.   typingr   r   
lxml.etreer   r   r   	lxml.htmlr   settingsr
   utilsr   r   xmlr   r   strintrF   boolr:   r$   r   r   <module>rU      s-   
          4 4 4 4 4 4 4 4 4 4 ! ! ! ! ! ! ' ' ' ' ' ' " " " " " " " "           L%# L%%#s(:"; L% L% L% L%^9 9c 9$ 9# 9 9 9 9 9 9r   