
    !}g                     v    d Z dZdZdZdZddlmZ ddlZdd	lm	Z	  ej
        e          Z G d
 de          ZdS )zI
Output formatting to text via lxml xpath nodes abstracted in this file.
	newspaperzLucas Ou-YangMITzCopyright 2014, Lucas Ou-Yang    )unescapeN   )	innerTrimc                   \    e Zd Zd Zd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zd Zd ZdS )OutputFormatterc                     d | _         || _        | j                                        | _        |j        | _        |j        | _        d S N)top_nodeconfig
get_parserparserlanguagestopwords_class)selfr   s     Z/var/www/py-google-trends/myenv/lib/python3.11/site-packages/newspaper/outputformatters.py__init__zOutputFormatter.__init__   s?    k,,..%5    c                 Z    |r(|| _         | j                            |          | _        dS dS )zRequired to be called before the extraction process in some
        cases because the stopwords_class has to set incase the lang
        is not latin based
        N)r   r   get_stopwords_classr   )r   	meta_langs     r   update_languagezOutputFormatter.update_language   s>    
  	;%DM//	::    	; 	;r   c                     | j         S r   )r   r   s    r   get_top_nodezOutputFormatter.get_top_node&   s
    }r   c                    || _         d\  }}|                                  | j        j        r|                                 }|                                  |                                  |                                  |                                  | 	                                 | 
                                 |                                 }||fS )zReturns the body text of an article, and also the body article
        html if specified. Returns in (text, html) form
        ) r   )r   remove_negativescores_nodesr   keep_article_htmlconvert_to_htmllinks_to_textadd_newline_to_bradd_newline_to_lireplace_with_textremove_empty_tagsremove_trailing_media_divconvert_to_text)r   r   htmltexts       r   get_formattedzOutputFormatter.get_formatted)   s     !
d((***;( 	*''))D            &&(((##%%d|r   c                    g }t          |                                           D ]}	 | j                            |          }n:# t          $ r-}t
                              dt          |           d }Y d }~nd }~ww xY w|rRt          |          }t          |          
                    d          }d |D             }|                    |           d                    |          S )Nz%s ignoring lxml node error: %s\nc                 8    g | ]}|                     d           S ) )strip).0ns     r   
<listcomp>z3OutputFormatter.convert_to_text.<locals>.<listcomp>K   s"    999A1773<<999r   z

)listr   r   getText
ValueErrorloginfo	__title__r   r   splitextendjoin)r   txtsnodetxterrtxt_liss         r   r(   zOutputFormatter.convert_to_text?   s    **,,-- 	% 	%Dk))$//   :IsKKK  %smm#C....u5599999G$$${{4   s   A
A9#A44A9c                     | j                             |                                           }| j                             |          S r   )r   clean_article_htmlr   nodeToString)r   cleaned_nodes     r   r!   zOutputFormatter.convert_to_htmlO   s9    {55d6G6G6I6IJJ{''555r   c                 \    | j                             | j        d          D ]	}d|_        
d S )Nbrtagr-   )r   getElementsByTagr   r*   )r   es     r   r#   z!OutputFormatter.add_newline_to_brS   s;    --dm-FF 	 	AAFF	 	r   c                 R   | j                             | j        d          D ]}| j                             |d          }|d d         D ][}| j                             |          dz   |_        | j                             |          D ]}| j                             |           \d S )NulrH   lir-   )r   rJ   r   r5   r*   getChildrenremove)r   rK   li_listrN   cs        r   r$   z!OutputFormatter.add_newline_to_liW   s    --dm-FF 	* 	*Ak221$2??Gcrcl * *+--b11E90044 * *AK&&q))))**	* 	*r   c                 `    | j                             |                                 d           dS )z[Cleans up and converts any nodes that should be considered
        text into text.
        aNr   	stripTagsr   r   s    r   r"   zOutputFormatter.links_to_text_   s.     	d//11377777r   c                    | j                             | j        d          }|D ]]}| j                             |d          }|rt	          |          nd}|dk     r'|                                                    |           ^dS )zvIf there are elements inside our top node that have a
        negative gravity score, let's give em the boot.
        z*[gravityScore]gravityScorer   r   N)r   
css_selectr   getAttributefloat	getparentrQ   )r   gravity_itemsitemscores       r   r   z+OutputFormatter.remove_negativescores_nodese   s     ..M,. .! 	. 	.DK,,T>BBE$)0E%LLLqEqyy  ''---		. 	.r   c                 h    | j                             |                                 ddddd           dS )a   
        Replace common tags with just text so we don't have any crazy
        formatting issues so replace <br>, <i>, <strong>, etc....
        With whatever text is inside them.
        code : http://lxml.de/api/lxml.etree-module.html#strip_tags
        bstrongirG   supNrV   r   s    r   r%   z!OutputFormatter.replace_with_textq   sF     	hT5	B 	B 	B 	B 	Br   c                     | j                             |                                 dg          }|                                 |D ]}| j                             |          }| j                             |          }|dk    s|dk    rv|stt          | j                             |d                    dk    rGt          | j                             |d                    dk    r| j                             |           dS )	zIt's common in top_node to exit tags that are filled with data
        within properties but not within the tags themselves, delete them
        *rG   z\robjectrH   r   embedN)	r   getElementsByTagsr   reversegetTagr5   lenrJ   rQ   )r   	all_nodeselrI   r*   s        r   r&   z!OutputFormatter.remove_empty_tags{   s#    K11#( (	 		' 		'B+$$R((C;&&r**Dttu}}   -DK88 9 * * + +./0 0DK88 9 ) ) * *-./ /""2&&&		' 		'r   c                      d fd	 j                                                                        }t          |          dk     rdS |d         } |          dk    r j                             |           dS dS )zPunish the *last top level* node in the top_node if it's
        DOM depth is too deep. Many media non-content links are
        eliminated: "related", "loading gallery", etc
        r   c                     j                             |           }|s|S d}|D ]} ||dz             }||k    r|}|S )zComputes depth of an lxml element via BFS, this would be
            in parser if it were used anywhere else besides this method
            r   r   )r   rP   )r>   depthchildren	max_depthrS   e_depth	get_depthr   s         r   rv   z<OutputFormatter.remove_trailing_media_div.<locals>.get_depth   sg     {..t44H I ( (#)Auqy11Y&& 'Ir      NrO      )r   )r   rP   r   rm   rQ   )r   top_level_nodes	last_noderv   s   `  @r   r'   z)OutputFormatter.remove_trailing_media_div   s    	 	 	 	 	 	 	 +11$2C2C2E2EFF!##F#B'	9Y1$$Ky))))) %$r   N)__name__
__module____qualname__r   r   r   r+   r(   r!   r#   r$   r"   r   r%   r&   r'    r   r   r	   r	      s        6 6 6; ; ;    ,! ! ! 6 6 6  * * *8 8 8
. 
. 
.B B B' ' '$* * * * *r   r	   )__doc__r9   
__author____license____copyright__r)   r   loggingr*   r   	getLoggerr{   r7   rh   r	   r~   r   r   <module>r      s     	
/              g!!T* T* T* T* T*f T* T* T* T* T*r   