
    !}g(                     :    d Z ddlZddlmZ  G d de          ZdS )zH
Holds the code for cleaning out unwanted tags from the lxml
dom xpath.
    N   )ReplaceSequencec                   h    e Zd Zd Zd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zd Zd Zd Zd ZdS )DocumentCleanerc                    || _         | j                                         | _        d| _        d| _        d| j        z  | _        d| j        z  | _        d| j        z  | _        d| _        d| _	        d| _
        d	| _        d
| _        d| _        d| _        t                                          dd                              d                              d          | _        d| _        dS )zVSet appropriate tag names and regexes of tags to remove
        from the HTML
        af  ^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar|storytopbar-bucket|utility-bar|inline-share-tools|comment|PopularQuestions|contact|foot|footer|Footer|footnote|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt|links|meta$|shoutbox|sponsor|tags|socialnetworking|socialNetworking|cnnStryHghLght|cnn_stryspcvbx|^inset$|pagetools|post-attributes|welcome_form|contentTools2|the_answers|communitypromo|runaroundLeft|subscribe|vcard|articleheadings|date|^print$|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text|legende|ajoutVideo|timestamp|js_repliesz$http://exslt.org/regular-expressionsz//*[re:test(@id, '%s', 'i')]z//*[re:test(@class, '%s', 'i')]z//*[re:test(@name, '%s', 'i')]z,<(a|blockquote|dl|div|img|ol|p|pre|table|ul)z	^caption$z google z^[^entry-]more.*$z[^-]facebookzfacebook-broadcastingz[^-]twitter
z

	z^\s+$z<.//article|.//*[@id="article"]|.//*[@itemprop="articleBody"]N)config
get_parserparserremove_nodes_reregexp_namespacenauthy_ids_renauthy_classes_renauthy_names_rediv_to_p_re
caption_re	google_re
entries_refacebook_refacebook_braodcasting_re
twitter_rer   createappendtablines_replacementscontains_article)selfr
   s     R/var/www/py-google-trends/myenv/lib/python3.11/site-packages/newspaper/cleaners.py__init__zDocumentCleaner.__init__   s     k,,..7 	 !G<"23"C"&"6#7 @ $ 4!5J%#-)(?%'%4%6%6VD&!!VD\\VH 	" !_    c                    |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|                     |          }|                     || j                  }|                     || j                  }|                     || j	                  }|                     || j
                  }|                     || j                  }|                     || j                  }|                     |          }|                     |d          }|                     |d          }|                     |d          }|S )z.Remove chunks of the DOM as specified
        divspansection)clean_body_classesclean_article_tagsclean_em_tagsremove_drop_capsremove_scripts_stylesclean_bad_tagsremove_nodes_regexr   r   r   r   r   r   clean_para_spansdiv_to_para)r   doc_to_cleans     r   cleanzDocumentCleaner.clean4   sa    ..|<<..|<<)),77,,\::11,??**<88..|T_MM..|T^LL..|T_MM..|T=MNN..|/3/LN N..|T_MM,,\::''e<<''f==''i@@r    c                     | j                             |d          }|r"| j                             |d         d           |S )zRemoves the `class` attribute from the <body> tag because
        if there is a bad match, the entire DOM will be empty!
        bodytagr   classattrr   getElementsByTagdelAttribute)r   docelementss      r   r%   z"DocumentCleaner.clean_body_classesJ   sK     ;///@@ 	@K$$Xa[w$???
r    c                     | j                             |d          }|D ]#}dD ]}| j                             ||           $|S )Narticler2   )idnamer4   r5   r7   )r   r:   articlesr=   r6   s        r   r&   z"DocumentCleaner.clean_article_tagsS   sb    ;///CC 	= 	=G/ = =((t(<<<<=
r    c                     | j                             |d          }|D ]K}| j                             |d          }t          |          dk    r| j                             |           L|S )Nemr2   imgr   )r   r8   lendrop_tag)r   r:   emsnodeimagess        r   r'   zDocumentCleaner.clean_em_tagsZ   sr    k**3D*99 	+ 	+D[11$E1BBF6{{a$$T***
r    c                 z    | j                             |d          }|D ]}| j                             |           |S )Nz+span[class~=dropcap], span[class~=drop_cap]r   
css_selectrE   )r   r:   itemsitems       r   r(   z DocumentCleaner.remove_drop_capsb   sO    &&s -? @ @ 	' 	'DK  &&&&
r    c                 d   | j                             |d          }|D ]}| j                             |           | j                             |d          }|D ]}| j                             |           | j                             |          }|D ]}| j                             |           |S )Nscriptr2   style)r   r8   removegetComments)r   r:   scriptsrM   stylescommentss         r   r)   z%DocumentCleaner.remove_scripts_stylesi   s    +..s.AA 	% 	%DKt$$$$--cw-?? 	% 	%DKt$$$$;**3// 	% 	%DKt$$$$
r    c                    | j                             || j                  }|D ]6}|                    | j                  s| j                             |           7| j                             || j                  }|D ]6}|                    | j                  s| j                             |           7| j                             || j                  }|D ]6}|                    | j                  s| j                             |           7|S N)r   xpath_rer   xpathr   rQ   r   r   )r   r:   naughty_listrG   naughty_classesnaughty_namess         r   r*   zDocumentCleaner.clean_bad_tagsy   s   {++C1CDD  	) 	)D::d344 )""4(((+..sD4JKK# 	) 	)D::d344 )""4(((,,S$2FGG! 	) 	)D::d344 )""4(((
r    c                     dD ]E}d|d|d}| j                             ||          }|D ]}| j                             |           F|S )N)r>   r4   z//*[re:test(@z, 'z', 'i')])r   rX   rQ   )r   r:   patternselectorregrZ   rG   s          r   r+   z"DocumentCleaner.remove_nodes_regex   sn    ' 	) 	)HH4<HHgggFC;//S99L$ ) )""4(((()
r    c                 z    | j                             |d          }|D ]}| j                             |           |S )Nzp spanrJ   )r   r:   spansrM   s       r   r,   z DocumentCleaner.clean_para_spans   sF    &&sH55 	' 	'DK  &&&&
r    c                 6    | j                             |          S rW   )r   
textToPara)r   replacement_textr:   s      r   get_flushed_bufferz"DocumentCleaner.get_flushed_buffer   s    {%%&6777r    c                    |}| j                             |          }t          |          dk    rJ| j                            |          }|| j                            |          dk    r| j                            |d          dk    rd| j                            |          z   dz   }|                    |           |                    |           | j        	                    |dd           | j                            |          }|=| j                            |          dk    r| j                            |d          dk    |                    |           | j        
                    |          }	|	| j                            |	          dk    r| j                            |	d          dk    rd| j                            |	          z   dz   }|                    |           |                    |	           | j        	                    |	dd           | j        
                    |	          }	|	C| j                            |	          dk    r+| j                            |	d          dk    d S d S d S d S d S d S d S )Nr   azgrv-usedalreadyyes )r6   value)r   
replaceAllrD   r   previousSiblinggetTaggetAttribute	outerHtmlr   setAttributenextSibling)
r   kidkid_textre   nodes_to_removekid_text_nodereplace_text	prev_nodeouter	next_nodes
             r   replace_walk_left_rightz'DocumentCleaner.replace_walk_left_right   s   1<<XFF|q  33MBBI'**955<<00!#46 69>? ?dk33I>>>D ''...&&y111((9J/4 ) 6 6 6 K77	BB	 '**955<<00!#46 69>? ? ##L111//>>I'**955<<00!#46 69>? ?dk33I>>>D ''...&&y111((9J/4 ) 6 6 6 K33I>>	 '**955<<00!#46 69>? ? ? ?# !  ('''<<? ? =<r    c                    g }g }g }| j                             |          }|D ]}| j                             |          dk    rit          |          dk    rV|                     d                    |          |          }|                    |           g }|                    |           | j                             |          r3| j                             |          }	| 	                    ||	||           |                    |           t          |          dk    r@|                     d                    |          |          }|                    |           g }|D ]}
| j         
                    |
           |S )Npr    )r   childNodesWithTextrn   rD   rf   joinr   
isTextNodegetTextr{   rQ   )r   r:   r"   re   nodes_to_returnru   kidsrs   new_nodert   ns              r   get_replacement_nodesz%DocumentCleaner.get_replacement_nodes   s   {--c22 	, 	,C{!!#&&#--#6F2G2G!2K2K22GG,--s4 4&&x000#% &&s++++'',, ,;..s33,,S(<L-<> > > >  &&s++++   1$$..rww7G/H/H#NNH""8,,,!  	" 	"AKq!!!!r    c                 <    | j                             |d           d S )Nr}   )r   
replaceTag)r   r:   r"   s      r   replace_with_paraz!DocumentCleaner.replace_with_para   s     sC(((((r    c                 V   d}d}| j                             ||          }g d}|D ] }| j                             ||          }|/t          |          dk    r|                     ||           |dz  }O||                     ||          }	d |	D             }	t          j        |j                  }
|	                                 t          |	          D ]\  }}|                    ||           |
                                D ]\  }}|                    ||           |dz  }|S )Nr   r2   )
rh   
blockquotedlr"   rC   olr}   pretableulr   c                     g | ]}||S rW    ).0r   s     r   
<listcomp>z/DocumentCleaner.div_to_para.<locals>.<listcomp>   s     K K KqQ]]]]r    )r   r8   getElementsByTagsrD   r   r   copydeepcopyattribclear	enumerateinsertrL   set)r   r:   dom_typebad_divs	else_divsdivstagsr"   rL   replace_nodesr   irG   r?   rk   s                  r   r-   zDocumentCleaner.div_to_para   sO   	{++CX+>>& & & 	 	CK11#t<<E3u::??&&sC000A $ : :3 D D K KM K K Ksz22		(77 ( (GAtJJq$''''#)<<>> ) )KD%GGD%((((Q	
r    N)__name__
__module____qualname__r   r/   r%   r&   r'   r(   r)   r*   r+   r,   rf   r{   r   r   r-   r   r    r   r   r   
   s        &_ &_ &_P  ,             $    8 8 8? ? ?<  @) ) )    r    r   )__doc__r   utilsr   objectr   r   r    r   <module>r      sj      " " " " " "h h h h hf h h h h hr    