
    %$}g^K              	          d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZ ddlmZ ddlmZmZ dd	lmZmZ  ej        e          Z ej        d
          ZdedefdZh dZddhZh dZh dZh dZh dZ  ej        dej!                   ej        dej!                   ej        dej!                   ej        dej!                   ej        dej!                   ej        dej!                  dZ"ddhZ#ddhZ$d ede%fd!Z& G d" d#          Z' G d$ d%          Z( ej        d&ej!                   ej        d'ej!                  d(Z) ej        d)ej!                  Z*d*ede+fd+Z,i fded,ede+fd-Z-dS ).a  Minimalistic fork of readability-lxml code

This is a python port of a ruby port of arc90's readability project

http://lab.arc90.com/experiments/readability/

Given a html document, it pulls out the main body text and cleans it up.

Ruby port by starrhorne and iterationlabs
Python port by gfxmonk

For list of contributors see
https://github.com/timbertson/python-readability
https://github.com/buriy/python-readability

License of forked code: Apache-2.0.
    N)sqrt)
attrgetter)AnyDictOptionalSet)tostring)HtmlElementfragment_fromstring   )	load_htmltrimz\.( |$)stringreturnc                 0    t          | t          d          S )Nxml)encodingmethod)r	   str)r   s    \/var/www/py-google-trends/myenv/lib/python3.11/site-packages/trafilatura/readability_lxml.py	_tostringr   &   s    FS7777    >
   apdloluldivimgpretable
blockquoter   article>   tdr    r"   >	   ddr   dtlir   r   formasideaddress>
   h1h2h3h4h5h6thnavfooterheader>   r   r   r'   r   embedinputzcombx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitterz#and|article|body|column|main|shadowzKarticle|body|content|entry|hentry|main|page|pagination|post|text|blog|storyzbutton|combx|comment|com-|contact|figure|foot|footer|footnote|form|input|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widgetz.<(?:a|blockquote|dl|div|img|ol|p|pre|table|ul)z+https?:\/\/(?:www\.)?(?:youtube|vimeo)\.com)unlikelyCandidatesReokMaybeItsACandidateRe
positiveRe
negativeRedivToPElementsRevideoRebodyhtmlr   r   elemc                 ^    t          t          |                                                     S )z7Return the length of the element with all its contents.)lenr   text_content)r?   s    r   text_lengthrC   W   s$    tD%%''(()))r   c                   .    e Zd ZdZddgZdededdfdZdS )	Candidatez,Defines a class to score candidate elements.scorer?   r   Nc                 "    || _         || _        d S N)rF   r?   )selfrF   r?   s      r   __init__zCandidate.__init__a   s    !
!%			r   )__name__
__module____qualname____doc__	__slots__floatr
   rJ    r   r   rE   rE   \   sM        22&!I&e &; &4 & & & & & &r   rE   c            	          e Zd ZdZg dZddedededd	fd
ZdefdZ	de
eef         dedefdZde
eef         dee         fdZdedefdZde
eef         fdZdedefdZdedefdZddZddZdede
eef         defdZd	S )Documentz,Class to build a etree document out of html.docmin_text_lengthretry_length      rU   rV   rW   r   Nc                 0    || _         || _        || _        dS )a  Generate the document

        :param doc: string of the html content.
        :param min_text_length: Set to a higher value for more precise detection of longer texts.
        :param retry_length: Set to a lower value for better detection of very small texts.

        The Document class is not re-enterable.
        It is designed to create a new Document() for each HTML file to process it.

        API method:
        .summary() -- cleaned up content
        NrT   )rI   rU   rV   rW   s       r   rJ   zDocument.__init__k   s!     .(r   c                 x   | j                             dd          D ]}|                                 d}	 |r|                                  |                                  |                                 }|                     |          }|r|                     ||          }n`|du rd}t          	                    d           t          	                    d           | j         
                    d          }||n| j         }|                     ||          }t          |pd	          }|r|| j        k     rd}|S )
z
        Given a HTML file, extracts the text of the article.

        Warning: It mutates internal DOM representation of the HTML document,
        so it is better to call other API methods before this one.
        scriptstyleTFz5Ended up stripping too much - going for a safer parsez=Ruthless and lenient parsing did not work. Returning raw htmlr=   N )rU   iter	drop_treeremove_unlikely_candidates&transform_misused_divs_into_paragraphsscore_paragraphsselect_best_candidateget_articleLOGGERdebugfindsanitizerA   rW   )	rI   r?   ruthless
candidatesbest_candidater#   r=   cleaned_articlearticle_lengths	            r   summaryzDocument.summary|   sd    HMM(G44 	 	DNN	# 2//11177999..00J!77
CCN A**:~FFt##$HLLO   S   x}}V,,"&"2$$"mmGZ@@O !6B77N NT->>> ""r   rk   rl   c                    t          d|j        dz            }t          d          }|j                                        }|t          |          n|j        g}|D ]}d}||j        k    s||v r||         j        |k    rd}nl|j        dk    ra|                     |          }	|j        pd}
t          |
          }|dk    r|	d	k     s&|dk    r"|	d
k    rt                              |
          rd}|r|                    |           |S )N
   皙?z<div/>FTr   r^   P   g      ?r   )maxrF   r   r?   	getparentlisttagget_link_densitytextrA   	DOT_SPACEsearchappend)rI   rk   rl   sibling_score_thresholdoutputparentsiblingssiblingr|   link_densitynode_contentnode_lengths               r   re   zDocument.get_article   sG    #&b.*>*D"E"E$X..$..00#)#54<<<N<O;P 	' 	'G F.---:%%w'-1HHH###44W==&|1r!,//  "$$$t++#r))(A--%,,\:: . "F 'g&&& r   c                 \   |sd S t          |                                t          d          d          }t                              t
          j                  r8|d d         D ]-}t                              d|j        j	        |j
                   .t          t          |                    S )NrF   T)keyreverse   zTop 5: %s %s)sortedvaluesr   rf   isEnabledForloggingDEBUGrg   r?   rw   rF   nextr_   )rI   rk   sorted_candidates	candidates       r   rd   zDocument.select_best_candidate   s     	4"Z%8%8$
 
 
 w}-- 	R.rr2 R R	^Y^-?QQQQD*++,,,r   r?   c                     t          |          pd}t          d |                    d          D                       }||z  S )Nr   c              3   4   K   | ]}t          |          V  d S rH   )rC   ).0links     r   	<genexpr>z,Document.get_link_density.<locals>.<genexpr>   s*      MM+d++MMMMMMr   z.//a)rC   sumfindall)rI   r?   total_lengthlink_lengths       r   rx   zDocument.get_link_density   sG    "4((-AMMV8L8LMMMMM\))r   c                    i }| j                             ddd          D ]}|                                }||                                }t          |                                          }t          |          }|| j        k     ri||fD ] }|||vr|                     |          ||<   !dt          |                    d                    z   t          |dz  d          z   }||         xj
        |z  c_
        |||         xj
        |dz  z  c_
        |                                D ]+\  }}	|	xj
        d|                     |          z
  z  c_
        ,|S )	Nr   r    r$   r   ,d         )rU   r_   ru   r   rB   rA   rV   
score_nodesplitminrF   itemsrx   )
rI   rk   r?   parent_nodegrand_parent_node	elem_textelem_text_lennoderF   r   s
             r   rc   zDocument.score_paragraphs   s   
HMM#ud33 	A 	AD..**K" + 5 5 7 7T..0011I	NNM t333$&78 = =#J(>(>'+t'<'<Jt$IOOC00111C9Lq4Q4QQE {#))U2)) ,,-33uqy@33
  *//11 	? 	?OD)OOq4#8#8#>#>>>OOOr   c                    d}t          d |                    d          |                    d          f          D ]L}t          d                             |          r|dz  }t          d                             |          r|dz  }M|S )Nr   classidr:   rX   r9   )filtergetREGEXESr{   )rI   r?   weight	attributes       r   class_weightzDocument.class_weight  s    txx'8'8$((4..&IJJ 	 	I|$++I66 "|$++I66 "r   c                    |                      |          }t          |j                  }|                                }|t          v r|dz  }n,|t
          v r|dz  }n|t          v r|dz  }n|t          v r|dz  }t          ||          S )Nr   r   )	r   r   rw   lower
DIV_SCORESBLOCK_SCORESBAD_ELEM_SCORESSTRUCTURE_SCORESrE   )rI   r?   rF   rw   names        r   r   zDocument.score_node  s    !!$''$(mmyy{{:QJEE\!!QJEE_$$QJEE%%%QJE%%%r   c           
         | j                             d          D ]}d                    t          d |                    d          |                    d          f                    }t          |          dk     ra|j        t          vrTt          d         	                    |          r4t          d         	                    |          s|
                                 d S )Nz.//* r   r   r   r7   r8   )rU   r   joinr   r   rA   rw   
FRAME_TAGSr   r{   r`   )rI   r?   attrss      r   ra   z#Document.remove_unlikely_candidates  s    H$$V,, 
	! 
	!DHHVD488G+<+<dhhtnn*MNNOOE5zzA~~
**23::5AA + !9:AA%HH +
    
	! 
	!r   c                    | j                             d          D ]\}t          d                             d                    t          t          t          |                                        sd|_        ]| j                             d          D ]}|j	        rR|j	        
                                r9t          d          }|j	        d c|_	        |_	        |                    d|           t          t          |          d          D ]\  }}|j        rU|j        
                                r<t          d          }|j        d c|_	        |_        |                    |d	z   |           |j        d
k    r|                                 d S )Nz.//divr;   r^   r   z<p/>r   T)r   r   br)rU   r   r   r{   r   mapr   rv   rw   ry   stripr   insertr   	enumeratetailr`   )rI   r?   p_elemposchilds        r   rb   z/Document.transform_misused_divs_into_paragraphs)  s   H$$X.. 	 	D -.55ItDzz2233   H$$X.. 	& 	&Dy 'TY__.. ',V44)-D&TYAv&&&$Yt__dCCC & &
U: 1%*"2"2"4"4 1088F.3j$+FKKKa0009$$OO%%%&	& 	&r   r   c           	      P	   |                     dddddd          D ]H}|                     |          dk     s|                     |          dk    r|                                 I|                     d	d
          D ]                                 |                     d          D ]Rdj        v r3t
          d                             j        d                   rd_        >                                 St                      }t          |
                    d                    D ]a|v r|                               }|v r|         j        nd}||z   dk     r7t                              dj        ||                                            p                                                    d          dk     rd}fdt"          D             }|dxx         dz  cc<   |dxx         t%                              d                    z  cc<   t)                    }	|                               }
                                }|||v r||         j        nd}|d         r&|d         d|d         dz  z   k    rd|d          d}n|d         |d         k    rj        t,          vrd}n|d         |d         d z  k    rd!}n}|	| j        k     r|d         dk    rd"|	 d#}n^|	| j        k     r|d         d$k    rd"|	 d%}n?|d&k     r|
d'k    rd(|
d)d*| }n(|d&k    r|
d+k    rd(|
d)d*| }n|d,         dk    r|	d-k     s|d,         dk    rd.}n|	sd/}g }                                D ]*}t)          |          }|r|                    |            n+t%          |          dz   }                    d0          D ]=}t)          |          }|r*|                    |           t%          |          |k    r n>|r@t5          |          d1k    r-d2}|                                         d3d4d5d6                     nd2}|r9                                 t                              d7|j        ||pd8           c|| _        t;          | j                  S )9Nr+   r,   r-   r.   r/   r0   r   gQ?r(   textareaiframesrcr<   VIDEOz6//table|//ul|//div|//aside|//header|//footer|//sectionz+Removed %s with score %6.3f and weight %-3sr   rq   Tc           	      \    i | ](}|t                              d |                     )S )z.//)rA   r   )r   kindr?   s     r   
<dictcomp>z%Document.sanitize.<locals>.<dictcomp>g  sB       >BD#dll<<<8899  r   r'   r   r6   z.//input[@type="hidden"]r   r   r   g?ztoo many images ()zmore <li>s than <p>sr   zless than 3x <p>s than <input>sztoo short content length z without a single imager   z and too many imagesrX   rr   ztoo many links z.3fz for its weight g      ?r5   K   z<<embed>s with too short content length, or too many <embed>sz
no content)	precedingi  Fr!   r   r   sectionz0Removed %6.3f %s with weight %s cause it has %s.r^   )r_   r   rx   r`   attribr   r{   ry   setreversedxpathrF   rf   rg   rw   rB   countTEXT_CLEAN_ELEMSrA   r   rC   ru   	LIST_TAGSrV   itersiblingsr|   r   updaterU   r   )rI   r   rk   r4   allowedr   rF   	to_removecountscontent_lengthr   r   reasonr   sibsib_content_lengthlimitr?   s                    @r   ri   zDocument.sanitizeF  s   iidD$dCC 	# 	#F  ((1,,0E0Ef0M0MPT0T0T  """IIfj11 	 	DNNIIh'' 	! 	!D##	(:(A(A$+eBT(U(U##		    $'EEJJOPP
 
 ]	 ]	D w&&t,,F.2j.@.@Jt$**aE~!!AH	       ""$$**3//"44 	   FV   t#w3t||4N'O'O#P#PP "-T!2!2#44T::"nn..* '*44 #;/55  #; 0&6%=1vc{S7H3H#H#HAAAAFFD\F3K//DHI4M4M3FFG_sa88>FF#d&:::ve}PQ?Q?Q````FF#d&:::ve}q?P?PXNXXX F b[[\C%7%7T,TTTFTT F r\\lS&8&8T,TTTFTT F Wo**~/B/BvHH H W F ( &)F  "H#0022 " "-8-=-=*- "$OO,>???!E"  MMA-E#0040@@ & &-8-=-=*- &$OO,>???"8}}55 % SCMMD$8$8$)	tyy$y'Q'QRRR %I NN$$$LLJ"   """r   )rX   rY   )r   N)rK   rL   rM   rN   rO   r
   intrJ   r   ro   r   rE   re   r   rd   rP   rx   rc   r   r   ra   rb   ri   rQ   r   r   rS   rS   f   s       66:::I) )K )# )RU )`d ) ) ) )"*# *# *# *# *#X'd;	+A&B 'T] 'bm ' ' ' 'R	-[)5K0L 	-QYZcQd 	- 	- 	- 	-*[ *U * * * *
"${I'="> " " " "H     &{ &y & & & &! ! ! !& & & &:p#[ p#d;	;Q6R p#WZ p# p# p# p# p# p#r   rS   z-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remotez+and|article|body|column|content|main|shadow)unlikelyCandidatesokMaybeItsACandidatezdisplay:\s*noner   c                     d| j         v r0t                              |                     dd                    rdS d| j         v rdS |                     d          dk    rd|                     dd          vrdS d	S )
zT
    Checks if the node is visible by considering style, attributes, and class.
    r]   r^   Fhiddenzaria-hiddentruezfallback-imager   T)r   DISPLAY_NONEr{   r   )r   s    r   is_node_visibler     s    
 $+,"5"5dhhw6K6K"L"Lu4;uxx&((-=TXXF F . . u4r   optionsc                 D   t          |           }|dS |                    dd          }|                    dd          }|                    dt                    }t          |                    d                    }|                    d	 |                    d
          D                        d}|D ]} ||          s|                    dd           d|                    dd           }	t          d                             |	          r!t          d                             |	          s~|                    d          rt          |	                                
                                          }
|
|k     r|t          |
|z
            z  }||k    r dS dS )z]
    Decides whether or not the document is reader-able without parsing the whole thing.
    NFmin_content_length   	min_score   visibility_checkerz.//p | .//pre | .//articlec              3   >   K   | ]}|                                 V  d S rH   )ru   )r   r   s     r   r   z)is_probably_readerable.<locals>.<genexpr>  s,      EEd!!EEEEEEr   z	.//div/brg        r   r^   r   r   r   r   z./parent::li/pT)r   r   r   r   r   r   REGEXPSr{   rA   rB   r   r   )r>   r   rU   r   r   r   nodesrF   r   class_and_idtext_content_lengths              r   is_probably_readerabler     s    D//C
{u %93??K,,I %9?KK		67788E	LLEEcii.D.DEEEEEEE  !!$'' 	((7B//FF$((42D2DFF'(//== 	g"G

&

	 ::&'' 	!$"3"3"5"5";";"="=>>!333),>>???944  5r   ).rN   r   remathr   operatorr   typingr   r   r   r   
lxml.etreer	   	lxml.htmlr
   r   utilsr   r   	getLoggerrK   rf   compilerz   r   r   DIV_TO_P_ELEMSr   r   r   r   r   Ir   r   r   r   rC   rE   rS   r   r   boolr   r   rQ   r   r   <module>r     s3   $  				             + + + + + + + + + + + +       6 6 6 6 6 6 6 6 " " " " " " " "		8	$	$ BJz""	8k 8c 8 8 8 8   Y
***RRRXXX <<<  'BJ 	[
  )bj)OQSQUVV"*V
  "* 	y
  #
924  rzH"$OO# ( f
4L	*k *c * * * *
& & & & & & & &P# P# P# P# P# P# P# P#r
 %"* 	A
  'BJ6   rz,bd33+ $      <> % % %s %$ % % % % % %r   