
    $$}gy3                     0   d Z ddlmZ ddlmZmZmZ ddlZddlZddl	Z	 ddl
mZ n# e$ r	 ddlmZ Y nw xY wddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ dZdZdZdZdZdZdZ e h d          Z!dZ"dZ# ej$        dej%                  Z&ddhZ' G d de(          Z) G d de)          Z*e"de#fdZ+e"de#fdZ,d Z- G d d e          Z. G d! d"e/          Z0 ed#$          d%             Z1eeeeeefd&Z2d' Z3d( Z4d) Z5efd*Z6eeeeeeede"e#e-fd+Z7dS ),zc
Copyright (c) 2011 Jan Pomikalek

This software is licensed as described in the file LICENSE.rst.
    )absolute_import)divisionprint_functionunicode_literalsN)	lru_cache)Cleaner)ContentHandler   )	Paragraph)unicodeignored)is_blankg?F      g333333?g{Gz?F>    pdddldth1h2h3h4h5h6litdthtrulcoldivprebodyformtabletfoottheadcenterlegendoptioncaptioncolgroupfieldsetoptgrouptextarea
blockquoteutf8replaces#   <meta[^>]+charset=["']?([^'"/>\s]+)goodbadc                       e Zd ZdZdS )JustextErrorz"Base class for jusText exceptions.N)__name__
__module____qualname____doc__     L/var/www/py-google-trends/myenv/lib/python3.11/site-packages/justext/core.pyr6   r6   1   s        ((((r<   r6   c                       e Zd ZdS )JustextInvalidOptionsN)r7   r8   r9   r;   r<   r=   r?   r?   5   s        Dr<   r?   c                    t          | t                    r| }|r|n|}|                     ||          } nt          | |||          }	 t          j                            |t          j                                                  }nM# t          $ r@ t          j                            | t          j                                                  }Y nw xY w|S )zConverts HTML to DOM.)parser)	
isinstancer   encodedecode_htmllxmlhtml
fromstring
HTMLParser
ValueError)rF   default_encodingencodingerrorsdecoded_htmlforced_encodingdoms          r=   html_to_domrP   9   s    $   M&.D((4D{{?F33"4)98VLLHi""<	8L8L8N8N"OO H H H i""4	0D0D0F0F"GGH
 Js   =B ACCc                 H   t          | t                    r| S |r|                     ||          S t                              |           }|rj|                    d                              d          }t          t                    5  |                     ||          cddd           S # 1 swxY w Y   	 |                     d          S # t          $ rK 	 |                     ||          cY S # t          $ r$}t          dt          |          z             d}~ww xY ww xY w)zv
    Converts a `html` containing an HTML page into Unicode.
    Tries to guess character encoding from meta tag.
    r
   ASCIINr1   z&Unable to decode the HTML to Unicode: )
rB   r   decodeCHARSET_META_TAG_PATTERNsearchgroupr   LookupErrorUnicodeDecodeErrorr6   )rF   rJ   rK   rL   matchdeclared_encodinges          r=   rD   rD   M   s   
 $    -{{8V,,,$++D11E :!KKNN11'::[!! 	: 	:;;0&99	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	:V{{6""" V V V	V;;/88888! 	V 	V 	VG'RS**TUUU	V	VsB   B++B/2B/7C 
D!C/,D!/
D9DDD!c                 h    dddddddddddddddd}t          di |}|                    |           S )zRemoves unwanted parts of DOM.FT)head)processing_instructionsremove_unknown_tagssafe_attrs_onlypage_structureannoying_tagsframesmetalinks
javascriptscriptscommentsstyleembeddedforms	kill_tagsr;   )r   
clean_html)rO   optionscleaners      r=   preprocessorrp   k   sf     $)$  G"     Gc"""r<   c                   L    e Zd ZdZed             Zd Zd Zd Zd Z	d Z
d Zd	S )
ParagraphMakerzg
    A class for converting a HTML page represented as a DOM object into a list
    of paragraphs.
    c                 d     |             }t           j                            ||           |j        S )zConverts DOM into paragraphs.)rE   saxsaxify
paragraphs)clsroothandlers      r=   make_paragraphszParagraphMaker.make_paragraphs   s.     #%%g&&&!!r<   c                     t                      | _        g | _        d | _        d| _        d| _        |                                  d S NF)PathInfopathrv   	paragraphlinkbr_start_new_pragraphselfs    r=   __init__zParagraphMaker.__init__   sA    JJ		  """""r<   c                     | j         r8| j                                         r| j                            | j                    t	          | j                  | _         d S N)r   contains_textrv   appendr   r~   r   s    r=   r   z"ParagraphMaker._start_new_pragraph   sN    > 	3dn::<< 	3O""4>222"49--r<   c                    |d         }| j                             |           |t          v s|dk    r8| j        r1|dk    r| j        xj        dz  c_        |                                  d S t          |dk              | _        | j        r| j                            d           n|dk    rd| _	        | j        xj        dz  c_        d S )Nr
   r    aT)
r~   r   PARAGRAPH_TAGSr   r   
tags_countr   boolappend_textr   )r   nameqnameattrss       r=   startElementNSzParagraphMaker.startElementNS   s    Aw	>!!ddlltwlt|| ))Q.))$$&&&&&44<((DGw !**3//// 	N%%*%%%%r<   c                     |d         }| j                                          |t          v r|                                  |dk    r	d| _        d S d S )Nr
   r   F)r~   popr   r   r   )r   r   r   s      r=   endElementNSzParagraphMaker.endElementNS   sR    Aw	>!!$$&&&3;;DIII ;r<   c                 .    |                                   d S r   )r   r   s    r=   endDocumentzParagraphMaker.endDocument   s      """""r<   c                     t          |          rd S | j                            |          }| j        r"| j        xj        t          |          z  c_        d| _        d S r|   )r   r   r   r   chars_count_in_linkslenr   )r   contenttexts      r=   
characterszParagraphMaker.characters   s]    G 	F~))'229 	=N//3t99<//r<   N)r7   r8   r9   r:   classmethodrz   r   r   r   r   r   r   r;   r<   r=   rr   rr      s         
 " " ["# # #. . .+ + +&  # # #    r<   rr   c                   R    e Zd Zd Zed             Zed             Zd Zd Zd Z	dS )r}   c                     g | _         d S r   	_elementsr   s    r=   r   zPathInfo.__init__   s    r<   c                 J    d                     d | j        D                       S )N.c              3   &   K   | ]}|d          V  dS )r   Nr;   .0r[   s     r=   	<genexpr>zPathInfo.dom.<locals>.<genexpr>   s&      55!555555r<   joinr   r   s    r=   rO   zPathInfo.dom   s%    xx55dn555555r<   c                 P    dd                     d | j        D                       z   S )N/c              3   0   K   | ]}d |dd         z  V  dS )z%s[%d]N   r;   r   s     r=   r   z!PathInfo.xpath.<locals>.<genexpr>   s/      GG1h2A2.GGGGGGr<   r   r   s    r=   xpathzPathInfo.xpath   s*    SXXGGGGGGGGGr<   c                     |                                  }|                    |d          dz   }|||<   ||i f}| j                            |           | S )Nr   r
   )_get_childrengetr   r   )r   tag_namechildrenorder
xpath_parts        r=   r   zPathInfo.append   s\    %%''Xq))A-"r*
j)))r<   c                 :    | j         si S | j         d         d         S )Nr   r   r   s    r=   r   zPathInfo._get_children   s#    ~ 	I~b!!$$r<   c                 8    | j                                          | S r   )r   r   r   s    r=   r   zPathInfo.pop   s    r<   N)
r7   r8   r9   r   propertyrO   r   r   r   r   r;   r<   r=   r}   r}      s           6 6 X6 H H XH  % % %    r<   r}      )maxsizec                 8    t          d | D                       } | S )z7Lower-case all words in stoplist and create frozen set.c              3   >   K   | ]}|                                 V  d S r   )lower)r   ws     r=   r   z"define_stoplist.<locals>.<genexpr>   s*      55q555555r<   )	frozenset)stoplists    r=   define_stoplistr      s%     55H55555HOr<   c                    t          |          }| D ]}t          |          }	|                    |          }
|                                }t	          | o|j                  |_        ||k    rd|_        dd|j        v s	d|j        v rd|_        ~d|j	        v rd|_        |	|k     r|j
        dk    rd|_        d|_        |
|k    r|	|k    rd|_        d|_        |
|k    rd|_        d|_        d	S )
z&Context-free paragraph classification.r4      ©z&copyselectr   shortr3   neargoodN)r   r   stopwords_densitylinks_densityr   
is_headingheadingcf_classr   dom_pathr   )rv   r   
length_lowlength_highstopwords_lowstopwords_highmax_link_densityno_headingsr   lengthstopword_densitylink_densitys               r=   classify_paragraphsr      sD    x((H ' '	Y$66x@@ ..00 [!IY5IJJ	***!&I	&&Gy~,E,E!&I+++!&Ij  -11%*	""%,	""//##%+	""%/	""..!+I!&I3' 'r<   c                 x    | |z   |k    r0| |z  } ||          j         }|t          v r|S |dk    r|s|S | |z   |k    0dS )Nr   r4   )
class_typeGOOD_OR_BAD)irv   ignore_neargoodincboundarycs         r=   _get_neighbourr     s`    
c'X

	SqM$H
????H c'X

 5r<   c                 (    t          | ||dd          S )z
    Return the class of the paragraph at the top end of the short/neargood
    paragraphs block. If ignore_neargood is True, than only 'bad' or 'good'
    can be returned, otherwise 'neargood' can be returned, too.
    r   )r   r   rv   r   s      r=   get_prev_neighbourr   !  s     !Z"bAAAr<   c           	      B    t          | ||dt          |                    S )z
    Return the class of the paragraph at the bottom end of the short/neargood
    paragraphs block. If ignore_neargood is True, than only 'bad' or 'good'
    can be returned, otherwise 'neargood' can be returned, too.
    r
   )r   r   r   s      r=   get_next_neighbourr   *  s      !Z!S__MMMr<   c                    t          |           D ]\  }}|j        |_        |j        r|j        dk    s$|dz   }d}|t	          |           k     rZ||k    rT| |         j        dk    rd|_        n;|t	          | |         j                  z  }|dz  }|t	          |           k     r||k    Ti }t          |           D ]\  }}|j        dk    rt          || d          }t          || d          }|dk    r|dk    rd||<   G|dk    r|dk    rd||<   Y|dk    rt          || d	          dk    s|dk    rt          || d	          dk    rd||<   d||<   |                                D ]\  }}	|	| |         _        t          |           D ]L\  }}|j        dk    rt          || d          }t          || d          }||fd
k    rd|_        Ed|_        Mt          |           D ]\  }}|j        r|j        dk    r|j        dk    s#|dz   }d}|t	          |           k     rZ||k    rT| |         j        dk    rd|_        n;|t	          | |         j                  z  }|dz  }|t	          |           k     r||k    TdS )zr
    Context-sensitive paragraph classification. Assumes that classify_pragraphs
    has already been called.
    r   r
   r   r3   r   T)r   r4   F)r4   r4   N)		enumerater   r   r   r   r   r   r   items)
rv   max_heading_distancer   r   jdistancenew_classesprev_neighbournext_neighbourr   s
             r=   revise_paragraph_classificationr   3  sN    "*--  9(1	! 	i&:g&E&EE#j//!!h2F&F&F!}'611'1	$JqM.///HFA #j//!!h2F&F&F K!*-- # #97**+Az4PPP+Az4PPPV##&(@(@#KNNu$$5)@)@"KNN%%*<Q
\a*b*b*bfp*p*p%%*<Q
\a*b*b*bfp*p*p#KNN"KNN!!## % %1#$
1   "*-- * *9:--+Az4PPP+Az4PPPN+~==#(I  #)I   "*-- 
 
9! 	i&:e&C&C	HZ^cHcHcE#j//!!h2F&F&F!}'611'-	$JqM.///HFA #j//!!h2F&F&F
 
r<   c           
          t          | |
|	|          } ||          }t                              |          }t          ||||||||           t	          ||           |S )u   
    Converts an HTML page into a list of classified paragraphs. Each paragraph
    is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
    )rP   rr   rz   r   r   )	html_textr   r   r   r   r   r   r   r   rK   rJ   
enc_errorsrp   rO   rv   s                  r=   justextr   v  sx     i!18Z
H
HC
,s

C//44J
Hj+~'7F F F#J0DEEEr<   )8r:   
__future__r   r   r   r   re	lxml.htmlrE   lxml.sax	functoolsr   ImportErrorbackports.functools_lru_cachelxml.html.cleanr   xml.sax.handlerr	   r   r   _compatr   r   utilsr   MAX_LINK_DENSITY_DEFAULTLENGTH_LOW_DEFAULTLENGTH_HIGH_DEFAULTSTOPWORDS_LOW_DEFAULTSTOPWORDS_HIGH_DEFAULTNO_HEADINGS_DEFAULTMAX_HEADING_DISTANCE_DEFAULTr   r   DEFAULT_ENCODINGDEFAULT_ENC_ERRORScompile
IGNORECASErT   r   	Exceptionr6   r?   rP   rD   rp   rr   objectr}   r   r   r   r   r   r   r   r;   r<   r=   <module>r     ss    ' & & & & & A A A A A A A A A A 				     8####### 8 8 8777777778 $ # # # # # * * * * * *             % % % % % % % %               #         %2:&QSUS`aa uo) ) ) ) )9 ) ) )	 	 	 	 	L 	 	 	 (8$Oa    ( (8$Oa V V V V<# # #4B B B B B^ B B BJ    v   D 3   :L'7L-@X' '  '  '  'F  B B BN N N Fb @ @ @ @F -?'7L-@X9GZ(8%L     s   ' 55