
    $$}g81                         d dl mZ d dl mZmZmZ d dlZd dlZd dlZd dlZddl	m
Z
mZmZmZ ddlT ddlmZmZ d Zej        d	fd
Zej        fdZej        fdZd Zedk    r e             dS dS )    )absolute_import)divisionprint_functionunicode_literalsN   )escapePY3URLErrorurllib)*)get_stoplistget_stoplistsc                     dt           j                            t           j                            t          j        d                             t
          t          t          t          t          t          t          t          d	z  S )Na  Usage: %(progname)s -s STOPLIST [OPTIONS] [HTML_FILE]
Convert HTML to plain text and remove boilerplate.

  -o OUTPUT_FILE   if not specified, output is written to stdout
  --encoding=...   default character encoding to be used if not specified
                   in the HTML meta tags (default: %(default_encoding)s)
  --enc-force      force specified encoding, ignore HTML meta tags
  --enc-errors=... errors handling for character encoding conversion:
                     strict: fail on error
                     ignore: ignore characters which can't be converted
                     replace: replace characters which can't be converted
                              with U+FFFD unicode replacement characters
                   (default: %(default_enc_errors)s)
  --format=...     output format; possible values:
                     default: one paragraph per line, each preceded with
                              <p> or <h> (headings)
                     boilerplate: same as default, except for boilerplate
                                  paragraphs are included, too, preceded
                                  with <b>
                     detailed: one paragraph per line, each preceded with
                               <p> tag containing detailed information
                               about classification as attributes
                     krdwrd: KrdWrd compatible format
  --no-headings    disable special handling of headings
  --list-stoplists print a list of inbuilt stoplists and exit
  -V, --version    print version information and exit
  -h, --help       display this help and exit

If no HTML_FILE specified, input is read from stdin.

STOPLIST must be one of the following:
  - one of the inbuilt stoplists; see:
      %(progname)s --list-stoplists
  - path to a file with the most frequent words for given language,
    one per line, in UTF-8 encoding
  - None - this activates a language-independent mode

Advanced options:
  --length-low=INT (default %(length_low)i)
  --length-high=INT (default %(length_high)i)
  --stopwords-low=FLOAT (default %(stopwords_low)f)
  --stopwords-high=FLOAT (default %(stopwords_high)f)
  --max-link-density=FLOAT (default %(max_link_density)f)
  --max-heading-distance=INT (default %(max_heading_distance)i)
r   )	progname
length_lowlength_highstopwords_lowstopwords_highmax_link_densitymax_heading_distancedefault_encodingdefault_enc_errors)ospathbasenamesysargvLENGTH_LOW_DEFAULTLENGTH_HIGH_DEFAULTSTOPWORDS_LOW_DEFAULTSTOPWORDS_HIGH_DEFAULTMAX_LINK_DENSITY_DEFAULTMAX_HEADING_DISTANCE_DEFAULTDEFAULT_ENCODINGDEFAULT_ENC_ERRORS     P/var/www/py-google-trends/myenv/lib/python3.11/site-packages/justext/__main__.pyusager)      s`    ,Z   !1!1#(1+!>!>??$&*,08(,
 
Y6 6r'   Tc           
          | D ]J}|j         dk    r|j        rd}nd}n|rd}t          d|dt          |j        d          |	           Kd
S )z
    Outputs the paragraphs as:
    <tag> text of the first paragraph
    <tag> text of the second paragraph
    ...
    where <tag> is <p>, <h> or <b> which indicates
    standard paragraph, heading or boilerplate respecitvely.
    goodhpb<z> FquotefileN)
class_typeheadingprintr   text)
paragraphsfpno_boilerplate	paragraphtags        r(   output_defaultr=   J   s       O O	6))   	C333y~U C C C CD2NNNNNO Or'   c           
          | D ]R}d|j         |j        t          |j                  |j        t          |j        d          fz  }t          ||           SdS )z
    Same as output_default, but only <p> tags are used and the following
    attributes are added: class, cfclass and heading.
    z6<p class="%s" cfclass="%s" heading="%i" xpath="%s"> %sFr0   r2   N)r4   cf_classintr5   xpathr   r7   r6   )r8   r9   r;   outputs       r(   output_detailedrC   a   sv    
    	I 	!""O9>///M
 
 	f2 r'   c                     | D ]L}|j         dv r|j        rd}nd}nd}|j        D ]*}t          d||                                fz  |           +MdS )a  
    Outputs the paragraphs in a KrdWrd compatible format:
    class<TAB>first text node
    class<TAB>second text node
    ...
    where class is 1, 2 or 3 which means
    boilerplate, undecided or good respectively. Headings are output as
    undecided.
    )r+   neargood      r   z%i	%sr2   N)r4   r5   
text_nodesr6   strip)r8   r9   r;   cls	text_nodes        r(   output_krdwrdrL   q   s       
@ 
@	#777  C"- 	@ 	@I(c9??#4#455B?????	@
@ 
@r'   c                     dd l } ddlm} 	 |                      t          j        dd          dg d          \  }}nm# | j        $ r`}t          |t          j                   t          t                      t          j                   t          j	        d           Y d }~nd }~ww xY wt          j        d          d         }t          j        }t          r |t          j        j                  }n |t          j                  }d }d	}	d
}
t           }t"          }t$          }t&          }t(          }t*          }d }t,          }d
}t.          }	 |D ]\  }}|dv r/t          t                                 t          j	        d           |dv rWt          t0          j                            t          j        d                   d|d           t          j	        d           |dk    rPt          d                    t9          t;                                                     t          j	        d           |dk    r@	 t          j        |dd          }	# t>          $ r}tA          d|d|          d }~ww xY w|dk    r\|!                                dk    rtE                      }_t0          j        #                    |          r	 t          j        |dd          }tE          d |D                       }|$                                 # t>          $ r}tA          d|d|          d }~wtJ          $ r}tA          d|z            d }~ww xY w|t;                      v rtM          |          }+tO          j(        d|          rAtA          d|dd                    t9          t;                                                    tA          d|z            |dk    r:	 |}d )                    |           # tT          $ r tA          d!|z            w xY w|d"k    rd#}|d$k    r>|!                                d%v r|!                                }tA          d&|z            |d'k    r|d(v r|}	/tA          d)|z            |d*k    rd#}
K|d+k    r6	 tW          |          }c# tX          $ r tA          d,|d-|d.          w xY w|d/k    r6	 tW          |          }# tX          $ r tA          d,|d-|d.          w xY w|d0k    r6	 t[          |          }# tX          $ r tA          d,|d-|d1          w xY w|d2k    r6	 t[          |          }# tX          $ r tA          d,|d-|d1          w xY w|d3k    r6	 t[          |          }S# tX          $ r tA          d,|d-|d1          w xY w|d4k    r6	 tW          |          }# tX          $ r tA          d,|d-|d.          w xY w|r|}|tA          d5          |sd}d}|r	 tO          j(        d6|d                   rt]          j/        |d                   }nt=          |d         d          }n4# t>          t`          f$ r }tA          d|d         d|          d }~ww xY w|1                                }|t          j        ur|$                                 t          |||||||||
|||          }|	d	k    rte          ||           d S |	d7k    rte          ||d
8           d S |	d9k    rtg          ||           d S |	d:k    rti          ||           d S tk          d;|	z            # tl          $ rl}t          t0          j                            t          j        d                   d<|t          j                   t          j	        d           Y d }~d S d }~ww xY w)=Nr   )__version__r   zo:s:hV)z	encoding=z	enc-forcezenc-errors=zformat=zno-headingshelpversionzlength-low=zlength-high=zstopwords-low=zstopwords-high=zmax-link-density=zmax-heading-distance=zlist-stoplistsr2   utf8defaultF)z-hz--help)z-Vz	--versionz: jusText vz<

Copyright (c) 2011 Jan Pomikalek <jan.pomikalek@gmail.com>z--list-stoplists
z-owzCan't open z for writing: z-snonerc                 6    g | ]}|                                 S r&   )rI   ).0ls     r(   
<listcomp>zmain.<locals>.<listcomp>   s     +K+K+K!AGGII+K+K+Kr'   z for reading: zLUnicode decoding error when reading the stoplist (probably not in UTF-8): %sz^\w*$zUnknown stoplist: z
Available stoplists:
zFile not found: %sz
--encoding zUknown character encoding: %sz--enc-forceTz--enc-errors)strictignorereplacezInvalid --enc-errors value: %sz--format)rS   boilerplatedetailedkrdwrdzUknown output format: %sz--no-headingsz--length-lowzInvalid value for z: 'z'. Integer expected.z--length-highz--stopwords-lowz'. Float expected.z--stopwords-highz--max-link-densityz--max-heading-distancezNo stoplist specified.z	[^:/]+://r`   )r:   ra   rb   zUnknown format: %sz: )7getoptjustextrN   r   r   GetoptErrorr6   stderrr)   exitcodecslookupstdinr	   stdoutbufferr   r   r    r!   r"   r#   r$   r%   r   r   r   joinsortedr   openIOErrorJustextInvalidOptionslowersetisfilecloseUnicodeDecodeErrorr   rematchencodeLookupErrorr@   
ValueErrorfloatr   urlopenr
   readr=   rC   rL   AssertionErrorJustextError)rc   VERSIONoptsargsestream_writerfp_infp_outstoplistformatno_headingsr   r   r   r   r   r   encodingr   force_default_encoding
enc_errorsoafp_stoplist	html_textr8   s                             r(   mainr      s	   MMM......	]]38ABB< <7 <7 <7 8 8
dd
    acj!!!!eggCJ''''
 M&))"-MIE
 +sz011sz**HFK#J%K)M+N/7H'"#JQ d	R d	RDAq$$$egg'''G$$SXa[1111777< = = =(((dii} 7 788999dB#[C88FF B B B//;<11aa@B B BB d7799&&"uuHHw~~a(( R
P*0+af*E*EK'*+K+K{+K+K+K'L'LH'--////& J J J"7"7CD11aa H#J #J J1 P P P"7!KMN!O#P #P PP moo--#/??8Ha00 R #8"7$%AAtyy1H1H'I'I'I!K#L #L L
 #88Lq8P"Q"QQl""U'($II.////" U U U/0ORS0STTTUm##)-&&n$$7799 ???!"JJ/0PST0TUUUjHHHFF/0JQ0NOOOo%%"n$$R!$QJJ! R R R//KL11aaaPR R RR o%%R"%a&&KK! R R R//KL11aaaPR R RR '''P$)!HHMM! P P P//IJAAANP P PP (((P%*1XXNN! P P P//IJAAANP P PP ***P',Qxx$$! P P P//IJAAANP P PP ...R+.q66((! R R R//KL11aaaPR R RR / " 	('H'(@AAA 	NM 	DD8L$q'22 /"N4733EE a#..EX& D D D++7;AwwwBD D DD JJLL		!!KKMMMY*k>+;=Q#3ZA A
 Y:v.....}$$:veDDDDDDz!!J/////x*f----- !!5!>???   "'**38A;7777;#*MMMMsn  .; 
B%AB  B%C2]" ;I]" 
I7I22I77A]" 	AL]" 
ML--M:MMB]" $O=;]" =PB]" R-+]" -!S	]" S)']" )!T

	]" T%#]" %!U	]" U!]" !!V	]" V]" !V>>	]" W]" !W::"]" AY* )]" *Z;ZZA)]" ]"  ]" 8]" ]" "
_,A!____main__)
__future__r   r   r   r   rh   r   rw   r   _compatr   r	   r
   r   coreutilsr   r   r)   rk   r=   rC   rL   r   __name__r&   r'   r(   <module>r      sE   ' & & & & & A A A A A A A A A A  				 				 



 2 2 2 2 2 2 2 2 2 2 2 2     . . . . . . . .7 7 7t #&*T O O O O. $':      "% @ @ @ @.t t tn zDFFFFF r'   