
    %$}gD                     l   d Z 	 ddlZdZn# e$ r dZY nw xY wddlZddlZddlZddlZddlZddl	Z	ddl
mZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZmZmZmZm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z; ddl<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZC  ejD        eE          ZF ejG        d           ejH        ejI        z   ZJ ejK        d          ZL ejK        d          ZM ejK        d          ZNg dZOddddd ZPd!ed"eeQ         fd#ZRd$eQd"eeQ         fd%ZSd!ed"e"fd&ZTd'eQd"eUfd(ZVd)eQd*eWd"eQfd+ZXd,eQd-eQd"e eQeQf         fd.ZYd/eQd"eQfd0ZZ	 	 dPd!ed2eQd/eQd3eWd4eeQ         d"e eQeQf         fd5Z[dQd6eQd!ed3eWd"eQfd7Z\	 	 	 dRd9eeQ         d!ed2eQd3eWd4eeQ         d"dfd:Z]d;eQd"eeQddf         fd<Z^	 dPd$eQd!ed3eWd=ee8         d"df
d>Z_d6eQd!ed3eWd=ee8         d"eWf
d?Z`d@e"d!ed3eWd=e8d"e eeQ         eWf         f
dAZad!ed"eWfdBZbd@e"dCeeQ         d!ed"e"fdDZc	 	 	 dSd!edFeWd@ee"         d=ee8         d"df
dGZdd!ed"dfdHZedIeeQ         dJeWd"eWfdKZfd!ed@e"d"eWfdLZgd!ed"dfdMZh	 	 dTd6ee         d!edNeeQ         d=ee8         d"eeQ         f
dOZidS )Uz1
Functions dedicated to command-line processing.
    NTF)urlsafe_b64encode)ProcessPoolExecutorThreadPoolExecutoras_completed)datetime)partial)makedirspathstatwalk)RLock)Any	GeneratorOptionalListSetTuple)UrlStoreextract_domainget_base_url)spider   )html2txt)extract)generate_bow_hash)Responseadd_to_compressed_dictbuffered_downloadsbuffered_response_downloadsload_download_buffer)find_feed_urls)reset_caches)	ExtractorFILENAME_LENMAX_FILES_PER_DIRECTORYargs_to_extractor)sitemap_search)LANGID_FLAGURL_BLACKLIST_REGEXis_acceptable_lengthlanguage_classifiermake_chunksiY  z[^/]+$z\.[a-z]{2,5}$z<[^<]+?>)URLcrawlexploreprobefeedsitemapz.csvz.jsonz.xml)csvjsonxmlxmlteiargsreturnc                    g }| j         rt	 t          | j         dd          5 }|                    d |D                        ddd           n# 1 swxY w Y   nQ# t          $ r t	          j        d           Y n1w xY wt          D ]%}t          | |          rt          | |          g} n&|st          	                    d           t          t                              |                    S )zGRead list of URLs to process or derive one from command-line arguments.rutf-8modeencodingc              3   >   K   | ]}|                                 V  d S N)strip.0lines     U/var/www/py-google-trends/myenv/lib/python3.11/site-packages/trafilatura/cli_utils.py	<genexpr>z"load_input_urls.<locals>.<genexpr>V   s*      !E!E4$**,,!E!E!E!E!E!E    Nz+ERROR: system, file type or buffer encodingzNo input provided)
input_fileopenextendUnicodeDecodeErrorsysexitINPUT_URLS_ARGSgetattrLOGGERwarninglistdictfromkeys)r7   
input_urls	inputfileargs       rE   load_input_urlsrX   N   sb   J 	DdoC'BBB Fi!!!E!E9!E!E!EEEEF F F F F F F F F F F F F F F! 	D 	D 	DHBCCCCC	D # 	 	CtS!! %dC001
  ,*+++ j))***s4   A  AA AA AA A;:A;filenamec                 r    t          | dd          5 }d |D             }ddd           n# 1 swxY w Y   |S )zRead list of unwanted URLs.r:   r;   )r>   c                 \    h | ])}t          j        d |                                          *S ) )r)   subrA   rB   s     rE   	<setcomp>z!load_blacklist.<locals>.<setcomp>j   s.    SSS4(,R>>SSSrG   N)rI   )rY   inputfh	blacklists      rE   load_blacklistra   f   s    	hg	.	.	. T'SS7SSS	T T T T T T T T T T T T T T T s   ,00c                     t          |           }t          || j        | j        o| j         | j        | j                  S )zGRead input list of URLs to process and build a domain-aware dictionary.)r`   compression
url_filterverbose)rX   r   r`   r2   rR   rd   re   )r7   	inputlists     rE   load_input_dictrg   n   sF    %%I!.\3$)m?   rG   	directoryc                     t          j        |           rt          j        |           sI	 t          | d           n6# t          $ r) t
          j                            d| z   dz              Y dS w xY wdS )z;Check if the output directory is within reach and writable.T)exist_okz0ERROR: Destination directory cannot be created: 
F)r
   existsisdirr	   OSErrorrL   stderrwrite)rh   s    rE   check_outputdir_statusrq   {   s     ;y!! I)>)> 
	Y..... 	 	 	 JBYNQUU   55	 4s   < /A/.A/dirnamecc                     |dk    r't          t          |t          z            dz             nd}t          j        | |          S )z7Return a destination directory based on a file counter.r   r   r\   )strintr%   r
   join)rr   rs   c_dirs      rE   determine_counter_dirry      sA    9:aCA//0014555RE9We$$$rG   destdir	extensionc                     d}|t          j        |          r_d                    d t          t                    D                       }t          j        | ||z             }|Kt          j        |          _||fS )zCFind a writable path and return it along with its random file name.Nr\   c              3   H   K   | ]}t          j        t                    V  d S r@   )randomchoice
CHAR_CLASS)rC   _s     rE   rF   z$get_writable_path.<locals>.<genexpr>   s,      RR6=44RRRRRRrG   )r
   rl   rw   ranger$   )rz   r{   output_pathrY   s       rE   get_writable_pathr      s~    K

[!9!9
77RReL>Q>QRRRRRiI)=>> 
[!9!9
   rG   contentc                     t          t          t                              d|           d                                                    S )zaCreate a filename-safe string by hashing the given content
    after deleting potential XML tags.r\      )r   r   	CLEAN_XMLr]   decode)r   s    rE   generate_hash_filenamer      s7     .y}}R/I/I2NNOOVVXXXrG   orig_filenamecounternew_filenamec                 v   t                               | j        d          }| j        rQt                              d|          }t          j        | j        |          }t                              d|          }n&t          | j        |          }|pt          |          }t          j        |||z             }	|	|fS )zPPick a directory based on selected options and a file name based on output type.z.txtr\   )EXTENSION_MAPPINGgetoutput_format	keep_dirs	STRIP_DIRr]   r
   rw   
output_dirSTRIP_EXTENSIONry   r   )
r7   r   r   r   r   r{   original_dirdestination_dirrY   r   s
             rE   determine_output_pathr      s     "%%d&8&AAI~ 	C }}R77)DO\BB"&&r=99/IIB#9'#B#B)OX	-ABBK''rG   
htmlstringc                 0   t          |j        |          }t          |d          \  }}t          |          du r\t          rUt          j        |d          5 }|                    |                     d                     ddd           n# 1 swxY w Y   |S )z-Write a copy of raw HTML in backup directory.z.html.gzTwbr;   N)	ry   
backup_dirr   rq   HAS_GZIPgziprI   rp   encode)r   r7   r   destination_directoryr   rY   
outputfiles          rE   archive_htmlr      s    1$/7KK-.CZPPK344<<<Y{D)) 	9ZZ..w77888	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9Os   )BBBr\   resultc                 6   | dS |j         $t          j                            | dz              dS t	          ||| ||          \  }}t          |          du rBt          |dd          5 }|                    |            ddd           dS # 1 swxY w Y   dS dS )z-Deal with result (write to STDOUT or to file)Nrk   Twr;   r<   )r   rL   stdoutrp   r   rq   rI   )r   r7   r   r   r   destination_pathr   r   s           rE   write_resultr      s     ~
$''''',A-,-
 -
)/ "/22d::&S7CCC )z  ((() ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ;:s   )BBBinputdirc              #   n   K   t          |           D ]"\  }}}|D ]}t          j        ||          V  #dS )z2Walk the directory tree and output all file names.N)r   r
   rw   )r   rootr   
inputfilesfnames        rE   generate_filelistr      s[      #H~~ ) )a 	) 	)E)D%((((((	)) )rG   optionsc                    |st          |          }| |_        t          | d          5 }|                                }ddd           n# 1 swxY w Y   t	          |           }t          |j        |j                  }t          j	        |          
                    d          |j        d<   t          |||          }t          ||| |d           dS )z1Aggregated functions to process a file in a list.rbNz%Y-%m-%dmax_dater   )r   )r&   sourcerI   readr   minst_ctimest_mtimer   fromtimestampstrftimedate_paramsexaminer   )	rY   r7   r   r   inputfr   	file_statref_timestampr   s	            rE   file_processingr      s     *#D))GN	h		 #[[]]
# # # # # # # # # # # # # # # XI	*I,>??M&.&<]&K&K&T&T' 'G
# Zw777FxtDDDDDDs   A

AAc                     |j         rt          | ||          nd}t          | ||          }t          |||||           |dk    r|r|dz  }|S )zVExtract text and metadata from a download webpage and eventually write out the result.r\   r   )r   r   r   r   r   )r   r   r   r   )r   r7   r   r   fileslugr   s         rE   process_resultr      st    
 ;?/Q|Jg666rHZw777FHgH    !|||1NrG   	url_storec                    g }|j                             dd          }| j        st          | |          \  }} t	          ||j        |          D ]f\  }}|r/t          |t                    r||_        t          ||||          }6t                              d|           |                    |           g| j        ||fS )z?Implement a download queue consumer, single- or multi-threaded.DEFAULT
SLEEP_TIMEr   zNo result for URL: %s)configgetfloatdoner    r   parallel
isinstanceru   urlr   rP   rQ   append)	r   r7   r   r   errors
sleep_time
bufferlistr   r   s	            rE   download_queue_processingr     s     F((LAAJn # 4Y
 K K
I-w
 
 
 		# 		#KC  #*VS11 #!(wHH6<<<c"""" n # 7?rG   c           	        	
 t          |           }|                                }| j        r|                                 t	          |           }t          | j        rt          nt          | j	        |j
                            dd          |j
                            dd                    
t                      }t          | j                  5 		
fd|D             }t!          |          D ]}|                                |                    |                                           | j        rzt'          |                                          | j        k    rP|5  |                                 |                                 t-                       ddd           n# 1 swxY w Y   	 ddd           n# 1 swxY w Y   t/          | |          }| j        r#t3          |||           }t5          | ||           |S )	z/Group CLI functions dedicated to URL discovery.r   EXTERNAL_URLSr   )target_langexternalr   max_workersc              3   D   K   | ]}                     |          V  d S r@   )submit)rC   r   executorfuncs     rE   rF   z cli_discovery.<locals>.<genexpr>5  s1      DD#8??4--DDDDDDrG   N)r   r   )rg   	dump_urlsrR   resetr&   r   r1   r!   r'   target_languager   
getbooleanr   r   r   r   r   r   add_urlslenget_known_domainsprint_unvisited_urlsr"   url_processing_pipeliner/   build_exploration_dictcli_crawler)r7   r   rU   r   lockfuturesfuture	exit_codecontrol_dictr   r   s            @@rE   cli_discoveryr   #  sl   %%I$$&&Jy %%G)7(**9oFF>**9lCC	  D 77D 
	6	6	6 '(DDDDDDDD #7++ 	' 	'F}}*""6==??3339 'Y%@%@%B%B!C!Ct}!T!T ' '!66888!)))$' ' ' ' ' ' ' ' ' ' ' ' ' ' '	'	' ' ' ' ' ' ' ' ' ' ' ' ' ' ' (i88I | C-iTJJDL'BBBBs7   BF;7F"F;"F&&F;)F&*F;;F?F?rU   c                     d |D             }|d |                                  D             z
  fd|D             }t          ||j        |j        |j                  S )zMFind domains for which nothing has been found and add info to the crawl dict.c                 ,    h | ]}t          |          S  r   rC   us     rE   r^   z)build_exploration_dict.<locals>.<setcomp>R  s     ;;;1^A&&;;;rG   c                 ,    h | ]}t          |          S r   r   r   s     rE   r^   z)build_exploration_dict.<locals>.<setcomp>S  s-     & & &q& & &rG   c                 6    g | ]}t          |          v |S r   r   )rC   r   still_to_crawls     rE   
<listcomp>z*build_exploration_dict.<locals>.<listcomp>V  s+    SSSA~a/@/@N/R/Ra/R/R/RrG   )r`   rd   re   )r   r   r`   rd   re   )r   rU   r7   input_domainsnew_input_urlsr   s        @rE   r   r   N  s     <;
;;;M" & &#,#>#>#@#@& & & N TSSSSSSN!.?	   rG      nc                     |pt          |           }|j                            dd          }i }|-t          j                            t          |                      n|t          _        t          j                                        D ]_}t          j        j        |         j	        rAt          j        
                    |d          }|rt          j        || j                  ||<   `t          j        j        st          t          j        |          \  }t          _        t          || j        |          D ]D\  }	}
|
r=t#          |
t$                    r(t          j        |
|t)          |	                              Et+          fdt          j                                        D                       rnt          j        j        t/          d	                    d
 t          j                                        D                                  dS )z~Start a focused crawler which downloads a fixed number of URLs within a website
    and prints the links found in the process.r   r   NF)
as_visited)langr   c              3   $   K   | ]
}|k    V  d S r@   r   )rC   rs   r   s     rE   rF   zcli_crawler.<locals>.<genexpr>  s'      AA!qAvAAAAAArG   rk   c              3      K   | ]}|V  d S r@   r   r   s     rE   rF   zcli_crawler.<locals>.<genexpr>  s"      <<!A<<<<<<rG   )r&   r   r   r   	URL_STOREr   rX   r   urldicttuplesget_url
init_crawlr   r   r    r   r   r   r   process_responser   anyget_all_countsprintrw   r   )r7   r   r   r   r   
param_dicthostname	startpager   r   r   s    `         rE   r   r   _  s    0*400G((LAAJJ !!/$"7"78888$ $6688  #H-4 	(00e0LLI '-'8D$8( ( (
8$ # ';j(
 (
$
F$ 7w
 
 
 	O 	OKC  O*VX66 O'
<;L;L0MNNNAAAAv/>>@@AAAAA 	 #  
$))<<v/99;;<<<
<
<=====rG   c                 t   t          |           }t          |           }t          || j        |          D ]\  }}|zt	          |          }|rit          |          |j        k    rQt          d |D                       r8t          r | j	        rt          |d          | j	        k    rt          |d           dS )zBProbe websites for extractable content and print the fitting ones.r   Nc              3   >   K   | ]}|                                 V  d S r@   )isalpha)rC   rs   s     rE   rF   z!probe_homepage.<locals>.<genexpr>  s*      44		444444rG   r\   T)flush)rX   r&   r   r   r   r   min_extracted_sizer  r(   r   r+   r
  )r7   rU   r   r   r   s        rE   probe_homepager    s     &&J%%G)DM7   + +V f%%F
+KK'"<<<44V44444 = $+/+ +6266$:NNN#T****+ +rG   r   totalc                 R    |dk    rt          |           |z  nd}|dk    rdS | rdS dS )zvCompute exit code based on the number of errors:
    0 if there are no errors, 126 if there are too many, 1 otherwise.r   gGz?~   r   )r   )r   r  ratios      rE   _define_exit_coder    sA     $)199CKK%!Et||s q1rG   c                    | j         r|                                 dS t          |           }|                                }|t          k    rdnd}t          || ||          \  }}t                              dt          |          |           | j	        du rt                      }|                    d |D                        t          |                    d                    dk    rmt          || ||          \  }}t                              dt          |          t          |                     t          ||                                          S t          ||          S )	zKAggregated functions to show a list and download and process an input list.Fr   r   z%s / %s URLs could not be foundTc                     g | ]}d |z   S )zhttps://web.archive.org/web/20/r   )rC   es     rE   r   z+url_processing_pipeline.<locals>.<listcomp>  s    RRRa=ARRRrG   zhttps://web.archive.orgz-%s archived URLs out of %s could not be found)rR   r   r&   total_url_numberr%   r   rP   debugr   archivedr   r   find_known_urlsr  )r7   r   r   	url_countr   r   archived_errorsr   s           rE   r   r     s`   y &&(((u%%G**,,I666aaBG 0	4'RROFG
LL2CKKKKK}JJ	RR6RRRSSSy(()BCCDDqHH!:4'" "OQ LL?O$$F   %_i6P6P6R6RSSSVY///rG   c                    d}t          |           }|j                            dd          }t          | j                  5 }t          t          | j                  t                    D ]k}|dk     rt          |          t          k    rd}t          t          | ||          }|                    ||d|           |dk    r|t          |          z  }l	 d	d	d	           d	S # 1 swxY w Y   d	S )
zGDefine batches for parallel file processing and perform the extraction.r   r   EXTRACTION_TIMEOUTr   r   )r7   r   r   
   )	chunksizetimeoutN)r&   r   getintr   r   r,   r   	input_dirr%   r   r   r   map)r7   filecounterr   r&  r   	filebatchworkers          rE   file_processing_pipeliner-    sF   K%%Gn##I/CDDG 
	7	7	7 .8$dn--/F
 
 	. 	.I Q3y>>5L#L#LdK  F LLb'LJJJas9~~-	.. . . . . . . . . . . . . . . . . .s   BC$$C(+C(r   c                    d}|st          ||          }|  t          j                            d           nt	          t          |           |          s t          j                            d           nn	 t          | |          }n[# t          $ rN}t          j                            dt          |           dt          j
                     d           Y d}~nd}~ww xY w|S )z;Generic safeguards and triggers around extraction function.NzERROR: empty document
zERROR: file size
r   zERROR: rk   )r&   rL   ro   rp   r*   r   r   	Exceptionru   	traceback
format_exc)r   r7   r   r   r   errs         rE   r   r     s     F /#D#..
23333!#j//7;; O
-....	OZ999FF 	O 	O 	OJMs3xxMM93G3I3IMMMNNNNNNNN	OMs   5B 
CACC)r   N)r   )r\   r   N)r   NN)NN)j__doc__r   r   ImportErrorloggingr~   restringrL   r0  base64r   concurrent.futuresr   r   r   r   	functoolsr   osr	   r
   r   r   	threadingr   typingr   r   r   r   r   r   courlanr   r   r   trafilaturar   baseliner   corer   deduplicationr   	downloadsr   r   r   r   r    feedsr!   metar"   settingsr#   r$   r%   r&   sitemapsr'   utilsr(   r)   r*   r+   r,   	getLogger__name__rP   seedascii_lettersdigitsr   compiler   r   r   rN   r   ru   rX   ra   rg   boolrq   rv   ry   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r   r-  r   r   rG   rE   <module>rP     s   KKKHH   HHH   				  



     $ $ $ $ $ $ T T T T T T T T T T             ) ) ) ) ) ) ) ) ) ) ) )       = = = = = = = = = = = = = = = = : : : : : : : : : :                   , , , , , ,              " ! ! ! ! !                  % $ $ $ $ $              
	8	$	$ C   !FM1
BJy!!	"*-..BJ{##	III 	  +# +$s) + + + +0S SX    
# 
( 
 
 
 
c d    $%3 %3 %3 % % % %!s !s !uS#X ! ! ! !YC YC Y Y Y Y "&( (
(( ( 	(
 3-( 38_( ( ( (4	 	S 	 	c 	3 	 	 	 	 "&) )SM)
) ) 	)
 3-) 
) ) ) ),) )	#tT/(B ) ) ) ) QUE EEE'*E:B9:ME	E E E E*),7?	7J   ""-0;D
49c>   .( ( ( ( ( (V%)#Y69   & $(#'	-> ->
->
-> !-> i 	->
 
-> -> -> ->`+ + + + + +.	d3i 	 	 	 	 	 	0# 0( 0s 0 0 0 0@.3 .4 . . . .4 #'	 
 
# i 	
 c]     s    