
    %$}gC                        d Z ddlZddlZddlZddlmZmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZmZmZmZ ddlZddlZdd
lmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'm(Z( 	 ddl)m*Z* ej+        ,                    d          Z-n# e.$ r dZ-Y nw xY w	 ddl/Z/ e/j0                    Z1e12                    e/j3        e/j4                   e12                    e/j3        e/j5                   dZ6n# e.$ r dZ6Y nw xY w ej7        e8          Z9 ej:        ej;        j<                   da=da>da?dedeej@        ef         fdZAejB        C                    d          ZDd ed          z   dz   ZEeEeDd<   g dZFh dZG G d d          ZHdedeeeeI                  eeI         f         fd ZJ	 dGded!eeeIeIf                  deeIeIf         fd"ZKdedejB        jL        fd#ZM	 dHded$eNdeej@        ef         fd%ZOd&eId$eNd'eNdedeeH         f
d(ZPd&eId)eHd*e#deNfd+ZQd&eId)eHd,eNd*e#deeeHeIf                  f
d-ZRde"dfd&eId$eNded*ee#         deeI         f
d.ZSddde"d/d&eId,eNd$eNd'eNdedeeH         fd0ZTd&eIdeNfd1ZUd&eIdeNfd2ZVd&eIdeNfd3ZW	 	 	 	 	 dId4eeI         d5eeeI                  d6eeI         d7ee         d8eNd9eNdefd:ZX	 dJd7ed<eYdeeeI         ef         fd=ZZ	 dKd?eeI         d@e[dAeeIgef         dBe[deeeIef         ddf         f
dCZ\	 dGd?eeI         d@e[d*ee#         deeeIeIf         ddf         fdDZ]	 dGd?eeI         d@e[d*ee#         deeeIeHf         ddf         fdEZ^d&eId$eNd'eNdedeeH         f
dFZ_dS )LzG
All functions needed to steer and execute downloads of web documents.
    N)ThreadPoolExecutoras_completed)ConfigParser)partial)version)BytesIO)sleep)	AnyCallableDict	GeneratorListOptionalSetTupleUnion)UrlStore)redirection_test   )DEFAULT_CONFIG	Extractor)URL_BLACKLIST_REGEXdecode_fileis_acceptable_lengthmake_chunks)SOCKSProxyManager
http_proxyTFargsreturnc                  x    t           rt          nt          j        }t           r	dt           ini }d|d<    |di || S )zCConfigure urllib3 download pool according to user-defined settings.	proxy_url2   	num_pools )	PROXY_URLr   urllib3PoolManager)r   manager_classmanager_argss      U/var/www/py-google-trends/myenv/lib/python3.11/site-packages/trafilatura/downloads.pycreate_poolr+   C   sN    )2K%%8KM/8@K++bL "L=00<04000    )accept_encodingztrafilatura/trafilaturaz( (+https://github.com/adbar/trafilatura)
User-Agent)i  i  i  i  i  i  i  i  i	  i
  i  i  i  i  i  i  iV  >   #   6   :   ;   <   @   B   M   R   S   [   c                       e Zd ZdZg dZdedededdfdZde	fd	Z
defd
Zdeeef         ddfdZde	ddfdZdeeef         fdZdS )Responsez5Store information gathered in a HTTP response object.dataheadershtmlstatusurlr>   rA   rB   r   Nc                 L    || _         d | _        d | _        || _        || _        d S Nr=   )selfr>   rA   rB   s       r*   __init__zResponse.__init__l   s)    	15#'	r,   c                     | j         d uS rD   )r>   rE   s    r*   __bool__zResponse.__bool__s   s    y$$r,   c                 8    | j         pt          | j                  S rD   )r@   r   r>   rH   s    r*   __repr__zResponse.__repr__v   s    y2K	222r,   
headerdictc                 L    d |                                 D             | _        dS )z#Store response headers if required.c                 >    i | ]\  }}|                                 |S r$   )lower).0kvs      r*   
<dictcomp>z*Response.store_headers.<locals>.<dictcomp>|   s&    DDDA		1DDDr,   N)itemsr?   )rE   rL   s     r*   store_headerszResponse.store_headersy   s*     ED1A1A1C1CDDDr,   decodec                 R    |r"| j         rt          | j                   | _        dS dS dS )z9Decode the bytestring in data and store a string in html.N)r>   r   r@   )rE   rV   s     r*   decode_datazResponse.decode_data~   s<     	/di 	/#DI..DIII	/ 	/ 	/ 	/r,   c                 *      fd j         D             S )z,Convert the response object to a dictionary.c                 2    i | ]}|t          |          S r$   )getattr)rP   attrrE   s     r*   rS   z$Response.as_dict.<locals>.<dictcomp>   s%    EEEdgdD))EEEr,   )	__slots__rH   s   `r*   as_dictzResponse.as_dict   s    EEEEdnEEEEr,   )__name__
__module____qualname____doc__r]   bytesintstrrF   boolrI   rK   r   rU   rX   r^   r$   r,   r*   r<   r<   h   s       ;;<<<IU C c d    %$ % % % %3# 3 3 3 3ES#X E4 E E E E
/$ /4 / / / /
Fc3h F F F F F Fr,   r<   configc                     |                      ddd                                          }|r|                                nd}|                      dd          pd}||fS )zARead and extract HTTP header strings from the configuration file.DEFAULTUSER_AGENTS )fallbackNCOOKIE)getstrip
splitlines)rg   myagents
agent_listmycookies       r*   _parse_configrt      sg     zz)]Rz@@FFHHH*2<$$&&&J zz)X..6$Hxr,   r?   c                     | t           k    r4t          |           \  }}i }|rt          j        |          |d<   |r||d<   |pt          S )z1Internal function to decide on user-agent string.r/   Cookie)r   rt   randomchoiceDEFAULT_HEADERS)rg   r?   rq   rs   s       r*   _determine_headersrz      s_     *622( 	<$*M($;$;GL! 	) (GH%o%r,   c           	          t           sht          j                            |                     dd          |                     dd          d|                     dd          dz  t
                    a t           S )z5Define a retry strategy according to the config file.ri   MAX_REDIRECTSr   DOWNLOAD_TIMEOUT   )totalredirectconnectbackoff_factorstatus_forcelist)RETRY_STRATEGYr&   utilRetrygetintFORCE_STATUSrg   s    r*   _get_retry_strategyr      sv      
 ++--	?;;]]?  !==4FGG!K) , 	
 	
 r,   no_sslc                     |rt           nt          }|sFt          |                     dd          |rdnt	          j                    |rdnd          }|r|a n|a|S )zXCreate a urllib3 pool manager according to options in the config file and HTTPS setting.ri   r}   N	CERT_NONECERT_REQUIRED)timeoutca_certs	cert_reqs)NO_CERT_POOL	HTTP_POOLr+   r   certifiwhere)rg   r   pools      r*   _initiate_poolr      sy    
 "0<<yD MM)-?@@#8TT%+@kk
 
 
  	LLIKr,   rB   with_headersc                 ,   	 t          ||          }|                    d| t          |          t          |          d          }t	                      }|                    d          D ]M}|                    |           t          |          |                    dd          k    rt          d          N|
                                 t          t          |          |j        |                                          }|r|                    |j                   |S # t"          j        j        $ r0 t(                              d	|            t-          | d
||          cY S t.          $ r&}	t(                              d| |	           Y d}	~	nd}	~	ww xY wdS )zPInternal function to robustly send a request (SSL or not) and return its result.)r   GETF)r?   retriespreload_contenti   ri   MAX_FILE_SIZEzMAX_FILE_SIZE exceededzretrying after SSLError: %sTzdownload error: %s %sN)r   requestrz   r   	bytearraystreamextendlenr   
ValueErrorrelease_connr<   rc   rA   geturlrU   r?   r&   
exceptionsSSLErrorLOGGERwarning_send_urllib_request	Exceptionerror)
rB   r   r   rg   pool_managerresponser>   chunkresperrs
             r*   r   r      s   8%fV<<<  ''&v..'//! ( 
 
 {{__U++ 	; 	;EKK4yy6==ODDDD !9::: E dX_hoo6G6GHH 	1x/000& E E E4c:::#C|VDDDDD 8 8 8,c3777777778 4s   DD AF"	F+FFr   optionsc                     t          |j        p|j        pd          }|j        dk    r#t                              d|j        |            dS t          ||          sdS dS )z2Check if the response conforms to formal criteria.rk      z!not a 200 response: %s for URL %sFT)r   r@   r>   rA   r   r   r   )rB   r   r   lentests       r*   _is_suitable_responser      sd    (-68=6B77G#8(/3OOOu11 u4r,   rV   c                 >    t          | ||          r|r|j        n|S dS )z:Internal function to run safety checks on response result.N)r   r@   )rB   r   rV   r   s       r*   _handle_responser      s.     S(G44 5 &4x}}H44r,   c                     |r|j         n|}t          | d||          }|r1|j        r*|st          |          }t	          | ||          r|j        S dS )a  Downloads a web page and seamlessly decodes the response.

    Args:
        url: URL of the page to fetch.
        no_ssl: Do not try to establish a secure connection (to prevent SSLError).
        config: Pass configuration values for output control.
        options: Extraction options (supersedes config).

    Returns:
        Unicode string or None in case of failed downloads and invalid results.

    T)rV   r   rg   r   N)rg   fetch_responser>   r   r   r@   )rB   r   rg   r   r   s        r*   	fetch_urlr     st    $  '2W^^FFc$vfMMMH !HM ! 	/v...G h88 	!= 4r,   )rV   r   r   rg   c                    t           st          nt          }t                              d|             || |||          }|st                              d|            dS |                    |           |S )a  Downloads a web page and returns a full response object.

    Args:
        url: URL of the page to fetch.
        decode: Use html attribute to decode the data (boolean).
        no_ssl: Don't try to establish a secure connection (to prevent SSLError).
        with_headers: Keep track of the response headers.
        config: Pass configuration values for output control.

    Returns:
        Response object or None in case of failed downloads and invalid results.

    zsending request: %szrequest failed: %sN)
HAS_PYCURLr   _send_pycurl_requestr   debugrX   )rB   rV   r   r   rg   dl_functionr   s          r*   r   r   #  sz    * /9R&&>RK
LL&,,,{3f==H )3///t   Or,   c                    d}t          j                    }|                    t           j        |                     d                     |                    t           j        d           |                    t           j        d           |                    t           j        d           |                    |j        d           t          r%|                    t           j
        t                     	 |                                 |                    |j                  dk     }n:# t           j        $ r(}t                              d| |           d}Y d}~nd}~ww xY w|                                 |S )	z+Send a basic HTTP HEAD request with pycurl.Futf-8
   r   Ti  zpycurl HEAD error: %s %sN)pycurlCurlsetoptURLencodeCONNECTTIMEOUTSSL_VERIFYPEERSSL_VERIFYHOSTNOBODYr%   	PRE_PROXYperformgetinfoRESPONSE_CODEr   r   r   close)rB   page_existscurlr   s       r*   _pycurl_is_live_pager   B  s7   K;==DKK
CJJw//000KK%r***KK%q)))KK%q)))KKT""" 1F$i000ll4#566<<   /c::: 	JJLLLs   12D$ $E3EEc                     	 t          |           }n4# t          $ r'}t                              d| |           Y d}~dS d}~ww xY wdS )zGUse courlan redirection test (based on urllib3) to send a HEAD request.zurllib3 HEAD error: %s %sNFT)r   r   r   r   )rB   _r   s      r*   _urllib3_is_live_pager   ^  s`    S!!   0#s;;;uuuuu 4s    
A>Ac                 T    t           rt          |           nd}|pt          |           S )zCSend a HTTP HEAD request without taking anything else into account.F)r   r   r   )rB   results     r*   is_live_pager   h  s.    *4?!#&&&%F/*3///r,   	inputlist	blacklist
url_filter	url_storecompressionverbosec                     |t          |d|          }t          t                              |                     } rfd| D             } rfd| D             } |                    |            |S )zMFilter, convert input URLs and add them to domain-aware processing dictionaryNF)
compressedstrictr   c                 B    g | ]}t          j        d |          v|S )rk   )r   sub)rP   ur   s     r*   
<listcomp>z*add_to_compressed_dict.<locals>.<listcomp>~  s8     
 
 
$7$;B$B$B)$S$SA$S$S$Sr,   c                 L    g | ]t          fd D                        S )c              3       K   | ]}|v V  	d S rD   r$   )rP   fr   s     r*   	<genexpr>z4add_to_compressed_dict.<locals>.<listcomp>.<genexpr>  s'      0L0LAa0L0L0L0L0L0Lr,   )any)rP   r   r   s    @r*   r   z*add_to_compressed_dict.<locals>.<listcomp>  s<    MMM1S0L0L0L0L0L0L0L-L-LMQMMMr,   )r   listdictfromkeysadd_urls)r   r   r   r   r   r   s    ``   r*   add_to_compressed_dictr   o  s     E7SSS	T]]9--..I 

 
 
 
 
 
 
	  NMMMM	MMM	y!!!r,         @
sleep_timec                 n    	 |                      |d          }|s| j        rnt          |           1|| fS )zRDetermine threading strategy and draw URLs respecting domain-based back-off rules.Ti )
time_limitmax_urls)get_download_urlsdoner	   )r   r   
bufferlists      r*   load_download_bufferr     sT    00JQV0WW
 	 	j	
 y  r,   '  r   download_threadsworker	chunksizec              #     K   t          |          5 t          | |          D ]A}fd|D             }t          |          D ] }||         |                                fV  !B	 ddd           dS # 1 swxY w Y   dS )z3Use a thread pool to perform a series of downloads.)max_workersc                 >    i | ]}                     |          |S r$   )submit)rP   rB   executorr   s     r*   rS   z'_buffered_downloads.<locals>.<dictcomp>  s)    PPP3X__VS993PPPr,   N)r   r   r   r   )r   r   r   r   r   future_to_urlfuturer  s     `    @r*   _buffered_downloadsr    s       
(8	9	9	9 =X Y77 	= 	=EPPPPP%PPPM&}55 = =#F+V]]__<<<<<=	== = = = = = = = = = = = = = = = = =s   AA66A:=A:c                 P    t          t          |          }t          | ||          S )z3Download queue consumer, single- or multi-threaded.)r   )r   r   r  )r   r   r   r   s       r*   buffered_downloadsr    s)     Y000Fz+;VDDDr,   c                 p    |r|j         nt          }t          t          |          }t	          | ||          S )z7Download queue consumer, returns full Response objects.r   )rg   r   r   r   r  )r   r   r   rg   r   s        r*   buffered_response_downloadsr
    s9      ':W^^NF^F333Fz+;VDDDr,   c                 Z   d t          |                                          D             }t          j                    }|                    t          j        |                     d                     |                    t          j        t                     |                    t          j	        |           |                    t          j
        d           |                    t          j        |                    dd                     |                    t          j        |                    dd                     |                    t          j        |                    dd                     |                    t          j        |                    dd                     |                    t          j        d           |du rA|                    t          j        d	           |                    t          j        d	           n1|                    t          j        t)          j                               |r3t-                      }|                    t          j        |j                   t2          r%|                    t          j        t2                     	 |                                }n# t          j        $ rr}t:                              d
| |           |du rG|j        d	         t>          v r3t:                               d| |           tC          | d||          cY d}~S Y d}~dS d}~ww xY wtE          ||#                    |j$                  |#                    |j%                            }	|&                                 |ri }
|'                                (                    dd          )                                D ]I}d|vr|*                    dd          \  }}|+                                |
|+                                <   J|	,                    |
           |	S )zDExperimental function using libcurl and pycurl to speed up downloadsc                 "    g | ]\  }}| d | S )z: r$   )rP   headercontents      r*   r   z(_send_pycurl_request.<locals>.<listcomp>  s7       #2676W  r,   r   r   ri   r|   r}   r   Tr   zpycurl error: %s %sFzretrying after SSL error: %s %sNz
iso-8859-1replace)errors:)-rz   rT   r   r   r   r   r   SHARE
CURL_SHARE
HTTPHEADERFOLLOWLOCATION	MAXREDIRSr   r   TIMEOUTMAXFILESIZENOSIGNALr   r   CAINFOr   r   r   HEADERFUNCTIONwriter%   r   
perform_rbr   r   r   CURL_SSL_ERRORSr   r   r<   r   r   EFFECTIVE_URLr   getvaluerV   rp   splitro   rU   )rB   r   r   rg   
headerlistr   headerbytesbufferbytesr   r   respheaderslinenamevalues                 r*   r   r     s    6H6P6P6V6V6X6X  J ;==DKK
CJJw//000KKj)))KK!:...KK%q)))KK &--	?"K"KLLLKK%v}}Y@R'S'STTTKKi9K L LMMMKK"FMM)_$M$MNNNKK###~~F)1---F)1----FM7=??333 >iiF);+<=== 1F$i000oo''<   *C555
 U??sx{o==LL:CEEE'T<HHHHHHHH ttttt  T\\$"455t||DDV7W7W D 	JJLLL (   "")),y)IITTVV
	6 
	6D
 $**S!,,KD%(-K

%%;'''Ks   J4 4L5A!L0$L50L5rD   )F)NNNFF)r   )r   )`rb   loggingosrw   concurrent.futuresr   r   configparserr   	functoolsr   importlib.metadatar   ior   timer	   typingr
   r   r   r   r   r   r   r   r   r   r&   courlanr   courlan.networkr   settingsr   r   utilsr   r   r   r   urllib3.contrib.socksr   environrn   r%   ImportErrorr   	CurlSharer  r   SH_SHARELOCK_DATA_DNSLOCK_DATA_SSL_SESSIONr   	getLoggerr_   r   disable_warningsr   InsecureRequestWarningr   r   r   r'   r+   r   make_headersry   
USER_AGENTr   r  r<   re   rt   rz   r   r   rf   r   r   r   r   r   r   r   r   r   r   floatr   rd   r  r  r
  r   r$   r,   r*   <module>rC     s     				  ? ? ? ? ? ? ? ? % % % % % %       & & & & & &            
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
         , , , , , , / / / / / / / / V V V V V V V V V V V V777777
|,,II   IIIMMM!!##J fov';<<<fov'CDDD JJ   JJJ 
	8	$	$  +B C C C	1 1g&93&> ? 1 1 1 1 ,++D+AAWW]+++.XX  !+   ( ?>>F F F F F F F FD ,  5$s)1Dhsm1S+T         ?C& &&#+DcN#;&	#s(^& & & & 1C    & */ "&
7#$   ."	""*."8D"h" " " "J	s 	h 	 	t 	 	 	 		 *.9BeHcM"#    )#'	 	  i 	
 c]   > )  	  	
   h   >c d    8s t    0c 0d 0 0 0 0 %) $$( CyC!  !	
      6 .1	! 	!	!%*	!
49h	! 	! 	! 	!  	= =S	== cUCZ = 	=
 uS#Xd*+= = = =" $(E ES	EE i E uS#Xd*+	E E E E $(	E 	ES		E	E i 	E uS(]#T4/0		E 	E 	E 	ES	SS*.S8DShS S S S S Ss%   4 B BB#AC9 9DD