
    %$}gq&                        d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZmZ ddlmZmZmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZ  ej        e          Z ej        d          Z  ej        dej!                  Z" ej        d          Z# ej        d          Z$ ej        d          Z% ej        d          Z& ej        d          Z' ej        d          Z( ej        d          Z)g dZ* G d d          Z+dddefde,dee,         de-de.de/de	e,         fdZ0de,d ee,         de-fd!Z1d"e,de	e,         fd#Z2d$ee,         d"e,de	e,         fd%Z3dS )&z#
Deriving link info from sitemaps.
    N)islice)sleep)CallableListSetOptionalPattern)	clean_urlextract_domainfilter_urlsfix_relative_urlsget_hostinfolang_filter   )is_similar_domain)	fetch_urlis_live_page)	MAX_LINKSMAX_SITEMAPS_SEENz.<loc>(?:<!\[CDATA\[)?(http.+?)(?:\]\]>)?</loc>z<xhtml:link.+?>zhref=["\'](.+?)["\']zg(?:blogger|blogpost|ghost|hubspot|livejournal|medium|typepad|squarespace|tumblr|weebly|wix|wordpress)\.z^.{0,5}<\?xml|<sitemap|<urlsetz\.xml(\..{2,4})?$|\.xml[?#]zhttps?://[^\s<"]+z
\?.*$|#.*$z\.xml\b)zsitemap.xmlzsitemap.xml.gzsitemapzsitemap_index.xmlzsitemap_news.xmlc                       e Zd ZdZg dZ	 	 ddededee         dee         d	ed
dfdZ	ddZ
ded
dfdZdee         dedeegdf         d
dfdZddZddZddZdS )SitemapObjectzCStore all necessary information on sitemap download and processing.)	base_urlcontentcurrent_urldomainexternalseensitemap_urlstarget_langurlsNFr   r   sitemapsurlsr    r   returnc                     || _         d| _        || _        || _        d| _        t                      | _        || _        || _        g | _	        d S )N )
r   r   r   r   r   setr   r   r    r!   )selfr   r   r"   r    r   s         T/var/www/py-google-trends/myenv/lib/python3.11/site-packages/trafilatura/sitemaps.py__init__zSitemapObject.__init__@   sN     &!& "!ee	'3*5!			    c                     t                               d| j                   t          | j                  pd| _        | j                            | j                   dS )z!Fetch a sitemap over the network.zfetching sitemap: %sr%   N)LOGGERdebugr   r   r   r   addr'   s    r(   fetchzSitemapObject.fetchR   sN    +T-=>>> !1228b	d&'''''r*   linkc                 \   || j         k    rdS t          | j        |          }t          || j                  pd}|rt          || j                  sdS t          |d          }|t                              d|           dS | j	        sRt                              |          s8t          | j        |          s#t                              d| j        |           dS t                              |          r| j                            |           dS | j                            |           dS )z^Examine a link and determine if it's valid and if it leads to
        a sitemap or a web page.Nr%   T)fastzcouldn't extract domain: %sz-link discarded, diverging domain names: %s %s)r   r   r   r
   r    r   r   r,   errorr   WHITELISTED_PLATFORMSsearchr   r   warningDETECT_SITEMAP_LINKr   appendr!   )r'   r1   	newdomains      r(   handle_linkzSitemapObject.handle_linkX   sA    4###F 55t/006B 	;tT-=>> 	F"4d333	LL6===F
 	)00;;	 &dk9==	
 NN?i   F%%d++ 	#$$T*****IT"""""r*   regexindexhandlerc                 "   fdt          |                    | j                  t                    D             D ]} ||           t                              dt          | j                  t          | j                  | j	                   dS )zJExtract links from the content using pre-defined regex, index and handler.c              3   (   K   | ]}|         V  d S )N ).0mr=   s     r(   	<genexpr>z.SitemapObject.extract_links.<locals>.<genexpr>~   s8       
 
AeH
 
 
 
 
 
r*   z%%s sitemaps and %s links found for %sN)
r   finditerr   r   r,   r-   lenr   r!   r   )r'   r<   r=   r>   matchs     `  r(   extract_linkszSitemapObject.extract_linksz   s    
 
 
 
$U^^DL%A%A9MM
 
 
 	 	E GENNNN3!""	NN		
 	
 	
 	
 	
r*   c                      d j         vrdS t          j        d j         dt          j                  dt
          ddf fd}                     t          d|           dS )	z7Extract links corresponding to a given target language.z	hreflang=Nzhreflang=[\"'](z.*?|x-default)[\"']attrsr#   c                                          |           r9t                               |           }|r                    |d                    dS dS dS )z!Examine language code attributes.r   N)r6   HREFLANG_REGEXr;   )rJ   
lang_match
lang_regexr'   s     r(   handle_lang_linkzASitemapObject.extract_sitemap_langlinks.<locals>.handle_lang_link   sf      '' 4+22599
 4$$Z]333334 44 4r*   r   )r   recompiler    DOTALLstrrH   XHTML_REGEX)r'   rO   rN   s   ` @r(   extract_sitemap_langlinksz'SitemapObject.extract_sitemap_langlinks   s    dl**FZDt/DDDbi
 

	4C 	4D 	4 	4 	4 	4 	4 	4 	4 	;+;<<<<<r*   c                 H    |                      t          d| j                   dS )z=Extract sitemap links and web page links from a sitemap file.r   N)rH   
LINK_REGEXr;   r/   s    r(   extract_sitemap_linksz#SitemapObject.extract_sitemap_links   s.    4+	
 	
 	
 	
 	
r*   c                 D   t          | j        | j                  }|sdS t                              | j                  s#|                     t          d| j                   dS | j        $| 	                                 | j
        s| j        rdS |                                  dS )z5Download a sitemap and extract the links it contains.Nr   )is_plausible_sitemapr   r   SITEMAP_FORMATrG   rH   DETECT_LINKSr;   r    rU   r   r!   rX   )r'   	plausibles     r(   processzSitemapObject.process   s    ()94<HH	 	F##DL11 	|Q0@AAAF'**,,,  DI ""$$$$$r*   )NF)r#   N)__name__
__module____qualname____doc__	__slots__rS   r   r   boolr)   r0   r;   r	   intr   rH   rU   rX   r^   rA   r*   r(   r   r   2   sD       II
 
 
I" &*" "" " 3i	"
 c]" " 
" " " "$( ( ( ( #  #  #  #  #  #D
S\
*-
8@#8M
	
 
 
 
= = = =$
 
 
 
% % % % % %r*   r   Fg       @urlr    r   
sleep_timemax_sitemapsr#   c                   	 t          |           \  }|t                              d|            g S t                    st                              d|            g S d}|                     d          r| g}n'g }t          |           t                    dz   k    r| }t          ||||          		j        s't                    pfdt          D             	_        	j        rt          	j
                  |k     r	j                                        	_        	                                 	                                 	fd	j        D             	_        t          	j
                  |k     rt          |           	j        rt          	j
                  |k     |rt!          	j        |          	_        t                              dt          	j                  |           	j        S )	ax  Look for sitemaps for the given URL and gather links.

    Args:
        url: Webpage or sitemap URL as string.
             Triggers URL-based filter if the webpage isn't a homepage.
        target_lang: Define a language to filter URLs based on heuristics
                     (two-letter string, ISO 639-1 format).
        external: Similar hosts only or external URLs
                  (boolean, defaults to False).
        sleep_time: Wait between requests on the same website.
        max_sitemaps: Maximum number of sitemaps to process.

    Returns:
        The extracted links as a list (sorted list of unique links).

    Nzinvalid URL: %sz*base URL unreachable, dropping sitemap: %s)z.gzr   z.xml   c                     g | ]	} d | 
S )/rA   )rB   gbaseurls     r(   
<listcomp>z"sitemap_search.<locals>.<listcomp>   s:     A
 A
 A
!"wA
 A
 A
r*   c                 &    g | ]}|j         v|S rA   )r   )rB   sr   s     r(   ro   z"sitemap_search.<locals>.<listcomp>   s,      
  
  
q/D/DA/D/D/Dr*   z%s sitemap links found for %s)r   r,   r7   r   endswithrF   r   r   find_robots_sitemapsGUESSESr   popr   r0   r^   r   r   r!   r-   )
rf   r    r   rg   rh   
domainname	urlfiltersitemapurlsrn   r   s
           @@r(   sitemap_searchry      s   . 's++J(#...	   CSIII	I
||.// es88c'llQ&&&IGZk8TTG  
3G<<  
 A
 A
 A
 A
&-A
 A
 A

 
 
3w|#4#4|#C#C%26688 
  
  
  
+ 
  
  
 w||++* 
 
3w|#4#4|#C#C  <"7<;;
LL0#gl2C2CZPPP<r*   contentsc                 L   |dS t                               d|           } t                              |           r/t	          |t
                    r8t                              |          rd|dd                                         v rt          
                    d|            dS dS )zLCheck if the sitemap corresponds to an expected format,
    i.e. TXT or XML.NFr%   z<html   znot a valid XML sitemap: %sT)SCRUB_REGEXsubPOTENTIAL_SITEMAPr6   
isinstancerS   r[   rG   lowerr,   r7   )rf   rz   s     r(   rZ   rZ      s     u //"c
"
"C 	  %%Hc**2@2F2Fx2P2P httn**,,,,4c:::u4r*   rn   c                 F    t          | dz             }t          ||           S )zUGuess the location of the robots.txt file and try to extract
    sitemap URLs from itz/robots.txt)r   extract_robots_sitemaps)rn   	robotstxts     r(   rs   rs     s&     'M122I"9g666r*   r   c                    | t          |           dk    rg S g }|                                 D ]}|                    d          }|dk    r
|d|         }|                                }|s>|                    dd          }t          |          dk    rh|d                                                                         |d<   |d         dk    r-|                    |d                                                    t          t          	                    |                    }fd	|D             }t                              d
t          |                     |S )z.Read a robots.txt file and find sitemap links.Ni'  #r   :r   rj   r   c                 4    g | ]}|t          |          S rA   )r   )rB   urn   s     r(   ro   z+extract_robots_sitemaps.<locals>.<listcomp>-  s)    JJJQJ$Wa00JJJr*   z%s sitemaps found in robots.txt)rF   
splitlinesfindstripsplitr   r9   listdictfromkeysr,   r-   )r   rn   
candidateslinei
line_partsrx   s    `     r(   r   r     sO    C	NNU22	J$$&& 9 9IIcNN668Dzz|| 	ZZQ''
z??a&qM//117799JqM!}	))!!*Q-"5"5"7"7888dmmJ//00JJJJJ*JJJK
LL2C4D4DEEEr*   )4rb   loggingrP   	itertoolsr   timer   typingr   r   r   r   r	   courlanr
   r   r   r   r   r   deduplicationr   	downloadsr   r   settingsr   r   	getLoggerr_   r,   rQ   rW   rR   rT   rL   r5   r[   r8   r\   r}   r   rt   r   rS   rd   floatre   ry   rZ   rs   r   rA   r*   r(   <module>r      s	     				             9 9 9 9 9 9 9 9 9 9 9 9 9 9                - , , , , , . . . . . . . . 2 2 2 2 2 2 2 2 
	8	$	$RZIJJ
bj+RY77344"
n   =>> bj!?@@ rz.//bj''BJz**   ~% ~% ~% ~% ~% ~% ~% ~%F "&)C C	C#C C 	C
 C 
#YC C C CLc Xc] t    *7# 7$s) 7 7 7 7x} s tCy      r*   