
    %$}gG                     d   d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
 ddlmZmZmZmZ ddlmZ  ej        d          Z ej        d          Z ej        d	          Z ej        d
ej                  Z	 d#dedede
e	d         e	eef         f         fdZ	 d$dedeee                  dedee         fdZdedefdZdedefdZdede	eef         fdZdede	ee         ef         fdZdededefdZ dee         dee         dee         fdZ!d%dedededefdZ"d ed!ee         defd"Z#dS )&zD
Functions related to URL manipulation and extraction of URL parts.
    N)unescape)AnyListOptionalSetTupleUnion)urljoinurlsplit
urlunsplitSplitResult)get_tldz{(?:(?:f|ht)tp)s?://(?:[^/?#]{,63}\.)?([^/?#.]{4,63}\.[^/?#]{2,63}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[0-9a-f:]{16,})(?:/|$)z(?<=\D):\d+z^www[0-9]*\.z(?:feed(?:burner|proxy))FurlfastreturnNNc                    | rt          | t                    sdS |rwt                              |           }|r[t                              d|d                             d          d                   }|                    d          d         }|r||fS t          | dd	          }|dS |j        t                              d|j
                  fS )
z0Cached function to extract top-level domain infor       @.r   T)	as_objectfail_silently)
isinstancestrDOMAIN_REGEXmatchSTRIP_PORT_REGEXsubsplitr   domainCLEAN_FLD_REGEXfld)r   r   domain_matchfull_domainclean_matchtldinfos         P/var/www/py-google-trends/myenv/lib/python3.11/site-packages/courlan/urlutils.pyget_tldinfor*      s      jc** z 0#))#.. 	0*..r<?3H3H3M3Mb3QRRK%++C003K 0"K//cT>>>Gz>?..r7;????    	blacklistc                 f    |t                      }t          | |          \  }}|r
||vr||vr|ndS )z;Extract domain name information using top-level domain infoNr   )setr*   )r   r,   r   r"   r&   s        r)   extract_domainr0   1   sX     EE	%c555FK 	!22{)7S7S 	r+   c                     t          | t                    rt          t          |                     }n5t          | t                    r| }nt          dt          |                     |S )z3Parse a string or use urllib.parse object directly.zwrong input type:)r   r   r   r   r   	TypeErrortype)r   
parsed_urls     r)   _parser5   A   s_    #s 8hsmm,,

	C	%	% 8

+T#YY777r+   c                 \    t          |           }|j        r|j        dz   }nd}||j        z   S )ziStrip URL of some of its parts to get base URL.
    Accepts strings and urllib.parse ParseResult objects.z://r   )r5   schemenetloc)r   r4   r7   s      r)   get_base_urlr9   L   s<     J "U*J%%%r+   c                     t          |           }t          |          }t          dd|j        |j        |j        g          }|dk    rd}|r|st          d|            ||fS )zvDecompose URL in two parts: protocol + host/domain and path.
    Accepts strings and urllib.parse ParseResult objects.r   /zincomplete URL: )r5   r9   r   pathqueryfragment
ValueError)r   r4   hostnamepathvals       r)   get_host_and_pathrB   W   s     JJ''H	R*"2J4GH G "}} 37 31C11222Wr+   c                 J    t          | d          }t          |           }||fS )zXConvenience function returning domain and host info (protocol + host/domain) from a URL.Tr.   )r0   r9   )r   
domainnamebase_urls      r)   get_hostinforF   g   s,    $///JC  Hxr+   baseurlc                    |                     d          r|S t          |           j        }t          |          }|j        |dfvr,|j        r|S t	          |                    d                    S t          | |          S )z8Prepend protocol and host information to relative links.{r   http)r7   )
startswithr   r8   r7   r   _replacer
   )rG   r   base_netloc	split_urls       r)   fix_relative_urlsrO   n   s    
~~c 
7##*KIR000 	J),,F,;;<<<7C   r+   	link_list	urlfilterc                     t          t          |                     S fd| D             }|sd | D             }t          t          |                    S )zDReturn a list of links corresponding to the given substring pattern.Nc                     g | ]}|v |	S  rT   ).0lrQ   s     r)   
<listcomp>zfilter_urls.<locals>.<listcomp>   s    <<<1Y!^^Q^^^r+   c                 F    g | ]}t                               |          |S rT   )FEED_WHITELIST_REGEXsearch)rU   rV   s     r)   rW   zfilter_urls.<locals>.<listcomp>   s,    PPPq1E1L1LQ1O1OPPPPr+   )sortedr/   )rP   rQ   filtered_lists    ` r)   filter_urlsr]   ~   sh    c)nn%%%<<<<	<<<M QPPIPPP#m$$%%%r+   T	referenceignore_suffixc                 n    t          |d          \  }}t          | d          \  }}|r||k    S ||k    S )zjDetermine if a link leads to another host, takes a reference URL and
    a URL as input, returns a booleanTr.   )r*   )r   r^   r_   stripped_refrefstripped_domainr"   s          r)   is_externalrd      sO     $ID999L#)#D999OV /,..S=r+   linkknown_linksc                 R   | |v rdS | d         dk    r|                      d          n| dz   }||v rdS |                     d          r_|                     d          rd| dd         z   nd| dd         z   }|d         dk    r|                     d          n|dz   }||v s||v rdS d	S )
zDCompare the link and its possible variants to the existing URL base.Tr   r;   rJ   httpsN      F)rstriprK   )re   rf   
slash_testprotocol_tests       r)   is_known_linkrn      s     {t &*"X__S!!!$*J[  t v 
!%!9!9QFT"1"Xwabb?Q 	
 R C''   %%%$ 	
 K'':+D+D45r+   )F)NF)T)$__doc__rehtmlr   typingr   r   r   r   r   r	   urllib.parser
   r   r   r   tldr   compiler   r   r#   IrY   r   boolr*   r0   r5   r9   rB   rF   rO   r]   rd   rn   rT   r+   r)   <module>rx      s    
			       9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 C C C C C C C C C C C C       rz  2:n-- "*_--!rz"=rtDD  !@ @	@@
5eCHo-.@ @ @ @. BG 	!#c(+:>c]         &c &c & & & &3 5c?      c  eHSM3$67        !s ! ! ! ! ! ! 	&49 	&# 	&49 	& 	& 	& 	& S S       #c( t      r+   