
    %$}gZ                         d Z ddlZddlZddlmZmZmZmZ ddlm	Z	 ddl
mZmZ ddlmZmZmZmZmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZm Z   ej!        e"          Z# ej$        dej%                  Z& ej$        dej%                  Z' ej$        dej%                  Z(	 	 	 	 	 d&de)de*de*dee)         de*de*deee)e)f                  fdZ+	 	 d'dddddddddde)dee)         de*de*dee)         de*de*de*de*dee)         dee)         dee)         fdZ,ddddddd d!e)dee)         d"ee)         d#ee	         d$e*de*de*dee)         deee)         ee)         f         fd%Z-dS )(z0
Core functions needed to make the module work.
    N)ListOptionalSetTuple)RobotFileParser   )normalize_url	scrub_url)	basic_filterdomain_filterextension_filteris_navigation_pageis_not_crawlablelang_filterpath_filtertype_filtervalidate_url)redirection_test)	BLACKLIST)extract_domainget_base_urlfix_relative_urlsis_externalis_known_linkz<a [^<>]+?>zhreflang=["\']?([a-z-]+)zhref=["\']?([^ ]+?)(["\' >])FTurlstrictwith_redirectslanguagewith_navtrailing_slashreturnc                 ~   	 t          |           du r"t                              d|            t          t	          |           } |rt          |           } t          | ||          du r"t                              d|            t          |6t          | |||          du r"t                              d|            t          t          |           \  }}|du r"t                              d|            t          t          |j
                  du r"t                              d|            t          t          |j                  du r"t                              d	|            t          |r>t          |j
        |j                  du r"t                              d
|            t          t          ||||          } |rt!          | t"          d          }nt!          | d          }|t                              d	|            dS n3# t$          t          f$ r t                              d|            Y dS w xY w| |fS )a(  Check links for appropriateness and sanity
    Args:
        url: url to check
        strict: set to True for stricter filtering
        with_redirects: set to True for redirection test (per HTTP HEAD request)
        language: set target language (ISO 639-1 codes)
        with_nav: set to True to include navigation pages instead of discarding them
        trailing_slash: set to False to trim trailing slashes

    Returns:
        A tuple consisting of canonical URL and extracted domain

    Raises:
        ValueError, handled in exception.
    Fzrejected, basic filter: %s)r   r   zrejected, type filter: %sNzrejected, lang filter: %szrejected, validation test: %szrejected, extension filter: %szrejected, domain name: %szrejected, path filter: %sT)	blacklistfast)r$   zdiscarded URL: %s)r   LOGGERdebug
ValueErrorr
   r   r   r   r   r   pathr   netlocr   queryr	   r   r   AttributeError)	r   r   r   r   r   r    validation_test
parsed_urldomains	            L/var/www/py-google-trends/myenv/lib/python3.11/site-packages/courlan/core.py	check_urlr0   *   s^   4>%%LL5s;;; nn  	("3''C s6H===FFLL4c:::  C6>BBeKKLL4c::: '33&7&7#e##LL8#>>> JO,,55LL93??? *++u44LL4c:::  	k*/:3CDDMMLL4c::: J.II  	4#C94HHHFF#Cd333F>LL4c:::4 
 J'   (#...tt ;s   HH ,H87H8)	no_filterr   r   r    r   	redirects	referencebase_urlpagecontentexternal_boolr1   r2   r3   r4   c          	         |
rt          d          t          |          }
|p|
}t                      t                      }}| s|S |	p|
}	d t                              |           D             D ]}d|v rd|v r|du r|d|v r{t
                              |          }|r^|d                             |          s|d         d	k    r7t                              |          }|r|	                    |d                    t                              |          }|r|	                    |d                    |D ]}|                    d
          st          ||          }|du r7t          ||||||          }|C|d         }|t          ||	d          k    rbt          ||          rs|	                    |           t                              dt!          |          t!          |                     |S )az  Filter links in a HTML document using a series of heuristics
    Args:
        pagecontent: whole page in binary format
        url: full URL of the original page
        external_bool: set to True for external links only, False for
                  internal links only
        no_filter: override settings and bypass checks to return all possible URLs
        language: set target language (ISO 639-1 codes)
        strict: set to True for stricter filtering
        trailing_slash: set to False to trim trailing slashes
        with_nav: set to True to include navigation pages instead of discarding them
        redirects: set to True for redirection test (per HTTP HEAD request)
        reference: provide a host reference for external/internal evaluation

    Returns:
        A set containing filtered HTTP links checked for sanity and consistency.

    Raises:
        Nothing.
    ,'base_url' is deprecated, use 'url' instead.c              3   &   K   | ]}|d          V  dS )r   N ).0ms     r/   	<genexpr>z extract_links.<locals>.<genexpr>   s&      FF!1FFFFFF    relnofollowFNhreflangr   z	x-defaulthttp)r   r    r   r   r   r   T)r   r3   ignore_suffixu!   %s links found – %s valid links)r'   r   setFIND_LINKS_REGEXfinditerHREFLANG_REGEXsearch
startswith
LINK_REGEXaddr   r0   r   r   r%   infolen)r5   r   r6   r1   r   r   r    r   r2   r3   r4   
candidates
validlinkslink	langmatch	linkmatchcheckeds                    r/   extract_linksrT      s\   D  IGHHHC  H
/C UUCEE
J  %XI GF/88EEFFF - -D==Z4//("6:;M;M&--d33I 1!''1115>q\[5P5P&--d33	 1NN9Q<000 #))$//I -y|,,,   v&& 	0$S$//D-!(!  G 1:DIT! ! !   z** 	t
KK3S__c*ooVVVr>   )langrulesexternalr   r   r4   
htmlstringrU   rV   rW   c                (   |rt          d          g g }	}t          | |||||          D ]d}
t          |
          s||                    d|
          s*t	          |
          r|	                    |
           O|                    |
           e||	fS )zPFind links in a HTML document, filter and prioritize them for crawling purposes.r8   )r5   r   r6   r   r   r   N*)r'   rT   r   	can_fetchr   append)rX   r   rU   rV   rW   r   r   r4   linkslinks_priorityrP   s              r/   filter_linksr_      s      IGHHH>E     D!! 	eooc4&@&@d## 	!!$''''LL.  r>   )FFNFT)NF).__doc__loggingretypingr   r   r   r   urllib.robotparserr   cleanr	   r
   filtersr   r   r   r   r   r   r   r   r   networkr   settingsr   urlutilsr   r   r   r   r   	getLogger__name__r%   compileIrE   rG   rJ   strboolr0   rT   r_   r:   r>   r/   <module>rp      s   
  				 - - - - - - - - - - - - . . . . . . + + + + + + + +
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 & % % % % %                    
	8	$	$2:nbd33 7>>RZ7>>

  "Z Z	ZZ Z sm	Z
 Z Z eCHoZ Z Z Z~ ]
 "#"] ] ]]	#] ]
 ] sm] ] ] ] ] }] sm] 	X] ] ] ]H '+"%! %! %!%!	#%! 3-	%!
 O$%! %! %! %! sm%! 49d3i %! %! %! %! %! %!r>   