
    %$}gC                        d Z ddlZddlZddlmZmZ ddlmZmZm	Z	m
Z
mZ ddlmZ ddlmZmZmZ ddlmZ  ej        e          Z ej        d	          Z ej        d
          Z ej        d          Z ej        d          Z ej        d          Z ej        d          Z ej        d          Z ej        d          Z ej        d          Z  ej        d          Z!d&de"dee"         dee"         fdZ#de"de"fdZ$	 d'de"de%dee"         de"fdZ&de"de"fdZ'de"de"fdZ(d&d e"dee"         de"fd!Z)	 	 	 d(d#eee"f         de%dee"         d$e%de"f
d%Z*dS ))z0
Functions performing URL trimming and cleaning
    N)OptionalUnion)parse_qsquote	urlencode
urlunsplitSplitResult   )is_valid_url)ALLOWED_PARAMSLANG_PARAMSTARGET_LANGS)_parsez	https?://zZ(https?://[^">&? ]+?)(?:https?://)|(?:https?://[^/]+?/[^/]+?[&?]u(rl)?=)(https?://[^"> ]+)z)https?://.+?(https?://.+?)(?:https?://|$)z(?<=\w):(?:80|443)z/+z^(?:/\.\.(?![^/]))+z</?[a-z]{,4}?>|{.+?}z/\&$z(.*?)[<>"\s]z^(?:dc|fbc|gc|twc|yc|ysc)lid|^(?:click|gbra|msclk|igsh|partner|wbra)id|^(?:ads?|mc|ga|gs|itm|mc|mkt|ml|mtm|oly|pk|utm|vero)_|(?:\b|_)(?:aff|affi|affiliate|campaign|cl?id|eid|ga|gl|kwd|keyword|medium|ref|referr?er|session|source|uid|xtor)urllanguagereturnc                 p    	 t          t          |           d|          S # t          t          f$ r Y dS w xY w)z4Helper function: chained scrubbing and normalizationFN)normalize_url	scrub_urlAttributeError
ValueError)r   r   s     M/var/www/py-google-trends/myenv/lib/python3.11/site-packages/courlan/clean.py	clean_urlr   0   sF    Ys^^UH===J'   tts     55c                    d                     |                                                               d          } |                     d          r*|                     dd                              dd          } t
                              d|           } t                              d|                     dd                    } t          	                    |           }t          |          dk    rd| vrt                              d	t          |          |            t                              |           }|r9t          |d                   r$|d         } t                              d
|            nTt                               |           }|r8t          |d                   r#|d         } t                              d
|            t"                              |           }|r|d         } t          |           dk    r4t                              d| dd         dz   t          |                      |                     d          dk    s|                     d          dk    r|                     d          } | S )z@Strip unnecessary parts and make sure only one URL is considered z  	
z	<![CDATA[z]]>z&amp;&r
   zweb.archive.orgzdouble url: %s %sztaking url: %si  z$invalid-looking link %s of length %dN2   u   …/   z://)joinsplitstrip
startswithreplaceREMAINING_MARKUPsubTRAILING_AMP	PROTOCOLSfindalllenLOGGERdebug	SELECTIONmatchr   
MIDDLE_URLTRAILING_PARTScountrstrip)r   	protocolsr.   s      r   r   r   8   s0    ''#))++


$
$	K C ~~k"" >kk+r**225"== 

r3
'
'C 

2s{{7C88
9
9C !!#&&I
9~~/s::(#i..#>>>$$ 	4\%(++ 	4(CLL)3////$$S))E 4eAh// 4Ah-s333   %%E Ah
3xx#~~;S"X=MsSVxxXXX yy~~cii..22jjooJ    Fquerystringstrictc                    | sdS t          |           }i }t          |          D ]}|                                }|r|t          vr
|t          vr+nt
                              |          rG|t          v rT|t          v rKt          ||         d                   t          |         vr#t          
                    d||           t          ||         ||<   t          |d          S )zStrip unwanted query elementsr   r   zbad lang: %s %sT)doseq)r   sortedlowerr   r   TRACKERS_REsearchr   strr+   r,   r   r   )r5   r6   r   qdictnewqdictqelemteststrs          r   clean_queryrB   j   s      r[!!EH ' '++-- 	n,,1K1K(( 	 $$;&&E%LO$$L,BBBLL*He<<<,XT****r4   stringc                    d| vr| S g }|                      d          D ]}|                                                    d          rU	 |                    d                              d          }n+# t
          $ r t                              d|           Y nw xY w|                    |           d	                    |          S )z@Probe for punycode in lower-cased hostname and try to decode it.zxn--.utf8idnazinvalid utf/idna string: %s)
r!   r:   r#   encodedecodeUnicodeErrorr+   r,   appendr    )rC   partsparts      r   decode_punycoderN      s    VES!!  ::<<""6** 	BB{{6**11&99 B B B:DAAAAABT88E??s   (A11%BBurl_partc                 $    t          | d          S )zbNormalize URLs parts (specifically path and fragment) while
    accounting for certain characters.z/%!=:,-)safe)r   )rO   s    r   normalize_partrR      s     	****r4   fragmentc                     d| v r2d| v rt          | d|          } nt                              |           rd} t          |           S )zNLook for trackers in URL fragments using query analysis, normalize the output.=r   Fr   )rB   r;   r<   rR   )rS   r   s     r   normalize_fragmentrV      sP    
h(??"8UH==HH)) 	H(###r4   T
parsed_urltrailing_slashc           	         t          |           } | j                                        }t          | j                                                  }	 | j        dv rt                              d|          }n# t          $ r Y nw xY wt          t                              dt                              d| j                                      }t          | j        ||          pd}|r|sd}nA|s?|s=t          |          dk    r*|                    d          r|                    d          }|rdnt%          | j        |          }t)          |||||f          S )zFTakes a URL string or a parsed URL and returns a normalized URL string)P   i  r   r   r
   )r   schemer:   rN   netlocport	NETLOC_REr&   r   rR   PATH2PATH1pathrB   queryr*   endswithr2   rV   rS   r   )	rW   r6   r   rX   r[   r\   newpathnewquerynewfragments	            r   r   r      sd    
##J$$&&FZ.446677F?i'']]2v..F    UYYr599S*/+J+JKKLLG:+VX>>D"H & &&& LL1S!!  ..%%U""$6z7JH$U$UKvvw+FGGGs   $A5 5
BB)N)FN)FNT)+__doc__loggingretypingr   r   urllib.parser   r   r   r   r	   filtersr   settingsr   r   r   urlutilsr   	getLogger__name__r+   compiler(   r-   r/   r^   r`   r_   r%   r'   r0   r;   r=   r   r   boolrB   rN   rR   rV   r    r4   r   <module>rt      s     				 " " " " " " " " L L L L L L L L L L L L L L ! ! ! ! ! ! ? ? ? ? ? ? ? ? ? ?       
	8	$	$ BJ|$$	BJa 	 RZDEE
BJ,--	 	
5
)** 2:566 rz'""O,,
 bjA  3 (3- 8C=    /3 /3 / / / /f GK+ ++"+6>sm++ + + +BC C    $+S +S + + + +$ $ $ $ $ $ $ $ "	"H "Hk3&'"H"H sm"H 	"H
 	"H "H "H "H "H "Hr4   