
    !}g'                         d Z dZdZdZdZddlZddlZddlmZm	Z	m
Z
mZmZ ddlmZ  ej        e          Zd	Zd
ZdZeez   Zg dZg dZg dZg dZddZd ZddZddZd Zd Zd Zd Z d Z!dS )zt
Newspaper treats urls for news articles as critical components.
Hence, we have an entire module dedicated to them.
	newspaperzLucas Ou-YangMITzCopyright 2014, Lucas Ou-Yang    N)parse_qsurljoinurlparseurlsplit
urlunsplit)
tldextracti N  z(?<=\W)zs([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?)htmlhtmmdrstaspxjsprhtmlcgixhtmljhtmlaspshtml)storyarticlefeaturefeaturedslides	slideshowgallerynewsvideomediavradiopress)careerscontactaboutfaqtermsprivacyadvertpreferencesfeedbackinfobrowsehowtoaccount	subscribedonateshopadmin)amazondoubleclicktwitter Fc                     t          |           }d                    fd|j                            d          D                       }|r|dd         }nd}t	          |dd         |fz   |z             S )z0
    Remove all param arguments from a url.
    &c              3   F   K   | ]}|                               |V  d S N)
startswith).0qry_itemkeep_paramss     N/var/www/py-google-trends/myenv/lib/python3.11/site-packages/newspaper/urls.py	<genexpr>zremove_args.<locals>.<genexpr>.   sK        {++            N)    )r   joinquerysplitr	   )urlr@   fragsparsedfiltered_queryfrags    `    rA   remove_argsrO   )   s     c]]FXX    !'!3!3C!8!8    N  abbzfRaRjN#44t;<<<rC   c                     t          |           }|j        }|j        }||v s||v r| S t          |          }|                    d          r|d         d         S | S )z
    Some sites like Pinterest have api's that cause news
    args to direct to their site with the real news url as a
    GET param. This method catches that and returns our param.
    rJ   r   )r   netlocrH   r   get)rJ   source_domain
parse_datadomainrH   
query_items         rA   redirect_backrW   :   so     #JFE &M"9"9
%J~~e $% ##JrC   c           	         	 |5t          |          j        }t          ||           }t          ||          }n| }nF# t          $ r9}t
                              d| dt          |                     d}Y d}~nd}~ww xY w|S )zn
    Operations that purify a url, removes arguments,
    redirects, and merges relatives with absolutes.
    Nzurl z failed on err rE   )r   rQ   r   rW   
ValueErrorlogcriticalstr)rJ   
source_urlrS   
proper_urles        rA   prepare_urlr`   Q   s    
!$Z007M S11J&z=AAJJ J   ###s1vvv>???





 s   9< 
A?/A::A?c                    |rt          |           } | t          |           dk     r|rt          d| z             dS d| v }d| vod| v}|s|r|rt          d| z             dS t          |           j        }|                    d	          sdS |                    d	          r
|dd
         }d |                    d	          D             }t          |          dk    rit          |           }|r|t          vr|rt          d| z             dS |d
                             d          }t          |          dk    r|d         |d
<   d|v r|
                    d           t          j        |           }	|	j        }
|	j                                        }|r|d
         nd}|t           v r|rt          d| z             dS t          |          dk    rd\  }}n*|                    d          }|                    d          }|r|dk    s|dk    rz||k    r7|d |                    d          D             vr|rt          d| z             dS ||k    r7|d |                    d          D             vr|rt          d| z             dS t          |          dk    r|rt          d| z             dS t$          D ]#}||v s||
k    r|rt          d| z              dS $t'          j        t*          |           }||rt          d| z             dS t,          D ]9}|                                d |D             v r|rt          d | z              dS :|rt          d!| z             dS )"a3  
    Is this URL a valid news-article url?

    Perform a regex check on an absolute url.

    First, perform a few basic checks like making sure the format of the url
    is right, (scheme, domain, tld).

    Second, make sure that the url isn't some static resource, check the
    file type.

    Then, search of a YYYY/MM/DD pattern in the url. News sites
    love to use this pattern, this is a very safe bet.

    Separators can be [\.-/_]. Years can be 2 or 4 digits, must
    have proper digits 1900-2099. Months and days can be
    ambiguous 2 digit numbers, one is even optional, some sites are
    liberal with their formatting also matches snippets of GET
    queries with keywords inside them. ex: asdf.php?topic_id=blahlbah
    We permit alphanumeric, _ and -.

    Our next check makes sure that a keyword is within one of the
    separators in a url (subdomain or early path separator).
    cnn.com/story/blah-blah-blah would pass due to "story".

    We filter out articles in this stage by aggressively checking to
    see if any resemblance of the source& domain's name or tld is
    present within the article title. If it is, that's bad. It must
    be a company link, like 'cnn is hiring new interns'.

    We also filter out articles with a subdomain or first degree path
    on a registered bad keyword.
    N   z/	%s rejected because len of url is less than 11Fzmailto:zhttp://zhttps://z)	%s rejected because len of url structure/c                 8    g | ]}t          |          d k    |S r   lenr>   xs     rA   
<listcomp>zvalid_url.<locals>.<listcomp>   #    <<<Q!1rC   r   z 	%s rejected due to bad filetype.   indexrE   z%s caught for a bad tld)r   r   -_rD   c                 6    g | ]}|                                 S r8   lowerri   s     rA   rk   zvalid_url.<locals>.<listcomp>        BBB17799BBBrC   z%s verified for being a slugTc                 6    g | ]}|                                 S r8   rt   ri   s     rA   rk   zvalid_url.<locals>.<listcomp>   rv   rC   z#%s caught for path chunks too smallz%s caught for bad chunksz%s verified for datec                 6    g | ]}|                                 S r8   rt   )r>   ps     rA   rk   zvalid_url.<locals>.<listcomp>   s     ;;;!AGGII;;;rC   z%s verified for good pathz%s caught for default false)r`   rh   printr   pathr=   endswithrI   url_to_filetypeALLOWED_TYPESremover
   extract	subdomainrU   ru   BAD_DOMAINScount
BAD_CHUNKSresearch
DATE_REGEX
GOOD_PATHS)rJ   verbosetestr1r2r{   path_chunks	file_type
last_chunktld_datsubdtldurl_slug
dash_countunderscore_countb
match_dateGOODs                     rA   	valid_urlr   f   s@   H  # {c#hhmmSELsRSSSu
s
B
3
	;Zs%:B	 R MEFLMMMuC==D ??3 u }}S CRCy =<djjoo<<<K ;!#C((	  	-77HACGHHH5 _**3//
z??Q(nKO +7###  %%GD
.


 
 C"-5{22H
k:E3c9:::u
;1'+$
$$^^C((
#>>#..  
Z!^^'7!';';)))BBhnnS.A.ABBBBBGE"@3"FGGGtj((BBhnnS.A.ABBBBBGE"@3"FGGGt ;1FE?#EFFFu   qDyy?83>???55  ) :s++J 7E036777t  ::<<;;{;;;;;@9C?@@@44 < :3c9:::5rC   c                    t          |           j        }|                    d          r
|dd         }d |                    d          D             }|d                             d          }t	          |          dk     rdS |d         }t	          |          dk    s|                                t          v r|                                S dS )z
    Input a URL and output the filetype of the file
    specified by the url. Returns None for no filetype.
    'http://blahblah/images/car.jpg' -> 'jpg'
    'http://yahoo.com'               -> None
    rc   Nrd   c                 8    g | ]}t          |          d k    |S rf   rg   ri   s     rA   rk   z#url_to_filetype.<locals>.<listcomp>   rl   rC   rm         )r   r{   r|   rI   rh   ru   r~   )abs_urlr{   r   r   r   s        rA   r}   r}      s     G!D}}S CRCy<<djjoo<<<KR&&s++J
:t2I
9~~ioo//=@@   4rC   c                 .    | dS t          | fi |j        S )zc
    returns a url's domain, this method exists to
    encapsulate all url code into this file
    N)r   rQ   r   kwargss     rA   
get_domainr     s'    
 tG&&v&&--rC   c                 .    | dS t          | fi |j        S z
    N)r   schemer   s     rA   
get_schemer     s'     tG&&v&&--rC   c                 .    | dS t          | fi |j        S r   )r   r{   r   s     rA   get_pathr     s'     tG&&v&&++rC   c                     t          j        dt           j                  }t          j        |          }|                    |           duS )z2
    this regex was brought to you by django!
    z^(?:http|ftp)s?://(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|\[?[A-F0-9]*:[A-F0-9:]+\]?)(?::\d+)?(?:/?|[/?]\S+)$N)r   compile
IGNORECASEr   )rJ   regexc_regexs      rA   
is_abs_urlr   "  sI     J	 M+ +E jGNN3t+,rC   )r8   Fr<   )FF)"__doc__	__title__
__author____license____copyright__loggingr   urllib.parser   r   r   r   r	   r
   	getLogger__name__rZ   MAX_FILE_MEMO_STRICT_DATE_REGEX_PREFIXr   STRICT_DATE_REGEXr~   r   r   r   rO   rW   r`   r   r}   r   r   r   r   r8   rC   rA   <module>r      s    	
/  				 J J J J J J J J J J J J J J ! ! ! ! ! !g!! &  D
-
: 3 3 3% % %
A A A
 322= = = ="  .   *I I I IX  ,. . .. . ., , ,- - - - -rC   