
    %$}g%                        d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ  ej        e          Zh dZ ej        d          Z ej        d          Z  ej        d          Z! ej        d          Z" ej        d          Z# ej        d          Z$ G d d          Z%de&de'fdZ(de	e&         de%de	e&         fdZ)de&de%de	e&         fdZ*de&de%de	e&         fdZ+de&de%de	e&         fdZ,de%de
e&         de	e&         fd Z-	 	 	 d*d#e&d$e
e&         d%e'd&e.de	e&         f
d'Z/d(e&d$e
e&         de	e&         fd)Z0dS )+z>
Examining feeds and extracting links for further processing.
    N)islice)sleep)ListOptional)	check_url	clean_urlfilter_urlsfix_relative_urlsget_hostinfois_valid_url   )is_similar_domain)	fetch_url)	MAX_LINKS)	load_html>   text/rdftext/rsstext/xml	text/atom
text/plaintext/rdf+xmltext/rss+xmltext/atom+xmlapplication/rdfapplication/rssapplication/xmlapplication/atomapplication/jsonapplication/rdf+xmlapplication/rss+xmlapplication/atom+xmlapplication/feed+jsonapplication/x-atom+xmlapplication/x.atom+xmlz<(feed|rss|\?xml)z<link .*?href=".+?"zhref="(.+?)"z:<link>(?:\s*)(?:<!\[CDATA\[)?(.+?)(?:\]\]>)?(?:\s*)</link>z\bcomments\bzn\.(?:atom|rdf|rss|xml)$|\b(?:atom|rss)\b|\?type=100$|feeds/posts/default/?$|\?feed=(?:atom|rdf|rss|rss2)|feed$c                   L    e Zd ZdZg dZ	 	 ddedededed	ee         d
dfdZdS )FeedParametersz.Store necessary information to proceed a feed.basedomainextlangrefFNbaseurlr)   	referenceexternaltarget_langreturnc                 L    || _         || _        || _        || _        || _        d S Nr'   )selfr-   r)   r.   r/   r0   s         Q/var/www/py-google-trends/myenv/lib/python3.11/site-packages/trafilatura/feeds.py__init__zFeedParameters.__init__M   s+     !	!!#.	!    )FN)	__name__
__module____qualname____doc__	__slots__strboolr   r6    r7   r5   r&   r&   I   s        44888I %)" "" " 	"
 " c]" 
" " " " " "r7   r&   feed_stringr1   c                 ^    t                               |           rdS | dd         }d|v pd|v S )z$Check if the string could be a feed.TNd   z<rssz<feed)FEED_OPENINGmatch)r@   	beginnings     r5   is_potential_feedrF   \   s@    +&& tDSD!IY6'Y"66r7   linklistparamsc                    g }t          t          |                     D ]}t          |j        |          }t	          ||j                  }|j|j        sGd|vrCt          |j        |d                   s(t          
                    d|j        |d                    }|                    |d                    d|v sd|v r|                    |           |S )	zGExamine links to determine if they are valid and
    lead to a web page)languageNfeedr   z'Rejected, diverging domain names: %s %sr   
feedburner	feedproxy)sortedsetr
   r(   r   r+   r*   r   r)   LOGGERwarningappend)rG   rH   output_linksitemlinkcheckeds         r5   handle_link_listrW   d   s     Ls8}}%% & & d33D6;777J	0$&&)&-DD ' =v}gVWj    ##GAJ////T!![D%8%8%%%r7   c                    t          |           s|                     d          ry	 d t          j        |                               dg           D             }d |D             S # t          j        j        $ r# t                              d|j	                   Y n$w xY wt                              d|j	                   g S d| v rAd d	 t          t                              |           t                    D             D             S d
| v rBd t          t                              | t          j                  t                    D             S g S )z<Try different feed types and return the corresponding links.{c                 b    g | ],}|                     d           p|                     d          -S )urlid)get).0rT   s     r5   
<listcomp>zfind_links.<locals>.<listcomp>   sA        HHUOO5txx~~  r7   itemsc                     g | ]}||S r3   r?   )r^   cs     r5   r_   zfind_links.<locals>.<listcomp>   s    ???ar7   zJSON decoding error: %szPossibly invalid feed: %sz<link c                 ^    g | ]*}d |vd|v
t                               |          d         +S )zatom+xmlz
rel="self"r   )	LINK_HREFsearchr^   rU   s     r5   r_   zfind_links.<locals>.<listcomp>   sO     
 
 
 %%,d*B*B	 T""1% +C*B*Br7   c              3   &   K   | ]}|d          V  dS )r   Nr?   r^   ms     r5   	<genexpr>zfind_links.<locals>.<genexpr>   s7        !     r7   z<link>c                 B    g | ]}|d                                           S )r   )striprh   s     r5   r_   zfind_links.<locals>.<listcomp>   s4     
 
 
 aDJJLL
 
 
r7   )rF   
startswithjsonloadsr]   decoderJSONDecodeErrorrP   debugr)   r   
LINK_ATTRSfinditerr   LINK_ELEMENTSreDOTALL)r@   rH   
candidatess      r5   
find_linksry      s   [)) !!#&& 	EG  $
; 7 7 ; ;GR H H  
 @?:????</ G G G6FFFFFG LL4fmDDD	 ;
 
 $Z%8%8%E%EyQQ  
 
 
 	
 ;
 
M22;	JJIVV
 
 
 	

 Is   =A$ $4BBc                 ~   | s"t                               dj                   g S t          |                                           }fdt          |          D             }|r7t                               dt          |          t          |                     n t                               dj                   |S )z7Extract and refine links from Atom, RSS and JSON feeds.zEmpty feed: %sc                 \    g | ](}|j         k    |                    d           dk    &|)S )/   )r,   count)r^   rU   rH   s     r5   r_   z!extract_links.<locals>.<listcomp>   sD       6:$**S//A"5"5 	"5"5"5r7   z!Links found: %s of which %s validzInvalid feed for %s)rP   rr   r)   ry   rl   rW   len)r@   rH   
feed_linksrS   s    `  r5   extract_linksr      s     %v}555	K--//88J   $Z88  L  ;/Z#lBSBS	
 	
 	
 	
 	*FM:::r7   
htmlstringc                 t   t          |           }|"t                              d|j                   g S d |                    d          D             }|sd |                    d          D             }g }t
                              |          D ]q}t          |j        |          }t          |          }|rI||j	        k    r>t          |          r/t                              |          s|                    |           rt                              dt          |          t          |                     |S )zxParse the HTML and try to extract feed URLs from the home page.
    Adapted from http://www.aaronsw.com/2002/feedfinder/NzInvalid HTML/Feed page: %sc                     g | ]b}|                     d           t          v s.t                              |                     dd                    L|                     dd          cS )typehref )r]   
FEED_TYPESLINK_VALIDATION_REre   rf   s     r5   r_   z"determine_feed.<locals>.<listcomp>   sk       88Fz))$$TXXfb%9%9:: * 	)))r7   z//link[@rel="alternate"][@href]c                     g | ]F}t                               |                    d d                    0|                    d d          GS )r   r   )r   re   r]   rf   s     r5   r_   z"determine_feed.<locals>.<listcomp>   sX     
 
 
!((&")=)=>>
HHVR  
 
 
r7   z
//a[@href]z%Feed URLs found: %s of which %s valid)r   rP   rr   r(   xpathdictfromkeysr
   r   r,   r   	BLACKLISTre   rR   r   )r   rH   tree	feed_urlsoutput_urlsrU   s         r5   determine_feedr      sV    Z  D|16;???	 JJ@AA  I  

 


<00
 
 
	 Ki(( 	% 	% d33	%
""T"" #$$T** # t$$$ LL/Y[AQAQ   r7   	urlfilterc                     | j         rrt          d| j         d| j          d          }|rPt          ||           }t	          ||          }t
                              dt          |          | j                   |S g S )z2Alternative way to gather feed links: Google News.z*https://news.google.com/rss/search?q=site:z&hl=z&scoring=n&num=100z!%s Google news links found for %s)r+   r   r)   r   r	   rP   rr   r   )rH   r   
downloadedr   s       r5   probe_gnewsr      s    { 
kkkFKkkk
 

  	&z6::J$Z;;JLL3S__fm   Ir7   F       @r[   r0   r/   
sleep_timec                 $   t          |           \  }}|t                              d|            g S t          ||| ||          }d}t	          |           }|t          ||          }	|	slt          ||          D ]6}
t	          |
          }|r#|	                    t          ||                     7t          |           t          |          dz   k    r| }|	r;t          |	|          }	t          
                    dt          |	          |           |	S t          
                    d|            nSt                              d|            |                     d          |k    rt          |           t          ||          S t          ||          S )a  Try to find feed URLs.

    Args:
        url: Webpage or feed URL as string.
             Triggers URL-based filter if the webpage isn't a homepage.
        target_lang: Define a language to filter URLs based on heuristics
                     (two-letter string, ISO 639-1 format).
        external: Similar hosts only or external URLs
                  (boolean, defaults to False).
        sleep_time: Wait between requests on the same website.

    Returns:
        The extracted links as a list (sorted list of unique links).

    NzInvalid URL: %sr}   z%s feed links found for %szNo usable feed links found: %szCould not download web page: %sr|   )r   rP   rQ   r&   r   r   r   extendr   r	   rr   errorrl   r   try_homepager   )r[   r0   r/   r   r)   r-   rH   r   r   r   rK   r@   s               r5   find_feed_urlsr      s   * #3''OFG~(#...	GVS(KHHFI3J":v66
 	 &z6:: J J'oo J%%mK&H&HIII3xx#g,,***	 	$Z;;JLL5s:OOO5s;;;;6<<<99S>>W$$*555vy)))r7   r-   c                 X    t                               d|            t          | |          S )zhShift into reverse and try the homepage instead of the particular feed
    page that was given as input.z&Probing homepage for feeds instead: %s)rP   rr   r   )r-   r0   s     r5   r   r   4  s)     LL97CCC';///r7   )NFr   )1r;   rn   loggingrv   	itertoolsr   timer   typingr   r   courlanr   r   r	   r
   r   r   deduplicationr   	downloadsr   settingsr   utilsr   	getLoggerr8   rP   r   compilerC   rs   rd   ru   r   r   r&   r=   r>   rF   rW   ry   r   r   r   floatr   r   r?   r7   r5   <module>r      si      				             ! ! ! ! ! ! ! !                - , , , , ,                        		8	$	$  
, rz.//RZ.//
BJ''	
A  BJ''	RZ  " " " " " " " "&73 74 7 7 7 7tCy . T#Y    6%C % %DI % % % %Ps N tCy    0)s )N )tCy ) ) ) )X 8C= T#Y    $ "&	6* 6*	6*#6* 6* 	6*
 
#Y6* 6* 6* 6*r0# 0HSM 0d3i 0 0 0 0 0 0r7   