
    !}g;                        d Z dZdZdZdZddlZddlmZmZm	Z	 ddl
m
Z
 d	d
lmZ d	dlmZ d	dlmZ d	dlmZ d	dlmZ d	dlmZ d	dlmZ  ej        e          Z G d de          Z G d de          ZdZ G d de          ZdS )ze
Source objects abstract online news source websites & domains.
www.cnn.com would be its own source.
	newspaperzLucas Ou-YangMITzCopyright 2014, Lucas Ou-Yang    N)urljoinurlsplit
urlunsplit)
tldextract   )network)urls)utils)Article)Configuration)ContentExtractor)ANCHOR_DIRECTORYc                       e Zd Zd ZdS )Categoryc                 0    || _         d | _        d | _        d S N)urlhtmldocselfr   s     P/var/www/py-google-trends/myenv/lib/python3.11/site-packages/newspaper/source.py__init__zCategory.__init__   s    	    N__name__
__module____qualname__r    r   r   r   r      s#            r   r   c                       e Zd Zd ZdS )Feedc                 "    || _         d | _        d S r   )r   rssr   s     r   r   zFeed.__init__#   s    r   Nr   r!   r   r   r#   r#   "   s#            r   r#      c                       e Zd ZdZd!dZd Zd Z ej        de	          d             Z
d	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd"dZd#dZd Zd Zd Zd Zd Zd Zd  Z dS )$SourceaE  Sources are abstractions of online news vendors like huffpost or cnn.
    domain     =  'www.cnn.com'
    scheme     =  'http'
    categories =  ['http://cnn.com/world', 'http://money.cnn.com']
    feeds      =  ['http://cnn.com/rss.atom', ..]
    articles   =  [<article obj>, <article obj>, ..]
    brand      =  'cnn'
    Nc                 p   |d|vs|dd         dk    rt          d          |pt                      | _        t          j        | j        |          | _        t          | j                  | _        || _        t          j	        |          | _        t          j
        | j                  | _        t          j        | j                  | _        g | _        g | _        g | _        d| _        d| _        d| _        d| _        t+          j        | j                  j        | _        d| _        d| _        d| _        dS )zThe config object for this source will be passed into all of this
        source's children articles unless specified otherwise or re-set.
        Nz://   httpzInput url is bad! F)	Exceptionr   configr   extend_configr   	extractorr   r   prepare_url
get_domaindomain
get_schemescheme
categoriesfeedsarticlesr   r   logo_urlfaviconr   extractbranddescription	is_parsedis_downloaded)r   r   r.   kwargss       r   r   zSource.__init__6   s    KU#--3rr7f3D3D/000/)$+v>>)$+66#C((odh//odh//
	'118
"r   c                 F   |                                   |                                  |                                  |                                  |                                  |                                  |                                  |                                  dS )zEncapsulates download and basic parsing with lxml. May be a
        good idea to split this into download() and parse() methods.
        N)downloadparseset_categoriesdownload_categoriesparse_categories	set_feedsdownload_feedsgenerate_articlesr   s    r   buildzSource.buildW   s     	

  """ 	     r   c                 d    |dk    rd |D             |dd<   n|dk    rd |D             |dd<   |S )aG  Delete rejected articles, if there is an articles param,
        purge from there, otherwise purge from source instance.

        Reference this StackOverflow post for some of the wonky
        syntax below:
        http://stackoverflow.com/questions/1207406/remove-items-from-a-
        list-while-iterating-in-python
        r   c                 :    g | ]}|                                 |S r!   )is_valid_url.0as     r   
<listcomp>z)Source.purge_articles.<locals>.<listcomp>r   s'    CCC!..2B2BC1CCCr   Nbodyc                 :    g | ]}|                                 |S r!   )is_valid_bodyrO   s     r   rR   z)Source.purge_articles.<locals>.<listcomp>t   s'    DDD!//2C2CD1DDDr   r!   )r   reasonr8   s      r   purge_articleszSource.purge_articlesh   sU     U??CChCCCHQQQKKvDDhDDDHQQQKr   iQ )secondscache_folderc                 L    | j                             | j        | j                  S )zThe domain param is **necessary**, see .utils.cache_disk for reasons.
        the boilerplate method is so we can use this decorator right.
        We are caching categories for 1 day.
        )r0   get_category_urlsr   r   )r   r3   s     r   _get_category_urlszSource._get_category_urlsw   s      ~//$(CCCr   c                 \    |                      | j                  }d |D             | _        d S )Nc                 .    g | ]}t          |           S r   r   rP   r   s     r   rR   z)Source.set_categories.<locals>.<listcomp>   s#    ===8,,,===r   )r\   r3   r6   )r   r   s     r   rD   zSource.set_categories   s0    &&t{33=====r   c                 l    g d} fd|D             }t           j                  }|j        dv rp|j                            d          rVd|j                            d          d         z   }|j        |j        |ddf}|                    t          |                     d	 |D             }d
 |D             }t          j
        | j                  }t          |          D ]A\  }}	||         j        }
|
r-|
j        r&t          j        |
j        |
          ||         _        Bd |D             }|D ]:}	 j                                                            |	j                  }||	_        ;d |D             } j        |z   } j                             j        |          }d |D              _        dS )zVDon't need to cache getting feed urls, it's almost
        instant with xpath
        )z/feedz/feedsz/rssc                 :    g | ]}t          j        |          S r!   )r   r   )rP   r   r   s     r   rR   z$Source.set_feeds.<locals>.<listcomp>   s%    OOOsGDHc22OOOr   )z
medium.comzwww.medium.comz/@z/feed//r	   r,   c                 .    g | ]}t          |           S r_   ra   rb   s     r   rR   z$Source.set_feeds.<locals>.<listcomp>   s#    )X)X)X(s*;*;*;)X)X)Xr   c                     g | ]	}|j         
S r!   r`   rP   cs     r   rR   z$Source.set_feeds.<locals>.<listcomp>   s    GGG1GGGr   responsec                      g | ]}|j         	|S r!   r   rh   s     r   rR   z$Source.set_feeds.<locals>.<listcomp>   s!    )^)^)^WXW])^!)^)^)^r   c                      g | ]}|j         	|S r   r   rh   s     r   rR   z$Source.set_feeds.<locals>.<listcomp>   s)     *= *= *=*+%*; +,*;*;*;r   c                 .    g | ]}t          |           S r_   )r#   rb   s     r   rR   z$Source.set_feeds.<locals>.<listcomp>   s     444dsmmm444r   N)r   r   netlocpath
startswithsplitr5   appendr   r
   multithread_requestr.   	enumeraterespokget_htmlr   
get_parser
fromstringr   r6   r0   get_feed_urlsr7   )r   common_feed_urlsrt   new_path	new_partscommon_feed_urls_as_categoriescategory_urlsrequestsindex_rk   r   categories_and_common_feed_urlsr   s   `             r   rG   zSource.set_feeds   s    766OOOO>NOOO""<;;;z$$T** ?#ej&6&6s&;&;A&>>!L%,"bH	 ''
9(=(=>>>)X)XGW)X)X)X&GG(FGGG.}dkJJ!"@AA 	5 	5HE1+H 5HK 5=D=ML8>5 >5 >5.u5: *_)^5S)^)^)^&/ 	 	A+((**55af==CAEE*= *=5S *= *= *=& +//<Z*Z'~++DH6UVV44t444


r   c                 R    | j                             | j                  }|| _        dS )z\Sets a blurb for this source, for now we just query the
        desc html attribute
        N)r0   get_meta_descriptionr   r=   )r   descs     r   set_descriptionzSource.set_description   s)     ~2248<<r   c                 N    t          j        | j        | j                  | _        dS )z!Downloads html of source
        N)r
   rz   r   r.   r   rJ   s    r   rB   zSource.download   s      $TXt{;;			r   c                    d | j         D             }t          j        || j                  }t	          | j                   D ]{\  }}||         }|j        1t          j        |j        |j                  | j         |         _        Et          
                    d| j         |         j        d| j        d           |d | j         D             | _         dS )z7Download all category html, can use mthreading
        c                     g | ]	}|j         
S r!   r`   rh   s     r   rR   z.Source.download_categories.<locals>.<listcomp>   s    8881888r   Nrj   zDeleting category  from source  due to download errorc                      g | ]}|j         	|S r!   rm   rh   s     r   rR   z.Source.download_categories.<locals>.<listcomp>   s    @@@@1@@@r   )r6   r
   rv   r.   rw   rx   rz   r   r   logwarning)r   r   r   r   r   reqs         r   rE   zSource.download_categories   s     98888.}dkJJ!$/22 	E 	EHE15/Cx#.5.>Gch/0 /0 /0&++ "oe4888$(((D E E E E A@do@@@r   c                    d | j         D             }t          j        || j                  }t	          | j                   D ]{\  }}||         }|j        1t          j        |j        |j                  | j         |         _        Et          
                    d| j        |         j        d| j        d           |d | j         D             | _         dS )z3Download all feed html, can use mthreading
        c                     g | ]	}|j         
S r!   r`   rP   fs     r   rR   z)Source.download_feeds.<locals>.<listcomp>   s    ///qQU///r   Nrj   zDeleting feed r   r   c                      g | ]}|j         	|S r!   )r%   r   s     r   rR   z)Source.download_feeds.<locals>.<listcomp>   s    555Aqu5a555r   )r7   r
   rv   r.   rw   rx   rz   r   r%   r   r   r6   )r   	feed_urlsr   r   r   r   s         r   rH   zSource.download_feeds   s     0/DJ///	.y$+FF!$*-- 	E 	EHE15/Cx#(/(8Gch)0 )0 )0
5!%% "oe4888$(((D E E E E 65555


r   c                     | j                                                             | j                  | _        | j        $t
                              d| j        z             dS |                                  dS )zfSets the lxml root, also sets lxml roots of all
        children links, also sets description
        NzSource %s parse error.)	r.   r{   r|   r   r   r   r   r   r   rJ   s    r   rC   zSource.parse   sh    
 ;))++66tyAA8KK048;<<<Fr   c                    t                               dt          | j                  z             | j        D ]:}| j                                                            |j                  }||_        ;d | j        D             | _        dS )z1Parse out the lxml root in each category
        z$We are extracting from %d categoriesc                      g | ]}|j         	|S r   ro   rh   s     r   rR   z+Source.parse_categories.<locals>.<listcomp>   s    KKK9J19J9J9Jr   N)	r   debuglenr6   r.   r{   r|   r   r   )r   categoryr   s      r   rF   zSource.parse_categories   s     			8do&&' 	( 	( 	( 	 	H+((**55hmDDCHLLKKdoKKKr   c                    | j                                                             |j                  }|d S | j                                                             |d          }t          d |D             | j                  |_        |S )Ntitle)tagc              3   2   K   | ]}|j         	|j         V  d S r   )text)rP   elements     r   	<genexpr>z,Source._map_title_to_feed.<locals>.<genexpr>   s+      PPG7<P7<PPPPPPr   )r.   r{   r|   r%   getElementsByTagnextr<   r   )r   feedr   elementss       r   _map_title_to_feedzSource._map_title_to_feed   s~    k$$&&11$(;;;4;))++<<Sg<NNPPxPPPRVR\]]
r   c                      t                               dt           j                  z              fd j        D              _        dS )zAdd titles to feeds
        zWe are parsing %d feedsc                 :    g | ]}                     |          S r!   )r   )rP   r   r   s     r   rR   z&Source.parse_feeds.<locals>.<listcomp>   s'    EEEQd--a00EEEr   N)r   r   r   r7   rJ   s   `r   parse_feedszSource.parse_feeds   sO     			+dj//" 	# 	# 	#EEEE$*EEE


r   c                    g }| j         D ]}| j                            |j        d          }g }t	          |          }|D ]3}t          ||j        | j                  }|                    |           4| 	                    d|          }t	          |          }| j        j
        rt          j
        | |          }t	          |          }	|                    |           t                              d|||	|j        fz             |S )z1Returns articles given the url of a feed
        T)regex)r   
source_urlr.   r   %d->%d->%d for %s)r7   r0   get_urlsr%   r   r   r   r.   ru   rW   memoize_articlesr   extendr   r   )
r   r8   r   r   cur_articlesbefore_purger   articleafter_purge
after_memos
             r   feeds_to_articleszSource.feeds_to_articles   s-    J 	I 	ID>**484*@@DLt99L - -!#x;( ( ( ##G,,,,..ulCCLl++K{+ J$5dLII\**JOOL)))II)#[*dhGH I I I Ir   c                 4   g }| j         D ]}g }| j                            |j        d          }t	          |          }|D ]D}|d         }|d         }t          ||j        || j                  }	|                    |	           E| 	                    d|          }t	          |          }
| j        j
        rt          j
        | |          }t	          |          }|                    |           t                              d||
||j        fz             |S )zTakes the categories, splays them into a big list of urls and churns
        the articles out of each url with the url_to_article method
        T)titlesr   r	   )r   r   r   r.   r   r   )r6   r0   r   r   r   r   r   r.   ru   rW   r   r   r   r   r   )r   r8   r   r   url_title_tupsr   tup	indiv_urlindiv_title_articler   r   s               r   categories_to_articleszSource.categories_to_articles  sD     	M 	MHL!^44X\$4OON~..L% 
. 
.F	!!f"!'|%;	   ##H----..ulCCLl++K{+ J$5dLII\**JOOL)))II)#[*hlKL M M M Mr   c                     |                                  }|                                 }||z   }d |D             }t          |                                          S )zGReturns a list of all articles, from both categories and feeds
        c                     i | ]
}|j         |S r!   r`   rP   r   s     r   
<dictcomp>z-Source._generate_articles.<locals>.<dictcomp>F  s    ===W===r   )r   r   listvalues)r   category_articlesfeed_articlesr8   uniqs        r   _generate_articleszSource._generate_articles?  s[     !7799..00 #44==H===DKKMM"""r     c                     |                                  }|d|         | _        t                              dt	          |          |           dS )zGSaves all current articles of news source, filter out bad urls
        Nz&%d articles generated and cutoff at %d)r   r8   r   r   r   )r   limitr8   s      r   rI   zSource.generate_articlesI  sS     **,, %(		:h--	( 	( 	( 	( 	(r   r	   c                    d | j         D             }g }|dk    rt          | j                   D ]j\  }}||         }t          j        || j                  }| j         |                             |           |s |                    | j         |                    kd | j         D             | _         n|t          k    r"t          	                    dt          z             t          j
        || j                  }t          |          D ]l\  }}	t          j        |	j        |	j                  }| j         |                             |           |	j        s |                    | j         |                    md | j         D             | _         d| _        t          |          d	k    r<t          	                    d
d                    d |D                       z             dS dS )z0Downloads all articles attached to self
        c                     g | ]	}|j         
S r!   r`   rO   s     r   rR   z,Source.download_articles.<locals>.<listcomp>U  s    ---!---r   r	   )r.   c                      g | ]}|j         	|S r!   rm   rO   s     r   rR   z,Source.download_articles.<locals>.<listcomp>_      @@@1@Q@@@r   zAUsing %s+ threads on a single source may result in rate limiting!rj   c                      g | ]}|j         	|S r!   rm   rO   s     r   rR   z,Source.download_articles.<locals>.<listcomp>k  r   r   Tr   z2The following article urls failed the download: %sz, c                     g | ]	}|j         
S r!   r`   rO   s     r   rR   z,Source.download_articles.<locals>.<listcomp>p  s    "B"B"BQ15"B"B"Br   N)r8   rw   r
   rz   r.   set_htmlru   !NUM_THREADS_PER_SOURCE_WARN_LIMITr   r   rv   r   rx   r?   r   join)
r   threadsr   failed_articlesr   r   r   r   filled_requestsr   s
             r   download_articleszSource.download_articlesQ  s    .-t}---a<<"+DM":": A Aw5k'DK@@@e$--d333 A#**4=+?@@@@@@@@DMM::: ;>_` a a a%9$LLO'88 A A
s'#(CCCe$--d333x A#**4=+?@@@@@@@@DM!!##KKL		"B"B/"B"B"BCCD E E E E E $#r   c                     t          | j                  D ]\  }}|                                 |                     d| j                  | _        d| _        dS )z0Parse all articles, delete if too small
        rS   TN)rw   r8   rC   rW   r>   )r   r   r   s      r   parse_articleszSource.parse_articlesr  sT     (66 	 	NE7MMOOOO++FDMBBr   c                 <    | j         dS t          | j                   S )z6Number of articles linked to this news source
        Nr   )r8   r   rJ   s    r   sizezSource.size{  s!     = 14=!!!r   c                 .    t          j        |            dS )zCClears the memoization cache for this specific news domain
        N)r   clear_memo_cacherJ   s    r   clean_memo_cachezSource.clean_memo_cache  s     	t$$$$$r   c                 $    d | j         D             S )z$Returns a list of feed urls
        c                     g | ]	}|j         
S r!   r`   )rP   r   s     r   rR   z$Source.feed_urls.<locals>.<listcomp>  s    000T000r   )r7   rJ   s    r   r   zSource.feed_urls  s     10TZ0000r   c                 $    d | j         D             S )z(Returns a list of category urls
        c                     g | ]	}|j         
S r!   r`   )rP   r   s     r   rR   z(Source.category_urls.<locals>.<listcomp>  s    ======r   )r6   rJ   s    r   r   zSource.category_urls  s     >=T_====r   c                 $    d | j         D             S )z'Returns a list of article urls
        c                     g | ]	}|j         
S r!   r`   r   s     r   rR   z'Source.article_urls.<locals>.<listcomp>  s    999999r   )r8   rJ   s    r   article_urlszSource.article_urls  s     :94=9999r   c                    t          d| j                   t          d| j                   t          d| j                   t          dt	          | j                             t          d| j        dd                    t          d           | j        dd	         D ]}t          d
d|j                   t          d|j                   t          dt	          |j                             t          d|j	                   t          dt	          |j
                             t          d           t          d|                                            t          d           t          d|                                            dS )z@Prints out a summary of the data in our source instance
        z[source url]:z[source brand]:z[source domain]:z[source len(articles)]:z[source description[:50]]:N2   z"printing out 10 sample articles...
   	z[url]:z		[title]:z	[len of text]:z	[keywords]:z	[len of html]:z	==============z
feed_urls:z
zcategory_urls:)printr   r<   r3   r   r8   r=   r   r   keywordsr   r   r   )r   rQ   s     r   print_summaryzSource.print_summary  sa    	otx(((,,, $+...'T]););<<<*D,<SbS,ABBB2333ss# 	& 	&A$!%(((,((($c!&kk222/1:...$c!&kk222$%%%%lDNN,,---f 2 2 4 455555r   r   )r   )r	   )!r   r   r    __doc__r   rK   rW   r   
cache_diskr   r\   rD   rG   r   rB   rE   rH   rC   rF   r   r   r   r   r   rI   r   r   r   r   r   r   r   r   r!   r   r   r(   r(   ,   s        # # # #B! ! !"   Uy8HIIID D JID> > >%5 %5 %5N     < < <
A A A"6 6 6"	 	 		L 	L 	L  F F F  :! ! !F# # #( ( ( (E E E EB  " " "% % %
1 1 1
> > >
: : :
6 6 6 6 6r   r(   )r   	__title__
__author____license____copyright__loggingurllib.parser   r   r   r   r,   r
   r   r   r   r   configurationr   
extractorsr   settingsr   	getLoggerr   r   objectr   r#   r   r(   r!   r   r   <module>r      s    	
/  6 6 6 6 6 6 6 6 6 6 ! ! ! ! ! !                         ( ( ( ( ( ( ( ( ( ( ( ( & & & & & &g!!    v       6    %& !6 6 6 6 6V 6 6 6 6 6r   