
    !}gH                     F   d Z dZdZdZddlZddlZddlZddlZddlZddl	m
Z
 ddl	mZ dd	l	mZ dd
l	mZ ddl	mZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZ ddlmZ  ej        e           Z! G d de"          Z# G d de$          Z% G d de"          Z&dS )	newspaperzLucas Ou-YangMITzCopyright 2014, Lucas Ou-Yang    N   )images)network)nlp)settings)urls)DocumentCleaner)Configuration)ContentExtractor)OutputFormatter)	URLHelper	RawHelperextend_configget_available_languagesextract_meta_refresh)VideoExtractorc                       e Zd ZdZdZdZdS )ArticleDownloadStater   r      N)__name__
__module____qualname__NOT_STARTEDFAILED_RESPONSESUCCESS     Q/var/www/py-google-trends/myenv/lib/python3.11/site-packages/newspaper/article.pyr   r      s        KOGGGr   r   c                       e Zd ZdS )ArticleExceptionN)r   r   r   r   r   r    r"   r"   %   s        Dr   r"   c                       e Zd ZdZd)dZd Zd*dZd Zd	 Zd
 Z	d Z
d Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd  Zd! Z d" Z!d# Z"d$ Z#d% Z$d& Z%d' Z&d( Z'dS )+Articlez9Article objects abstract an online news article page
     Nc                    t          |t                    st          |t                    rt          d          |pt                      | _        t	          | j        |          | _        t          | j                  | _        |dk    r2t          j        |          }|d}|dz   t          j	        |          z   }||dk    rt          d          || _
        t          j        || j
                  | _        || _        dx| _        | _        d| _        g x| _        | _        g | _        d| _        g | _        g | _        t/                      | _        g | _        d| _        d| _        d| _        d| _        d| _        t>          j         | _!        d| _"        d| _#        d| _$        d| _%        i | _&        d| _'        d| _(        d| _)        d| _*        d| _+        i | _,        dS )znThe **kwargs argument may be filled with config values, which
        is added into the config object
        zmConfiguration object being passed incorrectly as title or source_url! Please verify `Article`s __init__() fn.r%   Nhttpz://zinput url bad formatF)-
isinstancer   r"   configr   r   	extractorr
   
get_scheme
get_domain
source_urlprepare_urlurltitletop_img	top_imagemeta_imgimgsr   moviestextkeywordsmeta_keywordssettagsauthorspublish_datesummaryhtmlarticle_html	is_parsedr   r   download_statedownload_exception_msgmeta_description	meta_langmeta_favicon	meta_datacanonical_linktop_nodeclean_top_nodedoc	clean_docadditional_data)selfr/   r0   r-   r)   kwargsschemes          r    __init__zArticle.__init__,   s    e]++ 	G:}55	G"FG G G /#DK88)$+66_S))F~%$/#*>*>>Jr!1!1"#9::: %#C99
 )+*t~  #%$	DK  	    EE	   	  2>&*# !#    ! 
 #    "r   c                 ~    |                                   |                                  |                                  dS )zBuild a lone article from a URL independent of the source (newspaper).
        Don't normally call this method b/c it's good to multithread articles
        on a source (newspaper) level.
        N)downloadparser   rM   s    r    buildzArticle.build   s/    
 	






r   r   c                 $   |	 t          j        | j        | j                  }ns# t          j        j        $ rZ}t          j        | _	        t          |          | _        t                              d| j        d| j                   Y d}~dS d}~ww xY w|}| j        j        rCt          |          }|r2|dk     r,|                     t          j        |          |dz             S |                     |           |                     |           dS )zDownloads the link's HTML content, don't use if you are batch async
        downloading articles

        recursion_counter (currently 1) stops refreshes that are potentially
        infinite
        NzDownload failed on URL z because of r   )
input_htmlrecursion_counter)r   get_html_2XX_onlyr/   r)   requests
exceptionsRequestExceptionr   r   rA   strrB   logdebugfollow_meta_refreshr   rR   get_htmlset_html	set_title)rM   rW   r0   rX   r>   emeta_refresh_urls          r    rR   zArticle.download   s7    04;GG&7   &:&J#.1!ff+			888T%@%@B C C C D;* 	=3D99 =$5$9$9}}&/0@AA&7!&; % = = = 	dus   $ BABBc                    |                                   | j                                                            | j                  | _        t          j        | j                  | _        | j        d S | 	                                }|j
        | _
        t          | j                  }t          | j                  }| j                            | j                  }|                     |           | j                            | j                  }|                     |           | j                            | j                  }|                     |           | j        j        r9| j                            | j                   |                    | j                   | j                            | j                  }|                     |           | j                            | j                  }|                     |           | j                            | j        | j                  }	|                     |	           | j                            | j                  }
|                     |
           | j                             | j                  }| !                    |           | j        "                    | j                  }| #                    |           | j        $                    | j        | j                  | _%        |&                    | j                  | _        | j        '                    | j                  | _(        | j(        tS          | j        | j(                  }| *                    |+                                           | j        ,                    | j(                  | _(        t          j        | j(                  | _-        |.                    | j(                  \  }}| /                    |           | 0                    |           | 1                                 d| _2        | 3                                 d S )NT)4throw_if_not_downloaded_verboser)   
get_parser
fromstringr>   rJ   copydeepcopyrK   get_parse_candidate	link_hashr   r   r*   	get_titlerc   get_authorsset_authorsget_meta_langset_meta_languageuse_meta_languageupdate_languagerD   get_faviconset_meta_faviconget_meta_descriptionset_meta_descriptionget_canonical_linkr/   set_canonical_linkextract_tagsset_tagsget_meta_keywordsset_meta_keywordsget_meta_dataset_meta_dataget_publishing_dater<   cleancalculate_best_noderH   r   
set_movies
get_videospost_cleanuprI   get_formattedset_article_htmlset_textfetch_imagesr@   release_resources)rM   parse_candidatedocument_cleaneroutput_formatterr0   r;   rD   rE   rC   rG   r:   r8   rF   video_extractorr6   r?   s                   r    rS   zArticle.parse   s   ,,...;))++66tyAAtx008F 2244(2*4;77*4;77((88u.,,T^<<!!!N00@@	y)));( 	=N**4>:::,,T^<<<~11$.AAl+++ N//?? 	!!"2333::Hdn& &///~**4>::d88N }---N00@@	9%%% N>>HN 
 $))$(33::48DD=$,T[$-HHOOOO6688999 N77FFDM"&-">">D!1!?!?" "D,!!,///MM$     r   c                    | j         | j                            | j        | j                   }|                     |           | j                            | j        | j                   }| j        r|                    | j                   |                     |           | j	        p| 
                                s\| j                            | j        | j	                  }| j        j        r|                     |           n|                     |           | 
                                s"| j        j        r|                                  d S d S d S N)rK   r*   get_meta_img_urlr/   set_meta_imgget_img_urlsr3   addset_imgsrI   has_top_imageget_first_img_urlr)   r   set_top_imgset_top_img_no_checkset_reddit_top_img)rM   meta_img_urlr4   	first_imgs       r    r   zArticle.fetch_images
  sH   >%>::$.* *Ll+++>..txHHD} ('''MM$*43E3E3G3G*88$-/ /I{' 5  ++++)))444!!## 	&(@ 	&##%%%%%	& 	& 	& 	&r   c                 *    | j         d uo
| j         dk    S )Nr%   )r1   rT   s    r    r   zArticle.has_top_image   s    |4'>DLB,>>r   c                 4    t          j        | j                  S )zoPerforms a check on the url of this link to determine if article
        is a real news article or not
        )r
   	valid_urlr/   rT   s    r    is_valid_urlzArticle.is_valid_url#  s     ~dh'''r   c                 D   | j         st          d          | j                            | j                  }| j                            d          }| j                            d          }|dk    rAt          |          | j        j	        k    r$t                              d| j        z             dS |                                 s+| j        s$t                              d| j        z             dS | j        +t          | j                            d                    d
k     r$t                              d| j        z             dS t          |          | j        j	        k     r$t                              d| j        z             dS t          |          | j        j        k     r$t                              d| j        z             dS | j        | j        dk    r$t                              d| j        z             dS t                              d| j        z             dS )zrIf the article's body text is long enough to meet
        standard article requirements, keep the article
        z]must parse article before checking                                     if it's body is valid! .articlez%s verified for article and wcTz%s caught for no media no textFNr   z%s caught for bad titlez%s caught for word cntz%s caught for sent cntr%   z%s caught for no htmlz%s verified for default true)r@   r"   r*   get_meta_typerK   r6   splitlenr)   MIN_WORD_COUNTr^   r_   r/   is_media_newsr0   MIN_SENT_COUNTr>   )rM   	meta_type	wordcount	sentcounts       r    is_valid_bodyzArticle.is_valid_body)  s    ~ 	>" $= > > >N00@@	IOOC((	IOOC((	""s9~~+(- (-II6ABBB4!!## 	DI 	II6ABBB5:TZ%5%5c%:%:!;!;a!?!?II/$(:;;;5y>>DK666II.9:::5y>>DK666II.9:::59	RII-89995		048;<<<tr   c                 0    g d}|D ]}|| j         v r dS dS )z^If the article is related heavily to media:
        gallery, video, big pictures, etc
        )z/videoz/slidez/galleryz/powerpointz/fashionz/glamourz/clothTF)r/   )rM   	safe_urlsss      r    r   zArticle.is_media_newsP  s>    7 7 7	 	 	ADH}}tt ur   c                    |                                   |                                  t          j        | j                                                   t          t          j        | j                  	                                          }t          t          j        | j
                  	                                          }t          t          ||z                       }|                     |           | j        j        }t          j        | j
        | j        |          }d                    |          }|                     |           dS )z#Keyword extraction wrapper
        )r0   r6   	max_sents
N)rg   throw_if_not_parsed_verboser   load_stopwordsr)   get_languagelistr7   r6   keysr0   r9   set_keywordsMAX_SUMMARY_SENT	summarizejoinset_summary)rM   
text_keywstitle_keywskeywsr   summary_sentsr=   s          r    r   zArticle.nlp[  s    	,,...((***4;3355666#,ty11668899
3<
3388::;;Sz12233%   K0	DJTYR[\\\))M**!!!!!r   c                     | j         rt          j        | j        | j                   S t	          j        | j                  S )zyA parse candidate is a wrapper object holding a link hash of this
        article and a final_url of the article
        )r>   r   get_parsing_candidater/   r   rT   s    r    rl   zArticle.get_parse_candidatem  s9     9 	H248TYGGG.tx888r   c                     |                                  }t          j                            |          st          j        |           dS dS )z6Must be called after computing HTML/final URL
        N)get_resource_pathospathexistsmkdir)rM   res_paths     r    build_resource_pathzArticle.build_resource_pathu  sK     ))++w~~h'' 	HX	 	r   c                    d}t           j                            t          j        |          }t           j                            |          st          j        |           t           j                            |d| j        z            }|S )zxEvery article object has a special directory to store data in from
        initialization to garbage collection
        article_resourcesz%s_)r   r   r   r	   TOP_DIRECTORYr   r   rm   )rM   
res_dir_fnresource_directorydir_paths       r    r   zArticle.get_resource_path|  sl     )
W\\(*@*MMw~~011 	)H'(((7<< 2EDN4JKKr   c                     |                                  }t          j        |          D ]'}	 t          j        |           # t          $ r Y $w xY wd S r   )r   globr   removeOSError)rM   r   fnames      r    r   zArticle.release_resources  sl    %%''Yt__ 	 	E	%       	 	s   A
AAc                    	 t          j        |           }|                     |                                           dS # t          $ r}d|j        d         v rt                              d|z             nPd|j        d         v rt                              d|z             n)t                              d|z             Y d}~dS Y d}~dS Y d}~dS d}~wt          $ r(}t                              d|z             Y d}~dS d}~ww xY w)	zWrapper for setting images. Queries known image attributes
        first, then uses Reddit's image algorithm as a fallback.
        z1Can't convert 'NoneType' object to str implicitlyr   z(No pictures found. Top image not set, %sz	timed outz4Download of picture timed out. Top image not set, %szsTypeError other than None type error. Cannot set top image using the Reddit algorithm. Possible error with PIL., %sNzZOther error with setting top image using the Reddit algorithm. Possible error with PIL, %s)
r   Scraperr   largest_image_url	TypeErrorargsr^   r_   critical	Exception)rM   r   rd   s      r    r   zArticle.set_reddit_top_img  s   	Nt$$AQ002233333 	L 	L 	LBafQiOO		DqHIIIIq	))		PSTTUUUU GIJK L L L L L L L L L	 JIIIIIUUUUUU
  	N 	N 	NLL IKLM N N N N N N N N N	Ns"   ;? 
D	A7CDDDc                 @    |r|d | j         j                 | _        d S d S r   )r)   	MAX_TITLEr0   )rM   input_titles     r    rc   zArticle.set_title  s/     	=$%;dk&;%;<DJJJ	= 	=r   c                 D    |d | j         j                 }|r	|| _        d S d S r   )r)   MAX_TEXTr6   )rM   r6   s     r    r   zArticle.set_text  s4    )T[))* 	DIII	 	r   c                     |r[t          |t                    r,| j                                                            |          }|| _        t          j        | _        dS dS )z&Encode HTML before setting it
        N)	r(   bytesr)   rh   get_unicode_htmlr>   r   r   rA   )rM   r>   s     r    rb   zArticle.set_html  sb      	?$&& G{--//@@FFDI"6">D		? 	?r   c                     |r	|| _         dS dS )z7Sets the HTML of just the article's `top_node`
        N)r?   )rM   r?   s     r    r   zArticle.set_article_html  s$      	- ,D	- 	-r   c                 >    || _         |                     |           d S r   )r3   r   rM   src_urls     r    r   zArticle.set_meta_img  s#    !!'*****r   c                     |@t          j        |           }|                    |          r|                     |           d S d S d S r   )r   r   satisfies_requirementsr   )rM   r   r   s      r    r   zArticle.set_top_img  sY    t$$A''00 3))'22222 3 3r   c                 "    || _         || _        dS )zeProvide 2 APIs for images. One at "top_img", "imgs"
        and one at "top_image", "images"
        N)r1   r2   r   s     r    r   zArticle.set_top_img_no_check  s      r   c                 "    || _         || _        dS )z{The motive for this method is the same as above, provide APIs
        for both `article.imgs` and `article.images`
        N)r   r4   )rM   r4   s     r    r   zArticle.set_imgs  s     			r   c                     t          |t                    st          d          |r|d| j        j                 | _        dS dS )z'Keys are stored in list format
        zKeyword input must be list!N)r(   r   r   r)   MAX_KEYWORDSr7   )rM   r7   s     r    r   zArticle.set_keywords  sU     (D)) 	;9::: 	@$%>dk&>%>?DMMM	@ 	@r   c                     t          |t                    st          d          |r|d| j        j                 | _        dS dS )zKAuthors are in ["firstName lastName", "firstName lastName"] format
        zauthors input must be list!N)r(   r   r   r)   MAX_AUTHORSr;   )rM   r;   s     r    rp   zArticle.set_authors  sR     '4(( 	;9::: 	="#;DK$;#;<DLLL	= 	=r   c                 8    |d| j         j                 | _        dS )z]Summary here refers to a paragraph of text from the
        title text and body text
        N)r)   MAX_SUMMARYr=   )rM   r=   s     r    r   zArticle.set_summary  s     7 778r   c                 z    |r4t          |          dk    r#|t                      v r|dd         | _        dS dS dS dS )z5Save langauges in their ISO 2-character form
        r   N)r   r   rD   )rM   rD   s     r    rr   zArticle.set_meta_language  sW      	+Y1,,/1111&rr]DNNN	+ 	+,,11r   c                 N    d |                     d          D             | _        dS )z$Store the keys in list form
        c                 6    g | ]}|                                 S r   )strip).0ks     r    
<listcomp>z-Article.set_meta_keywords.<locals>.<listcomp>  s     JJJAaggiiJJJr   ,N)r   r8   )rM   r8   s     r    r~   zArticle.set_meta_keywords  s/     KJ1D1DS1I1IJJJr   c                     || _         d S r   )rE   )rM   rE   s     r    rv   zArticle.set_meta_favicon  s    (r   c                     || _         d S r   )rC   )rM   rC   s     r    rx   zArticle.set_meta_description  s     0r   c                     || _         d S r   )rF   )rM   rF   s     r    r   zArticle.set_meta_data  s    "r   c                     || _         d S r   )rG   )rM   rG   s     r    rz   zArticle.set_canonical_link   s    ,r   c                     || _         d S r   )r:   )rM   r:   s     r    r|   zArticle.set_tags  s    			r   c                 ,    d |D             }|| _         dS )z*Trim video objects into just urls
        c                 .    g | ]}||j         |j         S r   )src)r   os     r    r   z&Article.set_movies.<locals>.<listcomp>	  s&    BBBaBAEBaeBBBr   N)r5   )rM   movie_objects
movie_urlss      r    r   zArticle.set_movies  s#     CB]BBB
 r   c                     | j         t          j        k    rt          d          | j         t          j        k    rt          d| j        d| j                  dS )zbParse ArticleDownloadState -> log readable status
        -> maybe throw ArticleException
        z'You must `download()` an article first!z!Article `download()` failed with z on URL N)rA   r   r   r"   r   rB   r/   rT   s    r    rg   z'Article.throw_if_not_downloaded_verbose  sl     "6"BBB"#LMMM $8$HHH""...$: ; ; ; IHr   c                 2    | j         st          d          dS )z`Parse `is_parsed` status -> log readable status
        -> maybe throw ArticleException
        z$You must `parse()` an article first!N)r@   r"   rT   s    r    r   z#Article.throw_if_not_parsed_verbose  s*     ~ 	K"#IJJJ	K 	Kr   )r%   r%   N)NNr   )(r   r   r   __doc__rP   rU   rR   rS   r   r   r   r   r   r   rl   r   r   r   r   rc   r   rb   r   r   r   r   r   r   rp   r   rr   r~   rv   rx   r   rz   r|   r   rg   r   r   r   r    r$   r$   )   s\        j" j" j" j"X     :J! J! J!X& & &,? ? ?( ( (% % %N	 	 	" " "$9 9 9  	 	 	  N N N(= = =  
? ? ?- - -+ + +3 3 3! ! !  @ @ @= = =9 9 9+ + +K K K
) ) )1 1 1# # #- - -  ! ! !; ; ;K K K K Kr   r$   )'	__title__
__author____license____copyright__loggingrj   r   r   rZ   r%   r   r   r   r	   r
   cleanersr   configurationr   
extractorsr   outputformattersr   utilsr   r   r   r   r   videos.extractorsr   	getLoggerr   r^   objectr   r   r"   r$   r   r   r    <module>r     s  	
/   				                                 % % % % % % ( ( ( ( ( ( ( ( ( ( ( ( - - - - - -C C C C C C C C C C C C C C - - - - - -g!!    6   	 	 	 	 	y 	 	 	rK rK rK rK rKf rK rK rK rK rKr   