
    !}gW/                     ,   d Z dZdZdZdZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlZddlZddlZddlZddlmZ ddlmZ d	d
lmZ  ej        e          Ze                    ej                    G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z  G d de          Z! G d de"          Z#d Z$d Z%d Z&d Z'd Z(d  Z)d-d#Z*d$ Z+d% Z,d& Z-d' Z.d( Z/d) Z0d* Z1d+ Z2d, Z3dS ).zO
Holds misc. utility methods which prove to be
useful throughout this library.
	newspaperzLucas Ou-YangMITzCopyright 2014, Lucas Ou-Yang    N)sha1)BeautifulSoup   )settingsc                   $    e Zd Zed             ZdS )
FileHelperc                    t           j                            |           sct           j                            t           j                            t
                              }t           j                            |d|           }n| }	 t          j        |dd          }|	                                }|
                                 |S # t          $ r t          d|z            w xY w)N	resourcesrutf-8zCouldn't open file %s)ospathisabsabspathdirname__file__joincodecsopenreadcloseIOError)filenamedirpathr   fcontents        O/var/www/py-google-trends/myenv/lib/python3.11/site-packages/newspaper/utils.pyloadResourceFilezFileHelper.loadResourceFile"   s    w}}X&& 	goobgooh&?&?@@G7<<h??DDD	:D#w//AffhhGGGIIIN 	: 	: 	:1D8999	:s   ?C C#N)__name__
__module____qualname__staticmethodr         r   r
   r
   !   s-        : : \: : :r&   r
   c                       e Zd Zd ZdS )ParsingCandidatec                 "    || _         || _        d S N)url	link_hash)selfr+   r,   s      r   __init__zParsingCandidate.__init__4   s    "r&   N)r!   r"   r#   r.   r%   r&   r   r(   r(   2   s#        # # # # #r&   r(   c                   $    e Zd Zed             ZdS )	RawHelperc                     t          |t                    r|                    dd          }t          j        |                                          dt          j                    }t          | |          S )Nr   replace.)
isinstancestrencodehashlibmd5	hexdigesttimer(   )r+   raw_htmlr,   s      r   get_parsing_candidatezRawHelper.get_parsing_candidate:   se    h$$ 	;w	::H&{844>>@@@@$)+++N	Y///r&   Nr!   r"   r#   r$   r<   r%   r&   r   r0   r0   9   s-        0 0 \0 0 0r&   r0   c                   $    e Zd Zed             ZdS )	URLHelperc                     d| v r|                      dd          n| }t          j        |                                          dt	          j                    }t          ||          S )Nz#!z?_escaped_fragment_=r3   )r2   r7   r8   r9   r:   r(   )url_to_crawl	final_urlr,   s      r   r<   zURLHelper.get_parsing_candidateC   sn     |## !((/EFFF)5 	&{955??AAAA49;;;O		9555r&   Nr=   r%   r&   r   r?   r?   B   s-        6 6 \6 6 6r&   r?   c                       e Zd Zd Zd ZdS )StringSplitterc                 8    t          j        |          | _        d S r*   )recompilepattern)r-   rH   s     r   r.   zStringSplitter.__init__M   s    z'**r&   c                 >    |sg S | j                             |          S r*   )rH   splitr-   strings     r   rJ   zStringSplitter.splitP   s%     	I|!!&)))r&   N)r!   r"   r#   r.   rJ   r%   r&   r   rD   rD   L   s2        + + +* * * * *r&   rD   c                       e Zd Zd Zd ZdS )StringReplacementc                 "    || _         || _        d S r*   )rH   replaceWithr-   rH   rP   s      r   r.   zStringReplacement.__init__W   s    &r&   c                 @    |sdS  |j         | j        | j                  S N )r2   rH   rP   rK   s     r   
replaceAllzStringReplacement.replaceAll[   s(     	2v~dlD,<===r&   N)r!   r"   r#   r.   rU   r%   r&   r   rN   rN   V   s2        ' ' '> > > > >r&   rN   c                   *    e Zd Zd ZddZddZd ZdS )ReplaceSequencec                     g | _         d S r*   )replacementsr-   s    r   r.   zReplaceSequence.__init__b   s    r&   Nc                 ^    t          ||pd          }| j                            |           | S rS   )rN   rY   append)r-   firstPatternrP   results       r   createzReplaceSequence.createe   s2    "<1BCC  (((r&   c                 .    |                      ||          S r*   )r_   rQ   s      r   r\   zReplaceSequence.appendj   s    {{7K000r&   c                 P    |sdS |}| j         D ]}|                    |          }|S rS   )rY   rU   )r-   rL   mutatedStringrps       r   rU   zReplaceSequence.replaceAllm   s?     	2# 	9 	9BMM-88MMr&   r*   )r!   r"   r#   r.   r_   r\   rU   r%   r&   r   rW   rW   a   sZ             
1 1 1 1    r&   rW   c                       e Zd ZdS )TimeoutErrorN)r!   r"   r#   r%   r&   r   re   re   w   s        Dr&   re   c                       fd}|S )z+Borrowed from web.py, rip Aaron Swartz
    c                       fd}|S )Nc                        G  fddt           j                  } |            }|                               |                                rt	                      |j        r! |j        d         |j        d                   |j        S )Nc                   $    e Zd Zd Z fdZdS )3timelimit.<locals>._1.<locals>._2.<locals>.Dispatchc                     t           j                            |            d | _        d | _        |                     d           |                                  d S )NT)	threadingThreadr.   r^   error	setDaemonstartrZ   s    r   r.   z<timelimit.<locals>._1.<locals>._2.<locals>.Dispatch.__init__   sJ    $--d333"&DK!%DJNN4(((JJLLLLLr&   c                 d    	  i | _         d S #  t          j                    | _        Y d S xY wr*   )r^   sysexc_inforn   )r-   argsfunctionkws    r   runz7timelimit.<locals>._1.<locals>._2.<locals>.Dispatch.run   s>    4&.h&;&;&;4%(\^^



s    /N)r!   r"   r#   r.   rw   )rt   ru   rv   s   r   Dispatchrj      sG        ! ! !4 4 4 4 4 4 4 4 4r&   rx   r   r   )rl   rm   r   isAlivere   rn   r^   )rt   rv   rx   cru   timeouts   ``  r   _2z!timelimit.<locals>._1.<locals>._2   s    4 4 4 4 4 4 4 4 49+ 4 4 4 

AFF7OOOyy{{ %"nn$w - agaj,,,8Or&   r%   )ru   r|   r{   s   ` r   _1ztimelimit.<locals>._1~   s)    	 	 	 	 	 	, 	r&   r%   )r{   r}   s   ` r   	timelimitr~   {   s#        0 Ir&   c                 h    |                      dd          }|d         dk    r
|dd         }|dz  }|S )zjAll '/' are turned into '-', no trailing. schema's
    are gone, only the raw domain + ".txt" remains
    /-Nz.txtr2   )domainr   s     r   domain_to_filenamer      sB     ~~c3''H|sCRC=HOr&   c                 >    |                      dd          dd         S )z[:-4] for the .txt at end
    r   r   Nr   )r   s    r   filename_to_domainr      s#     C%%crc**r&   c                 2    d }| D ]} ||          s dS dS )z'True if a word is only ascii chars
    c                 0    t          |           dk    rdS | S )N   rT   )ord)chars    r   	onlyasciizis_ascii.<locals>.onlyascii   s    t99s??2Kr&   FTr%   )wordr   rz   s      r   is_asciir      sC      
   y|| 	55	4r&   c                 r   t          | d          }|                    dddi          }|r	 |d                             d          \  }}|                                                    d          r2|d	d
                             dd                              dd          S d
S # t          $ r Y d
S w xY wd
S )ae   Parses html for a tag like:
    <meta http-equiv="refresh" content="0;URL='http://sfbay.craigslist.org/eby/cto/5617800926.html'" />
    Example can be found at: https://www.google.com/url?rct=j&sa=t&url=http://sfbay.craigslist.org/eby/cto/
    5617800926.html&ct=ga&cd=CAAYATIaYTc4ZTgzYjAwOTAwY2M4Yjpjb206ZW46VVM&usg=AFQjCNF7zAl6JPuEsV4PbEzBomJTUpX4Lg
    zhtml.parsermetaz
http-equivrefresh)attrsr   ;zurl=   N"rT   ')r   findrJ   lower
startswithr2   
ValueError)htmlsoupelement	wait_parturl_parts        r   extract_meta_refreshr      s     }--Dii|Y&?i@@G F	F"))"4":":3"?"?Ix ~~**622 F|++C44<<S"EEEF F  	 	 	 44	F Fs   B& &
B43B4c                 |    dt           j        t           j        d                    fd| D                       S )zZConverts arbitrary string (for us domain name)
    into a valid file name for caching
    z-_.() rT   c              3   $   K   | ]
}|v |V  d S r*   r%   ).0rz   valid_charss     r   	<genexpr>z$to_valid_filename.<locals>.<genexpr>   s-      441#3#31#3#3#3#344r&   )rL   ascii_lettersdigitsr   )sr   s    @r   to_valid_filenamer      sB      #)"6"6FK774444a444444r&   逗 /tmpc                       fd}|S )zACaching extracting category locations & rss feeds for 5 days
    c                       fd}|S )Nc                  D   t          t          | d                   t          |          z                       d                                                    }t          j                            |          }t          j                            |          r]t          j                            |          }t          j	                    |z
  }|	k     r"t          j        t          |d                    S  | i |}t          j        |t          |d                     |S )zCalculate a cache key based on the decorated method signature
            args[1] indicates the domain of the inputs, we hash on domain!
            r   r   rbwb)r   r5   r6   r9   r   r   r   existsgetmtimer:   pickleloadr   dump)
rt   kwargskeyfilepathmodifiedage_secondsr^   cache_folderru   secondss
          r   inner_functionz4cache_disk.<locals>.do_cache.<locals>.inner_function   s     DGF$%+VG__6 66?ikk w||L#66H w~~h'' =7++H55"ikkH4((!;tHd';';<<< Xt.v..FKXt 4 4555Mr&   r%   )ru   r   r   r   s   ` r   do_cachezcache_disk.<locals>.do_cache   s/    	 	 	 	 	 	 	* r&   r%   )r   r   r   s   `` r   
cache_diskr      s)         . Or&   c                       fd}|S )z;Prints out the runtime duration of a method in seconds
    c                      t          j                     } | i |}t          j                     }t          dj        ||z
  fz             |S )Nz%r %2.2f sec)r:   printr!   )rt   rv   tsr^   temethods        r   timedzprint_duration.<locals>.timed   sP    Y[[$$$Y[[nb99:::r&   r%   )r   r   s   ` r   print_durationr      s#         Lr&   c              #      K   t          t          |           |z            }t          d|dz
            D ]}| ||z  ||z  |z            V  | ||z  |z
  d         V  dS )z%Yield n successive chunks from l
    r   r   N)intlenrange)lnnewnis       r   chunksr      s       s1vvz??D1a!e__ * *DTD()))))
AHtO
r&   c                     t          j        |           D ]I}t          j        ||          r2t          j        t           j                            | |                     JdS )z+Delete files in a dir matching pattern
    N)r   listdirrF   searchremover   r   )fnrH   r   s      r   purger   	  sZ     Z^^ + +9Wa   	+Ibgll2q))***+ +r&   c                    t           j                            t          j        t          | j                            }t           j                            |          rt          j        |           dS t          d| j        d           dS )z?Clears the memoization cache for this specific news domain
    zmemo file forzhas already been deleted!N)
r   r   r   r   MEMO_DIRr   r   r   r   r   )sourced_pths     r   clear_memo_cacher     so     GLL*,>v},M,MNNE	w~~e K
	%ov}.IJJJJJr&   c                    | j         }| j        }t          |          dk    rg S i }d |D             }t          j                            t          j        t          |                    }t          j        	                    |          rt          j        |dd          }|                                }|                                 d |D             }d |D             }t          |                                          D ]\  }	}
|                    |	          r||	= t          |                                          t          |                                          z   }d                    d |D                       }n>d                    d	 t          |                                          D                       }t          |          |j        k    rt&                              d
           d}t          j        |dd          }|                    |           |                                 t          |                                          S )a
  When we parse the <a> links in an <html> page, on the 2nd run
    and later, check the <a> links of previous runs. If they match,
    it means the link must not be an article, because article urls
    change as time passes. This method also uniquifies articles.
    r   c                     i | ]
}|j         |S r%   )r+   )r   articles     r   
<dictcomp>z$memoize_articles.<locals>.<dictcomp>(  s    AAAWGKAAAr&   r   utf8c                 6    g | ]}|                                 S r%   strip)r   us     r   
<listcomp>z$memoize_articles.<locals>.<listcomp>/  s     (((a		(((r&   c                     i | ]}|d S )Tr%   )r   r+   s     r   r   z$memoize_articles.<locals>.<dictcomp>1  s    ***cT***r&   z
c                 6    g | ]}|                                 S r%   r   r   hrefs     r   r   z$memoize_articles.<locals>.<listcomp>:  s     333dTZZ\\333r&   c                 6    g | ]}|                                 S r%   r   r   s     r   r   z$memoize_articles.<locals>.<listcomp>>  s     @@@dTZZ\\@@@r&   zmemo overflow, dumpingrT   wr   )r   configr   r   r   r   r   r   r   r   r   r   	readlinesr   listitemsgetkeysMAX_FILE_MEMOlogcriticalwritevalues)r   articlessource_domainr   memocur_articlesr   r   urlsr+   r   
valid_urls	memo_textffs                 r   memoize_articlesr    s    MM]F
8}}	DAAAAALGLL*,>},M,MNNE	w~~e BKsF++{{}}				((4(((**T*** !3!3!5!566 	& 	&LCxx}} & %$))++&&l.?.?.A.A)B)BB
KK33z3335 5		 KK@@d<+<+<+>+>&?&?@@@B B	 4yy6'''-...	 
UC	)	)BHHYHHJJJ##%%&&&r&   c                     t          t          j        d          5 } |                                 }t	          j        dt          |          dz
            }||         }|                                cddd           S # 1 swxY w Y   dS )z:Uses generator to return next useragent in saved file
    r   r   r   N)r   r   
USERAGENTSr   randomrandintr   r   )r   agents	selectionagents       r   get_useragentr	  M  s     
h!3	'	' 1N1c&kkAo66	y!{{}}	                 s   AA==BBc                      t          j        t           j                            t          j                            } d | D             }|D ]}t          |          dk    sJ |S )zGReturns a list of available languages and their 2 char input codes
    c                 v    g | ]6}|                     d           d                              d          d         7S )r   r   r3   r   )rJ   )r   r   s     r   r   z+get_available_languages.<locals>.<listcomp>[  s9    KKKqQWWS\\!_**3//2KKKr&      )r   r   r   r   r   STOPWORDS_DIRr   )stopword_filestwo_dig_codesds      r   get_available_languagesr  W  sa     ZX-C D DEENKKNKKKM  1vv{{{{{r&   c                     i ddddddddd	d
dddddddddddddddddddddd d!d"i d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdG} t                      }t          dH           t          dI           |D ]}t          dJ|dK| |                    t                       dLS )Mz5Prints available languages with their full names
    arArabicruRussiannlDutchdeGermanenEnglishesSpanishfrFrenchheHebrewitItaliankoKoreanno	Norwegiannbu   Norwegian (Bokmål)faPersianplPolishpt
PortuguesesvSwedishhu	HungarianfiFinnishdaDanishzhChineseid
Indonesianvi
Vietnamesemk
MacedoniantrTurkishelGreekuk	UkrainianhiHindiswSwahilibg	BulgarianhrCroatianroRomaniansl	SloveniansrSerbianetEstonianJapanese
Belarusian)jabez
Your available languages are:z
input code		full namez  z			  N)r  r   )language_dictcodescodes      r   print_available_languagesr]  a  s3   %h%i% 	g% 	h	%
 	i% 	i% 	h% 	h% 	i% 	h% 	k% 	#% 	i% 	h% 	l%  	i!%" 	k#% %$ 	i%%& 	h'%( 	i)%* 	l+%, 	l-%. 	l/%0 	i1%2 	g3%4 	k5%6 	g7%8 	i9%: 	k;%< 	j=%> 	j?%@ 	kA%B 	iC%D 	jE% %F I% % %MN $%%E	
+,,,	
%&&& > >$$$d(;(;<====	GGGGGr&   c                     t          |                                          D ]&\  }}t          | |          rt          | ||           '| S )z
    We are handling config value setting like this for a cleaner api.
    Users just need to pass in a named param to this source and we can
    dynamically generate a config object for it.
    )r   r   hasattrsetattr)r   config_itemsr   vals       r   extend_configrc    sU     ++--.. & &S63 	&FC%%%Mr&   )r   r   )4__doc__	__title__
__author____license____copyright__r   r7   loggingr   r   r  rF   rL   rr   rl   r:   r   bs4r   rT   r   	getLoggerr!   r   setLevelDEBUGobjectr
   r(   r0   r?   rD   rN   rW   	Exceptionre   r~   r   r   r   r   r   r   r   r   r   r   r  r	  r  r]  rc  r%   r&   r   <module>rp     s>    	
/    				   				  



                       g!! W]   : : : : : : : :"# # # # #v # # #0 0 0 0 0 0 0 06 6 6 6 6 6 6 6* * * * *V * * *> > > > > > > >    f   ,	 	 	 	 	9 	 	 	  <  + + +  F F F.5 5 5   :	 	 	  + + +K K K/' /' /'d    / / /d
 
 
 
 
r&   