
    !}ga                     (   d Z dZdZdZdZddlZddlZddlZddlm	Z	  ej
        d	          Zd
 Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          ZdS )z+
Stopword extraction and stopword classes.
	newspaperzLucas Ou-YangMITzCopyright 2014, Lucas Ou-Yang    N   )
FileHelperz[\s\t]+c                     t          | t                    rVt          j        t          d|           } d                    |                                           } |                                 S dS )N  )
isinstancestrresub	TABSSPACEjoin
splitlinesstrip)values    N/var/www/py-google-trends/myenv/lib/python3.11/site-packages/newspaper/text.py	innerTrimr      sW    % y#u--((**++{{}}2    c                   8    e Zd Zd Zd Zd Zd Zd Zd Zd Z	dS )		WordStatsc                 0    d| _         d| _        g | _        d S Nr   )stop_word_count
word_count
stop_wordsselfs    r   __init__zWordStats.__init__   s        r   c                     | j         S Nr   r   s    r   get_stop_wordszWordStats.get_stop_words(   
    r   c                     || _         d S r!   r"   )r   wordss     r   set_stop_wordszWordStats.set_stop_words+   s    r   c                     | j         S r!   r   r   s    r   get_stopword_countzWordStats.get_stopword_count.   s    ##r   c                     || _         d S r!   r)   )r   	wordcounts     r   set_stopword_countzWordStats.set_stopword_count1   s    (r   c                     | j         S r!   r   r   s    r   get_word_countzWordStats.get_word_count4   r$   r   c                     || _         d S r!   r/   )r   cnts     r   set_word_countzWordStats.set_word_count7   s    r   N)
__name__
__module____qualname__r   r#   r'   r*   r-   r0   r3    r   r   r   r      s}                 $ $ $) ) )      r   r   c                   X    e Zd Ze                    dd          Zi ZddZd Zd Z	d Z
dS )		StopWordsr	   enc                     || j         vr^t          j                            dd|z            }t	          t          j        |                                                    | j         |<   | j         |         | _        d S )Ntextzstopwords-%s.txt)	_cached_stop_wordsospathr   setr   loadResourceFiler   
STOP_WORDS)r   languager?   s      r   r   zStopWords.__init__@   sm    42227<<(:X(EFFDJ/55@@BBCC #H-1(;r   c                     t          |t                    }|r|                    d          }d t          j        D             }|                    d                              |          }|S )Nzutf-8c                 .    i | ]}t          |          d S r!   )ord).0cs     r   
<dictcomp>z0StopWords.remove_punctuation.<locals>.<dictcomp>M   s     @@@s1vvt@@@r   )r
   r   encodestringpunctuationdecode	translate)r   contentcontent_is_unicodetrans_tablestripped_inputs        r   remove_punctuationzStopWords.remove_punctuationG   sh     (55 	.nnW--G@@V-?@@@ 00::;GGr   c                 ,    |                     d          S )Nr   )split)r   rR   s     r   candidate_wordszStopWords.candidate_wordsR   s    ##C(((r   c                    |st                      S t                      }|                     |          }|                     |                                          }g }d}|D ]%}|dz  }|| j        v r|                    |           &|                    |           |                    t          |                     |	                    |           |S Nr   r   )
r   rS   rV   lowerrB   appendr3   r-   lenr'   )r   rO   wsrR   rV   overlapping_stopwordsrH   ws           r   r*   zStopWords.get_stopword_countU   s     	;;[[0099..~/C/C/E/EFF "  	0 	0AFADO##%,,Q///
!
c"788999
/000	r   N)r:   )r4   r5   r6   r   	maketransTRANS_TABLEr=   r   rS   rV   r*   r7   r   r   r9   r9   ;   sn        --B''K< < < <	 	 	) ) )    r   r9   c                   *     e Zd ZdZd fd	Zd Z xZS )StopWordsChinesezChinese segmentation
    zhc                 Z    t          t          |                               d           d S )Nrc   rC   )superrb   r   r   rC   	__class__s     r   r   zStopWordsChinese.__init__k   s+    %%...=====r   c                 8    dd l }|                    |d          S )Nr   T)cut_all)jiebacut)r   rR   rk   s      r   rV   z StopWordsChinese.candidate_wordsn   s#     	yyy666r   )rc   r4   r5   r6   __doc__r   rV   __classcell__rh   s   @r   rb   rb   h   sV         > > > > > >7 7 7 7 7 7 7r   rb   c                   0     e Zd ZdZd fd	Zd Zd Z xZS )StopWordsArabiczArabic segmentation
    arc                 Z    t          t          |                               d           d S )Nrs   re   )rf   rr   r   rg   s     r   r   zStopWordsArabic.__init__x   s*    ot$$--t-<<<<<r   c                     |S r!   r7   )r   rO   s     r   rS   z"StopWordsArabic.remove_punctuation|   s    r   c                     dd l }|j        j                                        }g }|j                            |          D ]*}|                    |                    |                     +|S r   )nltkstemisriISRIStemmertokenizewordpunct_tokenizerZ   )r   rR   rw   sr&   words         r   rV   zStopWordsArabic.candidate_words   sh    IN&&((M44^DD 	' 	'DLL&&&&r   )rs   )r4   r5   r6   rn   r   rS   rV   ro   rp   s   @r   rr   rr   u   se         = = = = = =        r   rr   c                   *     e Zd ZdZd fd	Zd Z xZS )StopWordsKoreanzKorean segmentation
    koc                 Z    t          t          |                               d           d S )Nr   re   )rf   r   r   rg   s     r   r   zStopWordsKorean.__init__   s*    ot$$--t-<<<<<r   c                    |st                      S t                      }|                     |          }|                     |          }g }d}|D ];}|dz  }| j        D ],}|                    |          r|                    |           -<|                    |           |                    t          |                     |	                    |           |S rX   )
r   rS   rV   rB   endswithrZ   r3   r-   r[   r'   )	r   rO   r\   rR   rV   r]   rH   r^   r}   s	            r   r*   z"StopWordsKorean.get_stopword_count   s     	;;[[0099..~>> "  	4 	4AFA_ 4 4::a== 4)003334 	!
c"788999
/000	r   )r   r4   r5   r6   rn   r   r*   ro   rp   s   @r   r   r      sV         = = = = = =      r   r   c                   *     e Zd ZdZd fd	Zd Z xZS )StopWordsHindizHindi segmentation
    hic                 Z    t          t          |                               d           d S )Nr   re   )rf   r   r   rg   s     r   r   zStopWordsHindi.__init__   s*    nd##,,d,;;;;;r   c                    |st                      S t                      }|                     |          }|                     |          }g }d}|D ]&}|dz  }| j        D ]}|                    |           '|                    |           |                    t          |                     |                    |           |S rX   )	r   rS   rV   rB   rZ   r3   r-   r[   r'   )	r   rO   r\   rR   rV   r]   rH   r^   	stop_words	            r   r*   z!StopWordsHindi.get_stopword_count   s     	;;[[0099..~>> "  	8 	8AFA!_ 8 8	%,,Y77778 	!
c"788999
/000	r   )r   r   rp   s   @r   r   r      sV         < < < < < <      r   r   c                   *     e Zd ZdZd fd	Zd Z xZS )StopWordsJapanesezJapanese segmentation
    jac                 Z    t          t          |                               d           d S )Nr   re   )rf   r   r   rg   s     r   r   zStopWordsJapanese.__init__   s+    &&///>>>>>r   c                 `    dd l }|                                }|                    |          }|S r   )tinysegmenterTinySegmenterr{   )r   rR   r   	segmentertokenss        r   rV   z!StopWordsJapanese.candidate_words   s7    !//11	##N33r   )r   rm   rp   s   @r   r   r      sV         ? ? ? ? ? ?      r   r   )rn   	__title__
__author____license____copyright__r>   r   rK   utilsr   compiler   r   objectr   r9   rb   rr   r   r   r   r7   r   r   <module>r      s    	
/ 				 				       BJz""	         >* * * * * * * *Z
7 
7 
7 
7 
7y 
7 
7 
7    i   &    i   4    Y   2
 
 
 
 
	 
 
 
 
 
r   