
    !}g                         d Z dZdZdZdZddlZddlZddlmZ ddl	m
Z
 d	d
lmZ dZ e            ad ZddZd Zd Zd Zd Zd Zd Zd Zd Zd ZdS )zH
Anything natural language related should be abstracted into this file.
	newspaperzLucas Ou-YangMITzCopyright 2014, Lucas Ou-Yang    N)path)Counter   )settingsg      4@c                 l   | dk    rt           j        }n2t          j        t           j        d                    |                     }t          |dd          5 }t                              t          d |
                                D                                  ddd           dS # 1 swxY w Y   dS )zB 
    Loads language-specific stopwords for keyword selection
    enzstopwords-{}.txtrzutf-8)encodingc                 6    g | ]}|                                 S  )strip).0ws     M/var/www/py-google-trends/myenv/lib/python3.11/site-packages/newspaper/nlp.py
<listcomp>z"load_stopwords.<locals>.<listcomp>%   s     ???Aaggii???    N)r   NLP_STOPWORDS_ENr   joinSTOPWORDS_DIRformatopen	stopwordsupdateset	readlines)languagestopwordsFilefs      r   load_stopwordsr!      s     4 1	("8"4";";H"E"EG G	mS7	3	3	3 Bq?????@@AAAB B B B B B B B B B B B B B B B B Bs   AB))B-0B-    c                 F   |r|r|dk    rg S g }t          |          }t          |          }t          |          }t          |||                              |          }|D ]}	|                    |	d                    |                    d            d |D             S )Nr   c                     | d         S )Nr   r   )summarys    r   <lambda>zsummarize.<locals>.<lambda>5   s
    wqz r   )keyc                     g | ]
}|d          S )r   r   )r   r&   s     r   r   zsummarize.<locals>.<listcomp>6   s    0007GAJ000r   )split_sentenceskeywordssplit_wordsscoremost_commonappendsort)
urltitletext	max_sents	summaries	sentenceskeys
titleWordsranksranks
             r   	summarizer;   (   s     u 	Q	I%%ID>>DU##J )Z..::9EEE " "a!!!!NN11N22200i0000r   c                    t          |           }t                      }t          |           D ]\  }}t          |          }t	          ||          }t          t          |                    }	t          |dz   |          }
t          ||          }t          ||          }||z   dz  dz  }|dz  |dz  z   |	dz  z   |
dz  z   dz  }||||f<   |S )z0Score sentences based on different features
    r   g       @      $@      ?      ?g      @)	lenr   	enumerater,   title_scorelength_scoresentence_positionsbsdbs)r6   r8   r+   senSizer9   issentencetitleFeaturesentenceLengthsentencePosition
sbsFeature
dbsFeature	frequency
totalScores                  r   r-   r-   9   s     )nnGIIE)$$ # #1q>>":x88%c(mm44,QUG<<8,,
8,,
*,3d:	"3&36$S()+;C+?@ADE
"q!fLr   c                     d}t          |           dk    rdS | D ]}||v r|||         z  }dt          j        t          |                     z  |z  dz  S )N        r   r?   r=   )r@   mathfabs)wordsr+   r-   words       r   rE   rE   M   si    EE

aq $ $8Xd^#E$)CJJ'''%/477r   c                    t          |           dk    rdS d}g }g }t          |           D ]M\  }}||v rD||         }|g k    r||g}|}||g}|d         |d         z
  }||d         |d         z  |dz  z  z  }Nt          t          |                                                              t          |                               dz   }	d|	|	dz   z  z  |z  S )Nr   r      r?   )r@   rA   r   r7   intersection)
rV   r+   summfirstsecondrH   rW   r-   difks
             r   rF   rF   W   s    E

aqDEFU## 	< 	<48TNE{{E
E
Ah*qF1I-#(;;C  --c%jj99::Q>Aa#g$&'r   c                     	 t          j        dd|           } d |                                 D             S # t          $ r Y dS w xY w)z'Split a string into array of words
    z[^\w ]r"   c                 \    g | ])}|                     d                                           *S ).)r   lowerr   xs     r   r   zsplit_words.<locals>.<listcomp>r   s.    ;;;""$$;;;r   N)resubsplit	TypeError)r3   s    r   r,   r,   m   sX    viT**;;djjll;;;;   tts   36 
AAc                    d}t          |           } | rt          |           }d | D             } i }| D ]}||v r||xx         dz  cc<   d||<   t          |t          |                    }t          |                                d d          }|d|         }t          d |D                       }|D ])}||         d	z  t          |d          z  }|d
z  dz   ||<   *t          |          S t                      S )zGet the top 10 keywords and their frequency scores ignores blacklisted
    words in stopwords, counts the number of occurrences of each word, and
    sorts them in reverse natural order (so descending) by number of
    occurrences.
    
   c                 $    g | ]}|t           v|S r   r   rd   s     r   r   zkeywords.<locals>.<listcomp>   s"    666a1I#5#5#5#5#5r   r   c                 "    | d         | d         fS )Nr   r   r   )re   s    r   r'   zkeywords.<locals>.<lambda>   s    1qt r   T)r(   reverseNc              3   $   K   | ]\  }}||fV  d S )Nr   )r   re   ys      r   	<genexpr>zkeywords.<locals>.<genexpr>   s*      4441aA444444r   r?   r>   )r,   r@   minsorteditemsdictmax)	r3   NUM_KEYWORDS	num_wordsfreqrW   min_sizer+   r_   articleScores	            r   r+   r+   w   s9    LtD II	664666 	 	Dt||T


a



T

|SYY//$**,,44"&( ( ( IXI&44844444 	1 	1A#A;,s9a/@/@@L&,q0HQKKH~~vvr   c                     ddl }|j                            d          }|                    |           }d |D             }|S )z(Split a large string into sentences
    r   Nztokenizers/punkt/english.picklec                 `    g | ]+}t          |          d k    |                    dd          ,S )rk   
r"   )r@   replacerd   s     r   r   z#split_sentences.<locals>.<listcomp>   s1    GGG3q66B;;4$$;;;r   )	nltk.datadataloadtokenize)r3   nltk	tokenizerr6   s       r   r*   r*      sP     	@AAI""4((IGGiGGGIr   c                 P    dt          j        t          | z
            t          z  z
  S )Nr   )rT   rU   ideal)sentence_lens    r   rC   rC      s!    ty-..666r   c                     | rEd | D             } d}|D ]}|t           vr	|| v r|dz  }|t          t          |           d          z  S dS )Nc                 $    g | ]}|t           v|S r   rm   rd   s     r   r   ztitle_score.<locals>.<listcomp>   s"    888qQi%7%7%7%7%7r   rS   r?   r   r   )r   rw   r@   )r2   rJ   countrW   s       r   rB   rB      sn     88E888 	 	DI%%$%--s3u::q))))qr   c                     | dz  |z  }|dk    rdS |dk    rdS |dk    rdS |dk    rdS |dk    rd	S |d
k    rdS |dk    rdS |dk    rdS |dk    rdS |dk    rdS |dk    rdS dS )zdDifferent sentence positions indicate different
    probability of being an important sentence.
    r?   r   g?g333333?g?g{Gz?gffffff?g333333?gQ?g      ?g?g?g333333?g{Gz?g?gQ?g?gq=
ףp?g(\?r   )rH   size
normalizeds      r   rD   rD      s     S4JSq
s

t
s

t
s

t
s

t
s

t
s

t
s

t
s

t
s

t
q..tqr   )r"   r"   r"   r#   )__doc__	__title__
__author____license____copyright__rf   rT   osr   collectionsr   r"   r   r   r   r   r!   r;   r-   rE   rF   r,   r+   r*   rC   rB   rD   r   r   r   <module>r      s5    	
/ 				                   CEE	B B B$1 1 1 1"  (8 8 8( ( (,    D  7 7 7	 	 	    r   