ó
¥dTc           @€  s>  d  d l  m Z d Z d Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l	 Z	 d  d l
 Z
 d  d l m Z d  d l Z d  d l Z d  d l m Z d  d l Z d  d l m Z d a e j ƒ  a d a i  a d	 a d	 a i  a e a e j e j  ƒ Z! e j" e# ƒ a$ t$ j% e j& ƒ t$ j' e! ƒ d
 „  Z( d „  Z) d d „ Z* d „  Z+ d „  Z, d „  Z- e+ d „  ƒ Z. d „  Z/ d „  Z0 e e1 d „ a2 e1 d „ a3 e+ d „  ƒ Z4 e+ d d „ ƒ Z5 t2 Z6 t3 Z7 d „  Z8 d „  Z9 d „  Z: d „  Z; e+ d d „ ƒ Z< d „  Z= d „  Z> d „  Z? d e1 d  „ Z@ d S(!   iÿÿÿÿ(   t   with_statements   0.34t   MITN(   t   log(   t   wraps(   t   md5s   dict.txtg        c         C€  s   t  j |  ƒ d  S(   N(   t   loggert   setLevel(   t	   log_level(    (    s   ../jieba\__init__.pyt   setLogLevel!   s    c      
   C€  s!  i  } t  ƒ  } d } t |  d ƒ î } d } xÞ | j ƒ  j ƒ  j d ƒ j d ƒ D]¸ } | d 7} yo | j d ƒ d  \ } } t | ƒ } | | | <| | 7} x/ t t | ƒ ƒ D] }	 | j	 | |	 d  ƒ q± WWqR t
 k
 r	}
 t j d	 |  | | f ƒ t
 |
 ‚ qR XqR WWd  QX| | | f S(
   Ng        t   rbi    s   utf-8s   
i   t    i   s   %s at line %s %s(   t   sett   opent   readt   rstript   decodet   splitt   floatt   xranget   lent   addt
   ValueErrorR   t   debug(   t   f_namet   lfreqt   pfdictt   ltotalt   ft   linenot   linet   wordt   freqt   cht   e(    (    s   ../jieba\__init__.pyt
   gen_pfdict%   s$    	+


c   
      C€  sÐ  |  s t  }  n  t µt r  d  St r2 b d  a n  t j j t j j t j	 ƒ  t j j
 t ƒ ƒ ƒ } t j j | |  ƒ } t j d | ƒ t j ƒ  } | t j j | d ƒ k rÐ t j j t j ƒ  d ƒ } n7 t j j t j ƒ  d t | j d d ƒ ƒ j ƒ  ƒ } t } t j j | ƒ rŸt j j | ƒ t j j | ƒ k rŸt j d | ƒ y8 t j t | d ƒ ƒ \ a a a a t t t ƒ } WqŸt } qŸXn  | r˜t | ƒ \ a a a t  d	 „  t j! ƒ  Dƒ ƒ a t" t j# ƒ  ƒ a t j d
 | ƒ y… t j$ ƒ  \ } } t j% | d ƒ # } t j& t t t t f | ƒ Wd  QXt j' d k rgd d l( m) }	 n	 t j* }	 |	 | | ƒ Wq˜t j+ d ƒ q˜Xn  t a t j d t j ƒ  | ƒ t j d ƒ Wd  QXd  S(   Ns    Building prefix dict from %s ...s   dict.txts   jieba.caches   jieba.u%s.caches   utf-8t   replaces   Loading model from cache %sR	   c         s€  s1   |  ]' \ } } | t  t | ƒ t ƒ f Vq d  S(   N(   R   R   t   total(   t   .0t   kt   v(    (    s   ../jieba\__init__.pys	   <genexpr>Y   s    s   Dumping model to file cache %st   wbt   ntiÿÿÿÿ(   t   moves   Dump cache file failed.s   Loading model cost %s seconds.s'   Prefix dict has been built succesfully.(,   t
   DICTIONARYt	   DICT_LOCKt   initializedR   t   Nonet   ost   patht   normpatht   joint   getcwdt   dirnamet   __file__R   R   t   timet   tempfilet
   gettempdirR   t   encodet	   hexdigestt   Truet   existst   getmtimet   marshalt   loadR   t   FREQR$   t   min_freqt
   isinstanceR   R"   t   dictt	   iteritemst   mint
   itervaluest   mkstempt   fdopent   dumpt   namet   shutilR*   t   renamet	   exception(
   t
   dictionaryt   _curpatht   abs_patht   t1t
   cache_filet   load_from_cache_failt   fdt   fpatht   temp_cache_filet   replace_file(    (    s   ../jieba\__init__.pyt
   initialize9   sR    		376$"	c         €  s   t  ˆ  ƒ ‡  f d †  ƒ } | S(   Nc          €  s.   t  r ˆ  |  | Ž  St t ƒ ˆ  |  | Ž  Sd  S(   N(   R-   RX   R+   (   t   argst   kwargs(   t   fn(    s   ../jieba\__init__.pyt   wrappedp   s    
(   R   (   R[   R\   (    (   R[   s   ../jieba\__init__.pyt   require_initializedn   s    	c         c€  s¨   t  |  ƒ } d } x | j ƒ  D] \ } } t | ƒ d k rj | | k rj |  | | d d !V| d } q x3 | D]+ } | | k rq |  | | d !V| } qq qq Wq Wd  S(   Niÿÿÿÿi   i    (   t   get_DAGRD   R   (   t   sentencet   dagt   old_jR&   t   Lt   j(    (    s   ../jieba\__init__.pyt	   __cut_all|   s    c         €  se   t  ˆ ƒ } d ˆ | <xH t | d d d ƒ D]0 ‰  t ‡  ‡ ‡ f d †  | ˆ  Dƒ ƒ ˆ ˆ  <q- Wd  S(   Ng        t    i   iÿÿÿÿc         3€  sB   |  ]8 } t  j ˆ ˆ  | d  !t ƒ ˆ | d  d | f Vq d S(   i   i    N(   R@   t   getRA   (   R%   t   x(   t   idxt   routeR_   (    s   ../jieba\__init__.pys	   <genexpr>Ž   s    (   g        Re   (   R   R   t   max(   R_   t   DAGRh   Ri   t   N(    (   Rh   Ri   R_   s   ../jieba\__init__.pyt   calcŠ   s    
c         C€  s¹   i  } t  |  ƒ } x  t | ƒ D]’ } g  } | } |  | } xS | | k  r | t k r | t k rr | j | ƒ n  | d 7} |  | | d !} q> W| s§ | j | ƒ n  | | | <q W| S(   Ni   (   R   R   R   R@   t   append(   R_   Rk   Rl   R&   t   tmplistt   it   frag(    (    s   ../jieba\__init__.pyR^      s    

c   	      c€  sï   t  j d t  j ƒ } t |  ƒ } i  } t |  | d | ƒ d } t |  ƒ } d } x‚ | | k  rÖ | | d d } |  | | !} | j | ƒ r´ t | ƒ d k r´ | | 7} | } qU | rÈ | Vd } n  | V| } qU W| rë | Vd } n  d  S(   Nu   [a-zA-Z0-9]i    u    i   (   t   ret   compilet   UR^   Rm   R   t   match(	   R_   t   re_engRk   Ri   Rg   Rl   t   buft   yt   l_word(    (    s   ../jieba\__init__.pyt   __cut_DAG_NO_HMM£   s*    !
		
c         c€  s„  t  |  ƒ } i  } t |  | d d | ƒd } d } t |  ƒ } xÐ | | k  r| | d d } |  | | !} | | d k r‹ | | 7} n~ | rt | ƒ d k r± | Vd } q| t k rå t j | ƒ } x, | D] }	 |	 VqÓ Wn x | D] }
 |
 Vqì Wd } n  | V| } qC W| r€t | ƒ d k r3| Vq€| t k rgt j | ƒ } x, | D] }	 |	 VqUWq€x | D] }
 |
 VqnWn  d  S(   Ni    Ri   u    i   (   R^   Rm   R   R@   t   finalsegt   cut(   R_   Rk   Ri   Rg   Rw   Rl   Rx   Ry   t
   recognizedt   tt   elem(    (    s   ../jieba\__init__.pyt	   __cut_DAG»   sB    			
c         c€  s‡  t  |  t ƒ sK y |  j d ƒ }  WqK t k
 rG |  j d d ƒ }  qK Xn  | r t j d t j ƒ t j d t j ƒ } } n+ t j d t j ƒ t j d t j ƒ } } | j |  ƒ } | rÈ t } n | r× t	 } n t
 } x£ | D]› } | sö qä n  | j | ƒ r$xw | | ƒ D] } | VqWqä | j | ƒ }	 xI |	 D]A }
 | j |
 ƒ rW|
 Vq:| svx |
 D] } | VqdWq:|
 Vq:Wqä Wd S(	   s?  The main function that segments an entire sentence that contains
    Chinese characters into seperated words.
    Parameter:
        - sentence: The str/unicode to be segmented.
        - cut_all: Model type. True for full pattern, False for accurate pattern.
        - HMM: Whether to use the Hidden Markov Model.
    s   utf-8t   gbkt   ignoreu   ([ä¸€-é¾¥]+)u   [^a-zA-Z0-9+#\n]u   ([ä¸€-é¾¥a-zA-Z0-9+#&\._]+)u	   (\r\n|\s)N(   RB   t   unicodeR   t   UnicodeDecodeErrorRr   Rs   Rt   R   Rd   R€   Rz   Ru   (   R_   t   cut_allt   HMMt   re_hant   re_skipt   blockst	   cut_blockt   blkR   t   tmpRg   t   xx(    (    s   ../jieba\__init__.pyR|   ã   s8    .+		c         c€  sâ   t  |  d | ƒ} xÉ | D]Á } t | ƒ d k rz xF t t | ƒ d ƒ D]+ } | | | d !} | t k rH | VqH qH Wn  t | ƒ d k rÕ xF t t | ƒ d ƒ D]+ } | | | d !} | t k r£ | Vq£ q£ Wn  | Vq Wd  S(   NR†   i   i   i   (   R|   R   R   R@   (   R_   R†   t   wordst   wRp   t   gram2t   gram3(    (    s   ../jieba\__init__.pyt   cut_for_search  s    c         C€  sã   t  |  t t f ƒ r' t |  d ƒ }  n  |  j ƒ  j d ƒ } d } xš | j d ƒ D]‰ } | d 7} | j ƒ  st qR n  | j d ƒ } | d | d } } | j ƒ  t	 k r° qR n  | d k rÑ | j
 d d ƒ } n  t | Œ  qR Wd	 S(
   sÿ    Load personalized dict to improve detect rate.
    Parameter:
        - f : A plain text file contains words and their ocurrences.
    Structure of dict file:
    word1 freq1 word_type1
    word2 freq2 word_type2
    ...
    Word type may be ignored
    R	   s   utf-8i    s   
i   R
   u   ï»¿u    N(   RB   t   strRƒ   R   R   R   R   R   t   isdigitt   FalseR#   t   add_word(   R   t   contentt   line_noR   t   tupR   R   (    (    s   ../jieba\__init__.pyt   load_userdict  s    
c         C€  so   t  t | ƒ t ƒ t |  <| d  k	 r9 | j ƒ  t |  <n  x/ t t |  ƒ ƒ D] } t	 j
 |  | d  ƒ qL Wd  S(   Ni   (   R   R   R$   R@   R.   t   stript   user_word_tag_tabR   R   R   R   (   R   R   t   tagR    (    (    s   ../jieba\__init__.pyR–   :  s
    c         C€  s   t  t |  t ƒ ƒ S(   N(   t   listt	   __ref_cutR•   (   R_   (    (    s   ../jieba\__init__.pyt   __lcutF  s    c         C€  s   t  t |  t t ƒ ƒ S(   N(   Rž   RŸ   R•   (   R_   (    (    s   ../jieba\__init__.pyt   __lcut_no_hmmH  s    c         C€  s   t  t |  t ƒ ƒ S(   N(   Rž   RŸ   R;   (   R_   (    (    s   ../jieba\__init__.pyt
   __lcut_allJ  s    c         C€  s   t  t |  ƒ ƒ S(   N(   Rž   t   __ref_cut_for_search(   R_   (    (    s   ../jieba\__init__.pyt   __lcut_for_searchL  s    c         C€  sµ   t  j d k r t d ƒ ‚ n  t j d d k rS t j d d k  rS t d ƒ ‚ n  d d	 l m } m } |  d  k r | ƒ  }  n  | |  ƒ a	 t
 t d
 „ } d „  } | a | a d  S(   NR)   s/   jieba: parallel mode only supports posix systemi    i   i   i   s4   jieba: the parallel feature needs Python version>2.5iÿÿÿÿ(   t   Poolt	   cpu_countc         s€  s‹   t  j d ƒ j |  ƒ } | r3 t j t | ƒ } n- | rN t j t | ƒ } n t j t | ƒ } x$ | D] } x | D] } | Vqt Wqg Wd  S(   Ns   ([
]+)(   Rr   Rs   R   t   poolt   mapR¢   R    R¡   (   R_   R…   R†   t   partst   resultt   rR   (    (    s   ../jieba\__init__.pyt   pcut\  s    c         s€  sU   t  j d ƒ j |  ƒ } t j t | ƒ } x$ | D] } x | D] } | Vq> Wq1 Wd  S(   Ns   ([
]+)(   Rr   Rs   R   R§   R¨   R¤   (   R_   R©   Rª   R«   R   (    (    s   ../jieba\__init__.pyt   pcut_for_searchh  s
    (   R/   RJ   t	   Exceptiont   syst   version_infot   multiprocessingR¥   R¦   R.   R§   R•   R;   R|   R’   (   t
   processnumR¥   R¦   R¬   R­   (    (    s   ../jieba\__init__.pyt   enable_parallelP  s    &	c           C€  s2   d t  ƒ  k r" t j ƒ  d  a n  t a t a d  S(   NR§   (   t   globalsR§   t   closeR.   RŸ   R|   R£   R’   (    (    (    s   ../jieba\__init__.pyt   disable_parallelr  s
    
	c         C€  si   t  ] t j j t j j t j ƒ  |  ƒ ƒ } t j j | ƒ sS t d | ƒ ‚ n  | a t	 a
 Wd  QXd  S(   Ns   jieba: path does not exist: (   R,   R/   R0   R1   R2   R3   R<   R®   R+   R•   R-   (   t   dictionary_pathRP   (    (    s   ../jieba\__init__.pyt   set_dictionaryz  s    'c          C€  sL   t  j j t  j j t  j ƒ  t  j j t ƒ ƒ ƒ }  t  j j |  t ƒ } | S(   N(   R/   R0   R1   R2   R3   R4   R5   R+   (   RO   RP   (    (    s   ../jieba\__init__.pyt   get_abs_path_dictƒ  s    3t   defaultc   	      c€  s¡  t  |  t ƒ s t d ƒ ‚ n  d } | d k rx xjt |  d | ƒD]. } t | ƒ } | | | | f V| | 7} qC Wn%x"t |  d | ƒD]} t | ƒ } t | ƒ d k rx[ t t | ƒ d ƒ D]@ } | | | d !} | t k rÆ | | | | | d f VqÆ qÆ Wn  t | ƒ d k r}x[ t t | ƒ d ƒ D]@ } | | | d !} | t k r6| | | | | d f Vq6q6Wn  | | | | f V| | 7} q‹ Wd S(	   s  Tokenize a sentence and yields tuples of (word, start, end)
    Parameter:
        - sentence: the unicode to be segmented.
        - mode: "default" or "search", "search" is for finer segmentation.
        - HMM: whether to use the Hidden Markov Model.
    s-   jieba: the input parameter should be unicode.i    Rº   R†   i   i   i   N(   RB   Rƒ   R®   R|   R   R   R@   (	   t   unicode_sentencet   modeR†   t   startR   t   widthRp   R   R‘   (    (    s   ../jieba\__init__.pyt   tokenizeˆ  s,    $$(A   t
   __future__R    t   __version__t   __license__Rr   R/   R¯   R{   R6   R7   R>   t   mathR   t   randomt	   threadingt	   functoolsR   t   loggingt   hashlibR   R+   t   RLockR,   R.   R   R@   RA   R$   Rœ   R•   R-   t   StreamHandlert   stderrt   log_consolet	   getLoggert   __name__R   R   t   DEBUGt
   addHandlerR   R"   RX   R]   Rd   Rm   R^   Rz   R€   R;   R|   R’   Rš   R–   RŸ   R£   R    R¡   R¢   R¤   R³   R¶   R¸   R¹   R¿   (    (    (    s   ../jieba\__init__.pyt   <module>   sj   		5					(-				!				