
    %$}g%              
          d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZ dd	lmZ d
dlmZ d
dlmZ  ej        d          Z eedd           Z ed          d*dededede fd            Z!d+dededee         fdZ"d,dedede#fdZ$ G d d          Z%d edefd!Z&d"\  Z'Z(Z)Z* G d# d$          Z+ e+e          Z,d%eddfd&Z-d'ed(ede fd)Z.dS )-z>Code parts dedicated to duplicate removal and text similarity.    N)SequenceMatcher)	lru_cache)blake2b)add)RLock)AnyDictListOptionalUnion)_Element   )LRU_SIZE)trimz\.[^/?#]{2,63}$	bit_countc                 F    t          |                               d          S )N1)bincount)xs    Y/var/www/py-google-trends/myenv/lib/python3.11/site-packages/trafilatura/deduplication.py<lambda>r      s    SVV\\#5F5F     i   maxsize      ?	reference
new_string	thresholdreturnc                     t                               d|           } t                               d|          }t          d| |                                          |k    S )zIReturn the similarity ratio between two short strings, here domain names. N)STRIP_EXTENSIONsubr   ratio)r   r   r   s      r   is_similar_domainr&      sQ      ##B	22I $$R44J4J77==??9LLr   @   inputstringlengthc                 F   g }|                                  D ]J}|                    t          j                  }|                                r|                    |           Kg }t          ddd          D ]*fd|D             }t          |          |dz  k    r|c S +|S )zbSplit input into list of tokens and adjust length threshold to make sure
    there is enough data.   c                 :    g | ]}t          |          k    |S  )len).0tis     r   
<listcomp>z!sample_tokens.<locals>.<listcomp>-   s$    222s1vvzz!zzzr      )splitstripstringpunctuationisalnumappendranger/   )r(   r)   tokenstokensampler2   s        @r   sample_tokensr?   #   s     F""$$ ! !F.//==?? 	!MM%   F1b"  2222V222v;;&1*$$MMM %Mr      c                     d                     t          |                                                     }t          |                                |                                          S )z=Create a bag of words and generate a hash for a given string. digest_size)joinr?   r6   r   encodedigest)r(   r)   
teststrings      r   generate_bow_hashrI   3   sR    -4455;;==J:$$&&F;;;BBDDDr   c            	       &   e Zd ZdZddgZ	 	 	 ddededee         d	dfd
Zded	efdZ	 e
d          ded	ee         fd            Zded	efdZd	efdZded	ee         fdZdeeeef                  d	ee         fdZded	efdZded	efdZdS )SimhashzAImplement a basic Charikar hashing approach of string similarity.hashr)   r"   r'   Nr(   existing_hashr    c                 r    || _         |                     |          p|                     |          | _        dS )z&Store length and existing or new hash.N)r)   validatecreate_hashrL   )selfr(   r)   rM   s       r   __init__zSimhash.__init__>   s5     MM-00QD4D4D[4Q4Q			r   c                     t                               t          |                                d                                          d          S )z&Return a numerical hash of the string.   rC   big)int
from_bytesr   rF   rG   )rQ   r(   s     r   _hashzSimhash._hashH   sB    ~~K&&((a888??AA5
 
 	
r   i @  r   r=   c                 H      fdt           j                  D             S )z2Create vector to add to the existing string vectorc                 N    g | ]!}                               d |z  z  rd nd"S )r   r,   )rX   )r0   r2   rQ   r=   s     r   r3   z*Simhash._vector_to_add.<locals>.<listcomp>]   s7    VVVaTZZ&&!q&19rVVVr   )r;   r)   )rQ   r=   s   ``r   _vector_to_addzSimhash._vector_to_addZ   s.     WVVVV5CUCUVVVVr   c           
         dg| j         z  t          || j                   D ]8}t          t          t          |                     |                              9t          fdt          | j                   D                       S )zCalculates a Charikar simhash. References used:
        https://github.com/vilda/shash/
        https://github.com/sean-public/python-hashes/blob/master/hashes/simhash.py
        Optimized for Python by @adbar.
        r   c              3   :   K   | ]}|         d k    d|z  V  dS )r   r   Nr.   )r0   r2   vectors     r   	<genexpr>z&Simhash.create_hash.<locals>.<genexpr>j   s/      HHaa16HHr   )r)   r?   listmapr   r[   sumr;   )rQ   r(   r=   r^   s      @r   rP   zSimhash.create_hash_   s     t{"";<< 	H 	HE#c64+>+>u+E+EFFGGFFHHHH5#5#5HHHHHHr   c                 :    t          | j                  dd         S )z3Convert the numerical hash to a hexadecimal string.r4   N)hexrL   rQ   s    r   to_hexzSimhash.to_hexl   s    49~~abb!!r   	inputhashc                 T    	 t          |d          S # t          t          f$ r Y dS w xY w)z2Convert the hexadecimal hash to a numerical value.   N)rV   	TypeError
ValueErrorrQ   rg   s     r   _hash_to_intzSimhash._hash_to_intp   s>    	y"%%%:& 	 	 	44	s    ''c                 \   t          |t                    r,dt          t          |                    cxk    rdk    rn n|S t          |t                    rU|                                r,dt          |          cxk    rdk    rn nt          |          S |                     |          S dS )z9Validate the input hash and return it, or None otherwise.      N)
isinstancerV   r/   strisdigitrm   rl   s     r   rO   zSimhash.validatew   s    i%% 	"C	NN0C0C*I*I*I*Ir*I*I*I*I*Ii%% 	0  "" &rS^^'A'A'A'Ar'A'A'A'A'A9~~%$$Y///tr   
other_hashc                 :    t          | j        |j        z            S )zJReturn distance between two hashes of equal length using the XOR operator.)BIN_COUNT_FUNCrL   rQ   rt   s     r   hamming_distancezSimhash.hamming_distance   s    di*/9:::r   c                 L    | j         |                     |          z
  | j         z  S )zjCalculate how similar this hash is from another simhash.
        Returns a float from 0.0 to 1.0.
        )r)   rx   rw   s     r   
similarityzSimhash.similarity   s&     d33J???4;NNr   )r"   r'   N)__name__
__module____qualname____doc__	__slots__rr   rV   r   rR   rX   r   r
   r[   rP   rf   rm   r   rO   r   rx   floatrz   r.   r   r   rK   rK   :   s       GG"I '+	R RR R  }	R
 
R R R R
 
 
 
 
 
$ YuWC WDI W W W WIs Is I I I I" " " " "c hsm    	(5c?"; 	 	 	 	 	;3 ;3 ; ; ; ;OS OU O O O O O Or   rK   contentc                 D    t          |                                           S )zACalculate a simhash hex value for meaningful bits of the content.)rK   rf   )r   s    r   content_fingerprintr      s    7""$$$r   )r   r   r4      c                   `    e Zd ZdZddeddfdZdedefdZd	edefd
Zd	e	deddfdZ
ddZdS )LRUCachea  
    Pure-Python Least Recently Used (LRU) cache using a circular doubly linked list
    Adapted from CPython functools.py lru_cache decorator implementation
    https://github.com/python/cpython/blob/3.9/Lib/functools.py#L524
    First adapted by https://github.com/vbarbaresi
       r   r    Nc                     t                      | _        || _        i | _        g | _        | j        | j        d d g| j        d d <   d| _        d S )NF)r   lockr   cacherootfull)rQ   r   s     r   rR   zLRUCache.__init__   sI    GG	+-
!		49dD9	!!!			r   linkc                     |\  }}}}||c|t           <   |t          <   | j        t                   }|x|t           <   | j        t          <   ||t          <   | j        |t           <   |S )N)NEXTPREVr   )rQ   r   	link_prev	link_next_keyresultlasts          r   
_move_linkzLRUCache._move_link   s\    -1*	9dF+4i(	$4y'++T
TYt_T
YT
r   keyc                     | j         5  | j                            |          }|r!|                     |          cddd           S 	 ddd           n# 1 swxY w Y   dS )zgTests if the key that is asked for is in the cache
        and retrieve its value from the linked list.Nr,   )r   r   getr   )rQ   r   r   s      r   r   zLRUCache.get   s     Y 	- 	-:>>#&&D -t,,	- 	- 	- 	- 	- 	- 	- 	--	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- rs   1AAAvaluec                    | j         5  | j                            |          }|r+|                     |           || j        |         t          <   n| j        rq| j        }||c|t          <   |t          <   |t                   | _        | j        t                   }dx| j        t          <   | j        t          <   | j        |= || j        |<   nb| j        t                   }|| j        ||g}|x|t          <   x| j        t          <   | j        |<   t          | j                  | j        k    | _        ddd           dS # 1 swxY w Y   dS )z Stores a given key in the cache.N)r   r   r   r   RESULTr   r   KEYr   r   r/   r   )rQ   r   r   r   oldrootoldkeyr   s          r   putzLRUCache.put   s    Y  	@  	@:>>#&&D @%%%*/
3''9 @"iG471GCL'&/ !(DI!Ys^F9==DIcNTYv%6
6* '.DJsOO  9T?D $)S%8DEIIDJI44:c? !$DJ4< ?DIA 	@  	@  	@  	@  	@  	@  	@  	@  	@  	@  	@  	@  	@  	@  	@  	@  	@  	@s   D"D77D;>D;c                     | j         5  | j                                         | j        | j        ddg| j        dd<   d| _        ddd           dS # 1 swxY w Y   dS )zDelete all cache content.NF)r   r   clearr   r   re   s    r   r   zLRUCache.clear   s    Y 	 	J Ity$=DIaaaLDI	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   ;AAA)r   )r    N)r{   r|   r}   r~   rV   rR   r   r   r   rr   r   r   r.   r   r   r   r      s         	 	 	d 	 	 	 	s s    s s    #@s #@3 #@4 #@ #@ #@ #@J     r   r   rH   c                     t                               |           }|dk    r|dz   nd}t                               | |           dS )zImplement LRU cache.r,   r   N)LRU_TESTr   r   )rH   cachevalr   s      r   put_in_cacher      sB    ||J''H$NNHqLLELLU#####r   elementoptionsc                 F   t          d                    |                                                     }t          |          |j        k    rEt
                              |          }||j        k    r t
                              ||dz              dS t          |           dS )z(Check for duplicate text with LRU cache.rB   r   TF)
r   rE   itertextr/   min_duplcheck_sizer   r   max_repetitionsr   r   )r   r   rH   r   s       r   duplicate_testr      s    chhw//112233J
:333<<
++g---LLX\22245r   )r   )r'   )r@   )/r~   rer7   difflibr   	functoolsr   hashlibr   operatorr   	threadingr   typingr   r	   r
   r   r   
lxml.etreer   settingsr   utilsr   compiler#   getattrrV   rv   rr   r   boolr&   r?   bytesrI   rK   r   r   r   r   r   r   r   r   r   r.   r   r   <module>r      s   @ @
 
			  # # # # # #                         3 3 3 3 3 3 3 3 3 3 3 3 3 3                   "*/00k+F+FGG 4M M M# M% MRV M M M M s C c     E E3 E EU E E E EPO PO PO PO PO PO PO POf% % % % % %
 % dCP P P P P P P Pf 8H%%%$S $T $ $ $ $H s t      r   