
    %$}g0                        d Z ddlmZ ddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZ 	 ddlmZ  e ed                    Zn# e$ r ddlmZ  e            pdZY nw xY wdd	lmZ dd
lmZmZmZ ddlmZ g dZ ee          dhz  Z	 dade
e         de
e         defdZ e            ZdddddddddZ  G d d          Z!dbdede
e         de!fd Z"dcd"e#deeef         fd#Z$ G d$ d%          Z% e&ed&          Z'd'Z(d(Z)d)Z*d*Z+d+Z,h d,Z-g d-Z.g d.Z/ ed/          Z0 e1g d0          Z2i d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`Z3dS )dz<
Listing a series of settings that are applied module-wide.
    )ConfigParser)datetime)unescape)AnyDictListOptionalSet)sched_getaffinity)	cpu_count   )Path)_ElementElementXPath)line_processing)csvjsonhtmlmarkdowntxtxmlxmlteipythonNfilenameconfigreturnc                    ||S | *t          t          t                    j        dz            } n0t          |                                           st          d          t                      }|                    |            |S )zE
    Use configuration object or read and parse a settings file.
    Nzsettings.cfgz$The given config file does not exist)strr   __file__parentis_fileFileNotFoundErrorr   read)r   r   s     T/var/www/py-google-trends/myenv/lib/python3.11/site-packages/trafilatura/settings.py
use_configr&      s|     tH~~,~=>>(^^##%% H FGGG^^F
KKM    MIN_EXTRACTED_SIZEMIN_OUTPUT_SIZEMIN_OUTPUT_COMM_SIZEMIN_EXTRACTED_COMM_SIZEMIN_DUPLCHECK_SIZEMAX_REPETITIONSMAX_FILE_SIZEMIN_FILE_SIZE)min_extracted_sizemin_output_sizemin_output_comm_sizemin_extracted_comm_sizemin_duplcheck_sizemax_repetitionsmax_file_sizemin_file_sizec            ,       Z   e Zd ZdZg dZeddddddddddddddddddddded	ed
ededededededededede	e         de	e         de	e         dededede	e
e                  de	e
e                  de	eeef                  f(dZde	e         de	e         ddfdZdeddfd Zdeddfd!ZdS )"	Extractorz0Defines a class to store all extraction options.)r   formatfastfocuscomments
formattinglinksimagestablesdeduplangr0   r1   r2   r3   r4   r5   r6   r7   max_tree_sizesourceurlwith_metadataonly_with_metadatatei_validationdate_paramsauthor_blacklisturl_blacklistr   FTN)r   output_formatr;   	precisionrecallr=   r>   r?   r@   rA   rB   rC   rF   rE   rG   rH   rI   rK   rL   rJ   r   rM   r;   rN   rO   r=   r>   r?   r@   rA   rB   rC   rF   rE   rG   rH   rI   rK   rL   rJ   c                P   |                      ||           |                     |           |                     |           || _        |rdn|rdnd| _        || _        |p
| j        dk    | _        || _        |	| _	        |
| _
        || _        || _        || _        || _        || _        |pt!                      | _        |pt!                      | _        |p|pt'          |          p|dk    | _        |p't+          | j                            dd                    | _        d | _        d S )NrO   rN   balancedr   r   DEFAULTEXTENSIVE_DATE_SEARCH)_set_source_set_format_add_configr;   r<   r=   r:   r>   r?   r@   rA   rB   rC   rF   rH   rI   setrK   rL   boolrG   set_date_paramsr   
getbooleanrJ   rD   )selfr   rM   r;   rN   rO   r=   r>   r?   r@   rA   rB   rC   rF   rE   rG   rH   rI   rK   rL   rJ   s                        r%   __init__zExtractor.__init__e   sP   0 	f%%%'''   	LHH9$LKK* 	
 ' * GdkZ.G 
"" 
#'	"%(:$2*:*Ccee'4'= )!)M"") (	 	 ,7 ,
/K""9.EFF;
 ;
 "r'   r   c                 n    |p|}|o(|                     dd                              d          | _        dS )z)Set the source attribute in a robust way.zutf-8replaceN)encodedecoderE   )r[   rF   rE   s      r%   rT   zExtractor._set_source   s7    Rw	!B!B!I!I'!R!Rr'   chosen_formatc                     |t           vr7t          dd                    t          t                                          || _        dS )z;Store the format if supported and raise an error otherwise.z#Cannot set format, must be one of: z, N)SUPPORTED_FORMATSAttributeErrorjoinsortedr:   )r[   ra   s     r%   rU   zExtractor._set_format   sM     111 \diiGX@Y@Y6Z6Z\\   $r'   c           	          t                                           D ]*\  }}t          | ||                    d|                     +|| _        dS )z&Store options loaded from config file.rR   N)CONFIG_MAPPINGitemssetattrgetintr   )r[   r   keyvalues       r%   rV   zExtractor._add_config   sQ    (..00 	@ 	@JCD#v}}Y>>????r'   )__name__
__module____qualname____doc__	__slots__DEFAULT_CONFIGr   r   rX   r	   r
   r   r\   rT   rU   rV    r'   r%   r9   r9   ?   s       66" " "IN  ." "! $##($/3,004-4" 4" 4" 4" 	4"
 4" 4" 4" 4" 4" 4" 4" 4" 4" sm4" c]4"  !4"" #4"$ !%4"& '4"( #3s8,)4"*  C)+4", d38n--4" 4" 4" 4"lSx} Shsm S S S S S
$ $ $ $ $ $, 4      r'   r9   argsrF   c                    t          t          | j                  | j        | j        | j        | j        | j        | j        | j	        | j
        || j        | j        | j                  }dD ]!}t          ||t          | |                     "|S )z-Derive extractor configuration from CLI args.)r   )r   rM   r>   rN   rO   r=   rA   rB   rC   rF   rG   rH   rI   )r;   r@   r?   )r9   r&   config_filerM   r>   rN   rO   no_comments	no_tablesdeduplicatetarget_languagerG   rH   validate_teirj   getattr)ru   rF   optionsattrs       r%   args_to_extractorr      s    4#3444(?.{!~!(2(  G , 4 4wtT223333Nr'   T	extensivec                 V    d| t          j                                        d          dS )z/Provide default parameters for date extraction.Tz%Y-%m-%d)original_dateextensive_searchmax_date)r   nowstrftime)r   s    r%   rY   rY      s0     %LNN++J77  r'   c            ,       $   e Zd ZdZg dZdddddddddddd ed          d ed          ddddddddee         dee         dee         d	ee         d
ee         dee         dee         deee                  deee                  dee         dee         dee         de	dee         de	dee         dee         dee         dee         dee         dee         f*dZ
edeeef         dd fd            Zd dZdeeee         f         fdZdS )!DocumentzZDefines a class to store all necessary data and metadata fields for extracted information.titleauthorrF   hostnamedescriptionsitenamedate
categoriestagsfingerprintidlicensebodyr=   commentsbodyraw_texttextlanguageimagepagetypefiledateNr   )r   r   rF   r   r   r   r   r   r   r   idvallicense_valr   r=   r   r   r   r   r   r   r   r   r   rF   r   r   r   r   r   r   r   r   r   r=   r   r   r   r   r   r   r   c                ,   || _         || _        || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        d S Nr   )r[   r   r   rF   r   r   r   r   r   r   r   r   r   r   r=   r   r   r   r   r   r   r   s                         r%   r\   zDocument.__init__   s    2 %*
%+"%'/*5'/#'	/9)-	*5!&&1"	'/&2'/#'	'/$)
'/'/r'   datar   c                 p     |             }|                                 D ]\  }}t          |||           |S )z.Set a series of attributes using a dictionary.)ri   rj   )clsr   docrl   rm   s        r%   	from_dictzDocument.from_dict  sD     cee**,, 	% 	%JCCe$$$$
r'   c                     | j         D ]t}t          | |          }t          |t                    rMt	          |          dk    r|dd         dz   }t          t          |                    }t          | ||           udS )z*Limit text length and trim the attributes.'  Ni'  u   …)rr   r}   
isinstancer   lenr   r   rj   )r[   slotrm   s      r%   clean_and_trimzDocument.clean_and_trim!  s    N 	+ 	+DD$''E%%% +u::%%!%4%L50E'88dE***	+ 	+r'   c                 *      fd j         D             S )z%Convert the document to a dictionary.c                 4    i | ]}|t          |d           S r   )r}   ).0r   r[   s     r%   
<dictcomp>z$Document.as_dict.<locals>.<dictcomp>/  s'    KKKDgdD$//KKKr'   )rr   )r[   s   `r%   as_dictzDocument.as_dict-  s    KKKKDNKKKKr'   )r   N)rn   ro   rp   rq   rr   r   r	   r   r   r   r\   classmethodr   r   r   r   r   rt   r'   r%   r   r      s=       ``  I8  $ $!"&%)"&"*.$(%)#%) "&!("&""&#"&"&/-0 -0 -0 }-0 	-0
 c]-0 3--0 c]-0 3--0 sm-0 T#Y'-0 tCy!-0 c]-0 }-0 c]-0 -0  3-!-0" #-0$ 3-%-0& sm'-0( 3-)-0* }+-0, 3---0. 3-/-0 -0 -0 -0^ T#s(^ 
    [
+ 
+ 
+ 
+Lc8C=01 L L L L L Lr'   r      i   i     i@B r   >   bipqdddtemh1h2h3h4h5h6lidivpremainspanstrongarticlesection
blockquote)3asideembedfooterformheadiframemenuobjectscriptappletaudiocanvasfiguremappicturesvgvideoareablinkbuttondatalistdialogframeframesetfieldsetlinkinputinslabellegendmarqueemathmenuitemnavnoindexnoscriptoptgroupoptionoutputparamprogressrprtrtcselectrE   styletracktextareatimeuse)abbracronymaddressbdibdobigciter   dfnfonthgroupimgr   markmetarubysmalltbodytemplatetfoottheadzL.//aside|.//div[contains(@class|@id, 'footer')]|.//footer|.//script|.//style)
r   codedelr   hilblistr   r   quotearArabicbg	BulgarianczCzechdaDanishdeGermanenEnglishelGreekesSpanishfaPersianfiFinnishfrFrenchhrCroatianhu	HungariankoKoreanr   
IndonesianitItaliannoNorwegian_NynorskDutchPolish
PortugueseRomanianRussianSlovak	SlovenianSerbianSwedishTurkish	UkrainianUrdu
Vietnamese)nlplptroruskslsrsvtrukurvi)NNr   )T)4rq   configparserr   r   r   r   typingr   r   r   r	   r
   osr   r   	CPU_COUNTImportErrorr   pathlibr   
lxml.etreer   r   r   utilsr   SUPPORTED_FMT_CLIrW   rc   r   r&   rs   rh   r9   r   rX   rY   r   minPARALLEL_CORESLRU_SIZEMAX_FILES_PER_DIRECTORYFILENAME_LEN	MAX_LINKSMAX_SITEMAPS_SEENCUT_EMPTY_ELEMSMANUALLY_CLEANEDMANUALLY_STRIPPEDBASIC_CLEAN_XPATH	frozensetTAG_CATALOGJUSTEXT_LANGUAGESrt   r'   r%   <module>rh     s:    & % % % % %             1 1 1 1 1 1 1 1 1 1 1 1 1 1!$$$$$$%%a(())II ! ! !	 qIII!       / / / / / / / / / / " " " " " " POO C)**hZ7  FJ sm,4\,B   &  /(28.($$	 	m m m m m m m m` C hsm y    , t tCH~    `L `L `L `L `L `L `L `LH Y##   	   :7 7 7 t   2 ER   iRRR !(!+! 	'! 	(	!
 	(! 	)! 	'! 	)! 	)! 	)! 	(! 	*! 	+! 	(!  	,!!" 	)#!$ 	
%!& 











?! ! !   s   ? AA