
    %$}gH              8       0   d Z ddlZddlZddlmZmZ ddlmZmZmZm	Z	m
Z
mZ ddlmZmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4  ej5        e6          Z7ddhZ8de"de&de9fdZ:dededede&de
ee9e;f         f
dZ<ddddddddddddddddddddde%dfded ee9         d!e=d"e=d#e=d$e=d%e=d&e9d'ee9         d(e=d)e=d*e=d+e=d,e=d-eee9ef                  d.e=d/e=d0ee;         d1ee	e9                  d2ee	e9                  d3e=d4ee         d5edee&         deee"ee9ef         f                  f2d6Z>ddddddddddddddddddddddde%dfded ee9         d7ee9         d!e=d"e=d#e=d$e=d%e=d&e9d8e=d'ee9         d(e=d)e=d*e=d+e=d,e=d-eee9ef                  d.e=d/e=d0ee;         d1ee	e9                  d2ee	e9                  d9ee9         d4ee         d5edee&         dee9         f6d:Z?dS );z4
Extraction configuration and processing functions.
    N)copydeepcopy)AnyDictOptionalSetTupleUnion)_ElementElementXPath
strip_tags)HtmlElement   )baseline)content_fingerprintduplicate_test)compare_extraction)build_html_outputconvert_tagsprune_unwanted_nodestree_cleaning)extract_commentsextract_content)Documentextract_metadata)DEFAULT_CONFIG	Extractor
use_config)LANGID_FLAGcheck_html_langlanguage_filter	load_htmlnormalize_unicode)build_json_outputcontrol_xml_outputxmltotxtxmltocsv)REMOVE_COMMENTS_XPATHmarkdowntxtdocumentoptionsreturnc           
      ^   d|j         v r| j                            d          D ]d}|j        dk    rWt	          |          dk    rD|j        s=|j        s6|                                }| |j        dk    r|                    |           et          | |          }n|j         dk    rt          | |j                  }n|j         dk    rt          | |j                  }n|j         d	k    rt          | |j                  }n|j        rCd
}dD ]8}t          | |          r&|| dt!          t          | |                     dz  }9|d
z  }nd}| t#          | j        |j                   }| j        1| dt#          | j        |j                                                   }t)          |          S )zMConvert XML tree to chosen format, clean the result and output it as a stringxml*graphicr   Ncodecsvjsonhtmlz---
)titleauthorurlhostnamedescriptionsitenamedate
categoriestagsfingerprintidlicensez: 
 )formatbodyitertaglentexttail	getparentremover&   r(   
formattingr%   with_metadatar   getattrstrr'   commentsbodystripr$   )r,   r-   elementparentreturnstringheaderattrs          P/var/www/py-google-trends/myenv/lib/python3.11/site-packages/trafilatura/core.pydetermine_returnstringrZ   ,   s    }))#.. 
	+ 
	+Gy((LLA%% & & !**,,%&**>*>MM'***)(G<<	5	 	 '*<==	6	!	!(73HII	6	!	!(73HII   	F J J 8T** JIIWXt-D-D)E)EIIIIFgFFF O(8=':L"M"MOO ,*cchx7LgN`.a.acciikkL\***    cleaned_treecleaned_tree_backuptree_backupc                 .   t          | |          \  }}}|j        s%t          |t          |          ||||          \  }}}||j        k     rF|j        dk    s;t          t          |                    \  }}}t                              d|           |||fS )z?Execute the standard cascade of extractors used by Trafilatura.	precisionz+non-clean extracted length: %s (extraction))	r   fastr   r   min_extracted_sizefocusr   LOGGERdebug)r\   r]   r^   r-   postbody	temp_textlen_texts          rY   trafilatura_sequenceri   e   s     %4L'$J$J!Hi < 
(:[!!)
 )
%)X ',,,W]k5Q5Q(0+1F1F(G(G%)XBHMMMY((r[   FTpythonfilecontentr9   ra   no_fallbackfavor_precisionfavor_recallinclude_commentsoutput_formattarget_languageinclude_tablesinclude_imagesinclude_formattinginclude_linksdeduplicatedate_extraction_paramsrO   only_with_metadatamax_tree_sizeurl_blacklistauthor_blacklistas_dictprune_xpathconfigc                 H
   |r|}t          j        dt                     |rt          j        dt                     |rt          d          |rt	          |t
                    s@t          d*i d|d|d|d|d|d	|d
|d|d|
d|	d|d|d|d|d|d|d|d|}	 t          |           }|"t                              d|           t          |j	        rL|j
        st          s>t          ||j	                  du r't                              d|j                   t          |j        rt          ||j        |j        |j
        |j                  }|j        |j        v r't                              d|j                   t          |j        r<|j        r|j        r|j        s't                              d|j                   t          nt1                      }|2t	          |t2                    r|g}t5          |d |D                       }t7          t9          |          |          }t9          |          }t;          |||j        p|j                  }|j        rt?          ||          \  }}}}ntA          d          dd}}}|j!        dk    rt5          |tD                    }tG          ||||          \  }} }!|j$        rtK          |          |j$        k    r8t          &                    d tK          |                     tO          |d!           tK          |          |j$        k    r5t          &                    d"tK          |          |j                   t          |j        r+||j(        k     r t          &                    d#|j                   |!|j)        k     r4||j*        k     r)t          &                    d$|!||j                   t          |j+        r9tY          ||          d%u r't          &                    d&|j                   t          |j	        rEt[          | ||j	        |          \  }"}|"d%u r't          &                    d'|j                   t          n8# t\          t          f$ r$ t                              d(|j                   Y dS w xY w|j/        d)k    rOta          ||j1                  |_2        |j        r!ta          ||j1                  |_        ||_3        |j2        |_4        n| |c|_4        |_3        ||_5        |s|n|6                                S )+al  Internal function for text extraction returning bare Python variables.

    Args:
        filecontent: HTML code as string.
        url: URL of the webpage.
        fast: Use faster heuristics and skip backup extraction.
        no_fallback: Will be deprecated, use "fast" instead.
        favor_precision: prefer less text but correct extraction.
        favor_recall: prefer more text even when unsure.
        include_comments: Extract comments along with the main text.
        output_format: Define an output format, Python being the default
            and the interest of this internal function.
            Other values: "csv", "html", "json", "markdown", "txt", "xml", and "xmltei".
        target_language: Define a language to discard invalid documents (ISO 639-1 format).
        include_tables: Take into account information within the HTML <table> element.
        include_images: Take images into account (experimental).
        include_formatting: Keep structural elements related to formatting
            (present in XML format, converted to markdown otherwise).
        include_links: Keep links along with their targets (experimental).
        deduplicate: Remove duplicate segments and documents.
        date_extraction_params: Provide extraction parameters to htmldate as dict().
        with_metadata: Extract metadata fields and add them to the output.
        only_with_metadata: Only keep documents featuring all essential metadata
            (date, title, url).
        url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
        author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
        as_dict: Will be deprecated, use the .as_dict() method of the document class.
        prune_xpath: Provide an XPath expression to prune the tree before extraction.
            can be str or list of str.
        config: Directly provide a configparser configuration.
        options: Directly provide a whole extractor configuration.

    Returns:
        A Python dict() containing all the extracted information or None.

    Raises:
        ValueError: Extraction problem.
    H"no_fallback" will be deprecated in a future version, use "fast" insteadzR"as_dict" will be deprecated, use the .as_dict() method on bare_extraction results:max_tree_size is deprecated, use settings.cfg file insteadr~   rp   ra   r`   recallcommentsrN   linksimagestablesdeduplangr9   rO   rx   r{   rz   date_paramsNzempty HTML tree: %sFzwrong HTML meta language: %szblacklisted URL: %szno metadata: %sc                 ,    g | ]}t          |          S  )r   ).0xs     rY   
<listcomp>z#bare_extraction.<locals>.<listcomp>  s    .M.M.MAuQxx.M.M.Mr[   rF   rD   r   zoutput tree too long: %shiz'output tree too long: %s, discarding %sznot enough comments: %sz+text and comments not long enough: %s %s %sTz!discarding duplicate document: %szwrong language: %szdiscarding data: %srj   r   )7warningswarnPendingDeprecationWarning
ValueError
isinstancer   r#   rd   errorr   ra   r    r!   sourcerO   r   r9   r   r{   rz   warningrx   r=   r7   r   rQ   r   r   r   r   r   r   r   rc   r)   ri   ry   rI   re   r   min_extracted_comm_sizemin_output_sizemin_output_comm_sizer   r   r"   	TypeErrorrE   r'   rN   rJ   rR   raw_textrF   r|   )#rk   r9   ra   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rO   rx   ry   rz   r{   r|   r}   r~   r-   treer,   r\   r]   rR   temp_commentslen_commentsrf   rg   rh   is_not_target_langs#                                      rY   bare_extractionr      s   D  
V%	
 	
 	
  
`%	
 	
 	
  WUVVV  
*Wi88 
 
 
 
6
'-
 
 &o	

  <
 &%
 *)
  -
 ">
 ">
 +
 !
 
 (-
  21
  .-!
" (-#
$ /.%
*p%%<LL.444 < 	!W\ 	! 	!tW\22e;;;W^LLL     	"'#( H |w4444hlCCC   ) !!"*.!5=\! .???    zzH "+s++ ,*m'.M.M.M.M.MNND %T$ZZ99"<00 $L'7;;V(,WW  	OFVgG GCL-|| 9@Q-L=K''/>STTL(<-tW)
 )
%)X
   	!8}}w4447XGGG8T***8}}w444=MMN  
 !  	Dw/N N NLL2GNCCCw...w;;;LL=	    = 	^Hg>>$FFLL<gnMMM < 	!+:=',, ,( "T))17>BBB  z"   ,gn===tt
 ~!! 7+=>> 	1 (w7I J JH$0H!$M3<l080HM":88(8(8(:(::s   $N?Q$ $1RR	record_idtei_validationsettingsfilec           	         |r|}t          j        dt                     |rt          d          |rt	          |t
                    sQt          di dt          ||          d|d|d|d|d|d	|d
|d|d|d|d|
d|d|d|d|	d|d|d|}t          | |d|          }|rt	          |t                    sdS |j	        t          vrf|j	        dk    rt          d          ||_        |j        >t          t          |j                  dz   t          |j                  z             |_        t#          ||          S )a[  Main function exposed by the package:
       Wrapper for text extraction and conversion to chosen output format.

    Args:
        filecontent: HTML code as string.
        url: URL of the webpage.
        record_id: Add an ID to the metadata.
        fast: Use faster heuristics and skip backup extraction.
        no_fallback: Will be deprecated, use "fast" instead.
        favor_precision: prefer less text but correct extraction.
        favor_recall: when unsure, prefer more text.
        include_comments: Extract comments along with the main text.
        output_format: Define an output format:
            "csv", "html", "json", "markdown", "txt", "xml", and "xmltei".
        tei_validation: Validate the XML-TEI output with respect to the TEI standard.
        target_language: Define a language to discard invalid documents (ISO 639-1 format).
        include_tables: Take into account information within the HTML <table> element.
        include_images: Take images into account (experimental).
        include_formatting: Keep structural elements related to formatting
            (only valuable if output_format is set to XML).
        include_links: Keep links along with their targets (experimental).
        deduplicate: Remove duplicate segments and documents.
        date_extraction_params: Provide extraction parameters to htmldate as dict().
        with_metadata: Extract metadata fields and add them to the output.
        only_with_metadata: Only keep documents featuring all essential metadata
            (date, title, url).
        url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
        author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
        settingsfile: Use a configuration file to override the standard settings.
        prune_xpath: Provide an XPath expression to prune the tree before extraction.
            can be str or list of str.
        config: Directly provide a configparser configuration.
        options: Directly provide a whole extractor configuration.

    Returns:
        A string in the desired format or None.

    r   r   r~   rp   ra   r`   r   r   rN   r   r   r   r   r   r9   rO   rx   r   r{   rz   r   F)r-   r|   r}   Nrj   z9'python' format only usable in bare_extraction() function r   )r   r   r   r   r   r   r   r   r   rE   TXT_FORMATSrA   r   r   rQ   r7   r@   rZ   )rk   r9   r   ra   rl   rm   rn   ro   rp   r   rq   rr   rs   rt   ru   rv   rw   rO   rx   ry   rz   r{   r   r}   r~   r-   r,   s                              rY   extractr   i  s.   D  
V%	
 	
 	

  WUVVV  
*Wi88 
 
 
 
lF333
'-
 
 &o	

  <
 &%
 *)
  -
 ">
 ">
 +
 !
 
 (-
  21
  *>!
" .-#
$ (-%
& /.'
. 	  H  :h99 t~[((>X%%K    (#6HN##c)C0A,B,BB$ $H 
 "(G444r[   )@__doc__loggingr   r   r   typingr   r   r   r   r	   r
   
lxml.etreer   r   r   r   	lxml.htmlr   r   deduplicationr   r   externalr   htmlprocessingr   r   r   r   main_extractorr   r   metadatar   r   settingsr   r   r   utilsr    r!   r"   r#   r$   r0   r%   r&   r'   r(   xpathsr)   	getLogger__name__rd   r   rQ   rZ   intri   boolr   r   r   r[   rY   <module>r      s              9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 ; ; ; ; ; ; ; ; ; ; ; ; ! ! ! ! ! !       > > > > > > > > ( ( ( ( ( (            > = = = = = = = 0 0 0 0 0 0 0 0 ; ; ; ; ; ; ; ; ; ;              K J J J J J J J J J J J ) ) ) ) ) ) 
	8	$	$5!6+X 6+	 6+c 6+ 6+ 6+ 6+r))$) ) 	)
 8S#) ) ) )> !!!%) $7;$#'(,+/!% #'1d; d;d;	#d; d; 	d;
 d; d; d; d; c]d; d; d; d; d; d; %T#s(^4d;  !d;" #d;$ C=%d;& CH%'d;( s3x()d;* +d;, #-d;. /d;0 i 1d;2 eHd38n,-.3d; d; d; d;R #!! %) $7;$#'(,+/"&!% #'55 55	#5 }5 	5
 5 5 5 5 5 5 c]5 5 5 5 5  !5" %T#s(^4#5$ %5& '5( C=)5* CH%+5, s3x(-5. 3-/50 #152 354 i 556 c]75 5 5 5 5 5r[   