
    %$}g[                        d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z ddlmZ ddlmZmZ dd	lmZmZmZmZmZmZmZ d
dlmZmZ d
dlmZmZmZ  ej         e!          Z" ed          Z# e$ ee%          j&        dz  dz            Z'h dZ(h dZ)da*ddhZ+h dZ, ed          Z-h dZ.h dZ/h dZ0h dZ1g dZ2ddddd Z3d!Z4dOd"ed#e5d$dfd%Z6dPd"ed'e5d$dfd(Z7d)ed$efd*Z8d)ed$efd+Z9dOd,ed-e5d$e$fd.Z:d)ed$efd/Z;d,ed$efd0Z<d1ed2ed$e$fd3Z=d4ed,ed$dfd5Z>d,ed$efd6Z?d7ed8ee$         d$efd9Z@d7ed$e5fd:ZAd"ed'e5d$e$fd;ZBd"ed<ee$         d'e5d$dfd=ZCd>ee         d'e5d$e$fd?ZDd@dAdBd1ed'e5dCe$dAe$d$e$f
dDZEd,ed$efdEZFd,ed$e$fdFZGdGed,ed$efdHZHd"ed$dfdIZId"ed$dfdJZJd"ed$efdKZKdLed$dfdMZLd"ed$dfdNZMdS )QzE
All functions related to XML generation, processing and validation.
    N)unescape)version)StringIO)dumps)Path)ListOptional)_ElementElement
SubElement	XMLParser
fromstringtostringDTD   )Document	Extractor)sanitizesanitize_treetext_chars_testtrafilaturadataztei_corpus.dtd>   pabhilbdeldivrefrowbodycellcodeheaditemlistquotetablegraphic>   rendroletypetarget	renditionr   r   >   r   r   r&   r'   r(   T)remove_blank_text>	   r   r   r    r#   r$   r&   r'   r(   r)   >   r   r   r   r$   >	   r   r   r   r    r"   r$   r%   r&   r)   >   r"   r%   noter'   figure)sitenametitleauthordateurlhostnamedescription
categoriestagslicenseidfingerprintlanguagez***__`)z#bz#iz#uz#ti  element	keep_tailreturnc                     |                                  }|dS |rJ| j        rC|                                 }||j        pd| j        z   |_        n|j        pd| j        z   |_        |                    |            dS )z
    Removes this element from the tree, including its children and
    text. The tail text is joined to the previous element or parent.
    N )	getparenttailgetprevioustextremove)rB   rC   parentpreviouss       O/var/www/py-google-trends/myenv/lib/python3.11/site-packages/trafilatura/xml.pydelete_elementrO   6   s    
   F~ AW\ A&&((!;,"<FKK%]0bGL@HM
MM'    Finclude_formattingc                 D   |                                  }|dS t          | |          }| j        
|| j        z  }|                                 }||j        r|j         d| n||_        n |j        |j         d| |_        n||_        |                    |            dS )zAMerge element with its parent and convert formatting to markdown.N )rG   replace_element_textrH   rI   rJ   rK   )rB   rQ   rL   	full_textrM   s        rN   merge_with_parentrV   I   s      F~$W.@AAI|W\!	""$$H:B-V8=669666Y		 22y22
MM'rP   treec                 6   |                      d          D ]}t          |          dk    rmt          |j                  du rWt          |j                  du rA|                                }|+|j        dk    r |j        dk    r|                    |           | S )z"Remove text elements without text.r?   r   FNr)   r#   )iterlenr   rJ   rH   rG   tagrK   )rW   rB   rL   s      rN   remove_empty_elementsr\   ^   s    99S>> ' 'w<<1!>!>%!G!GO\c\hLiLimrLrLr&&((F !gkY&>&>6:QWCWCWg&&&KrP   c                    t          |                     d                    D ][}|                    ddd          D ]A}|j        |j        k    r/|                                j        t
          vrt          |           B\| S )z/Prevent nested tags among a fixed list of tags.z.//head | .//code | .//pr#   r$   r   )reversedxpathiterdescendantsr[   rG   NESTING_WHITELISTrV   )rW   elemsubelems      rN   strip_double_tagsrd   j   s    $>??@@ + +++FFC@@ 	+ 	+G{dh&&7+<+<+>+>+BJ[+[+[!'***	+ KrP   docmetawith_metadatac                 p    |r fd j         D             }|                    |                    d          |                    d          |                    d          d                    |                    d          pg           d                    |                    d          pg           t	          |                    d          d	
          d           |                    d          }ndt	           j        d	
          i} j        }t	          |d	
          |d<   t          |d	          S )z0Build JSON output based on extracted informationc                 4    i | ]}|t          |d           S )N)getattr).0slotre   s     rN   
<dictcomp>z%build_json_output.<locals>.<dictcomp>v   s'    WWWTdGGT488WWWrP   r6   r2   r8   ;r9   r:   r!   F)rQ   )sourcezsource-hostnameexcerptr9   r:   rJ   commentsbodyrJ   comments)ensure_ascii)	__slots__updatepopjoinxmltotxtr!   rp   
json_dumps)re   rf   
outputdictrp   s   `   rN   build_json_outputrz   s   s2    ,WWWWWEVWWW
 nnU++)~~j99!~~m44((:>>,#?#?#E2FFHHZ^^F339r::Z^^F33NNN
 
 	 	 	 "~~n55hw|NNNO
+%luMMMJzju5555rP   c                     |                      d          D ])}|j        t          vr|j                                         *| S )zRemove unnecessary attributes.r?   )rY   r[   WITH_ATTRIBUTESattribclear)rW   rb   s     rN   clean_attributesr      sB    		#    8?**KKrP   c                    t          d          }t          ||            d| j        _        |                    t          | j                             d| j        _        |                    t          | j                             |S )z4Build XML output tree based on extracted informationdocmainrq   )r   add_xml_metar!   r[   appendr   rp   re   outputs     rN   build_xml_outputr      st    U^^F!!!GL MM"7<00111)G
MM"7#788999MrP   documentoptionsc                    t          | j                   t          | j                   |j        dk    rt          nt
          } ||           }t          |          }t          t          |d          t                    }|j        dk    r5|j
        r.t                              dt          |          |j                   t          |dd                                          S )z9Make sure the XML output is conform and valid if requiredxmlunicode)encodingxmlteizTEI validation result: %s %sT)pretty_printr   )rd   r!   r\   formatr   build_tei_outputr   r   r   CONTROL_PARSERtei_validationLOGGERdebugvalidate_teirn   strip)r   r   funcoutput_trees       rN   control_xml_outputr      s    hm$$$(-(((&~66<LD$x..K,,KXkIFFFWWK ~!!g&<!3\+5N5NPWP^___KdYGGGMMOOOrP   r   c                     t           D ]U}t          ||d          }|r@|                     |t          |t                    r|nd                    |                     VdS )z-Add extracted metadata to the XML output treeNrm   )META_ATTRIBUTESri   set
isinstancestrrv   )r   re   	attributevalues       rN   r   r      sl    $ X X	D11 	XJJy:eS+A+A"V%%sxxPUWWWX XrP   c                 N    t          |           }t          || j                  }|S )z8Build TEI-XML output tree based on extracted information)write_teitree	check_teir6   r   s     rN   r   r      s)     7##F vw{++FMrP   xmldocr6   c                    |                      d          D ]}d|_        |                    dd           |                                }|6t	          |          dk    r't          |          }|                    ||           |}|j        dk    rt          |           |                     d          D ]=}|j	        r4|j	        
                                rd|j	        dc|_        |_        |_	        >|                     d	          D ]}|j        t          vr1t                              d
|j        |           t          |           A|j        t           v rt#          |           n)|j        dk    rt%          |           t'          |           d |j        D             D ]>}t                              d||j        |           |j                            |           ?| S )zCCheck if the resulting XML file is conform and scrub remaining tagsr$   r   r,   headerNr   r   z.//text/body//div/lbz.//text/body//*z"not a TEI element, removing: %s %sr   c                 $    g | ]}|t           v|S  )TEI_VALID_ATTRS)rj   as     rN   
<listcomp>zcheck_tei.<locals>.<listcomp>   s"    MMMA_4L4L!4L4L4LrP   z0not a valid TEI attribute, removing: %s in %s %s)rY   r[   r   rG   rZ   _tei_handle_complex_headreplace_move_element_one_level_upfindallrH   r   rJ   TEI_VALID_TAGSr   warningrV   TEI_REMOVE_TAIL_handle_unwanted_tails!_handle_text_content_of_div_nodes_wrap_unwanted_siblings_of_divr}   ru   )r   r6   rb   rL   new_elemr   s         rN   r   r      s    F## - -"""!!>t99q==/55HNN4***D:&t,,,566 B B9 	B** 	B-0$)T*DHdi011 ' '8>)) NN?3OOOd###8&&"4((((X-d333*4000 NMT[MMM 	' 	'INNMyZ^ZbdghhhKOOI&&&&	' MrP   c                     t           t          t                    a t                               |           }|du r*t                              dt           j        j                   |S )zUCheck if an XML document is conform to the guidelines of the Text Encoding InitiativeNFznot a valid TEI document: %s)TEI_DTDr   
TEI_SCHEMAvalidater   r   	error_log
last_error)r   results     rN   r   r      sP     j//f%%F5w7H7STTTMrP   c                    | j         pd}|r| j         r| j        dk    rN	 t          |                     d          d                   }n# t          t
          f$ r d}Y nw xY wd|z   d| }n{| j        dk    rd	| d	}ni| j        d
k    r=|                     d          }|t          v rt          |          | t          |          }n!| j        dk    rd| j         v rd| d}nd| d}| j        dk    rm|rJd| d}|                     d          }|r	| d| d}nEt                              d|| j	                   |}n!t                              d|| j	                   | j        dk    rF|rDt          |           dk    r1| d         j        dk    r|                                 | dnd| d}n=| j        dk    r|r|                                 | nd| }n| j        dk    r|rd| d}|S )zeDetermine element text based on just the text of the element. One must deal with the tail separately.rF   r$   r*   r      #rS   r   z~~r   r#   
z```
z
```rA   r   []r-   ()zmissing link attribute: %s %s'zempty link: %s %sr"   r   r   Nz| r%   z- )rJ   r[   intget	TypeError
ValueErrorHI_FORMATTINGr   r   r}   rZ   rI   )rB   rQ   	elem_textnumberr*   	link_textr-   s          rN   rT   rT      s   "I -gl -;&  W[[00344z*   <55)55II[E!!*Y***II[D  ;;v&&D}$$,T2TIT}T?RTT	[F""w|##4I444		,	,,,	{e 		K(I(((I[[**F &(446444		?GN[[[%		NN.	7>JJJ{fs7||a/?/?1:>S  +2+>+>+@+@+L9RcW`RcRcRcI			9	&-&9&9&;&;&GyNNM]R[M]M]					9	&&&&	s   (A AA
returnlistc           	      2   | j         r#|                    t          | |                     | D ]}t          |||           | j         s| j        s| j        dk    rs|                     dd           d|                     dd           }|                    d|                                 d|                     dd           d	           n| j        t          v r| j        d
k    rt          | 
                    d                    }|                     d          p|                     d          }|r|                                sd}n"t          t          |          t                    }||k     r|                    d||z
  z   d           | 
                    d          r|                    dd|z   d           n#|                    d           n| j        dk    rdS | j        t          v r:| 
                    d          s%|                    |r| j        d
k    rdnd           nD| j        dk    r|                    d           n#| j        t          vr|                    d           | j        r|                    | j                   dS dS )zYRecursively convert a LXML element and its children to a flattened string representation.r)   r3   rF   rS   altz![z](srcr   r    z.//cellcolspanspanr   |r   z./cell[@role='head']z
|z---|r"   Nzancestor::cellu   
␤
z | )rJ   r   rT   process_elementrH   r[   r   r   NEWLINE_ELEMSrZ   r_   isdigitminr   MAX_TABLE_WIDTHSPECIAL_FORMATTING)rB   r   rQ   childrJ   
cell_count	span_infomax_spans           rN   r   r   ,  s   | M.w8JKKLLL ? ?z+=>>>><  ;)##kk'2..IIUB1G1GIIDL4::<<LL7;;ub3I3ILLLMMMM[M)){e## y!9!9::
#KK	22Igkk&6I6I	  D	(9(9(;(; D HH"3y>>?CCH((%%:0E)F&J&J&JKKK==!788 C%%&AFX,=&A&A&ABBB!!$''''[F"" F
 {m##GMM:J,K,K#*<_PUAUAU,,[_````			%    	.	.	.# | (','''''( (rP   	xmloutputc                     | dS g }t          | ||           t          t          d                    |                    pd          S )zLConvert to plain text format and optionally preserve formatting as markdown.NrF   )r   r   r   rv   )r   rQ   r   s      rN   rw   rw   b  sL    rJIz+=>>>HRWWZ00117R888rP   	null)delimr   r   c                   t          | j        |          p}t          | j        |          p}t                      }t	          j        ||t          j                  }|                    fd| j        | j	        | j
        | j        | j        | j        | j        ||| j        | j        fD                        |                                S )zAConvert the internal XML document representation to a CSV string.)	delimiterquotingc                     g | ]}|r|n	S r   r   )rj   dr   s     rN   r   zxmltocsv.<locals>.<listcomp>y  s.       +11t   rP   )rw   r!   rp   r   csvwriterQUOTE_MINIMALwriterowr6   r<   r=   r7   r3   imager5   r;   pagetypegetvalue)r   rQ   r   r   posttextcommentstextr   outputwriters      `    rN   xmltocsvr   n  s     '9::BdHH13EFFN$L ZZF:fs?PQQQL     6       ??rP   c                    t          dd          }t          ||            t          |d          }t          |d          }t          | j                  }d|_        |                    dd           |                    |           t          | j                  }d|_        |                    dd	           |                    |           |S )
z6Bundle the extracted post and comments into a TEI treeTEIzhttp://www.tei-c.org/ns/1.0)xmlnsrJ   r!   r   r,   entryrq   )	r   write_fullheaderr   r   r!   r[   r   r   rp   )re   teidoctextelemtextbodypostbodyrp   s         rN   r   r     s    U"?@@@FVW%%%&&))H(F++H--HHLLL!!!OOH#G$899LLVZ(((OOL!!!MrP   c                     | j         r,| j        r%| j                                         d| j          d}nZ| j         p| j        pd}t                              t
          j                  r&|dk    r t                              d| j                   |S )z5Construct a publisher string to include in TEI headerz (r   zN/Azno publisher for URL %s)	r7   r2   r   r   isEnabledForloggingWARNINGr   r6   )re   	publishers     rN   _define_publisher_stringr    s     CG, C'--//FF73CFFF		$A(8AE	w// 	CI4F4FNN4gkBBBrP   r   c                    t          | d          }t          |d          }t          |d          }|j        t          |dd          _        |j        r|j        t          |d          _        t          |d          }t	          |          }|j        r@|t          |d	          _        t          |d
          }|j        t          |d          _        nt          |d           t          |d          }|j        r|j        t          |dd          _        |j        t          |dd          _        t          |d          }	t          |	d          }
d                    t          d|j
        |j        g                    }|s t                              d|j                   d                    t          d|j        |g                    |
_        |t          |	dd          _        t          |	d          }t          |d          }|j        t          |dd          _        |j        r|j        t          |d          _        t          |d          }|t          |d	          _        |j        rt          |dd|j                   |j        t          |d          _        t          |d          }t          |d          }|j        t          |d          _        |j        s|j        rt          |d          }t          |d          }|j        r/d                    |j                  t          |d d!          _        |j        r/d                    |j                  t          |d d"          _        t          |d#          }|j        t          |dd$          _        t          |d%          }t          |d&          }t          |d't&          d()          }d(t          |d*          _        t          |dd+,           |S )-z+Write TEI header based on gathered metadata	teiHeaderfileDesc	titleStmtr3   r   )r,   r4   publicationStmtr  availabilityr   	notesStmtr0   r<   r=   
sourceDescbiblz, Nzno sigle for URL %ssiglebiblFullptrURL)r,   r-   r5   profileDescabstract	textClasskeywords,termr9   r:   creationdownloadencodingDescappInfoapplicationTrafilatura)r   identlabelz$https://github.com/adbar/trafilatura)r-   )r   r3   rJ   r4   r  r;   r<   r=   rv   filterr2   r5   r   r   r6   r8   r9   r:   filedatePKG_VERSION)r   re   r   filedescbib_titlestmtpublicationstmt_apublisher_stringr	  	notesstmt
sourcedescsource_biblr  biblfullpublicationstmtprofiledescr  	textclassr  r  encodingdescappinfor  s                         rN   r   r     s    ,,F&*--Hx55M;B=J}gF3338~ B3:>
=(++0"8->??/88 +:J
$k227!"3^DD-4_
<%%** 	$c***8[11Iz C8?

9f40005=D=PJy&}555:Hl33JZ00KIIfTG$4gl#CDDEEE ;,gk:::yyw}e.D!E!EFFK8=Jz60005*j11Hx55M;B=J}gF3338~ B3:>
=(++0 +<==O4DJ,,1{ K?EgkJJJJ/6|J'',V]33K+z22H%,%8Jx" TW\ T{K88	i44 	`CF88GL^C_C_Jxl;;;@< 	T=@XXgl=S=SJxf555:+z22H9@9IJxj1116fn55Ly11GWm[P]^^^K,9J{G$$){E*PQQQQMrP   c                    | j         r| j                                         rt          |           dk    rH| d         j        dk    r7| j          d| d         j         pd                                 | d         _         n1t	          d          }| j         |_         |                     d|           d| _         | j        r| j                                        rt          |           dk    rH| d         j        dk    r7| d         j         pd d| j                                         | d         _         n0t	          d          }| j        |_         |                     |           d| _        dS dS dS )z@Wrap loose text in <div> within <p> elements for TEI conformity.r   r   rS   rF   N)rJ   r   rZ   r[   r   insertrH   r   )rB   	new_childs     rN   r   r     sa   | **,, w<<!
# 5 5!(GG
0E2GGMMOOGAJOOI$\INNN1i(((| **,, w<<!3 6 6")"+"2"8bII7<IIOOQQGBKI$\INNN9%%%   rP   c                    | j         r| j                                         nd| _         | j         sdS | j        dk    r5d                    t	          d| j        | j         g                    | _        n]t          d          }| j         |_        |                                 }|,|                    |	                    |           dz   |           d| _         dS )z Handle tail on p and ab elementsNr   rS   r   )
rH   r   r[   rv   r  rJ   r   rG   r1  index)rB   new_siblingrL   s      rN   r   r     s    +2<A7<%%'''TGL< {cxxtglGL-I J JKKcll"<""$$MM&,,w//!3kBBBGLLLrP   c                    t          d| j                  }| j        r| j                                        nd|_        |                                 D ]}|j        dk    rjt          |          dk    s|j        rCt          |          dk    s|d         j        rt          |d           |j        |d         _        j|j        |_        w|	                    |           | j        r| j                                        nd}|r||_        |S )z0Convert certain child elements to <ab> and <lb>.r   )r}   Nr   r   r0  r   )
r   r}   rJ   r   iterchildrenr[   rZ   rH   r   r   )rB   new_elementr   rH   s       rN   r   r     s   $w~666K/6|Ew|))+++K%%'' 
& 
&9;!##{'7#{##q((KO,@({D111',zB$$#(:  u%%%%#*<97<TD  rP   div_elementc                    t          d          }d}|                                 }|dS |                                 D ]}|j        dk    r nx|j        t          v r-|p|                    |          }|                    |           J|r:t          |          dk    r'|                    ||           t          d          }d}|r+t          |          dk    r|                    ||           dS dS dS )z=Wrap unwanted siblings of a div element in a new div element.r   Nr   )	r   rG   itersiblingsr[   TEI_DIV_SIBLINGSr4  r   rZ   r1  )r9  r5  new_sibling_indexrL   siblings        rN   r   r   )  s   %..K""$$F~++-- ) );%E;*** 1 JV\\'5J5Jw'''' ! )S%5%5%9%9/===%enn$(! 6S--22'555556 622rP   c                    |                                  }||                                 nd}||dS t          d          }|                    t          |                                                      |                    |                    |          dz   |            | j        r| j                                        nd}|r||_	        d| _        |j        r|j                                        nd}|r||_        d|_        t          |          dk    s|j	        s|j        r,|                    |                    |           dz   |           t          |          dk    r|j	        s|                    |           dS dS dS )z
    Fix TEI compatibility issues by moving certain p-elems up in the XML tree.
    There is always a n+2 nesting for p-elements with the minimal structure ./TEI/text/body/p
    Nr   r   r   )rG   r   extendr&   r;  r1  r4  rH   r   rJ   rZ   rK   )rB   rL   grand_parentr   rH   s        rN   r   r   B  s   
   F)/);6##%%%L~-s||HOOD--//00111**622Q6@@@#*<97<TD "(+76;4D 
8}}qHMX]L..w77!;XFFF
6{{aF##### rP   )T)F)N__doc__r   r   htmlr   importlib.metadatar   ior   jsonr   rx   pathlibr   typingr   r	   
lxml.etreer
   r   r   r   r   r   r   settingsr   r   utilsr   r   r   	getLogger__name__r   r!  r   __file__rL   r   r   r   r   r   r<  r   r   r   r|   ra   r   r   r   boolrO   rV   r\   rd   rz   r   r   r   r   r   r   r   rT   r   rw   r   r   r  r   r   r   r   r   r   r   rP   rN   <module>rP     s    


        & & & & & &       $ $ $ $ $ $       ! ! ! ! ! ! ! !3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 * ) ) ) ) ) ) ) ; ; ; ; ; ; ; ; ; ; 
	8	$	$gm$$ Sh&/2BBCC
M M MAAA
+888 T222WWW111 XXX???    3d#>> H      & x T d    *	 	X 	 	 	 	H     6 6x 6 6 6 6 6 6,8     h 8    P PI P# P P P P&X XH X X X X Xh 8    'h 'Xc] 'x ' ' ' 'T d    ,( , , , , , ,^3(X 3(49 3(RV 3([_ 3( 3( 3( 3(l	9* 	9 	9 	9 	9 	9 	9 LP]c   x T S WZ hk    68     &h 3    DX D DX D D D DNx D    *H     "h 8    *6 6T 6 6 6 62$ $T $ $ $ $ $ $rP   