
    %$}g2              	          d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
mZ ddlmZ ddlmZmZ h dZh d	Zh d
Z ej        dej                  Z ej        dej                  Z ej        d          Z ej        dej                  Z ej        dej                  Z ej        dej                  Z ej        dej                  Z ej        d          Z ej        dej                  Z ej        d          Z dZ! ej        dej                  Z" ej        dej                  Z#de"fde#fgZ$ ej        dej                  Z% ej        d          Z& ej        d          Z' ej        d          Z( ej        d          Z) ej        d           Z* ej        d!ej                  Z+ ej        d"          Z, ej        d#ej                  Z- ej        d$ej.                  Z/d8d%ed&ed'e	e0         d(e1fd)Z2d*ed%ed(efd+Z3d,eee         ee0e0f         f         d%ed(efd-Z4d.e0d/e
e0         d(e	e0         fd0Z5d1e0d%ed(efd2Z6d3e0d(e0fd4Z7d5e	e0         d6e0d(e	e0         fd7Z8dS )9z
Functions needed to scrape metadata from JSON-LD format.
For reference, here is the list of all JSON-LD types: https://schema.org/docs/full.html
    N)unescape)AnyDictListOptionalPatternUnion   )Document)HTML_STRIP_TAGStrim>
   articleblogpostingnewsarticleliveblogpostingscholarlyarticleopinionnewsarticlesocialmediapostingreportagenewsarticlebackgroundnewsarticlemedicalscholarlyarticle>!   blogqapagereportr   faqpagewebpagewebsiteitempage	aboutpage
jobpostingr   contactpager   profilepagetecharticlecheckoutpagecollectionpagemedicalwebpager   satiricalarticler   realestatelistingreviewnewsarticlesearchresultspager   r   analysisnewsarticleaskpublicnewsarticler   r   discussionforumpostingr   advertisercontentarticle>   r   r   organizationnewsmediaorganizationzM"author":[^}[]+?"name?\\?": ?\\?"([^"\\]+)|"author"[^}[]+?"names?".+?"([^"]+)z$"[Pp]erson"[^}]+?"names?".+?"([^"]+)z`,?(?:"\w+":?[:|,\[])?{?"@type":"(?:[Ii]mageObject|[Oo]rganization|[Ww]eb[Pp]age)",[^}[]+}[\]|}]?z,"publisher":[^}]+?"name?\\?": ?\\?"([^"\\]+)z"@type"\s*:\s*"([^"]*)"z"articleSection": ?"([^"\\]+)z"author":|"person":)flagsz<[^>]+>z^https?://schema\.orgz\\u([0-9a-fA-F]{4}))	givenNameadditionalName
familyNamez*"@type":"[Aa]rticle", ?"name": ?"([^"\\]+)z"headline": ?"([^"\\]+)z"name"z
"headline"uB   ^([a-zäöüß]+(ed|t))? ?(written by|words by|words|by|von|from) z\d.+?$z@[\w]+z[._+]u$   ["‘({\[’\'][^"]+?[‘’"\')\]}]u   [^\w]+$|[:()?*$#!%/<>{}~¿]u;   \b\s+(am|on|for|at|in|to|from|of|via|with|—|-|–)\s+(.*)z3\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\bz"/|;|,|\||&|(?:^|\W)[u|a]nd(?:$|\W)u>   [✀-➾😀-🙏☀-⛿🌀-🗿🤀-🧿🩰-🫿🚀-🛿]+metadata	candidatecontent_typereturnc                    |rt          |t                    rl| j        r+t          | j                  t          |          k     r|dk    rdS | j        r1| j                            d          r|                    d          sdS dS )z6Determine if the candidate should be used as sitename.r   ThttpF)
isinstancestrsitenamelen
startswith)r5   r6   r7   s      Y/var/www/py-google-trends/myenv/lib/python3.11/site-packages/trafilatura/json_metadata.pyis_plausible_sitenamerA   9   s     Z	3//   	S):%;%;c)nn%L%LQ]ajQjQj4 	!2!=!=f!E!E 	iNbNbciNjNj 	45    parentc                    t          d|           D ]W}d|v rd|d         v r|d         d         |_        d|vs|d         s1t          |d         t                    r|d         d         n|d         }|                                }|t
          v r|j        st          |          |_        |t          v rX|	                    d          p)|	                    d          p|	                    d          }t          |||          r||_        |dk    rR|	                    d          r;|d                             d	          s t          |j        |d                   |_        S|t          v rd
|v rg|d
         }t          |t                    rE	 t!          j        |          }n/# t           j        $ r t          |j        |          |_        Y nw xY wt          |t                    s|g}|D ]dvsd         dk    rd}dv ru	                    d          }t          |t                    r)d                    |                              d          }nPt          |t*                    rd|v r|d         }n.dv r*dv r&d                    fdt,          D                       }t          |t                    rt          |j        |          |_        |j        sVd|v rRt          |d         t                    r|d         g|_        n(t          t          d|d                             |_        |j        s*d|v r|dk    r|d         |_        Fd|v r|d         |_        Y|S )z3Find and extract selected metadata from JSON parts.N	publishername@typer   	legalNamealternateNamepersonr:   authorPerson; r2   r4    c              3   0   K   | ]}|v |         V  d S N ).0xrK   s     r@   	<genexpr>z!process_parent.<locals>.<genexpr>x   s3      2b2bVW[aVaVa6!9VaVaVaVa2b2brB   articleSectionr   headline)filterr=   r;   listlowerJSON_OGTYPE_SCHEMApagetypenormalize_jsonJSON_PUBLISHER_SCHEMAgetrA   r?   normalize_authorsrK   JSON_ARTICLE_SCHEMAr<   jsonloadsJSONDecodeErrorjoinstripdictAUTHOR_ATTRS
categoriestitle)rC   r5   contentr7   r6   list_authorsauthor_namerK   s          @r@   process_parentrm   C   s   $'' D9 D9'!!f0D&D&D ' 4V <H'!!)9! /99I4.P.Pfww'**V]^eVf#))++ ---h6G- .| < <H000F++gw{{;/G/Gg7;;WfKgKgI$Xy,GG .$-!X%%{{6"" V76?+E+Ef+M+M V"3HOWV_"U"U0007""&x0lC00 [['+z,'?'?/ [ [ [*;HO\*Z*Z[ ",55 2$0>L* ^ ^Ff,,w80K0K&*!V++*0**V*<*<K)+t<< B.2ii.D.D.J.J4.P.P!+K!>!> B6[CXCX.9&.A(F22|v7M7M*-((2b2b2b2bl2b2b2b*b*bK%k377 ^.?Q\.].]HO & X+;w+F+Fg&67== X+23C+D*EH''*.vdGDT<U/V/V*W*WH' > 9W$$)B)B%,V_HNN7**%,Z%8HNOs   F'')GGschemac                 *   t          | t                    r| g} | D ]}|                    d          }|rt          |t                    rt                              |          rd|v r-t          |d         t                    r|d         n|d         g}nnd|v rht          |d         t                    rMd|d                                         v r1d|v r-t          |d         t                    r|d         n|d         g}n| }t          ||          }|S )z,Parse and extract metadata from JSON-LD dataz@contextz@graphrG   r   liveBlogUpdate)	r;   rf   r^   r<   JSON_SCHEMA_ORGmatchrX   rY   rm   )rn   r5   rC   contexts       r@   extract_jsonrt      sG   &$  8 8**Z(( 	8z'3// 	8O4I4I'4R4R 	86!!-7x8H$-O-Og))V\]eVfUgF""z&/3'G'G"L]aghoapavavaxaxLxLx  ~N  RX  ~X  ~X5?GW@XZ^5_5_ 011flm}f~e%fh77HOrB   elemtextregular_expressionc                     d}|                     |           }|rYd|d         v rOt          ||d                   }|                    d| d          } |                     |           }|r
d|d         v O|pdS )z.Crudely extract author names from JSON-LD dataNrN   r
    )count)searchr_   sub)ru   rv   authorsmymatchs       r@   extract_json_authorr~      s    G ''11G
 6cWQZ''#GWQZ88%))#xq)AA$++H55  6cWQZ'' ?drB   elemc                    t                               d|           }t          |t                    pt          |t                    }|r||_        d| v rSt                              |           }|r7t          |d         	                                          }|t          v r||_        d| v rRt                              |           }|r6d|d         vr,t          |d                   }t          ||          r||_        d| v r7t                              |           }|rt          |d                   g|_        t"          D ]C\  }}|| v r:|j        s3|                    |           }|rt          |d                   |_         nD|S )z*Crudely extract metadata from JSON-LD datarx   rG   r
   z"publisher",z"articleSection")JSON_AUTHOR_REMOVEr{   r~   JSON_AUTHOR_1JSON_AUTHOR_2rK   	JSON_TYPErz   r\   rY   rZ   r[   JSON_PUBLISHERrA   r=   JSON_CATEGORYrh   JSON_SEQri   )r   r5   element_text_authorrK   r}   r6   keyregexs           r@   extract_json_parse_errorr      s    -00T:: !4mDD E !4mDD  !  $""4(( 	.&wqz'7'7'9'9::I...$-!  ''-- 	.s'!*,,&wqz22I$Xy99 .$-! T!!&&t,, 	?#1'!*#=#=">H   
U$;;x~;ll4((G !/
!;!;OrB   stringc                 j   d| v r|                      dd                               dd                               dd          } t                              d |           } d                    d | D                       } t	          |           } t          t                              d|                     S )z-Normalize unicode strings and trim the output\z\nrx   z\rz\tc                 H    t          t          | d         d                    S )Nr
      )chrint)rr   s    r@   <lambda>z normalize_json.<locals>.<lambda>   s    Ca"<M<M8N8N rB   c              3   f   K   | ],}t          |          d k     st          |          dk    (|V  -dS )i   i  N)ordrR   cs     r@   rT   z!normalize_json.<locals>.<genexpr>   s:      QQqc!ffvooQ&QQrB   )replaceJSON_UNICODE_REPLACEr{   rd   r   r   JSON_REMOVE_HTML)r   s    r@   r\   r\      s    v~~r**225"==EEeRPP%))*N*NPVWWQQFQQQQQ&!! $$R00111rB   current_authorsauthor_stringc                 l   g }|                                                     d          st                              |          r| S | |                     d          }d|v r'|                                                    d          }d|v sd|v rt          |          }t          j	        d|          }t                              |          D ]t                    t          	                    d          t          	                    d          t          t          	                    d	                    t          	                    d          t           	                    d          t"          	                    d          t$          	                    d          t&          	                    d          rt)                    d
k    r
d	vrdvrd                                         rt-          d D                       dk     r                                |vrCt)          |          dk    st1          fd|D                       r|                               t)          |          dk    r| S d                    |                              d          S )z3Normalize author info to focus on author names onlyr:   NrM   z\uunicode_escapez&#z&amp;rx   rN   2   -r   c              3   B   K   | ]}|                                 d V  dS )r
   N)isupperr   s     r@   rT   z$normalize_authors.<locals>.<genexpr>  s/      )K)Kqyy{{)K!)K)K)K)K)K)KrB   r
   c              3       K   | ]}|vV  	d S rP   rQ   )rR   
new_authorrK   s     r@   rT   z$normalize_authors.<locals>.<genexpr>  s*      F~F~dnzY_G_F~F~F~F~F~F~rB   )rY   r?   AUTHOR_EMAILrr   splitencodedecoder   r   r{   AUTHOR_SPLITr   AUTHOR_EMOJI_REMOVEAUTHOR_TWITTERAUTHOR_REPLACE_JOINAUTHOR_REMOVE_NICKNAMEAUTHOR_REMOVE_SPECIALAUTHOR_PREFIXAUTHOR_REMOVE_NUMBERSAUTHOR_REMOVE_PREPOSITIONr>   r   sumri   allappendrd   re   )r   r   new_authorsrK   s      @r@   r_   r_      s   K''// <3E3Em3T3T "%++D11%,,..556FGG}= 8 8 //#'M::M$$]33 ' 'f$((V44##B//)--c6::;;'++B77&**2v66""2v..&**2v66*..r6::  	#f++++60A0AcQWFWFWay  "" 	$c)K)KV)K)K)K&K&Ka&O&O\\^^F$$#k*:*:a*?*?3F~F~F~F~r}F~F~F~C~C~*?v&&&
;199[!!''---rB   rP   )9__doc__ra   rehtmlr   typingr   r   r   r   r   r	   settingsr   utilsr   r   r`   rZ   r]   compileDOTALLr   r   r   r   r   r   
IGNORECASE
JSON_MATCHr   rq   r   rg   	JSON_NAMEJSON_HEADLINEr   r   r   r   r   r   r   r   r   r   UNICODEr   r<   boolrA   rm   rt   r~   r   r\   r_   rQ   rB   r@   <module>r      s5   
  				       < < < < < < < < < < < < < < < <       ( ( ( ( ( ( ( ( _  _  _  [	  [	  [	 WWW 
kmomvww
BBINNRZ  !D  E  E KRYWWBJ129==	
;RYGGRZ.bmDDD
2:j)) "*5R]KKK!rz"899 <BJDbiPP	
5ryAAy!L-#@A
`hjhuvvv"
9-- I&& bj** #$KLL "
#ABB &BJ'emomz{{{ rzPQQrz?r}UUU bj	 
	 	 	  H  HUXM ei    G3 G( Gx G G G GTtCy$sCx.89 X RZ    ,# 73< HUXM    '3 '( 'x ' ' ' 'T23 23 2 2 2 2*.x} *.S *.XVY] *. *. *. *. *. *.rB   