
    %$}gJ                        d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZmZmZmZmZ ddlmZ ddlmZ dd	lmZmZ d
dlmZ d
dlm Z m!Z!m"Z"m#Z# d
dl$m%Z%m&Z& d
dl'm(Z(m)Z)m*Z*m+Z+ d
dl,m-Z-m.Z.m/Z/m0Z0m1Z1 dgZ2 ej3        e4          Z5 ej3        d          6                    ej7                    ej8        d          Z9 ej8        d          Z: ej8        d          Z; ej8        d          Z< ej8        d          Z= ej8        dej>                  Z?h dZ@h dZAh dZBh dZCh dZDddhZEh dZFd d!hZGd"d#hZHh d$ZId%d&d'd(d(d(d)d*ZJd+d,hZKg d-ZLd.eMd/eMfd0ZNd1eMd2eeM         d/eeM         fd3ZOd4ed5e%d/e%fd6ZPd4ed/e
eMeeM         f         fd7ZQd4ed/e%fd8ZR	 dOd4ed:ee         d;eSd/eeM         fd<ZTd4ed/eeMeeM         eeM         f         fd=ZUd4ed/eeM         fd>ZVd4ed/eeM         fd?ZWdPd4ed@eeM         d/eeM         fdAZXd4ed/eeM         fdBZYdCeMd4ed/eeM         fdDZZdQdFedGe[d/eeM         fdHZ\d4ed/eeM         fdIZ]	 	 	 	 dRdKeeeMf         d@eeM         dLee	         dMe[d2eeeM                  d/e%fdNZ^dS )SzH
Module bundling all functions needed to scrape metadata from webpages.
    N)deepcopy)unescape)AnyDictListOptionalSetTupleUnion)extract_domainget_base_urlis_valid_urlnormalize_urlvalidate_url)	find_date)XPath)HtmlElementtostring   )prune_unwanted_nodes)extract_jsonextract_json_parse_errornormalize_authorsnormalize_json)Documentset_date_params)HTML_STRIP_TAGSline_processing	load_htmltrim)AUTHOR_DISCARD_XPATHSAUTHOR_XPATHSCATEGORIES_XPATHSTAGS_XPATHSTITLE_XPATHSr   htmldatez$https?://(?:www\.|w[0-9]+\.)?([^/]+)z("(?:\\"|[^"])*")|\su5   ^(.+)?\s+[–•·—|⁄*⋆~‹«<›»>:-]\s+(.+)$z["\']z=/(by-nc-nd|by-nc-sa|by-nc|by-nd|by-sa|by|zero)/([1-9]\.[0-9])zT(cc|creative commons) (by-nc-nd|by-nc-sa|by-nc|by-nd|by-sa|by|zero) ?([1-9]\.[0-9])?>   
dc.creator
dc:creatordcsext.authoratc-metaauthordc.creator.autparsely-authordcterms.creatorsailthru.authordcterms.creator.autshareaholic:article_author_namebylauthorauthorscreator	rbauthorscitation_authorarticle:author>   dc.descriptiondc:descriptiondcterms.abstractdcterms.descriptiontwitter:descriptionsailthru.descriptiondescription>
   dc.publisherdc:publisherdcterms.publishersailthru.publisher	copyright	publisher	rbpubnamecitation_journal_titletwitter:sitearticle:publisher>   parsely-tagsdcterms.subjectshareaholic:keywordstagskeywordscitation_keywords>   dc.titledcterms.titleparsely-titletwitter:titlesailthru.titleshareaholic:titletitlerbtitlefb_titleheadlinecitation_title	rbmainurltwitter:url>   twitter:imagetwitter:image:srcimageog:imageog:image:urlog:image:secure_urlr2   r7   rG   zapplication-name>   
http-equivcharsetpropertyrU   r>   sitenamer^   pagetype)zog:titlezog:descriptionzog:site_namer_   r`   ra   zog:typez	og:authorzog:article:author)z.//head//link[@rel="canonical"]z.//head//basez6.//head//link[@rel="alternate"][@hreflang="x-default"]rL   returnc                     t          t          |                     }|sdS t                              d|          } d                    t          d|                     d                              S )z!Remove special characters of tags z, N)r    r   CLEAN_META_TAGSsubjoinfiltersplit)rL   trimmeds     T/var/www/py-google-trends/myenv/lib/python3.11/site-packages/trafilatura/metadata.pynormalize_tagsrq      s_    8D>>""G rsG,,D99VD$**T"2"233444    r3   author_blacklistc                     d D             fd|                      d          D             }|r(d                    |                              d          S dS )z:Check if the authors string correspond to expected values.c                 6    h | ]}|                                 S  )lower).0as     rp   	<setcomp>z check_authors.<locals>.<setcomp>   s     <<<a		<<<rr   c                     g | ]>}|                                                                 v*|                                 ?S rv   )striprw   )rx   r2   rs   s     rp   
<listcomp>z!check_authors.<locals>.<listcomp>   sK       <<>>!!)999 	999rr   ;z; N)rn   rl   r|   )r3   rs   new_authorss    ` rp   check_authorsr      s{    <<+;<<<   mmC((  K
  2yy%%++D1114rr   treemetadatac                 4   |                      d          D ]}|j        s
t          t                              d|j                            }	 t          j        |          }t          ||          }]# t
          j        $ r t          ||          }Y ~w xY w|S )z,Parse and extract metadata from JSON-LD datazK.//script[@type="application/ld+json" or @type="application/settings+json"]z\1)
xpathtextr   JSON_MINIFYrk   jsonloadsr   JSONDecodeErrorr   )r   r   elemelement_textschemas        rp   extract_meta_jsonr      s    

U  
H 
H y 	%kooeTY&G&GHH	HZ--F#FH55HH# 	H 	H 	H/hGGHHH	HOs   $A33BBc                    t                               d          }|                     d          D ]}|                    d          |                    d          }}|re|                                sQ|t
          v r||t
          |         <   \|dk    rt          |          r||d<   w|t          v rt          d|          |d<   |S )	zESearch meta tags following the OpenGraph guidelines (https://ogp.me/))rU   r2   urlr>   re   r^   rf   z+.//head/meta[starts-with(@property, "og:")]rd   contentzog:urlr   Nr2   )	dictfromkeysr   getisspaceOG_PROPERTIESr   	OG_AUTHORr   )r   resultr   property_namer   s        rp   extract_opengraphr      s    ]]R F
 

HII 	D 	D!%*!5!5txx	7J7Jw 	D7??,, 	D--7>}]344(**|G/D/D* 'u)++#4T7#C#Cx  Mrr   c           	         t                                          t          |                     }t          |j        |j        |j        |j        |j        |j	        f          r|S g d}}| 
                    d          D ]t          j        d                    dd                                                    }|sAdj        v r                    dd                                          }|                    d          r|dk    r#|                    t'          |                     |t(          v rt+          |j        |          |_        |dk    r|j        p||_        |t,          v r|j	        p||_	        d	j        v r                    d	d                                          }|t.          v rt+          |j        |          |_        Y|t0          v r|j        p||_        r|t2          v r|j        p||_        |t4          v r|j        p||_        |t6          v sd
|v r|}|dk    r|j        st9          |          r	||_        |t:          v r"|                    t'          |                     dj        v rv                    dd                                          }|dk    rt+          |j        |          |_        Z|dk    r|j        p||_        p|dk    r|j        p||_        t          fdt<          D                       r=t>                               dtC          dd                                                     |j        p||_        ||_"        |S )z)Search meta tags for relevant informationNz.//head/meta[@content]ri   r   rd   og:zarticle:tagrH   nameztwitter:app:namer[   itempropr2   r>   rX   c              3   *   K   | ]}|j         vV  d S N)attrib)rx   keyr   s     rp   	<genexpr>zexamine_meta.<locals>.<genexpr>0  s*      >>CDK'>>>>>>rr   zunknown attribute: %sFunicode)pretty_printencoding)#r   	from_dictr   allrU   r2   r   r>   re   r^   iterfindr   rk   r   r|   r   rw   
startswithappendrq   PROPERTY_AUTHORr   METANAME_IMAGEMETANAME_AUTHORMETANAME_TITLEMETANAME_DESCRIPTIONMETANAME_PUBLISHERTWITTER_ATTRSr   METANAME_TAG
EXTRA_METALOGGERdebugr   rL   )	r   r   rL   backup_sitenamecontent_attrproperty_attr	name_attritemprop_attrr   s	           @rp   examine_metar      s    zz##$5d$;$;<<H NOL N	
	 	 
 /D 677 B B&*2txx	2/F/FGGMMOO 	 $$ HHZ44::<<M''.. --N<889999/11"3HO\"R"R"555$,$5$E!!.00!)!?<t{"",,2244IO++"3HO\"R"Rn,,!)!?<222'/';'K|$$000$,$5$E!!m++/AY/N/N". ]**  + .. +  ,l**N<889994;&& HHZ44::<<M(("3HO\"R"R-//'/';'K|$$*,,!)!?< >>>>:>>>>> 	LL'EIFFFLLNN   !)<_HHMOrr      expressions	len_limitc                 Z   |D ]} ||           }|D ][}t          d                    |                                                    }|r#dt          |          cxk     r|k     r	n U|c c S \t          |          dk    r)t                              d|t          |                     dS )zExtract meta information    r   z#more than one invalid result: %s %sN)r    rl   itertextlenr   r   )r   r   r   
expressionresultsr   r   s          rp   extract_metainfor   >  s    
 " 
 

*T"" 	 	D388DMMOO4455G 1s7||7777i77777w<<!LL5z3w<<   4rr   c                    d}|                      d          }|Nt          |                                          }t                              |          x}r||d         |d         fS t
                              d           |ddfS )z2Extract text segments out of main <title> element.ri   z.//head//titleNr   r   zno main title found)findr    text_contentHTMLTITLE_REGEXmatchr   r   )r   rU   title_elementr   s       rp   examine_title_elementr   Q  s     EII.//M ]//1122#))%0005 	-%(E!H,,
LL&'''$rr   c                    |                      d          }t          |          dk    r+t          |d                                                   }|r|S t	          | t
                    pd}|r|S t          |           \  }}}||fD ]}|rd|vr|c S |r|d                                         S 	 |                     d          d                                         }n*# t          $ r t          
                    d           Y nw xY w|S )zExtract the document titlez.//h1r   r   ri   .z.//h2zno h2 title found)findallr   r    r   r   r%   r   r   
IndexErrorr   r   )r   
h1_resultsrU   firstsecondts         rp   extract_titler   _  s2    g&&J
:!Z]//1122 	LT<006BE 066E5&V_   	AHHH ,!}))+++*

7##A&3355 * * *()))))*Ls   0-C $DDc                     t          t          |           t                    }t          |t          d          }|rt          d|          }|S )zExtract the document author(s)x   )r   N)r   r   r!   r   r"   r   )r   subtreer2   s      rp   extract_authorr   {  sH    "8D>>3HIIGg}DDDF 1"400Mrr   default_urlc                 *   t           D ]9}|                     |          }||j                            d          nd}|r n:|r|                    d          r|                     d          D ]{}|                    d          p|                    d          pd}|                    d          s|                    d	          r#t          |j        d
                   }|r||z   } n||r%t          |          \  }}|rt          |          nd}|p|S )z'Extract the URL from the canonical linkNhref/z.//head//meta[@content]r   rd   ri   r   ztwitter:r   )	URL_SELECTORSr   r   r   r   r   r   r   r   )	r   r   selectorelementr   attrtypebase_urlvalidation_result
parsed_urls	            rp   extract_urlr     sI   !  ))H%%,3,?gn  (((T 	E	  s~~c"" }}%>?? 	 	G{{6**Kgkk*.E.EKH""5)) X-@-@-L-L 'y(ABB "S.CE  G(4S(9(9%:+<FmJ'''$+rr   c                 X    t          |           ^}}t          d |D             d          S )z=Extract the name of a site from the main title (if it exists)c              3   &   K   | ]}|d |v |V  dS )r   Nrv   )rx   parts     rp   r   z#extract_sitename.<locals>.<genexpr>  s+      @@$4@C4KKKKKK@@rr   N)r   next)r   _partss      rp   extract_sitenamer     s2    %d++IA@@%@@@$GGGrr   metatypec                    g }d| z   dz   | dk    rt           nt          }|D ]0}|                    fd ||          D                        |r n1| dk    r:|s8|                    d          D ]"}|                    |j        d                    #d t                              d |D                       D             S )	z!Find category and tag informationr   z	[s|ies]?/categoryc              3      K   | ]8}t          j        |j        d                    "|                                V  9dS )r   N)researchr   r   )rx   r   regexprs     rp   r   z#extract_catstags.<locals>.<genexpr>  s\       
 
y$+f"566

 
 
 
 
 
rr   zR.//head//meta[@property="article:section" or contains(@name, "subject")][@content]r   c                     g | ]}||S rv   rv   )rx   rs     rp   r}   z$extract_catstags.<locals>.<listcomp>  s    TTT!RSTATTTrr   c              3   8   K   | ]}|t          |          V  d S r   )r   )rx   xs     rp   r   z#extract_catstags.<locals>.<genexpr>  s/      $N$NAA$N_Q%7%7$N$N$N$N$N$Nrr   )r#   r$   extendr   r   r   r   r   )r   r   r   xpath_expressioncatexprr   r   s         @rp   extract_catstagsr     s   GHn{*G,4
,B,B((#   
 
 
 

 
 
 	
 	
 	

  	E	 :gzz`
 
 	6 	6G NN7>)45555
 UTt}}$N$N$N$N$NNNTTTTrr   Fr   strictc                 B   t                               |                     dd                    }|r&d|d                                          d|d          S | j        rA|r+t
                              | j                  }|r|d         ndS t          | j                  S dS )	zkProbe a link for identifiable free license cues.
    Parse the href attribute first and then the link text.r   ri   zCC r   r   r   r   N)LICENSE_REGEXr   r   upperr   TEXT_LICENSE_REGEXr    )r   r   r   s      rp   parse_license_elementr    s       VR!8!899E 32U1X^^%%22a222| " 	/&--gl;;E$.588$.GL!!!4rr   c                     |                      d          D ]}t          |d          }||c S |                     d          D ]}t          |d          }||c S dS )z:Search the HTML code for license information and parse it.z.//a[@rel="license"][@href]F)r   Nz[.//footer//a[@href]|.//div[contains(@class, "footer") or contains(@id, "footer")]//a[@href]T)r   r  r   )r   r   r   s      rp   extract_licenser    s     << =>>  &wu===MMM  ::e    'wt<<<MMM 4rr   Tfilecontentdate_config	extensivec                 (   |pt                      }|pt          |          }t          |           }|t                      S t	          |          }|j        rd|j        vrd|_        	 t          ||          }n2# t          $ r%}t          	                    d|           Y d}~nd}~ww xY w|j
        st          |          |_
        |j        r|rt          |j        |          |_        |j        st          |          |_        |j        r|rt          |j        |          |_        |j        st          ||          |_        |j        rt!          |j        d          |_        |j        |d<   t%          |fi ||_        |j        st+          |          |_        |j        rt-          |j        t.                    r|j        d         |_        n3t-          |j        t0                    rt3          |j                  |_        |j                            d          |_        |j        rFd	|j        vr=|j        d                                         s|j        
                                |_        n5|j        r.t8                              |j                  }|r|d
         |_        |j        st?          d|          |_        |j         st?          d|          |_         tC          |          |_"        |d         |_#        |$                                 |S )a  Main process for metadata extraction.

    Args:
        filecontent: HTML code as string or parsed tree.
        default_url: Previously known URL of the downloaded document.
        date_config: Provide extraction parameters to htmldate as dict().
        author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.

    Returns:
        A trafilatura.settings.Document containing the extracted metadata information or None.
        The Document class has .as_dict() method that will return a copy as a dict.
    Nr   z%error in JSON metadata extraction: %sT)fastr   r   @r   r   r   tagmax_date)%setr   r   r   r   r2   r   	Exceptionr   warningrU   r   r   r   r   r   r   hostnamer   datere   r   
isinstancelistr   strlstripisupperMETA_URLr   
categoriesr   rL   r  licensefiledateclean_and_trim)	r  r   r  r  rs   r   r   errmymatchs	            rp   extract_metadatar    sC   ( (0355;!;!;K [!!D|zz D!!H  3ho55E$T844 E E E>DDDDDDDDE > -&t,,  K+ K'9IJJ? /(.. K+ K'9IJJ < 6"455 | D*8<dCCC "Kd22k22HM  3,T22 +h'.. 	7 ( 1! 4H)400 	7 #H$5 6 6H$-44S99 	:8,,,%a(0022 - !) 1 7 7 9 9H	 +.... 	+ '
H  A.z4@@ = 6(55 't,,H $J/HOs   (A9 9
B(B##B()r   r   )F)NNTN)___doc__r   loggingr   copyr   htmlr   typingr   r   r   r   r	   r
   r   courlanr   r   r   r   r   r&   r   
lxml.etreer   	lxml.htmlr   r   htmlprocessingr   json_metadatar   r   r   r   settingsr   r   utilsr   r   r   r    xpathsr!   r"   r#   r$   r%   __all__	getLogger__name__r   setLevelWARNINGcompiler  r   r   rj   r   Ir  r   r   r   r   r   METANAME_URLr   r   r   r   r   r   r   r  rq   r   r   r   r   intr   r   r   r   r   r   r   boolr  r  r  rv   rr   rp   <module>r6     s      				             ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?                          + + + + + + + + 0 0 0 0 0 0            0 / / / / / / / D D D D D D D D D D D D              ,		8	$	$  *   & &w 7 7 72:=>>bj011"*<  "*X&&
D   RZ[D  
  &           ]+   -.!34 322
 #"  -.	  5 5 5 5 5 5
3 
#c( 
x} 
 
 
 
K 8      K Dhsm1C,D    .^{ ^x ^ ^ ^ ^D CF 
$(K<?c]   &

3x},-        8 #     k  RU    6H; H8C= H H H HUs U+ U$s) U U U U6 ;  RU     + (3-    & "&!%+/k k{C'(k#k #k 	k
 s3x(k k k k k k krr   