
    %$}ghz                        d Z ddlZddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZmZmZmZ ddlmZmZ d	d
lmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9 d	dl:m;Z;m<Z<m=Z= d	dl>m?Z?m@Z@mAZAmBZB d	dlCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZL  ejM        eN          ZOdedePfdZQh dZRh dZSh dZTh dZUddhZVeUW                    eV          ZXh dZY ejZ        d          Z[e'e(fe)e*ffZ\dePde?deeP         fdZ]dedePde?deeP         fdZ^dede?deeP         fdZ_d eeP         d!eeP         d"eeP         de?deeeP                  f
d#Z`d$ePd%eeP         d!eeP         d"eeP         de?deeeP                  fd&Za e	e;'          d(ebdePde?debfd)            Zcdede?deeP         fd*Zddede?deeP         fd+Zed,eeeP                  dePfd-Zfd$ePde?deeP         fd.Zg	 	 	 	 	 	 	 	 d<d2eehePef         d3eid4eid5ePd6eeP         d7eid8eeeePf                  d9eeeePf                  d:eideeP         fd;ZjdS )=zZModule bundling all functions needed to determine the date of HTML strings
or LXML trees.
    N)Counter)deepcopy)datetime)	lru_cachepartial)MatchOptionalPatternUnionr   )HtmlElementtostring   )%discard_unwantedextract_url_dateidiosyncrasies_search
img_searchjson_searchregex_parsepattern_searchtry_date_exprDATE_EXPRESSIONSFAST_PREPENDSLOW_PREPENDFREE_TEXT_EXPRESSIONSMAX_SEGMENT_LENMIN_SEGMENT_LENYEAR_PATTERNYMD_PATTERNCOPYRIGHT_PATTERNTIMESTAMP_PATTERNTHREE_PATTERNTHREE_CATCHTHREE_LOOSE_PATTERNTHREE_LOOSE_CATCHSELECT_YMD_PATTERNSELECT_YMD_YEARYMD_YEARDATESTRINGS_PATTERNDATESTRINGS_CATCHSLASHES_PATTERNSLASHES_YEARYYYYMM_PATTERNYYYYMM_CATCHMMYYYY_PATTERNMMYYYY_YEARSIMPLE_PATTERNTHREE_COMP_REGEX_ATHREE_COMP_REGEX_BTWO_COMP_REGEX)
CACHE_SIZECLEANING_LISTMAX_POSSIBLE_CANDIDATES)	Extractor
clean_html	load_html	trim_text)	check_extracted_referencecompare_valuesfilter_ymd_candidateget_min_dateget_max_dateis_valid_dateis_valid_formatplausible_year_filtervalidate_and_convertelementreturnc                 J    t          | dd                                          S )z,Format the element to be logged to a string.Funicodepretty_printencoding)r   strip)rD   s    M/var/www/py-google-trends/myenv/lib/python3.11/site-packages/htmldate/core.py	logstringrM   K   s#    G%)DDDJJLLL    >E   dc.datedc:date
bt:pubdate
dc.created
dc:created
og:pubdate
og:regdatedcterms.datepublish-datesailthru.datedc.date.issueddcterms.issuedpublished-datearticle.createddc.date.createddcterms.createdog:publish_dateog:datepublishedparsely-pub-datetwt-published-atarticle.publishedarticle:post_datearticle:publishedog:published_timevr:published_timevideo:release_datedc.date.publicationrnews:datepublishedfield-name-post-dateog:article:publishedarticle:published_datearticle:published_timearticle:publicationdateog:article:published_timeog:question:published_timecxenseparse:recs:publishtimedcsext.articlefirstpublishedanalyticsattributes.articledate"shareaholic:article_published_timedatemetapdateptimecreatedgentimepubdatedoc_date	rbpubdate	timestamp
dateposteddatecreateddisplaydatepublishdatedate_createdpublish_datepublish_timerelease_datecitation_datedatepublishedpublisheddatedate_publishedpublished_datepublished_timepublication_datecontent_create_datearticle_date_originalmediator_published_timeoriginalpublicationdatecitation_publication_date>   utimelastmodlastdatemodifiedlastmodifiedlast-modified>   
bt:moddatedc.modifiedog:updated_timearticle:modifieddcterms.modifiedog:modified_timearticle:modified_datearticle:modified_timearticle:post_modifiedog:article:modified_timedatemodifiedr   r   updated_timemodified_timerevision_datemodificationdate>   pubyearr   r   r   
dateupdate>   date-publishedtime published	publishedz\D+$textoptionsc                     t          |           } t          |           t          k    rdS t                              d| dt
                             } t          | |j        |j        |j	        |j
                  S )z'Prepare text and try to extract a date.N )r:   lenr   NON_DIGITS_REGEXsubr   r   format	extensiveminmax)r   r   s     rL   examine_textr      sk    
 T??D
4yyO##tD)9/)9$:;;Dgng/gk  rN   tree
expressionc                     |                      |          }|rt          |          t          k    rdS |D ]G}|                                |                    dd          fD ]}t          ||          }|r|c c S HdS )z3Check HTML elements one by one for date expressionsNtitler   )xpathr   r6   text_contentgetr   )r   r   r   elementselemr   attempts          rL   examine_date_elementsr      s     zz*%%H s8}}'>>>t  &&(($((7B*?*?@ 	 	D"411G 	
 4rN   c                 t
   d\  }}t          t          |j        |j        |j        |j                  }|                     d          D ]}|j        rd|j        vr
d|j        vrd|j        v r|                    dd          	                                }|dk    r%t          |                    d          |          }nP|t          v rHt                              d	t          |                      ||                    d                    }n|t          v rmt                              d	t          |                     |j        s  ||                    d                    }n ||                    d                    }nd
|j        v r|                    d
d          	                                }|t          v s	|t"          v rnt                              dt          |                      ||                    d                    }|&|t          v r|j        s|t"          v r|j        s|}n|}nd|j        v r6|                    dd          	                                }|t$          v rt                              dt          |                      ||                    d          p|                    d                    }|"|t&          v r|j        s|t(          v r	|j        s|}n|dk    r|t                              dt          |                     d|j        v rKd                    |                    dd          ddg          }t-          |d|j        |j                  r|}nd|j        v rt|                    dd          	                                dk    rFt                              dt          |                      ||                    d                    }nd|j        v r|                    dd          	                                }|dk    rmt                              dt          |                     |j        r ||                    d                    }n ||                    d                    }nr|dk    rlt                              dt          |                     |j        s ||                    d                    }n ||                    d                    }| n||t                              d           |}|S )a  
    Parse header elements to find date cues

    :param tree:
        LXML parsed tree object
    :type tree: LXML tree
    :param options:
        Options for extraction
    :type options: Extractor
    :return: Returns a valid date expression as a string, or None

    )NN)outputformatextensive_searchmin_datemax_datez.//metacontentr   namer   zog:urlzexamining meta name: %spropertyzexamining meta property: %sNitempropzexamining meta itemprop: %scopyrightyear-01%Y-%m-%dearliestlatestr|   zexamining meta pubdate: %sz
http-equivrv   zexamining meta http-equiv: %sr   z-opting for reserve date with less granularity)r   r   r   r   r   r   iterfindattribr   lowerr   DATE_ATTRIBUTESLOGGERdebugrM   NAME_MODIFIEDoriginalPROPERTY_MODIFIEDITEMPROP_ATTRSITEMPROP_ATTRS_ORIGINALITEMPROP_ATTRS_MODIFIEDjoinr@   )r   r   
headerdatereservetryfuncr   	attributer   s           rL   examine_headerr      s     %J^ *  G i(( V V 	++$+--T[  ,,2244IH$$*488I+>+>HHo--6	$HHH$WTXXi%8%899

m++6	$HHH' ;!())<)<!=!=JJ%gdhhy&9&9::G4;&&R006688IO++y<M/M/M:IdOOLLL!'$((9"5"566&!_449I4!%666w?O6%,

 #*4;&&R006688IN**:IdOOLLL!'$((:"6"6"M$((9:M:MNN&!%<<<AQ<!%<<<WEU<%,

 o--:IdOOLLL++!hhB(?(?t'LMMG$gk'+   * #*$+%%xx	2&&,,..);;99T??KKK$WTXXi%8%899
T[((r2288::IF""<iooNNN# ;!())<)<!=!=JJ%gdhhy&9&9::GGo--<iooNNN' ;!())<)<!=!=JJ%gdhhy&9&9::G!E " g1DEEE
rN   occurrencescatchyearpatc                 F   | rt          |           t          k    rdS t          |           dk    r/|                    t          t	          |                               S |                     d          }t                              d|           t          |j	                   dd         }t                              d|           t          | \  }}g }|D ]4}	|                    |	          }
|
r|                    |
d                    5fd|D             }t          |          r|d	         |d         k    r|                    |d	                   }n|d         |d	         k    r1|d         |d	         z  d
k    r|                    |d                   }n|                    |d	                   }nht          |          r/|                    ||                    d                             }n*t                              d|d	         |d                    d}|S )z2Select a candidate among the most frequent matchesNr   
   zfirstselect: %s)reverse   zbestones: %sc           	          g | ];}t          t          t          |          d d           dj        j                  <S )r   %Yr   )r@   r   intr   r   ).0yearr   s     rL   
<listcomp>z$select_candidate.<locals>.<listcomp>  sY         	SYY1%%tgk'+	
 	
 	
  rN   r   g      ?Tzno suitable candidate: %s %s)r   r6   searchnextitermost_commonr   r   sortedr   zipappendallanyindex)r   r   r   r   firstselectbestonespatternscountsyearspattern
year_match
validationmatchs      `         rL   select_candidater  c  s(     #k**-DDDt
;1||Dk!2!233444 ))"--K
LL"K000kw/?+?@@@!DH
LL*** H~HfE ( (^^G,,
 	(LLA'''    	  J : !9q	!!LL!--EE1Xq!!fQi&)&;c&A&ALL!--EE LL!--EE	Z Xj&6&6t&<&<=>>3U1XuQxHHHLrN   
htmlstringr  c                 b    t          | |||j        |j                  }t          ||||          S )z)Chained candidate filtering and selectionr  r   r   r   )rB   r   r   r  )r  r  r   r   r   
candidatess         rL   search_patternr    sA     '{  J Jw@@@rN   )maxsize	referencec                 z    t          ||j        |j        |j        |j                  }|t          | ||          S | S )z[Compare candidate to current date reference (includes date validation and older/newer test))r   r   r   r   r   r<   )r  r   r   r   s       rL   compare_referencer    sG     GNG$5w{GK G i':::rN   c                    |                      d          }dt          |          cxk     rt          k     rn nd}|D ]}}d|j        v rv	 t	          |                    dd                    }n# t          $ r Y =w xY wt                              d|           |j	        r|dk    s||k     r|}r|j	        s||k    r|}|                    d          t          v rd|j        v r|                    d          }t                              d|           |j	        r/t          ||j        |j        |j        |j                  }||c S t!          |||          }|dk    r nY(|j        rNt          |j                  d
k    r6t                              d|j                   t!          ||j        |          }t%          ||          pt'          | d|          S d	S )zTScan the page for abbr elements and check if their content contains an eligible datez.//abbrr   z
data-utimer   zdata-utime found: %sclassr   zabbr published-title found: %sNr   zabbr published found: %s)findallr   r6   r   r   r   
ValueErrorr   r   r   CLASS_ATTRSr   r   r   r   r   r  r   r;   r   )r   r   r   r  r   	candidatetrytextr   s           rL   examine_abbr_elementsr    s/   
 ||I&&H3x==2222222222	 '	Q '	QDt{** #DHH\2$>$> ? ?II!   H3Y???# *a9y;P;P )II ) *i).C.C )I'""k11dk))"hhw//GLL!A7KKK' ""/##N#-#K#K# # #.#*NNN / %6i'$R$R	$q==!E ) Y Q3ty>>B#6#6LL!;TYGGG 1)TY P PI(G<< 
@UA
 A
 	

 4s   #A//
A<;A<c                 t   |                      d          }dt          |          cxk     rt          k     rn nd}|D ]}d}|                    dd          }t          |          dk    rdd|j        v r>|                    d          dk    r%|j        rd}t                              d	|           nd
|j        v r|j        rp|                    d
d                              d          s)|                    d
d                              d          rd}t                              d|           nY|j        s6|                    d
          dk    rd}t                              d|           nt                              d|           |r/t          ||j
        |j        |j        |j                  }||c S t          |||          }|j        Nt          |j                  dk    r6t                              d|j                   t          ||j        |          }t!          ||          S dS )zTScan the page for time elements and check if their content contains an eligible datez.//timer   Fr   r      r|   Tz#shortcut for time pubdate found: %sr  z
entry-datez
entry-timez$shortcut for time/datetime found: %supdatedz,shortcut for updated time/datetime found: %sztime/datetime found: %sNztime/datetime found in text: %s)r  r   r6   r   r   r   r   r   
startswithr   r   r   r   r   r  r   r;   )r   r   r   r  r   shortcut_flagdatetime_attrr   s           rL   examine_time_elementsr    si   
 ||I&&H3x==2222222222	 1	M 1	MD!M HHZ44M=!!A%% ,,++y88( 9 %)MLL!FVVVV++' "--88FF88GR00;;LII )-BM    %- $((72C2Cy2P2P(,J)   LL!:MJJJ  U+%) G *& + !2)]G T TII&3ty>>A+=+=>	JJJ-iGLL	 )G<<<4rN   r  c                     d |                                  D             \  }}}t          |          dk    r|d         dk    rd| nd| }| d| d| S )zoNormalize string output by adding "0" if necessary,
    and optionally expand the year from two to four digits.c              3   D   K   | ]}||                     d           V  dS )r   N)zfill)r   gs     rL   	<genexpr>z"normalize_match.<locals>.<genexpr>8  s1      @@qa@

@@@@@@rN   r   r   91920r   )groupsr   )r  daymonthr   s       rL   normalize_matchr+  5  ss     A@ELLNN@@@C
4yyA~~"1gnn{D{{{+t++""U""S"""rN   c           
      6   t                               d           d}t          | t          t          t          |          }|^t          |d                   }t          t          |dd          d|j        |j	                  rt                               d|           |}t                               d           t          D ][}t          | |d         |d         t          |          }t          ||d         |j        ||j        |j        |j	                  }||c S \t          | t          t           |j        |j	        	          }i }|D ]0}	t#          j        |	          }
t'          |
          }||	         ||<   1t)          |          }t+          |t,          t.          |          }t          |t          |j        ||j        |j        |j	                  }||S t          | t0          t2          t          |          }t          |t0          |j        ||j        |j        |j	                  }||S t          | t4          t6          |j        |j	        d
          }i }|D ]0}	t9          j        |	          }
t'          |
          }||	         ||<   1t)          |          }t+          |t,          t.          |          }t          |t4          |j        ||j        |j        |j	                  }||S t                               d           t          | t:          t<          t          |          }|t          t          |d                   t          |d                   d          }|dk    s|j        |k    rTtA          ||j        |j        |j	                  }|0t                               dt:          |d         |d                    |S t          | tB          tD          |j        |j	        |j                  }i }|D ]_}	tG          j        |	          }
|
d         }tI          |          dk    rd| }d%                    |
d         |dg          }||	         ||<   `t)          |          }t+          |t,          t.          |          }t          |tB          |j        ||j        |j        |j	                  }||S tM          |           }|dk    s|r1|j        |k    r&tA          ||j        |j        |j	                  }||S |dk    rRt                               d           t          t          |          dd          }|'                    |j                  S t                               d           t          | tP          t          t          |          }|t          t          |d                   dd          }t          |d|j        |j	                  rL|j        |k    rAt                               dtP          |d                    |'                    |j                  S dS )a  
    Opportunistically search the HTML text for common text patterns

    :param htmlstring:
        The HTML document in string format, potentially cleaned and stripped to
        the core (much faster)
    :type htmlstring: string
    :param options:
        Define extraction options
    :type options: Extractor
    :return: Returns a valid date expression as a string, or None

    z(looking for copyright/footer informationr   Nr   r   r   z'copyright year/footer pattern found: %sz3 componentsr
  T)r  r   r   r   
incompletezswitching to two componentsr   z#date found for pattern "%s": %s, %s0r   r   zusing copyright year as defaultzswitching to one componentr   zdate found for pattern "%s": %s))r   r   r  r   r   r   r@   r   r   r   THREE_COMP_PATTERNSr=   r   r   rB   r%   r&   r1   r  r+  r   r  r   r'   r(   r)   r*   r+   r2   r,   r-   r   rC   r.   r/   r3   r   r   r   strftimer0   )r  r   copyear	bestmatchr   r   resultr  replacementitemr  r  
dateobjectr*  s                 rL   search_pager7  >  sc     LL;<<<G I 9Q<  T1a  $W[
 
 
 	 LLBDIIIG LL    (  "QKQK
 
	 &QKNKK
 
 MMM  '"{  J K 2 2"(..#E**	!+D!1I%%J [(GLLI! F   I " F  '{  J K 2 2"(..#E**	!+D!1I%%J [(GLLI! F  LL./// I c)A,//Yq\1B1BAFF
a<<:?g55)GNW[  F !9"aLaL	    '{#  J K 2 2$T**au::??KKEHHeAht455	!+D!1I%%J [(GLLI! F  Z((J!||
|z''A'A%W[
 
 
 M !||6777c'llAq11
""7>222 LL-... I c)A,//A66
JW[  		7 7**LL1>9Q<   &&w~6664rN   TFr   
htmlobjectr   original_dater   urlverboser   r   deferred_url_extractorc	                    |rt          j        t           j                   t          |           }	|	dS |dk    rt	          |          sdS t          |t          |          t          |          ||          }
d}|,|	                    d          }||	                    d          }t          ||
          }||s|S t          |	|
          pt          |	|
          }||S |r||S t          |	|
          }||S 	 t          t          t!          |	          t"                              \  }}n,# t$          $ r |	}t&                              d           Y nw xY w|rt*          t,          z   }nt.          t,          z   }t1          |||
          p t1          |d|
          pt3          ||
          }||S 	 t5          |dd	
          }n5# t6          $ r( t5          |d                              dd          }Y nw xY wt;          |t<          |
          pt?          ||
          ptA          ||
          }||S |rt&          !                    d           d}tE          |          D ]N}|#                                }tH          tK          |          cxk     rtL          k     sn =tO          |||
          }OtQ          ||
          }|ptS          ||
          S dS )a  
    Extract dates from HTML documents using markup analysis and text patterns

    :param htmlobject:
        Two possibilities: 1. HTML document (e.g. body of HTTP request or .html-file) in text string
        form or LXML parsed tree or 2. URL string (gets detected automatically)
    :type htmlobject: string or lxml tree
    :param extensive_search:
        Activate pattern-based opportunistic text search
    :type extensive_search: boolean
    :param original_date:
        Look for original date (e.g. publication date) instead of most recent
        one (e.g. last modified, updated time)
    :type original_date: boolean
    :param outputformat:
        Provide a valid datetime format for the returned string
        (see datetime.strftime())
    :type outputformat: string
    :param url:
        Provide an URL manually for pattern-searching in URL
        (in some cases much faster)
    :type url: string
    :param verbose:
        Set verbosity level for debugging
    :type verbose: boolean
    :param min_date:
        Set the earliest acceptable date manually (ISO 8601 YMD format)
    :type min_date: datetime, string
    :param max_date:
        Set the latest acceptable date manually (ISO 8601 YMD format)
    :type max_date: datetime, string
    :param deferred_url_extractor:
        Use url extractor as backup only to prioritize full expressions,
        e.g. of the type `%Y-%m-%d %H:%M:%S`
    :type deferred_url_extractor: boolean
    :return: Returns a valid date expression as a string, or None
    )levelNr   z.//link[@rel="canonical"]hrefzlxml cleaner errorz.//title|.//h1FrG   rH   )rI   zutf-8ignorezextensive search startedr   )*loggingbasicConfigDEBUGr9   rA   r7   r?   r>   findr   r   r   r   r  r   r8   r   r5   r  r   errorr   r   r   r   r  r   UnicodeDecodeErrordecoder   r    r   r   r   r   rK   r   r   r   r  r;   r7  )r8  r   r9  r   r:  r;  r   r   r<  r   r   
url_resulturlelemr3  abbr_resultsearch_tree	discarded	date_exprr  r  segment	converteds                         rL   	find_daterP  (  s   d  1'-0000Z  D |tz!!/,*G*G!t XX G J
{))788++f%%C "#w//J&< D'**Hk$.H.HF  *"8 ( K +!1x~~}55"
 "
YY  + + +)*****+
  4 #33		 #33	
 		
 	
 
	7
 !
 

	7 !g66  Yk	RRR

 Y Y Yk>>>EEgxXX


Y 	z#4g>> 	6k7++	6 W55 
   =/000	,[99 	G 	GGmmooG"S\\CCCCOCCCC))WgFFII-iAA	<K
G<<<4s$   >2D1 1&EE5G /G:9G:)TFr   NFNNF)k__doc__rA  recollectionsr   copyr   r   	functoolsr   r   typingr   r	   r
   r   Counter_Type	lxml.htmlr   r   
extractorsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   settingsr4   r5   r6   utilsr7   r8   r9   r:   
validatorsr;   r<   r=   r>   r?   r@   rA   rB   rC   	getLogger__name__r   strrM   r   r   r   r   r   unionr   r  compiler   r/  r   r   r   r  r  r   r  r  r  r+  r7  bytesboolrP   rN   rL   <module>re     s)     				                   ( ( ( ( ( ( ( ( K K K K K K K K K K K K K K + + + + + + + +& & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & &N I H H H H H H H H H > > > > > > > > > > > >
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
	8	$	$M{ Ms M M M M
H H HV     * FEE )<8 (../FGG???2:g&&  K +, 
 c]    
  c]	   (u
uu c]u u u up4c"43<4 S\4 	4
 eCj4 4 4 4nAAS\A 3<A S\	A
 A eCjA A A A$ :  		   6
66 c]6 6 6 6r>
>> c]> > > >B#8E#J/ #C # # # #gC g) g g g g gX ""/3/3#(o oeS+-.oo o 	o
 
#o o uXs]+,o uXs]+,o !o c]o o o o o orN   