
    %$}gd/                        d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZmZmZmZmZmZ 	 ddlZn# e$ r Y nw xY wddlmZmZ d	d
lmZmZ d	dlmZmZmZ d	dlm Z  d	dl!m"Z"m#Z#m$Z$  ej%        e&          Z' edd          Z(dZ)dZ*dZ+ G d d          Z,de-de-de	ee-         ee-         f         fdZ.de-de	ee-         ee-         ee-         f         fdZ/de-de-dee         fdZ0de-dee         fdZ1de-dee-         de2fd Z3d!ee-         de2fd"Z4	 d6de-d$e,d%ee-         ddfd&Z5d'ee         d$e,ddfd(Z6	 	 	 	 	 d7d)e-d*ee-         d+ee         d!eee-                  d,eee-                  d-ee-         de,fd.Z7	 d8d$e,d/e2de,fd0Z8e*e+ddde ddfde-d1e9d2e9d!eee-                  d3eee-                  d*ee-         d4ed+ee         d-ee-         de	ee-         ee-         f         fd5Z:dS )9zC
Functions dedicated to website navigation and crawling/spidering.
    N)ConfigParser)sleep)ListOptionalTuple)RobotFileParser)UrlStoreextract_linksfix_relative_urlsget_base_urlis_navigation_pageis_not_crawlable)XPathtostring   )baselineprune_unwanted_nodes)Responsefetch_response	fetch_url)DEFAULT_CONFIG)LANGID_FLAGdecode_file	load_htmlF)
compressedstrictz/robots.txt
   i c                       e Zd ZdZg dZ	 	 	 ddedee         dee         dee         ddf
d	Zdedefd
Z	dedefdZ
deddfdZdeee                  dee         fdZdedefdZdS )CrawlParametersz6Store necessary information to manage a focused crawl.)	startbaselangrulesrefi	known_numis_onprune_xpathNr    r"   r#   r(   returnc                     || _         |                     |          | _        |                     |          | _        || _        |pt          | j                  | _        d| _        d| _	        d| _
        || _        d S )Nr   T)r    _get_base_urlr!   _get_referencer$   r"   	get_rulesr#   r%   r&   r'   r(   )selfr    r"   r#   r(   s        R/var/www/py-google-trends/myenv/lib/python3.11/site-packages/trafilatura/spider.py__init__zCrawlParameters.__init__0   su      
++E22	++E22#'	050M499M9M

*5    c                 L    t          |          }|st          d|           |S )z#Set reference domain for the crawl.zcannot start crawl: )r   
ValueError)r.   r    r!   s      r/   r+   zCrawlParameters._get_base_urlA   s3     '' 	=;E;;<<<r1   c                 p    |                     d          dk    r|                    dd          d         n|S )zDetermine the reference URL./   r   r   )countrsplit)r.   r    s     r/   r,   zCrawlParameters._get_referenceH   s6    */++c*:*:a*?*?u||C##A&&UJr1   	url_storec                     t          |                    | j                            | _        t	          |                    | j                            | _        dS )z*Adjust crawl data based on URL store info.N)boolfind_unvisited_urlsr!   r'   lenfind_known_urlsr&   )r.   r9   s     r/   update_metadatazCrawlParameters.update_metadataL   sD    )77	BBCC
Y66tyAABBr1   todoc                 (     |sg S  fd|D             S )z.Prepare the todo list, excluding invalid URLs.c                 <    g | ]}|j         k    j        |v |S  )r    r$   ).0ur.   s     r/   
<listcomp>z/CrawlParameters.filter_list.<locals>.<listcomp>U   s*    EEEa1
??tx1}}}}}r1   rC   )r.   r@   s   ` r/   filter_listzCrawlParameters.filter_listQ   s*     	IEEEE4EEEEr1   linkc                 z    | j          s| j                             d|          o| j        |v ot          |           S )z9Run checks: robots.txt rules, URL type and crawl breadth.*)r#   	can_fetchr$   r   )r.   rH   s     r/   is_valid_linkzCrawlParameters.is_valid_linkW   sI     ^>tz33C>> +D +$T***	
r1   NNN)__name__
__module____qualname____doc__	__slots__strr   r   r0   r+   r,   r	   r?   r   rG   r;   rL   rC   r1   r/   r   r   ,   sO       <<cccI
 #+/%)6 66 sm6 (	6
 c]6 
6 6 6 6"3 3    KC KC K K K KC Cd C C C C
Fc 3 FS	 F F F F
# 
$ 
 
 
 
 
 
r1   r   
htmlstringhomepager)   c                 T   d| vrd| vr| |fS t          |           }|| |fS |                    d          }|r|d         nd}|rd|vrt          j        d|           | |fS |                    d          d	                                                                                             d
d          }|                    d          st          |          }t          ||          }t          |          }|t          j        d|           dS t          j        d|           ||fS )z:Check if there could be a redirection by meta-refresh tag.z	"refresh"z	"REFRESH"Nz@.//meta[@http-equiv="refresh" or @http-equiv="REFRESH"]/@contentr    ;zno redirect found: %sr   zurl=httpzfailed redirect: %s)NNzsuccessful redirect: %s)r   xpathlogginginfosplitstriplowerreplace
startswithr   r   r   warning)rT   rU   	html_treeresultsresulturl2base_urlnewhtmlstrings           r/   refresh_detectionri   `   sS    *$$J)F)F8##*%%I8## ooJ G #*WQZZF $S&&,h7778##<<Q%%''--//77CCD??6"" 1%% 400dOOM-t444zL*D111$r1   c                 6   t          | d          }|r|j        sdS |j        | dfvr!t          j        d|j                   |j        } t          |j                  }t          ||           \  }}|dS t          j        d|           ||t          |          fS )zBCheck if the homepage is redirected and return appropriate values.FdecoderM   r5   zfollowed homepage redirect: %sNzfetching homepage OK: %s)	r   dataurlr[   r\   r   ri   debugr   )rU   responserT   new_htmlstringnew_homepages        r/   probe_alternative_homepagers      s     hu555H  8=   |Hc?**5x|DDD< X]++J $5Z#J#J NLM,l;;;<l)C)CCCr1   
robots_urlrm   c                    t                      }|                    |            	 |                    |                                           n3# t          $ r&}t
                              d|           Y d}~dS d}~ww xY w|S )zEParse a robots.txt file with the standard library urllib.robotparser.zcannot read robots.txt: %sN)r   set_urlparse
splitlines	ExceptionLOGGERerror)rt   rm   r#   excs       r/   parse_robotsr}      s     E	MM*DOO%%&&&&   13777ttttt Ls   'A 
A=A88A=rg   c                 \    | t           z   }t          |          }|rt          ||          ndS )z?Attempt to fetch and parse robots.txt file for a given website.N)ROBOTS_TXT_URLr   r}   )rg   rt   rm   s      r/   r-   r-      s3    N*JZ  D-1;<
D)))t;r1   languagec                     | rF|rDt           r=t          |           \  }}}t          j        |          \  }}t	          ||k              S dS )zRun a baseline extraction and use a language detector to
    check if the content matches the target language.
    Return True if language checks are bypassed.T)r   r   	py3langidclassifyr;   )rT   r   _textre   s        r/   is_target_languager      sY      (h (; (j))
4&t,,	Fh&'''4r1   r@   c                 4    t          d | D                       S )z6Probe if there are still navigation URLs in the queue.c              3   4   K   | ]}t          |          V  d S )N)r   )rD   rn   s     r/   	<genexpr>z&is_still_navigation.<locals>.<genexpr>   s+      773!#&&777777r1   )any)r@   s    r/   is_still_navigationr      s    77$777777r1   rW   paramsrn   c                 ^   t          | |j                  sdS | r|j        xt          |j        t                    r|j        g|_        t          |           }|@t          |d |j        D                       }t          |                                          } g g }}t          | |p|j
        d|j        dd          D ]R}|                    |          st          |          r|                    |           =|                    |           St                              ||           dS )zExamine the HTML code and process the retrieved internal links.
    Extract and filter new internal links after an optional language check.
    Store the links in todo-list while prioritizing the navigation ones.Nc                 ,    g | ]}t          |          S rC   )r   )rD   xs     r/   rF   z!process_links.<locals>.<listcomp>   s    .T.T.TAuQxx.T.T.Tr1   FT)pagecontentrn   external_boolr   with_navr   )urls
appendleft)r   r"   r(   
isinstancerS   r   r   r   rl   r
   r!   rL   r   append	URL_STOREadd_urls)rT   r   rn   treelinkslinks_priorityrH   s          r/   process_linksr      sS    j&+66  1f(4f(#.. 	6"("4!5F$$'.T.TAS.T.T.TUUD!$..00J>E6;     ##D)) 	d## 	!!$''''LLEn=====r1   rp   c                     | | j         sdS t                              | j        gd           t	          t          | j                   ||j                   dS )z2Convert urllib3 response object and extract links.NT)visited)rm   r   r   rn   r   r   r!   )rp   r   s     r/   process_responser      sY    
 x}~t444 +hm,,ffkBBBBBr1   r    r"   r#   knownr(   c                    t          | |||          }t                              |pg d           t                              |                    |                     t                              |j        |j                   |s4t                              |j        gd           t          |d          }n|	                    t                     |S )zInitialize crawl by setting variables, copying values to the
    URL store and retrieving the initial page if the crawl starts.T)r   r   )r   F)initial)
r   r   r   rG   store_rulesr!   r#   r    
crawl_pager?   )r    r"   r#   r@   r   r(   r   s          r/   
init_crawlr      s     UD%==F EKR666F..t44555&+v|444  *>>>FD111y)))Mr1   r   c                    t                               | j                  }|s:d| _        t	          t                               | j                            | _        | S | xj        dz  c_        |rGt          |          \  }}}|r1|r/|r-t           	                    |g           t          || |           n!t          |d          }t          ||            |                     t                      | S )z6Examine a webpage, extract navigation links and links.Fr   )rn   rk   )r   get_urlr!   r'   r=   r>   r&   r%   rs   r   r   r   r   r?   )r   r   rn   rT   rU   new_base_urlrp   s          r/   r   r     s     

FK
(
(C y88EEFF
HHMHH 
+-G-L-L*
Hl 	7( 	7| 	7z****f#6666!#e4446*** 9%%%Mr1   max_seen_urlsmax_known_urlsknown_linksconfigc	                 \   t          | |||||          }	t                              |	j        |                    dd                    }
|	j        rQ|	j        |k     rF|	j        |k     r;t          |	          }	t          |
           |	j        r|	j        |k     r|	j        |k     ;t          t                              t                              |	j                                      }t          t                              t                              |	j                                      }||fS )a  Basic crawler targeting pages of interest within a website.

    Args:
        homepage: URL of the page to first page to fetch, preferably the homepage of a website.
        max_seen_urls: maximum number of pages to visit, stop iterations at this number or at the exhaustion of pages on the website, whichever comes first.
        max_known_urls: stop if the total number of pages "known" exceeds this number.
        todo: provide a previously generated list of pages to visit / crawl frontier.
        known_links: provide a list of previously known pages.
        lang: try to target links according to language heuristics.
        config: use a different configuration (configparser format).
        rules: provide politeness rules (urllib.robotparser.RobotFileParser() format).
        prune_xpath: remove unwanted elements from the HTML pages using XPath.

    Returns:
        List of pages to visit, deque format, possibly empty if there are no further pages to visit.
        Set of known links.

    DEFAULT
SLEEP_TIME)default)r   r   get_crawl_delayr!   getfloatr'   r%   r&   r   r   listdictfromkeysr<   r>   )rU   r   r   r@   r   r"   r   r#   r(   r   
sleep_times              r/   focused_crawlerr   2  s   : $t[+NNF**V__YEE +  J 	M11f6F6W6WF##j 	M11f6F6W6W i;;FKHHIIJJDt}}Y%>%>v{%K%KLLMMKr1   )rW   )NNNNN)F);rQ   r[   configparserr   timer   typingr   r   r   urllib.robotparserr   courlanr	   r
   r   r   r   r   r   ImportError
lxml.etreer   r   corer   r   	downloadsr   r   r   settingsr   utilsr   r   r   	getLoggerrN   rz   r   r   MAX_SEEN_URLSMAX_KNOWN_URLSr   rS   ri   rs   r}   r-   r;   r   r   r   r   r   r   intr   rC   r1   r/   <module>r      s     % % % % % %       ( ( ( ( ( ( ( ( ( ( . . . . . .               	 	 	 	D	 ' & & & & & & & 0 0 0 0 0 0 0 0 : : : : : : : : : : $ $ $ $ $ $ 6 6 6 6 6 6 6 6 6 6 
	8	$	$He444	1
 1
 1
 1
 1
 1
 1
 1
h##"#
8C=(3-'(# # # #LDD
8C=(3-#67D D D D2S  0I    < < 9 < < < <3 (3- D    8d3i 8D 8 8 8 8 #> #>#>#> 
##> 
	#> #> #> #>LCx CC 
C C C C  '+ $!%!% 
3- O$ 49
	
 DI #    :      B '( $'+)'+!%. ... . 49
	.
 $s)$. 3-. . O$. #. 49d3i . . . . . .s   9 A A