
    !}g                        d Z dZdZdZdZddlZddlZddlZddlZddlm	Z	 ddl
mZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZmZ  ej        e          Z edd          Z edd          Z edd          Z ed          Z ed          Z ed          Z ed          Z  ed          Z! ed          Z" ed          Z# e$            Z%dZ&dZ'dZ(g dZ)g dZ*g d Z+ G d! d"e,          Z-dS )#a  
Newspaper uses much of python-goose's extraction code. View their license:
https://github.com/codelucas/newspaper/blob/master/GOOSE-LICENSE.txt

Keep all html page extraction code within this file. Abstract any
lxml or soup parsing code in the parsers.py file!
	newspaperzLucas Ou-YangMITzCopyright 2014, Lucas Ou-Yang    N)defaultdict)parse)
tldextract)urljoinurlparse
urlunparse   )urls)StringReplacementStringSplitterz&#65533; z#!z?_escaped_fragment_=z&raquo;   »z\|z - _/    » : z
a[rel=tag]zMa[href*='/tag/'], a[href*='/tags/'], a[href*='/topic/'], a[href*='?keyword=']z^[A-Za-z]{2}$)storyarticlefeaturefeaturedslides	slideshowgallerynewsvideomediavradiopress)careerscontactaboutfaqtermsprivacyadvertpreferencesfeedbackinfobrowsehowtoaccount	subscribedonateshopadmin)amazondoubleclicktwitterc                       e Zd Zd Zd Zd Zd Zd Zd(dZd Z	d	 Z
d
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd)dZd Zd Zd Zd Zd Zd Zd Zd Zd Zd  Z d! Z!d" Z"d# Z#d$ Z$d% Z%d& Z&d' Z'dS )*ContentExtractorc                     || _         | j                                         | _        |j        | _        |j        | _        d S N)config
get_parserparserlanguagestopwords_class)selfr;   s     T/var/www/py-google-trends/myenv/lib/python3.11/site-packages/newspaper/extractors.py__init__zContentExtractor.__init__8   s8    k,,..%5    c                 Z    |r(|| _         | j                            |          | _        dS dS )zRequired to be called before the extraction process in some
        cases because the stopwords_class has to set incase the lang
        is not latin based
        N)r>   r;   get_stopwords_classr?   )r@   	meta_langs     rA   update_languagez ContentExtractor.update_language>   s>    
  	;%DM//	::    	; 	;rC   c                    t          j        d          fdd }fd}g d}g d}g }g }|D ]9}|D ]4}	| j                            |||	          }
|                    |
           5:|D ]z}d}|j        d	k    r1|                    d
          }t          |          dk    r|d         }n	|j        pd}t          |          dk    r|                     ||                     { ||          S )zcFetch the authors of the article, return as a list
        Only works for english articles
        z\dc                 H    t                              |                     S r:   )boolsearch)d_digitss    rA   contains_digitsz5ContentExtractor.get_authors.<locals>.contains_digitsN   s    q))***rC   c                     i }g }| D ]W}|                                 |v rd||                                 <   |                    |                                           X|S )zRemove duplicates from provided list but maintain original order.
              Derived from http://www.peterbe.com/plog/uniqifiers-benchmark
            r   )lowerappendtitle)lstseenresultitems       rA   uniqify_listz2ContentExtractor.get_authors.<locals>.uniqify_listQ   sg     DF , ,::<<4''%&TZZ\\"djjll++++MrC   c                 "   t          j        dd|           } t          j        dd|           } |                                 } t          j        d|           }d |D             }g }g }g d}|D ]d}||v r>t	          |          dk    r*|                    d                    |                     g }D |          s|                    |           et	          |          d	k    }|r(|                    d                    |                     |S )
a  
            Takes a candidate line of html or text and
            extracts out the name(s) in list form:
            >>> parse_byline('<div>By: <strong>Lucas Ou-Yang</strong>,<strong>Alex Smith</strong></div>')
            ['Lucas Ou-Yang', 'Alex Smith']
            <[^<]+?>r   z[bB][yY][\:\s]|[fF]rom[\:\s]z
[^\w'\-\.]c                 6    g | ]}|                                 S  strip).0ss     rA   
<listcomp>zFContentExtractor.get_authors.<locals>.parse_byline.<locals>.<listcomp>q        :::17799:::rC   )and,r   r   r      )resubr]   splitlenrQ   join)
search_strname_tokens_authorscurname
delimiterstoken
valid_namerN   s          rA   parse_bylinez2ContentExtractor.get_authors.<locals>.parse_byline^   s+    
B
;;J  >JOOJ#))++J
 (=*==K::k:::KHG)))J$ * *J&&7||a'' (9(9:::"$(// *NN5))) g,,!+J 3 1 1222OrC   )namerelitempropclassid)authorbylinez
dc.creatorbylattrvaluer   metaz@contentr   )	re   compiler=   getElementsByTagextendtagxpathrh   text)r@   docrW   rq   ATTRSVALSmatchesauthorsr{   valfoundmatchcontentmmrM   rN   s                 @@rA   get_authorszContentExtractor.get_authorsH   sd    *T""	+ 	+ 	+ 	+ 	+	 	 	(	 (	 (	 (	 (	X ;::888 	& 	&D & &44St34OOu%%%%&
  		6 		6EGyF""[[,,r77Q;; eG**7||a||G44555|G$$$rC   c                    d }t          j        t          j        |          }|r$|                    d          } ||          }|r|S dddddddddd	ddd
dddddddddddddddddddddddddddg
}|D ]e}| j                            ||d         |d                   }	|	r8| j                            |	d         |d                   } ||          }|r|c S fdS )a,  3 strategies for publishing date extraction. The strategies
        are descending in accuracy and the next strategy is only
        attempted if a preferred one fails.

        1. Pubdate from URL
        2. Pubdate from metadata
        3. Raw regex searches in the HTML + added heuristics
        c                 r    | r4	 t          |           S # t          t          t          t          f$ r Y d S w xY wd S r:   )date_parser
ValueErrorOverflowErrorAttributeError	TypeError)date_strs    rA   parse_date_strz<ContentExtractor.get_publishing_date.<locals>.parse_date_str   sT       &x000"M>9M        44    s    44r   propertyzrnews:datePublishedr   )	attributer|   r   zarticle:published_timerr   OriginalPublicationDatert   datePublisheddatetimezog:published_timearticle_date_originalpublication_datezsailthru.datePublishDatepubdater   r|   rz   N)re   rK   r   STRICT_DATE_REGEXgroupr=   r   getAttribute)
r@   urlr   r   
date_matchr   datetime_objPUBLISH_DATE_TAGSknown_meta_tag	meta_tagss
             rA   get_publishing_datez$ContentExtractor.get_publishing_date   s   	  	  	  Yt5s;;
 	$!''**H)>(33L $## %/D!# #$/G!# # +D!# #$"$ $$/B!# # +B!# # +=!# # ?!# # =!# ##i"$ $'
, 0 	( 	(N44#K0$W- 5 / /I  (;33aL"9-/ /  .~h77 (''''trC   c                 P    d} j                             |d          }|t          |          dk    r|S  j                             |d                   }d}d} j                             |d          pg } fd|D             }|r}|                    t          d	
           |d         }t          |                    d                    dk    rd}d                    d |                                D                       }                     |d          p                     |d          pd}	t          j	        d          }
|

                    d|                                          }|

                    d|                                          }|

                    d|	                                          }||k    rd	}n_|r||k    r|}d	}nR|r/||v r+|r)||v r%t          |          t          |	          k    r|}d	}n!|r||k    r|                    |          r|	}d	}|s"d|v r                     |t          |          }d	}|s"d|v r                     |t          |          }d	}|s"d|v r                     |t           |          }d	}|s"d|v r                     |t"          |          }d	}|s"d|v r                     |t$          |          }d	}t&                              |          }|

                    d|                                          }||k    r|}|S )a  Fetch the article title and analyze it

        Assumptions:
        - title tag is the most reliable (inherited from Goose)
        - h1, if properly detected, is the best (visible to users)
        - og:title and h1 can help improve the title extraction
        - python == is too strict, often we need to compare filtered
          versions, i.e. lowercase and ignoring special chars

        Explicit rules:
        1. title == h1, no need to split
        2. h1 similar to og:title, use h1
        3. title contains h1, title contains og:title, len(h1) > len(og:title), use h1
        4. title starts with og:title, use og:title
        5. use title, after splitting
        r   rR   r   Nr   Fh1c                 D    g | ]}j                             |          S r[   )r=   getText)r^   r   r@   s     rA   r`   z.ContentExtractor.get_title.<locals>.<listcomp>  s5     5 5 53dk11#66 5 5 5rC   T)keyreverser   rd   c                     g | ]}||S r[   r[   r^   xs     rA   r`   z.ContentExtractor.get_title.<locals>.<listcomp>  s    %L%L%LA!%La%L%L%LrC   zmeta[property="og:title"]zmeta[name="og:title"]z[^\u4e00-\u9fa5a-zA-Z0-9\ ]|-r   r   r   )r=   r   rh   r   sortrg   ri   get_meta_contentre   r~   rf   rP   
startswithsplit_titlePIPE_SPLITTERDASH_SPLITTERUNDERSCORE_SPLITTERSLASH_SPLITTERARROWS_SPLITTERMOTLEY_REPLACEMENT
replaceAll)r@   r   rR   title_element
title_textused_delimetertitle_text_h1title_element_h1_listtitle_text_h1_listtitle_text_fbfilter_regexfilter_title_textfilter_title_text_h1filter_title_text_fbfilter_titles   `              rA   	get_titlezContentExtractor.get_title   s   " 44Sg4FF C$6$6!$;$;L [((q)9::
  $ < <SAE != !G !G !MJL 	5 5 5 535 5 5 	N##T#:::.q1M=&&s++,,11 "HH%L%L1D1D1F1F%L%L%LMMM 	c#>?? 	Bc#:;;	B?A 	 z"@AA(,,R<<BBDD+//MBBHHJJ+//MBBHHJJ J&&!NN! 	"&:>R&R&R&J!NN! 	"&:>O&O&O( 'P-AEV-V-V&&]););;;&J!NN! 	"&:>O&O&O%001EFF 'P&J!N  	"#"3"3))*m*79 9J!N  	"#"3"3))*m*79 9J!N  	"#"3"3))*6I*79 9J!N  	"#"3"3))*n*79 9J!N  	"&J"6"6))*o*79 9J!N"--j99
 $''E2288::<//!ErC   Nc                    d}d}|                     |          }|r<t          j        d          }|                    d|                                          }t          |          D ]m\  }}	|	                                }
|r.||                    d|
                                          v r|} n%t          |
          |k    rt          |
          }|}n||         }t          	                    |                                          S )z.Split the title to best part possible
        r   z[^a-zA-Z0-9\ ]r   )
rg   re   r~   rf   rP   	enumerater]   rh   TITLE_REPLACEMENTSr   )r@   rR   splitterhintlarge_text_lengthlarge_text_indextitle_piecesr   ititle_piececurrents              rA   r   zContentExtractor.split_title^  s    ~~e,, 	6:&788L##B--3355D (55 	% 	%NA{!''))G  0 0W = = C C E EEE#$ 7||///$'LL!#$  -.!,,U3399;;;rC   c                     g }|D ]@}ddd} | j         j        |j        fi |}d |D             }|                    |           A|dd         }fd|D             }t	          t          |                    }|S )zbTakes a source url and a list of category objects and returns
        a list of feed urls
        typezapplication\/rss\+xmlrz   c                 b    g | ],}|                     d           |                     d           -S hrefget)r^   es     rA   r`   z2ContentExtractor.get_feed_urls.<locals>.<listcomp>  s1    OOO1vOvOOOrC   N2   c                 :    g | ]}t          j        |          S r[   r   prepare_url)r^   f
source_urls     rA   r`   z2ContentExtractor.get_feed_urls.<locals>.<listcomp>  s6     5 5 5   +Az:: 5 5 5rC   )r=   r   r   r   listset)r@   r   
categoriestotal_feed_urlscategorykwargsfeed_elements	feed_urlss    `      rA   get_feed_urlszContentExtractor.get_feed_urlsw  s     " 	. 	.H$/FGGF8DK8( ( &( (MOOOOOI""9----)#2#.5 5 5 5$35 5 5s?3344rC   c                     dddd} | j         j        |fi |}|r#| j                             |d         d          }|S dS )zExtract the favicon from a website http://en.wikipedia.org/wiki/Favicon
        <link rel="shortcut icon" type="image/png" href="favicon.png" />
        <link rel="icon" type="image/png" href="favicon.png" />
        linkrs   iconr   r{   r|   r   r   r   )r=   r   r   )r@   r   r   r}   favicons        rA   get_faviconzContentExtractor.get_favicon  s\    
  @@+t{+C::6:: 	k..tAw??GNrrC   c                 J   | j                             |d          }|Jddddddddg}|D ];} | j         j        |fi |}|r$| j                             |d	         d
          } n<|r8|dd         }t          j        t
          |          r|                                S dS )z+Extract content language from meta
        lang)r{   Nr}   z
http-equivzcontent-languager   rr   r   r   rd   )r=   r   r   re   rK   RE_LANGrP   )r@   r   r{   itemsrV   r}   r|   s          rA   get_meta_langzContentExtractor.get_meta_lang  s     {''&'99< ,. .@@E
   3t{3C@@4@@ ;33Qi 4 1 1DE  	%!HEy%(( %{{}}$trC   c                     | j                             ||          }d}|4t          |          dk    r!| j                             |d         d          }|r|                                S dS )zExtract a given meta content form document.
        Example metaNames:
            "meta[name=description]"
            "meta[name=keywords]"
            "meta[property=og:type]"
        Nr   r   r   )r=   
css_selectrh   r   r]   )r@   r   metanamer}   r   s        rA   r   z!ContentExtractor.get_meta_content  sj     {%%c844D		Ak..tAw	BBG 	#==??"rrC   c                    dgdz  \  }}}}}|                      |d          }|sdddd} | j        j        |fdd	i|}	|	r|	d
                             d          nd}|sP|                      |d          }|s8dddd}
 | j        j        |fi |
}	|	r|	d
                             d          nd}|p|p|p|}|rt	          ||          S dS )z:Returns the 'top img' as specified by the website
        N   zmeta[property="og:image"]r   rs   zimg_src|image_srcr   	use_regexTr   r   zmeta[name="og:image"]r   r   )r   r=   r   r   r   )r@   article_urlr   top_meta_imagetry_onetry_two	try_threetry_fourlink_img_src_kwargselemslink_icon_kwargss              rA   get_meta_img_urlz!ContentExtractor.get_meta_img_url  s4    BF
=)X''-HII 	G8KLL  0DK0\\\H[\\E.3=eAhll6***G G 11#7NOO	  G/5uv'V'V$8DK8QQ@PQQE7<FuQx||F333$H DGDyDH 	8;777rrC   c                 .    |                      |d          S )z:Returns meta type of article, open graph protocol
        zmeta[property="og:type"]r   r@   r   s     rA   get_meta_typezContentExtractor.get_meta_type  s     $$S*DEEErC   c                 .    |                      |d          S )zHIf the article has meta description set in the source, use that
        zmeta[name=description]r  r  s     rA   get_meta_descriptionz%ContentExtractor.get_meta_description  s     $$S*BCCCrC   c                 .    |                      |d          S )zEIf the article has meta keywords set in the source, use that
        zmeta[name=keywords]r  r  s     rA   get_meta_keywordsz"ContentExtractor.get_meta_keywords  s     $$S*?@@@rC   c                 H   t          t                    }| j                            |d          }|D ]}|j                            d          p|j                            d          }|j                            d          p|j                            d          }|r|sp|                                |                                }}|                                rt          |          }d|vr|||<   |	                    d          }|
                    d          }||         }t          |t                    st          |t                    r||i||<   ||         }t          |          D ]\  }	}
|	t          |          dz
  k    r|||
<    n|                    |
          st                      ||
<   n]t          |                    |
          t                    s(t          |                    |
          t                    rd	||
         i||
<   ||
         }|S )
Nr}   r   rr   r   r|   r   r   r   
identifier)r   dictr=   r   attribr   r]   isdigitintrg   pop
isinstancestrr   rh   )r@   r   data
propertiespropr   r|   key_headrefidxparts              rA   get_meta_datazContentExtractor.get_meta_data  s   4  [++C88
 !	  !	 D+//*--H1H1HCKOOI..J$+//'2J2JE e ekkmmC}} #E

#~~!S	))C..CwwqzzHx.C#s## %z#s';'; %"*CX8n&s^^ 
  
 	T#c((Q,&& %CIEwwt}} : $CIIs33 :z#''$--QT7U7U : ".s4y 9CI$irC   c                 8   | j                             |ddd          }|r!| j                             |d         d          nd}|                     |d          }|p|pd}|r|                                }t          |          }|j        st          |          }t          j        d		                    |j                  |j
                  }		 |	                    d
          }
n# t          $ r
 |j
        }
Y nw xY wt          |j        |j        |
dddf          }|S )z
        Return the article's canonical URL

        Gets the first available value of:
        1. The rel=canonical tag
        2. The og:url tag
        r   rs   	canonicalr   r   r   r   zmeta[property="og:url"]z.*{}(?=/)/(.*)r   )r=   r   r   r   r]   r	   hostnamere   r   formatpathr   r   r
   scheme)r@   r  r   linksr&  og_urlmeta_urlparsed_meta_urlparsed_article_urlstrip_hostname_in_meta_path	true_paths              rA   get_canonical_linkz#ContentExtractor.get_canonical_link  sX    ,,Sf53> - @ @ CHODK,,U1Xv>>>R	&&s,EFF,," 	4~~''H&x00O"+ 4 &.k%:%:".0* &!3!<==)./0 /0+5 ; A A! D DII% 5 5 5 / 4III5 &'9'@'9'BI')2r'3 4 4 s   C$ $C87C8c                     ddi} | j         j        |fi |}d |D             }t          fd|D                       }|S )z<Return all of the images on an html page, lxml root
        r   imgc                 b    g | ],}|                     d           |                     d           -S )srcr   )r^   img_tags     rA   r`   z1ContentExtractor.get_img_urls.<locals>.<listcomp>7  sD     ? ? ?7;;u+=+=?E"" ? ? ?rC   c                 0    g | ]}t          |          S r[   )r   )r^   r   r  s     rA   r`   z1ContentExtractor.get_img_urls.<locals>.<listcomp>9  s3     * * *  !c22 * * *rC   )r=   r   r   )r@   r  r   
img_kwargsimg_tagsr   	img_linkss    `     rA   get_img_urlszContentExtractor.get_img_urls2  s     U^
/4;/BBzBB? ?'? ? ? * * * *$(* * * + +	rC   c                     |                      ||          }t          |          }|rt          ||d                   S dS )zRetrieves the first image in the 'top_node'
        The top node is essentially the HTML markdown where the main
        article lies and the first image in that area is probably signifigcant.
        r   r   )r<  r   r   )r@   r  top_nodenode_imagess       rA   get_first_img_urlz"ContentExtractor.get_first_img_url=  sG    
 ''X>>;'' 	8;A777rrC   c                 l    |g S ddi} | j         j        |fi |}|rd |D             S d |D             S )zZReturn a list of urls or a list of (url, title_text) tuples
        if specified.
        Nr   ac                 p    g | ]3}|                     d           |                     d           |j        f4S r   )r   r   r^   rB  s     rA   r`   z.ContentExtractor._get_urls.<locals>.<listcomp>U  s8    MMMquuV}}MQUU6]]AF+MMMrC   c                 b    g | ],}|                     d           |                     d           -S r   r   rD  s     rA   r`   z.ContentExtractor._get_urls.<locals>.<listcomp>V  s1    ???!v?f???rC   r=   r   )r@   r   titlesa_kwargsa_tagss        rA   	_get_urlszContentExtractor._get_urlsH  sf     ;I3<--c>>X>>  	NMMVMMMM??v????rC   Fc                 b   |t                               d           g S |rHt          j        ddt	          |                    }t          j        d|          }d |D             }|pg S t          |t                    r| j                            |          }n|}| 	                    ||          S )z`doc_or_html`s html page or doc and returns list of urls, the regex
        flag indicates we don't parse via lxml and just search the html.
        Nz0Must extract urls from either html, text or doc!rY   r   zNhttp[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+c                 6    g | ]}|                                 S r[   r\   )r^   r   s     rA   r`   z-ContentExtractor.get_urls.<locals>.<listcomp>e  ra   rC   )
logcriticalre   rf   r  findallr  r=   
fromstringrJ  )r@   doc_or_htmlrG  regexr   s        rA   get_urlszContentExtractor.get_urlsX  s     LLKLLLI 	%&S#k2B2BCCK*01<> >K ;:k:::K$"$k3'' 	+((55CCC~~c6***rC   c           
         |                      |          }g }|D ]|}t          j        |d          }t          j        |d          }t          j        |d          }|s!|s| j        j        rt          d|z             h|r4|                    d          r| j        j        rt          d|z             |r+|dk    r%|dk    r| j        j        rt          d|z             |rt          j
        |          }	t          j
                  }
|	j                            d	          }d}|D ]Q}||
j        k    rD| j        j        r4t          d
t          |          dt          |
j                             d} nR|s2|	j        |
j        k    r"| j        j        rt          d|z             |	j        dv r | j        j        rt          d|z             |                    |dz   |z              d |                    d          D             }d|v r|                    d           t#          |          dk    r3t#          |d                   dk     r|                    ||z              ^| j        j        rt          d|z             ~g d}g }|D ]}t          j        |          }t          j
        |          j        }|dz   |z   }d}|D ]L}|                                |                                v r"| j        j        rt          d|z             d} nM|s|                    |           |                    d           t'          |          D ]h\  }}|                    d          rd|z   }|||<   n|                    d          r
d|z   }|||<   |                    d          r|dd         }|||<   it+          t-          |                    }fd|D             }d  |D             }|S )!zInputs source lxml root and source url, extracts domain and
        finds all of the top level urls, we are assuming that these are
        the category urls.
        cnn.com --> [cnn.com/latest, world.cnn.com, cnn.com/asia]
        F)allow_fragmentsz+elim category url %s for no domain and path#z'elim category url %s path starts with #httphttpsz7elim category url %s for bad scheme, not http nor https.zsubdomain contains at z and Tz(elim category url %s for domain mismatch)mr   z)elim category url %s for mobile subdomainz://c                 8    g | ]}t          |          d k    |S )r   )rh   r   s     rA   r`   z6ContentExtractor.get_category_urls.<locals>.<listcomp>  s#    HHHQSVVaZZqZZZrC   r   z
index.htmlr   r      z;elim category url %s for >1 path chunks or size path chunks)Br%   helpr(   legalr+   sitemapprofiler/   mobiler_  facebookmyspacer6   linkedinbebo
friendsterstumbleuponyoutubevimeostoremailr*   mapspasswordimgurflickrrK   subscriptionitunes	siteindexeventsstopjobsr#   
newsletterr0   academyshoppingpurchasezsite-mapr2   r1   rv  productr)   r,   ticketscouponsforumboardarchiver-   r.   zhow tor&   r'   chartsservicesr$   plusr3   loginsignupregister	developerproxyr   z4elim category url %s for subdomain contain stopword!z//zhttp:Nc                 :    g | ]}t          j        |          S r[   r   )r^   p_urlr   s     rA   r`   z6ContentExtractor.get_category_urls.<locals>.<listcomp>  s6     9 9 9" )%<< 9 9 9rC   c                     g | ]}||S r:   r[   )r^   cs     rA   r`   z6ContentExtractor.get_category_urls.<locals>.<listcomp>  s    CCCqQ]]]]rC   )rS  r   
get_scheme
get_domainget_pathr;   verboseprintr   r   extract	subdomainrg   domainr  rQ   removerh   rP   r   endswithr   r   )r@   r   r   	page_urlsvalid_categoriesr  r*  r  r)  	child_tld
domain_tldchild_subdomain_partssubdomain_containsr#  path_chunks	stopwords_valid_categoriesr  conjunctionbadbadwordr   category_urlss    `                     rA   get_category_urlsz"ContentExtractor.get_category_urlsn  s:    MM#&&	 >	? >	?E_UEBBBF_UEBBBF=>>>D $ ;& #G!" # # # ,, ;& MCeKLLL 6V++'0A0A;& : 0278 : : : *?&.u55	'/
;;
(1(;(A(A#(F(F%%*"1  Dz000;. I!E$'IIIIs:3D/E/E/E$G I I I-1* 1 * E")Z->>>{* !  *,1 2 4 4 4 ! (J66{* 5  +-2 3 5 5 5$++FUNV,CDDDD IH$**S//HHH;..&&|444{##q((SQ-@-@2-E-E$++FTM::::{* ?  57< = ? ? ?" " "	  & 	0 	0E=''D"*511;I*y0KC$  ==??k&7&7&9&999{* =  35: ; = = =CE :  0!((///  %%%!"344 	- 	-HAu&& -',!!$$!!$'' -%',!!$~~c"" -crc
',!!$ %6!7!7889 9 9 9&79 9 9CCMCCCrC   c                 t   t          t          |                    dk    rt          S | j                            |t
                    }|s)| j                            |t                    }|st          S g }|D ]3}| j                            |          }|r|                    |           4t          |          S Nr   )
rh   r   
NO_STRINGSr=   r   A_REL_TAG_SELECTORA_HREF_TAG_SELECTORr   rQ   r   )r@   r   elementstagselr   s         rA   extract_tagszContentExtractor.extract_tags  s    tCyy>>Q;))#% % 	"{--(* *H "!! 	! 	!B+%%b))C !C   4yyrC   c           	         d }|                      |          }t          d          }d}d}g }g }|D ]}	| j                            |	          }
|                     | j                                      |
          }|                     |	          }|                                dk    r|s|                    |	           t          |          }d}t          |          dz  }|D ]}	t          d          }| 
                    |	          r |dk    rt          d|z  dz            }|dz  }|dk    rp||z
  |k    rgt          |||z
  z
            }t          t          |t          d                               }t          |          |z   }|d	k    rt          d
          }| j                            |	          }
|                     | j                                      |
          }t          |                                |z             }| j                            |	          }|                     ||           |                     |d           ||vr|                    |           | j                            |          }|H|                     |d           |                     ||dz             ||vr|                    |           |dz  }|dz  }d}|D ]%}|                     |          }||k    r|}|}||}&|S )N      ?r   r>   rd   g      ?r   r      (   r   )nodes_to_checkfloatr=   r   r?   r>   get_stopword_countis_highlink_densityrQ   rh   is_boostablepowabsr  	getParentupdate_scoreupdate_node_count	get_score)r@   r   r>  r  starting_boostcntr   parent_nodesnodes_with_textnode	text_node
word_statshigh_link_densitynodes_numbernegative_scoringbottom_negativescore_nodesboost_scoreboosternegscoreupscoreparent_nodeparent_parent_nodetop_node_scorer   scores                            rA   calculate_best_nodez$ContentExtractor.calculate_best_node  sX   ,,S11s" 	- 	-D++D11I--t}-EE""9--  $ 8 8 > >,,..22;L2&&t,,,?++%*<%8%84%?"# %	 %	D((K  && (!88"'~)=(C"D"DK"a'Nb   1$)CCC#2lQ6FGI IG"'WeAhh)?)?(?"@"@K";//2BBH"}}&+Ahh++D11I--t}-EE""9-- *7799KGHHG+//55Kk7333"";222,..##K000 "&!6!6{!C!C!-&&'91===!!"4gkBBB%\99 ''(:;;;1HCFAA 	 	ANN1%%E~%%!&rC   c                 l   d}d}d}d}|                      |          }|D ]}| j                            |          }||k    rq||k    r dS | j                            |          }	|                     | j                                      |	          }
|
                                |k    r dS |dz  }dS )	au  A lot of times the first paragraph might be the caption under an image
        so we'll want to make sure if we're going to boost a parent node that
        it should be connected to other paragraphs, at least for the first n
        paragraphs so we'll want to make sure that the next sibling is a
        paragraph and has at least some substantial weight to it.
        pr   r      Fr  Tr   )walk_siblingsr=   getTagr   r?   r>   r  )r@   r  para
steps_awayminimum_stopword_countmax_stepsaway_from_nodenodescurrent_nodecurrent_node_tagparagraph_textr  s              rA   r  zContentExtractor.is_boostableF  s     
!""#""4((! 	  	 L#{11,??4''!888 55!%!4!4\!B!B!114=1II&&~66 00225KKK44a
urC   c                 6    | j                             |          S r:   )r=   previousSiblingsr@   r  s     rA   r  zContentExtractor.walk_siblingsa  s    {++D111rC   c                     |                      |          }|                     |          }|D ]3}|                     ||          }|D ]}|                    d|           4|S r  )get_siblings_scorer  get_siblings_contentinsert)r@   r>  baseline_score_siblings_pararesultsr  psr  s          rA   add_siblingszContentExtractor.add_siblingsd  s    '+'>'>x'H'H$$$X..# 	& 	&L**:< <B & &1%%%%&rC   c                    |j         dk    rRt          | j                            |                    dk    r'|}|j        rt          j        |          }d|_        |gS | j                            |d          }|dS g }|D ]}| j                            |          }t          |          dk    r|                     | j	                  
                    |          }|
                                }	t          d          }
|                     |          }t          ||
z            }||	k     r4|s2| j                            d|d          }|                    |           |S )	zDAdds any siblings that may have a decent score to this node
        r  r   r   r   Nr  g333333?)r   r   tail)r   rh   r=   r   r  copydeepcopyr   r?   r>   r  r  r  createElementrQ   )r@   current_siblingr  e0potential_paragraphsr  first_paragraphr   r  paragraph_scoresibling_baseline_scorer  r  r  s                 rA   r  z%ContentExtractor.get_siblings_contentn  s    #%%DK//@@AAAEE Bw ]2&&4K#';#?#?S $@ $* $* #+t'; ) )O;..??D4yy1}}%)%9%9%)] &: &4 &4..t44 # +5*G*G*I*I16s.,0,D,D+-- --) %&B&<'= !> !> ?22;L2 $ 9 9$'d !: !? !?AIIaLLL	rC   c                    d}d}d}| j                             |d          }|D ]}| j                             |          }|                     | j                                      |          }|                     |          }	|                                dk    r|	s|dz  }||                                z  }|dk    r||z  }|S )a  We could have long articles that have tons of paragraphs
        so if we tried to calculate the base score against
        the total text score of those paragraphs it would be unfair.
        So we need to normalize the score based on the average scoring
        of the paragraphs within the top node.
        For example if our total score of 10 paragraphs was 1000
        but each had an average value of 100 then 100 should be our base.
        i r   r  r   r  rd   r   )r=   r   r   r?   r>   r  r  )
r@   r>  baseparagraphs_numberparagraphs_scorer  r  r  r  r  s
             rA   r  z#ContentExtractor.get_siblings_score  s     55hC5HH" 	D 	DD++D11I--t}-EE""9--  $ 8 8 > >,,..22;L2!Q&! J$A$A$C$CC q  #&77DrC   c                     d}| j                             |d          }|rt          |          }||z   }| j                             |dt	          |                     dS )zAdds a score to the gravityScore Attribute we put on divs
        we'll get the current score then add the score we're passing
        in to the current.
        r   gravityScoreN)r=   r   r  setAttributer  )r@   r  add_to_scorecurrent_scorescore_string	new_scores         rA   r  zContentExtractor.update_score  se    
 {//nEE 	0!,//M!L0	  ~s9~~FFFFFrC   c                     d}| j                             |d          }|rt          |          }||z   }| j                             |dt	          |                     dS )z=Stores how many decent nodes are under a parent node
        r   gravityNodesN)r=   r   r  r  r  )r@   r  add_to_countr  count_stringr  s         rA   r  z"ContentExtractor.update_node_count  se     {//nEE 	.--M!L0	  ~s9~~FFFFFrC   c                 x   | j                             |d          }|sdS | j                             |          }d |                                D             }|sdS t	          t          |                    }g }|D ]/}|                    | j                             |                     0d                    |          }|                                }	t	          t          |	                    }
t	          t          |                    }t	          |
|z            }t	          ||z            }|dk    rdS dS )zChecks the density of links within a node, if there is a high
        link to text ratio, then the text is less likely to be relevant
        rB  r   Fc                 :    g | ]}|                                 |S r[   )isalnum)r^   words     rA   r`   z8ContentExtractor.is_highlink_density.<locals>.<listcomp>  s%    AAA$$,,..AAAArC   Tr   r  )r=   r   r   rg   r  rh   rQ   ri   )r@   r   r+  r   wordswords_numbersbr   	link_text
link_wordsnum_link_words	num_linkslink_divisorr  s                 rA   r  z$ContentExtractor.is_highlink_density  s/    ,,QC,88 	5{""1%%AA$**,,AAA 	4SZZ(( 	1 	1DIIdk))$//0000GGBKK	__&&
s://#e**%%	^l:;;lY.//C<<4urC   c                 0    |                      |          pdS )z>Returns the gravityScore as an integer from this node
        r   )get_node_gravity_scorer  s     rA   r  zContentExtractor.get_score  s     **4005A5rC   c                 ^    | j                             |d          }|sd S t          |          S )Nr  )r=   r   r  )r@   r  gravity_scores      rA   r  z'ContentExtractor.get_node_gravity_score  s4    00~FF 	4]###rC   c                 V    g }dD ]#}| j                             ||          }||z  }$|S )zXReturns a list of nodes we want to search
        on like paragraphs and tables
        )r  pretdr   rF  )r@   r   r  r   r   s        rA   r  zContentExtractor.nodes_to_check  sD     % 	$ 	$CK00#0>>Ee#NNrC   c                 N   | j                             |d          }|D ]I}| j                             |          }t          |          dk     r| j                             |           J| j                             |d          }t          |          dk    r|j        dk    rdS dS )Nr  r      r   r  TF)r=   r   r   rh   r  r   )r@   r   sub_paragraphsr  txtsub_paragraphs_2s         rA   is_table_and_no_para_existz+ContentExtractor.is_table_and_no_para_exist  s    55aS5AA 	& 	&A+%%a((C3xx"}}""1%%%;77s7CC  A%%!%4--4urC   c                     |                      |          }|                      |          }t          |dz            }||k     r|j        dk    rdS dS )Ng{Gz?r  FT)r  r  r   )r@   r  r   r  current_node_score	thresholds         rA   is_nodescore_threshold_metz+ContentExtractor.is_nodescore_threshold_met  sV    --!^^A...3.//	**5trC   c                    |                      |          }| j                            |          D ]Q}| j                            |          }|dk    r/|                     |          r| j                            |           R|S )zRemove any divs that looks like non-content, clusters of links,
        or paras with no gusto; add adjacent nodes which look contenty
        r  )r  r=   getChildrenr  r  r  )r@   r>  r  r   e_tags        rA   post_cleanupzContentExtractor.post_cleanup  s       **((.. 	* 	*AK&&q))E||++A.. *K&&q)))rC   r:   )FF)(__name__
__module____qualname__rB   rG   r   r   r   r   r   r   r   r   r  r  r  r  r$  r2  r<  r@  rJ  rS  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r$  r[   rC   rA   r8   r8   7   so       6 6 6; ; ;X% X% X%H= = =~q q qf< < < <2  $
 
 
  2    2F F F
D D D
A A A
% % %N$ $ $L	 	 		 	 	@ @ @ + + + +,| | ||  $F F FP  62 2 2  " " "H  8G G G	G 	G 	G  86 6 6
$ $ $  
 
 
  
 
 
 
 
rC   r8   ).__doc__	__title__
__author____license____copyright__r  loggingre   collectionsr   dateutil.parserr   r   r   urllib.parser   r	   r
   r   r   utilsr   r   	getLoggerr%  rM  r   ESCAPED_FRAGMENT_REPLACEMENTr   r   r   r   r   r   COLON_SPLITTERSPACE_SPLITTERr   r  r  r  r   
good_paths
bad_chunksbad_domainsobjectr8   r[   rC   rA   <module>r:     s    	
/   				 				 # # # # # # 0 0 0 0 0 0 ! ! ! ! ! ! 6 6 6 6 6 6 6 6 6 6       4 4 4 4 4 4 4 4g!!&&z266 00
  "  " &&y$77 u%%u%%$nS)) $$ .(($$$$SUU
! B 
% % %
A A A
 322_ _ _ _ _v _ _ _ _ _rC   