
    !}g{                         d Z ddlZddlZddlZddlZddlZddlmZ ddl	Z	ddl
mZ ddlmZ ddlmZ  ej        e          Z G d d	e          ZdS )
a  
Newspaper uses a lot of python-goose's parsing code. View theirlicense:
https://github.com/codelucas/newspaper/blob/master/GOOSE-LICENSE.txt

Parser objects will only contain operations that manipulate
or query an lxml or soup dom object generated from an article's html.
    N)unescape)UnicodeDammit)deepcopy   )textc                      e Zd Zed             Zed             Zed             Zed             Zed             Zed             Z	ed             Z
ed             Zed	             Zed
             Ze	 d$defd            Zed             Zed             Zed             Zed             Zed             Zed             Zed%d            Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed&d             Z ed&d!            Z!ed'd"            Z"ed#             Z#dS )(Parserc                 <    d}|                     |d|i          }|S )N$http://exslt.org/regular-expressionsre
namespacesxpath)clsnode
expressionregexp_namespaceitemss        Q/var/www/py-google-trends/myenv/lib/python3.11/site-packages/newspaper/parsers.pyxpath_rezParser.xpath_re   s(    A

:49I2J
KK    c                     t          |t                    r|D ]}|                                 d S |                                 d S N)
isinstancelistdrop_tag)r   nodesr   s      r   r   zParser.drop_tag!   sT    eT"" 	        NNr   c                 ,    |                     |          S r   )	cssselect)r   r   selectors      r   
css_selectzParser.css_select)   s    ~~h'''r   c                     t          |t                    r|S |s|S t          |d          }|j        s*t	          dd                    |j                  z            |j        }|S )NT)is_htmlz4Failed to detect encoding of article HTML, tried: %sz, )r   strr   unicode_markup	Exceptionjointried_encodings)r   html	converteds      r   get_unicode_htmlzParser.get_unicode_html-   s~    dC   	K 	K!$555	' 	6F		)34456 6 6 'r   c                 Z   |                      |          }	 |                    d          r"t          j        dd|t          j                  }t
          j                            |          | _        | j        S # t          $ r' t                              d|d d                    Y d S w xY w)Nz<?z^\<\?.*?\?\> )flagsz.fromstring() returned an invalid string: %s...   )r,   
startswithr   subDOTALLlxmlr*   
fromstringdocr'   logwarn)r   r*   s     r   r5   zParser.fromstring;   s    ##D))	t$$ Jvor4ryIIIi**400CG7N 	 	 	HHEtCRCyQQQFF	s   A!A9 9-B*)B*c                     t           j        j                                        }d|_        d|_        g d|_        d|_        |                    |          S )NT)aspanpbrstrongbemittcodepre
blockquoteimgh1h2h3h4h5h6ulollidldtddF)	r4   r*   cleanCleaner
javascriptstyle
allow_tagsremove_unknown_tags
clean_html)r   r   article_cleaners      r   clean_article_htmlzParser.clean_article_htmlJ   sY    )/1133%)" $&0 &0 &0"
 /4+))$///r   c                 h    t           j                            |d                                          S )zc`decode` is needed at the end because `etree.tostring`
        returns a python bytestring
        r*   )method)r4   etreetostringdecoder   r   s     r   nodeToStringzParser.nodeToStringW   s+    
 z""4"77>>@@@r   c                     ||_         d S r   tag)r   r   re   s      r   
replaceTagzParser.replaceTag^   s    r   c                 4    t          j        j        |g|R   d S r   )r4   r^   
strip_tags)r   r   tagss      r   	stripTagszParser.stripTagsb   s#    
d*T******r   c                 N    d|z  }|                     |          }|r|d         S d S )Nz//*[@id="%s"]r   r   )r   r   iddr!   elemss        r   getElementByIdzParser.getElementByIdf   s3    "S(

8$$ 	8Otr   NFreturnc                 &   d }d|pdz  }|rQ|rO|rddi}|d|d|d}n=d|dt           j        d	t           j        d
}	|d|	d|                                d}|                    ||          }
||
v r|s|r|
                    |           |
S )Nzdescendant-or-self::%s*r   r   z
[re:test(@z, "z", "i")]ztranslate(@z", "z")z
[contains(z")]r   )stringascii_uppercaseascii_lowercaselowerr   remove)r   r   re   attrvaluechilds	use_regexNSr!   transrm   s              r   getElementsByTagzParser.getElementsByTagn   s     +szc: 	WE 	W WBC<DHHdddEEERW 9=f>T>T>TV\VlVlVlm7?xxV

8
33 5==c=V=LLr   c                 0    |                     |           d S r   )append)r   r   childs      r   appendChildzParser.appendChild   s    Er   c                      t          |          S r   )r   ra   s     r   
childNodeszParser.childNodes   s    Dzzr   c                    |}|j         rNt          j                                        }|j         |_         d|_        d |_         |                    d|           t          t          |                    D ]c\  }}|                    |          }|j        dk    r&|j	        r6| 
                    d|j	        d           }|                    |dz   |           dt          |          S )Nr   r   )re   r   tailr   )r   r4   r*   HtmlElementre   insert	enumerater   indexr   createElement)r   r   roottcnidxs          r   childNodesWithTextzParser.childNodesWithText   s     9 		%%''AYAFAEDIKK1d4jj)) 	( 	(DAq**Q--Cuv (%%&qvD%IIC!GQ'''Dzzr   c                 ,    |                      |          S r   )r5   )r   r   s     r   
textToParazParser.textToPara   s    ~~d###r   c                 *    |                                 S r   )getchildrenra   s     r   getChildrenzParser.getChildren       !!!r   c                 t    dd                     d |D                       z  }|                    |          }|S )Nzdescendant::*[%s]z or c              3       K   | ]	}d |z  V  
dS )zself::%sN ).0re   s     r   	<genexpr>z+Parser.getElementsByTags.<locals>.<genexpr>   s'      99S
S(999999r   )r(   r   )r   r   ri   r!   rm   s        r   getElementsByTagszParser.getElementsByTags   s@    &KK99D99999;

8$$r   r<   c                 l    t           j                                        }||_        ||_        ||_        |S r   )r4   r*   r   re   r   r   )r   re   r   r   r   s        r   r   zParser.createElement   s/    I!!##r   c                 ,    |                     d          S )Nz//comment()r   ra   s     r   getCommentszParser.getComments   s    zz-(((r   c                 *    |                                 S r   )	getparentra   s     r   	getParentzParser.getParent   s    ~~r   c                 \   |                                 }||j        rc|                                }|'|j        sd|_        |xj        d|j        z   z  c_        n&|j        sd|_        |xj        d|j        z   z  c_        |                                 |                    |           d S d S )Nr.    )r   r   getpreviousr   clearrv   )r   r   parentprevs       r   rv   zParser.remove   s    !!y 	1''))<!; )&(KK3?2KKK9 '$&	IIty0IIJJLLLMM$ r   c                     |j         S r   rd   ra   s     r   getTagzParser.getTag   s	    xr   c                     d |                                 D             }t          j        d                    |                                                    S )Nc                     g | ]}|S r   r   )r   rA   s     r   
<listcomp>z"Parser.getText.<locals>.<listcomp>   s    +++a+++r   r   )itertextr   	innerTrimr(   strip)r   r   txtss      r   getTextzParser.getText   sC    ++4==??+++~chhtnn2244555r   c                 B    d |                     d          D             S )z\
            returns preceding siblings in reverse order (nearest sibling is first)
        c                     g | ]}|S r   r   )r   r   s     r   r   z+Parser.previousSiblings.<locals>.<listcomp>   s    ===a===r   T)	preceding)itersiblingsra   s     r   previousSiblingszParser.previousSiblings   s)    
 >=4,,t,<<====r   c                 *    |                                 S r   )r   ra   s     r   previousSiblingzParser.previousSibling   r   r   c                 *    |                                 S r   )getnextra   s     r   nextSiblingzParser.nextSibling   s    ||~~r   c                      |j         dk    rdndS )Nr   TFrd   ra   s     r   
isTextNodezParser.isTextNode   s    x6))ttu4r   c                 b    |r|j                             |d           }|rt          |          }|S r   )attribgetr   )r   r   rw   s      r   getAttributezParser.getAttribute   s7     	/;??4..D 	"D>>Dr   c                 \    |r'|j                             |d           }|r|j         |= d S d S d S r   )r   r   )r   r   rw   _attrs       r   delAttributezParser.delAttribute   sK     	&KOOD$//E &K%%%	& 	&& &r   c                 B    |r|r|                     ||           d S d S d S r   )set)r   r   rw   rx   s       r   setAttributezParser.setAttribute   s?     	"E 	"HHT5!!!!!	" 	" 	" 	"r   c                 j    |}|j         rt          |          }d |_         |                     |          S r   )r   r   rb   )r   r   e0s      r   	outerHtmlzParser.outerHtml  s7    7 	"BBG###r   )NNNFF)r<   NNr   )NN)$__name__
__module____qualname__classmethodr   r   r"   r,   r5   r[   rb   rf   rj   rn   r   r}   r   r   r   r   r   r   r   r   r   rv   r   r   r   r   r   r   r   r   r   r   r   r   r   r	   r	      s         [
   [ ( ( [(   [   [ 
0 
0 [
0 A A [A   [ + + [+   [ PU Z^   [&   [   [   [, $ $ [$ " " ["   [    [ ) ) [)     [      [     [ 6 6 [6 > > [> " " ["   [ 5 5 [5    [ & & & [& " " " [" $ $ [$ $ $r   r	   )__doc__logging
lxml.etreer4   	lxml.htmllxml.html.cleanr   r*   r   rr   bs4r   copyr   r.   r   	getLoggerr   r7   objectr	   r   r   r   <module>r      s                  				                         g!!n$ n$ n$ n$ n$V n$ n$ n$ n$ n$r   