1717from .htmls import get_title
1818from .htmls import shorten_title
1919
20- zlog = logging .getLogger ('zenya ' )
20+ zlog = logging .getLogger ('econtext.text ' )
2121
2222# Python 2.7 compatibility.
2323if sys .version < '3' :
2424 str = unicode
2525
2626REGEXES = {
27- 'unlikelyCandidatesRe' : re .compile ('ad-break|agegate|cart|combx|comment|community|disclaimer|disqus|extra|foot|header|hidden|legal|menu|modal|nav|pager|pagination|polic|popup|reference|remark|review|rss|shoutbox|sidebar|slideshow|sponsor|toc|tweet|twitter|video|warranty' , re .I ),
28- 'okMaybeItsACandidateRe' : re .compile ('econtextmax|and|article|body|column|content|main|shadow|product|feature|detail|spec|about' , re .I ),
29- 'positiveRe' : re .compile ('econtextmax|and|article|body|column|content|main|shadow|product|feature|detail|spec|about|itemprop|text' , re .I ),
30- 'negativeRe' : re .compile ('ad|ad-break|agegate|cart|citation|combx|comment|community|disclaimer|disqus|extra|feedback|foot|form|fulfillment|header|hidden|legal|menu|modal|nav|pager|pagination|placeholder|polic|popup|qa|question|reference|remark|return|review|rss|shoutbox|sidebar|slideshow|small|sponsor|toc|tweet|twitter|video|warranty' , re .I ),
31- 'divToPElementsRe' : re .compile ('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)' , re .I ),
32- 'negativeStyles' : re .compile ('display:.?none|visibility:.?hidden' , re .I )
27+ 'unlikelyCandidatesRe' : re .compile ('ad-break|agegate|cart|combx|comment|community|disclaimer|disqus|extra|foot|header|hidden|legal|menu|modal|nav|pager|pagination|polic|popup|reference|remark|review|rss|shoutbox|sidebar|slideshow|sponsor|toc|tweet|twitter|video|warranty' , re .I ),
28+ 'okMaybeItsACandidateRe' : re .compile ('econtextmax|and|article|body|column|content|main|shadow|product|feature|detail|spec|about|text|story ' , re .I ),
29+ 'positiveRe' : re .compile ('econtextmax|and|article|body|column|content|main|shadow|product|feature|detail|spec|about|itemprop|text|story|story-content ' , re .I ),
30+ 'negativeRe' : re .compile ('ad|ad-break|agegate|cart|citation|combx|comment|community|disclaimer|disqus|extra|feedback|foot|form|fulfillment|header|hidden|item |legal|menu|modal|nav|pager|pagination|placeholder|polic|popup|qa|question|reference|remark|return|review|rss|shoutbox|sidebar|slideshow|small|sponsor|toc|tweet|twitter|video|warranty' , re .I ),
31+ 'divToPElementsRe' : re .compile ('<(a|article| blockquote|dl|div|img|ol|p|pre|table|ul|main )' , re .I ),
32+ 'negativeStyles' : re .compile ('display:.?none|visibility:.?hidden' , re .I )
3333 #'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
3434 #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
3535 #'trimRe': re.compile('^\s+|\s+$/'),
@@ -100,7 +100,7 @@ class Document:
100100
101101 METAPROPS = ['description' , 'title' , 'keywords' , 'og:title' , 'og:description' , 'twitter:description' , 'twitter:title' ]
102102 ITEMPROPS = ['model' , 'brand' , 'description' , 'name' ]
103- BADTAGS = ['nav ' , 'footer ' , 'header ' , 'aside' ]
103+ BADTAGS = ['footer ' , 'header ' , 'nav ' , 'aside' , 'script' , 'style ' ]
104104
105105 def __init__ (self , input , ** options ):
106106 """Generate the document
@@ -211,6 +211,9 @@ def addProps(self, dedupe, base=None):
211211 base = self .html .find (".//body" )
212212 for elem in self .html .xpath (".//*[@itemprop]" ):
213213 if elem .attrib .get ('itemprop' ) in self .ITEMPROPS :
214+ ancestors = set (a .tag for a in elem .iterancestors ())
215+ if len (ancestors .intersection (set (Document .BADTAGS ))) > 0 :
216+ continue
214217 metacontent = elem .attrib .get ('content' , elem .text_content ().strip ())
215218 if dedupe .get (elem .attrib .get ('itemprop' )) != metacontent :
216219 meta = fragment_fromstring (u'<p class="econtextmax itemprop {}">{}</p>' .format (elem .attrib .get ('itemprop' ), re .sub ("<.*?>" , '' , metacontent )))
@@ -231,7 +234,7 @@ def summary(self, html_partial=False):
231234 while True :
232235 self ._html (True )
233236 to_drop = []
234- for i in self .tags (self .html , 'script' , 'style' ):
237+ for i in self .tags (self .html , * self . BADTAGS ):
235238 to_drop .append (i )
236239 for i in to_drop :
237240 i .drop_tree ()
@@ -651,10 +654,10 @@ def sanitize(self, node, candidates):
651654 if el .getparent () is not None :
652655 el .drop_tree ()
653656
654- for el in ([node ] + [n for n in node .iter ()]):
655- if not self .options .get ('attributes' , None ):
656- #el.attrib = {} #FIXME:Checkout the effects of disabling this
657- pass
657+ # for el in ([node] + [n for n in node.iter()]):
658+ # if not self.options.get('attributes', None):
659+ # #el.attrib = {} #FIXME:Checkout the effects of disabling this
660+ # pass
658661
659662 self .html = node
660663 return self .get_clean_html ()
0 commit comments