diff --git a/fixtures/test_tags_kexp.json b/fixtures/test_tags_kexp.json index c006c31..0c7deb4 100644 --- a/fixtures/test_tags_kexp.json +++ b/fixtures/test_tags_kexp.json @@ -1,11 +1,12 @@ { - "url": "http://blogs.kusp.org/filmgang/2013/02/08/stand-up-guys/", + "url": "http://blogs.kusp.org/filmgang/2013/02/08/stand-up-guys/", "expected": { "tags": [ - "kusp film review", - "Stand Up Guys", - "film", + "kusp film review", + "Stand Up Guys", + "film", "Dennis Morton" - ] + ], + "cleaned_text": "Listen to the review by Dennis Morton, above." } -} \ No newline at end of file +} diff --git a/lib/cleaner.js b/lib/cleaner.js index 4305509..91d8d92 100644 --- a/lib/cleaner.js +++ b/lib/cleaner.js @@ -118,28 +118,28 @@ void function () { kidTextNode = kid; kidText = kid.text(); replaceText = kidText.replace(/\n/g, '\n\n').replace(/\t/g, '').replace(/^\s+$/g, ''); - if (replaceText.length > 1) { - previousSiblingNode = kidTextNode.prev(); - while (previousSiblingNode[0] && previousSiblingNode[0].name === 'a' && previousSiblingNode.attr('grv-usedalready') !== 'yes') { - outer = ' ' + doc.html(previousSiblingNode) + ' '; + if (!(replaceText.length > 1)) + return; + previousSiblingNode = kidTextNode.prev(); + while (previousSiblingNode[0] && previousSiblingNode[0].name === 'a' && previousSiblingNode.attr('grv-usedalready') !== 'yes') { + outer = ' ' + doc.html(previousSiblingNode) + ' '; + replacementText.push(outer); + nodesToRemove.push(previousSiblingNode); + previousSiblingNode.attr('grv-usedalready', 'yes'); + previousSiblingNode = previousSiblingNode.prev(); + } + replacementText.push(replaceText); + nextSiblingNode = kidTextNode.next(); + return function (accum$) { + while (nextSiblingNode[0] && nextSiblingNode[0].name === 'a' && nextSiblingNode.attr('grv-usedalready') !== 'yes') { + outer = ' ' + doc.html(nextSiblingNode) + ' '; replacementText.push(outer); - nodesToRemove.push(previousSiblingNode); - previousSiblingNode.attr('grv-usedalready', 'yes'); - previousSiblingNode = previousSiblingNode.prev(); + nodesToRemove.push(nextSiblingNode); + nextSiblingNode.attr('grv-usedalready', 'yes'); + accum$.push(previousSiblingNode = nextSiblingNode.next()); } - replacementText.push(replaceText); - nextSiblingNode = kidTextNode.next(); - return function (accum$) { - while (nextSiblingNode[0] && nextSiblingNode[0].name === 'a' && nextSiblingNode.attr('grv-usedalready') !== 'yes') { - outer = ' ' + doc.html(nextSiblingNode) + ' '; - replacementText.push(outer); - nodesToRemove.push(nextSiblingNode); - nextSiblingNode.attr('grv-usedalready', 'yes'); - accum$.push(previousSiblingNode = nextSiblingNode.next()); - } - return accum$; - }.call(this, []); - } + return accum$; + }.call(this, []); } else { return nodesToReturn.push(doc(kid).html()); } @@ -180,17 +180,17 @@ void function () { div = doc(this); items = div.find(tags.join(', ')); if (items.length === 0) { - return replaceWithPara(doc, this); - } else { - replaceNodes = getReplacementNodes(doc, div); - html = ''; - _.each(replaceNodes, function (node) { - if (node !== '') - return html += '
' + node + '
'; - }); - div.empty(); - return doc(div).replaceWith('' + html); + replaceWithPara(doc, this); + return; } + replaceNodes = getReplacementNodes(doc, div); + html = ''; + _.each(replaceNodes, function (node) { + if (node !== '') + return html += '' + node + '
'; + }); + div.empty(); + return doc(div).replaceWith('' + html); }); }; cleanErrantLinebreaks = function (doc) { diff --git a/lib/extractor.js b/lib/extractor.js index 58212cc..4ceb469 100644 --- a/lib/extractor.js +++ b/lib/extractor.js @@ -240,19 +240,19 @@ void function () { var currentNode, currentNodeTag, paraText, wordStats; currentNode = doc(this); currentNodeTag = currentNode[0].name; - if (currentNodeTag === 'p') { - if (stepsAway >= maxStepsawayFromNode) { - boostable = false; - return false; - } - paraText = currentNode.text(); - wordStats = stopwords(paraText, lang); - if (wordStats.stopwordCount > minimumStopwordCount) { - boostable = true; - return false; - } - return stepsAway += 1; + if (!(currentNodeTag === 'p')) + return; + if (stepsAway >= maxStepsawayFromNode) { + boostable = false; + return false; + } + paraText = currentNode.text(); + wordStats = stopwords(paraText, lang); + if (wordStats.stopwordCount > minimumStopwordCount) { + boostable = true; + return false; } + return stepsAway += 1; }); return boostable; }; @@ -272,31 +272,27 @@ void function () { }; getSiblingsContent = function (doc, lang, currentSibling, baselinescoreSiblingsPara) { var potentialParagraphs, ps; - if (currentSibling[0].name === 'p' && currentSibling.text().length > 0) { + if (currentSibling[0].name === 'p' && currentSibling.text().length > 0) return [currentSibling]; - } else { - potentialParagraphs = currentSibling.find('p'); - if (potentialParagraphs === null) { - return null; - } else { - ps = []; - potentialParagraphs.each(function () { - var firstParagraph, highLinkDensity, paragraphScore, score, siblingBaselineScore, txt, wordStats; - firstParagraph = doc(this); - txt = firstParagraph.text(); - if (txt.length > 0) { - wordStats = stopwords(txt, lang); - paragraphScore = wordStats.stopwordCount; - siblingBaselineScore = .3; - highLinkDensity = isHighlinkDensity(doc, firstParagraph); - score = baselinescoreSiblingsPara * siblingBaselineScore; - if (score < paragraphScore && !highLinkDensity) - return ps.push(txt); - } - }); - return ps; - } - } + potentialParagraphs = currentSibling.find('p'); + if (potentialParagraphs === null) + return null; + ps = []; + potentialParagraphs.each(function () { + var firstParagraph, highLinkDensity, paragraphScore, score, siblingBaselineScore, txt, wordStats; + firstParagraph = doc(this); + txt = firstParagraph.text(); + if (!(txt.length > 0)) + return; + wordStats = stopwords(txt, lang); + paragraphScore = wordStats.stopwordCount; + siblingBaselineScore = .3; + highLinkDensity = isHighlinkDensity(doc, firstParagraph); + score = baselinescoreSiblingsPara * siblingBaselineScore; + if (score < paragraphScore && !highLinkDensity) + return ps.push(txt); + }); + return ps; }; getSiblingsScore = function (doc, topNode, lang) { var base, nodesToCheck, paragraphsNumber, paragraphsScore; @@ -377,22 +373,14 @@ void function () { return doc(p).remove(); }); subParagraphs2 = e.find('p'); - if (subParagraphs2.length === 0 && e[0].name !== 'td') { - return true; - } else { - return false; - } + return subParagraphs2.length === 0 && e[0].name !== 'td'; }; isNodescoreThresholdMet = function (doc, node, e) { var currentNodeScore, thresholdScore, topNodeScore; topNodeScore = getScore(node); currentNodeScore = getScore(e); thresholdScore = topNodeScore * .08; - if (currentNodeScore < thresholdScore && e[0].name !== 'td') { - return false; - } else { - return true; - } + return currentNodeScore >= thresholdScore || e[0].name === 'td'; }; postCleanup = function (doc, targetNode, lang) { var node; @@ -401,10 +389,10 @@ void function () { var e, eTag; e = doc(this); eTag = e[0].name; - if (!(eTag === 'p' || eTag === 'a')) - if (isHighlinkDensity(doc, e) || isTableAndNoParaExist(doc, e) || !isNodescoreThresholdMet(doc, node, e)) { - return doc(e).remove(); - } + if (eTag === 'p' || eTag === 'a') + return; + if (isHighlinkDensity(doc, e) || isTableAndNoParaExist(doc, e) || !isNodescoreThresholdMet(doc, node, e)) + return doc(e).remove(); }); return node; }; diff --git a/lib/unfluff.js b/lib/unfluff.js index a724ad5..816c7b7 100644 --- a/lib/unfluff.js +++ b/lib/unfluff.js @@ -96,6 +96,7 @@ void function () { return this.cleanedDoc_; doc = getParsedDoc.call(this, html); this.cleanedDoc_ = cleaner(doc); + this.doc_ = cheerio.load(html); return this.cleanedDoc_; }; }.call(this); diff --git a/src/cleaner.coffee b/src/cleaner.coffee index f2dd49d..df05db6 100644 --- a/src/cleaner.coffee +++ b/src/cleaner.coffee @@ -105,26 +105,26 @@ getReplacementNodes = (doc, div) -> kidText = kid.text() replaceText = kidText.replace(/\n/g, "\n\n").replace(/\t/g, "").replace(/^\s+$/g, "") - if(replaceText.length) > 1 - previousSiblingNode = kidTextNode.prev() + return unless replaceText.length > 1 + previousSiblingNode = kidTextNode.prev() - while previousSiblingNode[0] && previousSiblingNode[0].name == "a" && previousSiblingNode.attr('grv-usedalready') != 'yes' - outer = " " + doc.html(previousSiblingNode) + " " - replacementText.push(outer) - nodesToRemove.push(previousSiblingNode) - previousSiblingNode.attr('grv-usedalready', 'yes') - previousSiblingNode = previousSiblingNode.prev() + while previousSiblingNode[0] && previousSiblingNode[0].name == "a" && previousSiblingNode.attr('grv-usedalready') != 'yes' + outer = " " + doc.html(previousSiblingNode) + " " + replacementText.push(outer) + nodesToRemove.push(previousSiblingNode) + previousSiblingNode.attr('grv-usedalready', 'yes') + previousSiblingNode = previousSiblingNode.prev() - replacementText.push(replaceText) + replacementText.push(replaceText) - nextSiblingNode = kidTextNode.next() + nextSiblingNode = kidTextNode.next() - while nextSiblingNode[0] && nextSiblingNode[0].name == "a" && nextSiblingNode.attr('grv-usedalready') != 'yes' - outer = " " + doc.html(nextSiblingNode) + " " - replacementText.push(outer) - nodesToRemove.push(nextSiblingNode) - nextSiblingNode.attr('grv-usedalready', 'yes') - previousSiblingNode = nextSiblingNode.next() + while nextSiblingNode[0] && nextSiblingNode[0].name == "a" && nextSiblingNode.attr('grv-usedalready') != 'yes' + outer = " " + doc.html(nextSiblingNode) + " " + replacementText.push(outer) + nodesToRemove.push(nextSiblingNode) + nextSiblingNode.attr('grv-usedalready', 'yes') + previousSiblingNode = nextSiblingNode.next() # otherwise else @@ -158,16 +158,17 @@ divToPara = (doc, domType) -> if items.length == 0 replaceWithPara(doc, this) - else - replaceNodes = getReplacementNodes(doc, div) + return + + replaceNodes = getReplacementNodes(doc, div) - html = "" - _.each replaceNodes, (node) -> - if node != '' - html += "#{node}
" + html = "" + _.each replaceNodes, (node) -> + if node != '' + html += "#{node}
" - div.empty() - doc(div).replaceWith("#{html}") + div.empty() + doc(div).replaceWith("#{html}") # For plain text nodes directly inside of p tags that contain random single # line breaks, remove those junky line breaks. They would never be rendered diff --git a/src/extractor.coffee b/src/extractor.coffee index 2589c95..b62cd23 100644 --- a/src/extractor.coffee +++ b/src/extractor.coffee @@ -259,22 +259,22 @@ isBoostable = (doc, node, lang) -> nodes.each () -> currentNode = doc(this) currentNodeTag = currentNode[0].name + return unless currentNodeTag == 'p' + + # Make sure the node isn't more than 3 hops away + if stepsAway >= maxStepsawayFromNode + boostable = false + return false - if currentNodeTag == "p" - # Make sure the node isn't more than 3 hops away - if stepsAway >= maxStepsawayFromNode - boostable = false - return false + paraText = currentNode.text() + wordStats = stopwords(paraText, lang) - paraText = currentNode.text() - wordStats = stopwords(paraText, lang) + # Check if the node contains more than 5 common words + if wordStats.stopwordCount > minimumStopwordCount + boostable = true + return false - # Check if the node contains more than 5 common words - if wordStats.stopwordCount > minimumStopwordCount - boostable = true - return false - - stepsAway += 1 + stepsAway += 1 boostable @@ -290,30 +290,27 @@ addSiblings = (doc, topNode, lang) -> return topNode getSiblingsContent = (doc, lang, currentSibling, baselinescoreSiblingsPara) -> - - if currentSibling[0].name == 'p' && currentSibling.text().length > 0 - return [currentSibling] - else - potentialParagraphs = currentSibling.find("p") - if potentialParagraphs == null - return null - else - ps = [] - potentialParagraphs.each () -> - firstParagraph = doc(this) - txt = firstParagraph.text() - - if txt.length > 0 - wordStats = stopwords(txt, lang) - paragraphScore = wordStats.stopwordCount - siblingBaselineScore = 0.30 - highLinkDensity = isHighlinkDensity(doc, firstParagraph) - score = baselinescoreSiblingsPara * siblingBaselineScore - - if score < paragraphScore && !highLinkDensity - ps.push(txt) - - return ps + return [currentSibling] if currentSibling[0].name == 'p' && currentSibling.text().length > 0 + + potentialParagraphs = currentSibling.find("p") + return null if potentialParagraphs == null + + ps = [] + potentialParagraphs.each () -> + firstParagraph = doc(this) + txt = firstParagraph.text() + return unless txt.length > 0 + + wordStats = stopwords(txt, lang) + paragraphScore = wordStats.stopwordCount + siblingBaselineScore = 0.30 + highLinkDensity = isHighlinkDensity(doc, firstParagraph) + score = baselinescoreSiblingsPara * siblingBaselineScore + + if score < paragraphScore && !highLinkDensity + ps.push(txt) + + return ps getSiblingsScore = (doc, topNode, lang) -> base = 100000 @@ -400,20 +397,14 @@ isTableAndNoParaExist = (doc, e) -> doc(p).remove() subParagraphs2 = e.find("p") - if subParagraphs2.length == 0 && e[0].name != "td" - return true - else - return false + return subParagraphs2.length == 0 && e[0].name != "td" isNodescoreThresholdMet = (doc, node, e) -> topNodeScore = getScore(node) currentNodeScore = getScore(e) thresholdScore = topNodeScore * 0.08 - if (currentNodeScore < thresholdScore) && e[0].name != 'td' - return false - else - return true + return (currentNodeScore >= thresholdScore) || e[0].name == 'td' # Remove any remaining trash nodes (clusters of nodes with little/no content) postCleanup = (doc, targetNode, lang) -> @@ -422,8 +413,8 @@ postCleanup = (doc, targetNode, lang) -> node.children().each () -> e = doc(this) eTag = e[0].name - if eTag not in ['p', 'a'] - if isHighlinkDensity(doc, e) || isTableAndNoParaExist(doc, e) || !isNodescoreThresholdMet(doc, node, e) - doc(e).remove() + return if eTag in ['p', 'a'] + if isHighlinkDensity(doc, e) || isTableAndNoParaExist(doc, e) || !isNodescoreThresholdMet(doc, node, e) + doc(e).remove() return node diff --git a/src/unfluff.coffee b/src/unfluff.coffee index 9d5a20e..e834ae8 100644 --- a/src/unfluff.coffee +++ b/src/unfluff.coffee @@ -87,4 +87,5 @@ getCleanedDoc = (html) -> return @cleanedDoc_ if @cleanedDoc_? doc = getParsedDoc.call(this, html) @cleanedDoc_ = cleaner(doc) + @doc_ = cheerio.load(html) @cleanedDoc_ diff --git a/test/unfluff.coffee b/test/unfluff.coffee index 2879be4..9ff5cfd 100644 --- a/test/unfluff.coffee +++ b/test/unfluff.coffee @@ -94,6 +94,9 @@ suite 'Unfluff', -> checkFixture('tags_cnet' , ['tags']) checkFixture('tags_abcau' , ['tags']) + test 'cleaner side effects ignored', -> + checkFixture('tags_kexp', ['cleaned_text', 'tags']) + test 'reads videos', -> checkFixture('embed' , ['videos']) checkFixture('iframe' , ['videos'])