Readability, a web body extraction tool

What is Readability?

If you see a good article on the Internet and want to save it for later reading, but don’t want to save a lot of green ads or irrelevant web elements at the same time, then you can try Readability!

Readability is a unique “read later” web favorites service that not only allows you to bookmark articles you like, but also has the best feature of automatically and intelligently eliminating some unimportant elements on the web page and reformatting them to present you with a clean and tidy body part, making your better reading experience! In addition to plug-ins for mainstream browsers, it also provides mobile versions of the app for iOS/Android/Kindle, which can be synced to your phone for efficient and comfortable reading anytime, anywhere ……

Implementation principle of Readability

Extracting the main content from a web page has always been a more challenging algorithm.

Readability was previously open source, but is no longer publicly available. Here is the original open source version for reference: arc90labs-readability - Readability cleans up hard-to-read articles on the Web.

Readability reconsolidates the content of a page by traversing the Dom object and weighting tags and common text. Next, we take a brief look at how this algorithm is implemented. First, it defines a set of orthographies.

regexps: {
        unlikelyCandidates:    /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
        okMaybeItsACandidate:  /and|article|body|column|main|shadow/i,
        positive:              /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
        negative:              /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
        extraneous:            /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
        divToPElements:        /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
        replaceBrs:            /(<br[^>]*>[ \n\r\t]*){2,}/gi,
        replaceFonts:          /<(\/?)font[^>]*>/gi,
        trim:                  /^\s+|\s+$/g,
        normalize:             /\s{2,}/g,
        killBreaks:            /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
        videos:                /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
        skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
        nextLink:              /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
        prevLink:              /(prev|earl|old|new|<|«)/i
},

As you can see, tags and text are grouped with weighted or downgraded groups. The whole content analysis is done by the grabArticle function. First start traversing the nodes.

`1`	`for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1)`

The elements that do not resemble content are then removed.

if (stripUnlikelyCandidates) 
{
    var unlikelyMatchString = node.className + node.id;
    if (
        (
            unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&
            unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 &&
            node.tagName !== "BODY"
        )
    )
    {
        dbg("Removing unlikely candidate - " + unlikelyMatchString);
        node.parentNode.removeChild(node);
        nodeIndex-=1;
        continue;
    }               
}

After replacing the DIV with a P-tag, the target node is then traversed for scoring.

var candidates = [];
for (var pt=0; pt < nodesToScore.length; pt+=1) {
    var parentNode      = nodesToScore[pt].parentNode;
    var grandParentNode = parentNode ? parentNode.parentNode : null;
    var innerText       = readability.getInnerText(nodesToScore[pt]);

    if(!parentNode || typeof(parentNode.tagName) === 'undefined') {
        continue;
    }

    /* If this paragraph is less than 25 characters, don't even count it. */
    if(innerText.length < 25) {
        continue; }

    /* Initialize readability data for the parent. */
    if(typeof parentNode.readability === 'undefined') {
        readability.initializeNode(parentNode);
        candidates.push(parentNode);
    }

    /* Initialize readability data for the grandparent. */
    if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') {
        readability.initializeNode(grandParentNode);
        candidates.push(grandParentNode);
    }

    var contentScore = 0;

    /* Add a point for the paragraph itself as a base. */
    contentScore+=1;

    /* Add points for any commas within this paragraph */
    contentScore += innerText.split(',').length;
    
    /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
    contentScore += Math.min(Math.floor(innerText.length / 100), 3);
    
    /* Add the score to the parent. The grandparent gets half. */
    parentNode.readability.contentScore += contentScore;

    if(grandParentNode) {
        grandParentNode.readability.contentScore += contentScore/2;             
    }
}

Finally, according to the score, the content is reassembled.

var articleContent        = document.createElement("DIV");
if (isPaging) {
    articleContent.id     = "readability-content";
}
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
var siblingNodes          = topCandidate.parentNode.childNodes;


for(var s=0, sl=siblingNodes.length; s < sl; s+=1) {
    var siblingNode = siblingNodes[s];
    var append      = false;

    /**
     * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.
     * Example of error visible here: http://www.esquire.com/features/honesty0707
    **/
    if(!siblingNode) {
        continue;
    }

    dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
    dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));

    if(siblingNode === topCandidate)
    {
        append = true;
    }

    var contentBonus = 0;
    /* Give a bonus if sibling nodes and top candidates have the example same classname */
    if(siblingNode.className === topCandidate.className && topCandidate.className !== "") {
        contentBonus += topCandidate.readability.contentScore * 0.2;
    }

    if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
    {
        append = true;
    }
    
    if(siblingNode.nodeName === "P") {
        var linkDensity = readability.getLinkDensity(siblingNode);
        var nodeContent = readability.getInnerText(siblingNode);
        var nodeLength  = nodeContent.length;
        
        if(nodeLength > 80 && linkDensity < 0.25)
        {
            append = true;
        }
        else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
        {
            append = true;
        }
    }

    if(append) {
        dbg("Appending node: " + siblingNode);

        var nodeToAppend = null;
        if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
            /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
            
            dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
            nodeToAppend = document.createElement("DIV");
            try {
                nodeToAppend.id = siblingNode.id;
                nodeToAppend.innerHTML = siblingNode.innerHTML;
            }
            catch(er) {
                dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
                nodeToAppend = siblingNode;
                s-=1;
                sl-=1;
            }
        } else {
            nodeToAppend = siblingNode;
            s-=1;
            sl-=1;
        }
        
        /* To ensure a node does not interfere with readability styles, remove its classnames */
        nodeToAppend.className = "";

        /* Append sibling and subtract from our list because it removes the node when you append to another node */
        articleContent.appendChild(nodeToAppend);
    }
}

As you can see, there are a lot of trick tricks used, such as not scoring paragraphs under 25 words. There are already many versions in other languages, but considering that the original function was designed for English pages. In the porting process, we need to consider

Support GBK, GB2312 and other encoding

Support more video sites, such as Youku, Tudou, etc.

Support extracting article titles

Support converting image relative path to absolute path

…

How to use Readability quickly?

The better ported versions collected so far are.

nodejs version: https://github.com/luin/readability
php version: https://github.com/feelinglucky/php-readability
Python version: https://github.com/timbertson/python-readability
JS version: https://github.com/mozilla/readability
Swift version: https://github.com/exyte/ReadabilityKit
java version: https://github.com/karussell/snacktory
Ruby version: https://github.com/cantino/ruby-readability

Table of Contents

What is Readability?

Implementation principle of Readability

How to use Readability quickly?