美文网首页
Chrome阅读模式

Chrome阅读模式

作者: 蒂卡波牧羊犬 | 来源:发表于2018-08-17 17:25 被阅读0次

    提取特征:
    extract_features.js会根据urls提取html特征以及chrome distilled特征,分别为name.feature以及name.dfeature
    features:

     'opengraph': hasOGArticle(),
     'url': document.location.href,
     'title': document.title,
     'numElements': body.querySelectorAll('*').length,
     'numAnchors': body.querySelectorAll('a').length,
     'numForms': body.querySelectorAll('form').length,
     'numTextInput': body.querySelectorAll('input[type="text"]').length,
     'numPasswordInput': body.querySelectorAll('input[type="password"]').length,
     'numPPRE': body.querySelectorAll('p,pre').length,
     'innerText': body.innerText,
     'textContent': body.textContent,
     'innerHTML': body.innerHTML,
     'mozScore': Math.min(6 * Math.sqrt(1000 - 140), _mozScore(false, 0.5, 140, true, 1000)),
     'mozScoreAllSqrt': Math.min(6 * Math.sqrt(1000), _mozScore(false, 0.5, 0, true, 1000)),
     'mozScoreAllLinear': Math.min(6 * 1000, _mozScore(false, 1, 0, true, 1000)),
     'visibleElements': countVisible(body.querySelectorAll('*')),
     'visiblePPRE': countVisible(body.querySelectorAll('p,pre')),
    

    native:

    derived features:

      features = [
        'id', index,
        'sin', math.sin(index),
        'openGraph', opengraph,
    
        'forum', 'forum' in path,
        'index', 'index' in path,
        'search', 'search' in path,
        'view', 'view' in path,
        'archive', 'archive' in path,
        'asp', '.asp' in path,
        'phpbb', 'phpbb' in path,
        'php', path.endswith('.php'),
        'pathLength', len(path),
        'domain', len(path) < 2,
        'pathComponents', CountMatches(path, r'\/.'),
        'slugDetector', CountMatches(path, r'[^\w/]'),
        'pathNumbers', CountMatches(path, r'\d+'),
        'lastSegmentLength', len(GetLastSegment(path)),
    
        'visibleRatio', float(visibleElements) / max(1, numElements),
        'visiblePPRERatio', float(visiblePPRE) / max(1, numPPRE),
        'PPRERatio', float(numPPRE) / max(1, numElements),
        'anchorPPRERatio', float(numAnchors) / max(1, numPPRE),
    
        'innerTextLength', len(innerText),
        'textContentLength', len(textContent),
        'innerHtmlLength', len(innerHTML),
        'innerTextLengthRatio', float(len(innerText)) / max(1, len(innerHTML)),
        'textContentLengthRatio', float(len(textContent)) / max(1, len(innerHTML)),
        'innerTexttextContentLengthRatio',float(len(innerText)) / max(1, len(textContent)),
    
        'innerTextWordCount', innerTextWords,
        'textContentWordCount', textContentWords,
        'innerhtmlWordCount', innerHTMLWords,
        'innerTextWordCountRatio', float(innerTextWords) / max(1, innerHTMLWords),
        'textContentWordCountRatio', float(textContentWords) / max(1, innerHTMLWords),
        'innerTexttextContentWordCountRatio', float(innerTextWords) / max(1, textContentWords),
    
        'textCount', numText,
        'passwordCount', numPassword,
        'formCount', numForms,
        'anchorCount', numAnchors,
        'elementCount', numElements,
        'anchorRatio', float(numAnchors) / max(1, numElements),
      ]
    
      for k in sorted(raw):
        if 'mozScore' in k or 'num' in k:
          features += [k, raw[k]]
    

    mozScore

      function _mozScore(trim, power, cut, excludeLi, saturate) {
        var score = 0;
    
        var nodes = document.querySelectorAll('p,pre')
        for (var i = 0; i < nodes.length; i++) {
          var node = nodes[i];
          if (!isVisible(node)) {
            continue;
          }
          var matchString = node.className + " " + node.id;
          if (unlikelyCandidates.test(matchString) &&
               !okMaybeItsACandidate.test(matchString)) {
            continue;
          }
    
          if (excludeLi && node.matches && node.matches("li p")) {
            continue;
          }
    
          var textContent = node.textContent;
          if (trim) textContent = textContent.trim();
          var textContentLength = textContent.length;
          textContentLength = Math.min(saturate, textContentLength)
          if (textContentLength < cut) {
            continue;
          }
    
          score += Math.pow(textContentLength - cut, power);
        }
        return score;
      }
    

    分类算法

    OG_ARTICLE
    meta是否包括og:type

    AdaBoost
    原理:https://blog.csdn.net/v_JULY_v/article/details/40718799

    AdaBoost

    相关文章

      网友评论

          本文标题:Chrome阅读模式

          本文链接:https://www.haomeiwen.com/subject/xgbdbftx.html