美文网首页
有人采用这样一种方式来检测html中的图片?是在是匪夷所思啊!

有人采用这样一种方式来检测html中的图片?是在是匪夷所思啊!

作者: brzhang | 来源:发表于2017-11-28 18:10 被阅读14次
    public class HtmlScraper {
      /**
       * Scrapes an HTML page for <img> tags.
       *
       * @return Scraped plain text
       */
      public static String parseWithImageTags(
          String htmlText,
          @Nullable String originUrl,
          List<String> outImageUrls) {
        ExtractImageGetter imageGetter = new ExtractImageGetter(originUrl, outImageUrls);
        String strippedText = Html.fromHtml(
            htmlText,
            imageGetter,
            null /* tagHandler */)
            .toString();
    
        return strippedText.trim();
      }
    
      private static class ExtractImageGetter implements Html.ImageGetter {
        @Nullable private final String mOriginUrl;
        private final List<String> mSources;
    
        public ExtractImageGetter(@Nullable String originUrl, List<String> outSources) {
          mOriginUrl = originUrl;
          mSources = outSources;
        }
    
        @Override
        public Drawable getDrawable(String source) {
          if (mOriginUrl != null && TextUtils.isEmpty(Uri.parse(source).getScheme())) {
            StringBuilder newSource = new StringBuilder();
            newSource.append(mOriginUrl);
            if (!mOriginUrl.endsWith("/") && !source.startsWith("/")) {
              newSource.append("/");
            }
            newSource.append(source);
            source = newSource.toString();
          }
          mSources.add(source);
    
          // Dummy drawable.
          return new ColorDrawable(Color.TRANSPARENT);
        }
    
        public List<String> getSources() {
          return mSources;
        }
      }
    }
    

    最后图片将会提取到outImageUrls中,但是,这种效率真的要比正则匹配要好吗?

    public void parseSubTree(XmlPullParser pp) throws SAXException, IOException {
            this.pp = pp;
            final boolean namespaceAware = pp.getFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES);
            try {
                if(pp.getEventType() != XmlPullParser.START_TAG) {
                    throw new SAXException(
                        "start tag must be read before skiping subtree"+pp.getPositionDescription());
                }
                final int[] holderForStartAndLength = new int[2];
                final StringBuilder rawName = new StringBuilder(16);
                String prefix = null;
                String name = null;
                int level = pp.getDepth() - 1;
                int type = XmlPullParser.START_TAG;
    
                LOOP:
                do {
                    switch(type) {
                        case XmlPullParser.START_TAG:
                            if(namespaceAware) {
                                final int depth = pp.getDepth() - 1;
                                final int countPrev =
                                    (level > depth) ? pp.getNamespaceCount(depth) : 0;
                                //int countPrev = pp.getNamespaceCount(pp.getDepth() - 1);
                                final int count = pp.getNamespaceCount(depth + 1);
                                for (int i = countPrev; i < count; i++)
                                {
                                    contentHandler.startPrefixMapping(
                                        pp.getNamespacePrefix(i),
                                        pp.getNamespaceUri(i)
                                    );
                                }
                                name = pp.getName();
                                prefix = pp.getPrefix();
                                if(prefix != null) {
                                    rawName.setLength(0);
                                    rawName.append(prefix);
                                    rawName.append(':');
                                    rawName.append(name);
                                }
                                startElement(pp.getNamespace(),
                                             name,
                                             // TODO Fixed this. Was "not equals".
                                             prefix == null ? name : rawName.toString());
                            } else {
                                startElement(pp.getNamespace(),
                                             pp.getName(),
                                             pp.getName());
                            }
                            //++level;
    
                            break;
                        case XmlPullParser.TEXT:
                            final char[] chars = pp.getTextCharacters(holderForStartAndLength);
                            contentHandler.characters(chars,
                                                      holderForStartAndLength[0], //start
                                                      holderForStartAndLength[1] //len
                                                     );
                            break;
                        case XmlPullParser.END_TAG:
                            //--level;
                            if(namespaceAware) {
                                name = pp.getName();
                                prefix = pp.getPrefix();
                                if(prefix != null) {
                                    rawName.setLength(0);
                                    rawName.append(prefix);
                                    rawName.append(':');
                                    rawName.append(name);
                                }
                                contentHandler.endElement(pp.getNamespace(),
                                                          name,
                                                          prefix != null ? name : rawName.toString()
                                                         );
                                // when entering show prefixes for all levels!!!!
                                final int depth = pp.getDepth();
                                final int countPrev =
                                    (level > depth) ? pp.getNamespaceCount(pp.getDepth()) : 0;
                                int count = pp.getNamespaceCount(pp.getDepth() - 1);
                                // undeclare them in reverse order
                                for (int i = count - 1; i >= countPrev; i--)
                                {
                                    contentHandler.endPrefixMapping(
                                        pp.getNamespacePrefix(i)
                                    );
                                }
                            } else {
                                contentHandler.endElement(pp.getNamespace(),
                                                          pp.getName(),
                                                          pp.getName()
                                                         );
    
                            }
                            break;
                        case XmlPullParser.END_DOCUMENT:
                            break LOOP;
                    }
                    type = pp.next();
                } while(pp.getDepth() > level);
            } catch (XmlPullParserException ex)  {
                final SAXParseException saxException = new SAXParseException("parsing error: "+ex, this, ex);
                ex.printStackTrace();
                errorHandler.fatalError(saxException);
            }
        }
    

    反正就是去解析xml

    private void handleStartTag(String tag, Attributes attributes) {
            if (tag.equalsIgnoreCase("br")) {
                // We don't need to handle this. TagSoup will ensure that there's a </br> for each <br>
                // so we can safely emite the linebreaks when we handle the close tag.
            } else if (tag.equalsIgnoreCase("p")) {
                handleP(mSpannableStringBuilder);
            } else if (tag.equalsIgnoreCase("div")) {
                handleP(mSpannableStringBuilder);
            } else if (tag.equalsIgnoreCase("strong")) {
                start(mSpannableStringBuilder, new Bold());
            } else if (tag.equalsIgnoreCase("b")) {
                start(mSpannableStringBuilder, new Bold());
            } else if (tag.equalsIgnoreCase("em")) {
                start(mSpannableStringBuilder, new Italic());
            } else if (tag.equalsIgnoreCase("cite")) {
                start(mSpannableStringBuilder, new Italic());
            } else if (tag.equalsIgnoreCase("dfn")) {
                start(mSpannableStringBuilder, new Italic());
            } else if (tag.equalsIgnoreCase("i")) {
                start(mSpannableStringBuilder, new Italic());
            } else if (tag.equalsIgnoreCase("big")) {
                start(mSpannableStringBuilder, new Big());
            } else if (tag.equalsIgnoreCase("small")) {
                start(mSpannableStringBuilder, new Small());
            } else if (tag.equalsIgnoreCase("font")) {
                startFont(mSpannableStringBuilder, attributes);
            } else if (tag.equalsIgnoreCase("blockquote")) {
                handleP(mSpannableStringBuilder);
                start(mSpannableStringBuilder, new Blockquote());
            } else if (tag.equalsIgnoreCase("tt")) {
                start(mSpannableStringBuilder, new Monospace());
            } else if (tag.equalsIgnoreCase("a")) {
                startA(mSpannableStringBuilder, attributes);
            } else if (tag.equalsIgnoreCase("u")) {
                start(mSpannableStringBuilder, new Underline());
            } else if (tag.equalsIgnoreCase("sup")) {
                start(mSpannableStringBuilder, new Super());
            } else if (tag.equalsIgnoreCase("sub")) {
                start(mSpannableStringBuilder, new Sub());
            } else if (tag.length() == 2 &&
                       Character.toLowerCase(tag.charAt(0)) == 'h' &&
                       tag.charAt(1) >= '1' && tag.charAt(1) <= '6') {
                handleP(mSpannableStringBuilder);
                start(mSpannableStringBuilder, new Header(tag.charAt(1) - '1'));
            } else if (tag.equalsIgnoreCase("img")) {
                startImg(mSpannableStringBuilder, attributes, mImageGetter);
            } else if (mTagHandler != null) {
                mTagHandler.handleTag(true, tag, mSpannableStringBuilder, mReader);
            }
        }
    
    image.png

    其实把,如果让我实现,我肯定想不到使用这个方法,我肯定是这么想的,引入一个jquery类似的解析库,
    $("<img>").foreach({$0.src})这样的拿出链接了。

    相关文章

      网友评论

          本文标题:有人采用这样一种方式来检测html中的图片?是在是匪夷所思啊!

          本文链接:https://www.haomeiwen.com/subject/qppebxtx.html