美文网首页
2019-07-17

2019-07-17

作者: 程序学习er | 来源:发表于2019-07-17 14:57 被阅读0次
    
    import java.util.ArrayList;
    import java.util.List;
    import javax.xml.parsers.ParserConfigurationException;
    import javax.xml.xpath.XPath;
    import javax.xml.xpath.XPathConstants;
    import javax.xml.xpath.XPathExpressionException;
    import javax.xml.xpath.XPathFactory;
    
    import com.juxinli.jobscrawler.service.CleanWebService;
    import lombok.extern.slf4j.Slf4j;
    import org.htmlcleaner.CleanerProperties;
    import org.htmlcleaner.DomSerializer;
    import org.htmlcleaner.HtmlCleaner;
    import org.htmlcleaner.TagNode;
    import org.springframework.stereotype.Service;
    import org.w3c.dom.Document;
    import org.w3c.dom.Node;
    import org.w3c.dom.NodeList;
    
    @Slf4j
    @Service
    public class CleanWebServiceImpl implements CleanWebService {
    
    
        @Override
        public Object fetchNode(String pageString, String xpath) {
            HtmlCleaner hc = new HtmlCleaner();
            TagNode tn = hc.clean(pageString);
            Document dom = null;
            try {
                dom = new DomSerializer(new CleanerProperties()).createDOM(tn);
            } catch (ParserConfigurationException e) {
                log.error(e.getLocalizedMessage(), e);
            }
            XPath xPath = XPathFactory.newInstance().newXPath();
            Object rootNode = null;
            try {
                rootNode = xPath.evaluate(xpath, dom, XPathConstants.NODESET);
            } catch (XPathExpressionException e) {
                log.error("xpath提取出错", e);
            }
            return rootNode;
        }
    
        @Override
        public List<String> getNodeListByAttr(String pageString,String xpath, String attr) {
            Object rootNode = fetchNode(pageString,xpath);
            List<String> attrContentList = new ArrayList<>();
            if (rootNode instanceof NodeList) {
                NodeList nodeList = (NodeList) rootNode;
                for (int i = 0; i < nodeList.getLength(); i++) {
                    Node node = nodeList.item(i);
                    if (node.getAttributes().getNamedItem(attr) == null)
                        attrContentList.add("Null");
                    attrContentList.add(node.getAttributes().getNamedItem(attr).getTextContent());
                }
            }
            return attrContentList;
        }
    
        @Override
        public List<String> getNodeList(String pageString, String xpath) {
            Object rootNode = fetchNode(pageString, xpath);
            List<String> contentList = new ArrayList<>();
            if (rootNode instanceof NodeList) {
                NodeList nodeList = (NodeList) rootNode;
                for (int i = 0; i < nodeList.getLength(); i++) {
                    Node node = nodeList.item(i);
                    contentList.add(node.getTextContent()!= "" ? node
                            .getTextContent() : "Null");
                }
            }
            return contentList;
        }
    
        /**
         * 这个只用来取页数
         * @param xpath
         * @return
         */
        @Override
        public String[] getNodeArray(String pageString, String xpath) {
            Object rootNode = fetchNode(pageString,xpath);
            String[] contentArray = new String[6];
            if (rootNode instanceof NodeList) {
                NodeList nodeList = (NodeList) rootNode;
                for (int i = 0; i < nodeList.getLength(); i++) {
                    Node node = nodeList.item(i);
                    if (node == null) {
                        continue;
                    }
                    contentArray[i] = (node.getTextContent()!= null ? node
                            .getTextContent() : "Null");
                }
            }
            return contentArray;
        }
    }
    

    相关文章

      网友评论

          本文标题:2019-07-17

          本文链接:https://www.haomeiwen.com/subject/zewqlctx.html