import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import com.juxinli.jobscrawler.service.CleanWebService;
import lombok.extern.slf4j.Slf4j;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.springframework.stereotype.Service;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
@Slf4j
@Service
public class CleanWebServiceImpl implements CleanWebService {
@Override
public Object fetchNode(String pageString, String xpath) {
HtmlCleaner hc = new HtmlCleaner();
TagNode tn = hc.clean(pageString);
Document dom = null;
try {
dom = new DomSerializer(new CleanerProperties()).createDOM(tn);
} catch (ParserConfigurationException e) {
log.error(e.getLocalizedMessage(), e);
}
XPath xPath = XPathFactory.newInstance().newXPath();
Object rootNode = null;
try {
rootNode = xPath.evaluate(xpath, dom, XPathConstants.NODESET);
} catch (XPathExpressionException e) {
log.error("xpath提取出错", e);
}
return rootNode;
}
@Override
public List<String> getNodeListByAttr(String pageString,String xpath, String attr) {
Object rootNode = fetchNode(pageString,xpath);
List<String> attrContentList = new ArrayList<>();
if (rootNode instanceof NodeList) {
NodeList nodeList = (NodeList) rootNode;
for (int i = 0; i < nodeList.getLength(); i++) {
Node node = nodeList.item(i);
if (node.getAttributes().getNamedItem(attr) == null)
attrContentList.add("Null");
attrContentList.add(node.getAttributes().getNamedItem(attr).getTextContent());
}
}
return attrContentList;
}
@Override
public List<String> getNodeList(String pageString, String xpath) {
Object rootNode = fetchNode(pageString, xpath);
List<String> contentList = new ArrayList<>();
if (rootNode instanceof NodeList) {
NodeList nodeList = (NodeList) rootNode;
for (int i = 0; i < nodeList.getLength(); i++) {
Node node = nodeList.item(i);
contentList.add(node.getTextContent()!= "" ? node
.getTextContent() : "Null");
}
}
return contentList;
}
/**
* 这个只用来取页数
* @param xpath
* @return
*/
@Override
public String[] getNodeArray(String pageString, String xpath) {
Object rootNode = fetchNode(pageString,xpath);
String[] contentArray = new String[6];
if (rootNode instanceof NodeList) {
NodeList nodeList = (NodeList) rootNode;
for (int i = 0; i < nodeList.getLength(); i++) {
Node node = nodeList.item(i);
if (node == null) {
continue;
}
contentArray[i] = (node.getTextContent()!= null ? node
.getTextContent() : "Null");
}
}
return contentArray;
}
}
网友评论