昨天用JAVA爬了欣欣旅游的邮编信息,生成XML。中间涉及到了Jdom知识,记录下爬取过程。
效果图如下所示:
<?xml version="1.0" encoding="utf-8"?>
<postcodes name="恩施市邮编信息">
<no-address>
<no>445003</no>
<address>新建街二巷,巴公路一巷,民族西路二巷,新建街一巷</address>
</no-address>
<no-address>
<no>445014</no>
<address>龙凤镇大市场,龙凤镇龙凤村,龙凤镇双堰村,龙凤镇向家村</address>
</no-address>
<no-address>
<no>445016</no>
<address>白杨坪乡鲁竹坝村,白杨乡白杨街,白杨乡朝阳坡村,白杨乡董家店村</address>
</no-address>
.............................
代码如下:
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.input.SAXBuilder;
import org.jdom2.output.Format;
import org.jdom2.output.XMLOutputter;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class UrlSpider {
//爬邮编、地址
static String regexNo = "<a href=\"/youbian/[\\d]+\">([\\d]+)</a>";
static String regexAddress = "<p>([^>]+)<a href=\"/youbian/[\\d]+\">[^<]+</a></p>";
static Pattern patternNo = Pattern.compile(regexNo);
static Pattern patternAddr = Pattern.compile(regexAddress);
static StringBuffer sb = new StringBuffer("");
static ArrayList addrList = new ArrayList();
static ArrayList noList = new ArrayList();
public static void main(String[] args) throws Exception {
BufferedReader bs = getMsg();
String line = null;
while ((line = bs.readLine()) != null) {
sb.append(line);
}
//创建解析对象
String filepath = System.getProperty("user.dir")+ File.separator+"src"+File.separator+"post.xml";
SAXBuilder saxBuilder = new SAXBuilder();
InputStream inputStream = Thread.currentThread().getContextClassLoader().getResourceAsStream("post.xml");
//用解析对象解析输入流生成文档对象
Document document = saxBuilder.build(inputStream);
Element rootElement = document.getRootElement();
rootElement.setName("postcodes");
rootElement.setAttribute("name","恩施市邮编信息");
Matcher matchernNo = patternNo.matcher(sb);
Matcher matchernAddr = patternAddr.matcher(sb);
while (matchernNo.find()||matchernAddr.find()) {
Element noaddress = new Element("no-address");
Element no= new Element("no");
if (matchernNo.find()) {
String group = matchernNo.group(1);
if (group != null) {
// System.out.println("邮编号码:" + group);
noaddress.addContent(no);
no.setText(group);
}
}
if (matchernAddr.find()) {
String group1 = matchernAddr.group(1);
// System.out.println("地址:" + group1);
Element address = new Element("address");
rootElement.addContent( noaddress);
noaddress.addContent(address);
address.setText(group1);
}
}
//将documnet写入到硬盘
Format format = Format.getPrettyFormat();//定义格式
format.setEncoding("utf-8");
XMLOutputter xmlOutputter = new XMLOutputter(format);
try {
xmlOutputter.output(document,new FileOutputStream(filepath));
} catch (IOException e) {
e.printStackTrace();
}
}
public static BufferedReader getMsg() throws IOException {
URL url = new URL("https://tool.cncn.com/youbian/enshi-enshi");
URLConnection connection = url.openConnection();
//添加User-Agent
connection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36");
return new BufferedReader(new InputStreamReader(connection.getInputStream(), "GBK"));
}
}
网友评论