美文网首页那些年敲过的JAVA代码
JAVA爬取邮编信息,生成XML

JAVA爬取邮编信息,生成XML

作者: 我想专心学习 | 来源:发表于2019-02-28 09:55 被阅读0次

    昨天用JAVA爬了欣欣旅游的邮编信息,生成XML。中间涉及到了Jdom知识,记录下爬取过程。

    效果图如下所示:

    <?xml version="1.0" encoding="utf-8"?>
    <postcodes name="恩施市邮编信息">
      <no-address>
        <no>445003</no>
        <address>新建街二巷,巴公路一巷,民族西路二巷,新建街一巷</address>
      </no-address>
      <no-address>
        <no>445014</no>
        <address>龙凤镇大市场,龙凤镇龙凤村,龙凤镇双堰村,龙凤镇向家村</address>
      </no-address>
      <no-address>
        <no>445016</no>
        <address>白杨坪乡鲁竹坝村,白杨乡白杨街,白杨乡朝阳坡村,白杨乡董家店村</address>
      </no-address>
      .............................
      
    

    代码如下:

    import org.jdom2.Document;
    import org.jdom2.Element;
    import org.jdom2.input.SAXBuilder;
    import org.jdom2.output.Format;
    import org.jdom2.output.XMLOutputter;
    
    import java.io.*;
    import java.net.URL;
    import java.net.URLConnection;
    import java.util.ArrayList;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    public class UrlSpider {
    
        //爬邮编、地址
        static String regexNo = "<a href=\"/youbian/[\\d]+\">([\\d]+)</a>";
        static String regexAddress = "<p>([^>]+)<a href=\"/youbian/[\\d]+\">[^<]+</a></p>";
        static Pattern patternNo = Pattern.compile(regexNo);
        static Pattern patternAddr = Pattern.compile(regexAddress);
        static StringBuffer sb = new StringBuffer("");
        static ArrayList addrList = new ArrayList();
        static ArrayList noList = new ArrayList();
        public static void main(String[] args) throws Exception {
            BufferedReader bs = getMsg();
            String line = null;
            while ((line = bs.readLine()) != null) {
                sb.append(line);
            }
            //创建解析对象
            String filepath = System.getProperty("user.dir")+ File.separator+"src"+File.separator+"post.xml";
            SAXBuilder saxBuilder = new SAXBuilder();
            InputStream inputStream = Thread.currentThread().getContextClassLoader().getResourceAsStream("post.xml");
            //用解析对象解析输入流生成文档对象
            Document document = saxBuilder.build(inputStream);
            Element rootElement = document.getRootElement();
            rootElement.setName("postcodes");
            rootElement.setAttribute("name","恩施市邮编信息");
            Matcher matchernNo = patternNo.matcher(sb);
            Matcher matchernAddr = patternAddr.matcher(sb);
    
            while (matchernNo.find()||matchernAddr.find()) {
                Element noaddress = new Element("no-address");
                Element no= new Element("no");
                if (matchernNo.find()) {
                    String group = matchernNo.group(1);
                    if (group != null) {
    //                    System.out.println("邮编号码:" + group);
                        noaddress.addContent(no);
                        no.setText(group);
                    }
                }
                if (matchernAddr.find()) {
                    String group1 = matchernAddr.group(1);
    //                 System.out.println("地址:" + group1);
                    Element address = new Element("address");
                    rootElement.addContent( noaddress);
                    noaddress.addContent(address);
                    address.setText(group1);
                }
            }
    
            //将documnet写入到硬盘
            Format format = Format.getPrettyFormat();//定义格式
            format.setEncoding("utf-8");
            XMLOutputter xmlOutputter = new XMLOutputter(format);
            try {
                xmlOutputter.output(document,new FileOutputStream(filepath));
            } catch (IOException e) {
                e.printStackTrace();
            }
    
    
        }
    
        public static BufferedReader getMsg() throws IOException {
            URL url = new URL("https://tool.cncn.com/youbian/enshi-enshi");
            URLConnection connection = url.openConnection();
            //添加User-Agent
            connection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36");
            return new BufferedReader(new InputStreamReader(connection.getInputStream(), "GBK"));
        }
    
    }
    
    
    

    相关文章

      网友评论

        本文标题:JAVA爬取邮编信息,生成XML

        本文链接:https://www.haomeiwen.com/subject/fsncuqtx.html