美文网首页程序员
java爬虫之webmagic实战

java爬虫之webmagic实战

作者: GG_lyf | 来源:发表于2021-01-12 09:13 被阅读0次

    前言

      在学了关于webmagic的东西之后,就想搞点事情,于是乎就又去挖起点的墙角了


    开搞

    1.要先添加依赖并找到一个要爬的目标

    <dependency>
       <groupId>us.codecraft</groupId>
       <artifactId>webmagic-core</artifactId>
       <version>0.7.3</version>
    </dependency>
    <dependency>
       <groupId>us.codecraft</groupId>
       <artifactId>webmagic-extension</artifactId>
       <version>0.7.3</version>
    </dependency>
    

    2.使用xpath语法找这个cf

    Selectable div = page.getHtml().xpath("//div[@class=\"focus-wrap box-center mb40 cf\"]");
    

    3.再使用Css选择器选择获取classify-list及一下的dd

    Selectable div2 = div.$(".classify-list").css(".cf");
    

    4.使用Css选择器选择所有的dd,之后遍历

    List<String> all = div2.css("dd").all();
    

    5.再使用xpath和css选择器获取里面的链接和文本

    6.代码如下:
    6.1 爬取分类名和页面路径

    public class QidianCategoryProcessor implements PageProcessor, Serializable {
      private static List<Map<Selectable, Map<Selectable, String>>> out = new ArrayList<>();
    
      public void process(Page page) {
        Selectable div = page.getHtml().xpath("//div[@class=\"focus-wrap box-center mb40 cf\"]");
        Selectable div2 = div.$(".classify-list").css(".cf");
        List<String> all = div2.css("dd").all();
        for (int i = 1; i <= all.size(); i++) {
          Selectable xpath = div2.xpath("dd[" + i + "]");
          Selectable a = xpath.xpath("a[@href]");
          Selectable $ = a.$("a", "href");//获取属性
          String url = "";
          if (i != all.size() - 1) {
            url = "https://www.qidian.com/" + $;
          } else {
            url = "https:" + $;
          }
          Selectable name = a.xpath("i/text()");//通过包裹有文本的最近的标签获取文本
          EveryCategoryProcessor.in(url, name);
        }
        out = EveryCategoryProcessor.out();
      }
    
      public Site getSite() {
        return Site.me().setTimeOut(5000).setCharset("utf-8").setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36").setSleepTime(100).setRetryTimes(3);
      }
    
      @Bean
      public String run() {
        Spider.create(new QidianCategoryProcessor()).addUrl("https://www.qidian.com/").thread(5).run();
        return "ok";
      }
    
      public static List<Map<Selectable, Map<Selectable, String>>> get() {
        return out;
      }
    }
    

    6.2 爬取每一个分类下的一部分数据

    public class EveryCategoryProcessor implements PageProcessor, Serializable {
    
      private static Map<Selectable, Map<Selectable, String>> map = new HashMap<Selectable, Map<Selectable, String>>();
      private Map<Selectable, String> map2 = new HashMap<Selectable, String>();
      private static Selectable name;
    
      @Override
      public void process(Page page) {
        Selectable xpath = page.getHtml().xpath("//tbody");
        List<String> tr = xpath.css("tr").all();
        for (int i = 1; i <= tr.size(); i++) {
          Selectable xpath1 = xpath.xpath("tr[" + i + "]");
          if ((xpath1 + "").equals("null")) {
            continue;
          }
          Selectable css = xpath1.css("td:nth-child(2)");
          Selectable $ = css.$("a", "href");
          Selectable text = css.xpath("a/text()");
          map2.put(text, "https:" + $);
        }
        map.put(name, map2);
      }
    
      @Override
      public Site getSite() {
        return Site.me().setTimeOut(5000).setCharset("utf-8").setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36").setSleepTime(100).setRetryTimes(3);
      }
    
      static void in(String url, Selectable name) {
        EveryCategoryProcessor.name = name;
        Spider.create(new EveryCategoryProcessor()).addUrl(url).thread(5).run();
      }
    
      static List<Map<Selectable, Map<Selectable, String>>> out() {
        List<Map<Selectable, Map<Selectable, String>>> list = new ArrayList<>();
        list.add(map);
        return list;
      }
    }
    
    结果

    相关文章

      网友评论

        本文标题:java爬虫之webmagic实战

        本文链接:https://www.haomeiwen.com/subject/wbqooktx.html