java爬虫案例

作者: 这里是廖同学 | 来源:发表于2019-12-08 23:51 被阅读0次

    这是一个基于WebMagic + SpringBoot开发的一个简单的爬虫案例,主要爬取前程无忧的招聘数据保存到Mysql数据库!

    数据图

    WebMagic简介
    WebMagic是一个简单灵活的Java爬虫框架。基于WebMagic,你可以快速开发出一个高效、易维护的爬虫。

    mysql表结构

    /*
    Navicat MySQL Data Transfer
    
    Source Server         : 本机数据库
    Source Server Version : 80017
    Source Host           : localhost:3306
    Source Database       : crawler
    
    Target Server Type    : MYSQL
    Target Server Version : 80017
    File Encoding         : 65001
    
    Date: 2019-12-08 23:36:09
    */
    
    SET FOREIGN_KEY_CHECKS=0;
    
    -- ----------------------------
    -- Table structure for jobs_item
    -- ----------------------------
    DROP TABLE IF EXISTS `jobs_item`;
    CREATE TABLE `jobs_item` (
      `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键ID',
      `company_name` varchar(100) DEFAULT NULL COMMENT '公司名称',
      `company_addr` varchar(255) DEFAULT NULL COMMENT '公司地址',
      `company_info` text COMMENT '公司简介',
      `job_name` varchar(100) DEFAULT NULL COMMENT '工作名称',
      `job_num` int(11) DEFAULT '0' COMMENT '招聘人数',
      `job_addr` varchar(255) DEFAULT NULL COMMENT '工作地址',
      `job_info` text COMMENT '工作简介',
      `diploma` varchar(20) DEFAULT NULL COMMENT '文凭',
      `salary_min` bigint(10) DEFAULT NULL COMMENT '最小月薪',
      `salary_max` bigint(10) DEFAULT NULL COMMENT '最多月薪',
      `url` varchar(100) DEFAULT NULL COMMENT '招聘信息详情页',
      `time` varchar(20) DEFAULT NULL COMMENT '职位最近发布时间',
      `created` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
      `updated` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB AUTO_INCREMENT=2844 DEFAULT CHARSET=utf8;
    
    

    项目目录图

    image.png

    添加pom.xml依赖

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
    
        <parent>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-parent</artifactId>
            <version>2.2.1.RELEASE</version>
            <relativePath/> <!-- lookup parent from repository -->
        </parent>
    
        <groupId>club.studycode</groupId>
        <artifactId>qcwy-crawler</artifactId>
        <version>1.0.0-SNAPSHOT</version>
        <name>qcwy-crawler</name>
        <description>JOBS</description>
    
        <properties>
            <java.version>1.8</java.version>
            <mapper.version>2.1.5</mapper.version>
            <webmagic.version>0.7.3</webmagic.version>
        </properties>
    
        <dependencies>
    
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-test</artifactId>
                <scope>test</scope>
                <exclusions>
                    <exclusion>
                        <groupId>org.junit.vintage</groupId>
                        <artifactId>junit-vintage-engine</artifactId>
                    </exclusion>
                </exclusions>
            </dependency>
    
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-web</artifactId>
            </dependency>
    
            <dependency>
                <groupId>org.projectlombok</groupId>
                <artifactId>lombok</artifactId>
                <optional>true</optional>
            </dependency>
    
            <dependency>
                <groupId>tk.mybatis</groupId>
                <artifactId>mapper-spring-boot-starter</artifactId>
                <version>${mapper.version}</version>
            </dependency>
    
            <dependency>
                <groupId>mysql</groupId>
                <artifactId>mysql-connector-java</artifactId>
                <scope>runtime</scope>
            </dependency>
    
            <!--WebMagic 核心依赖-->
            <dependency>
                <groupId>us.codecraft</groupId>
                <artifactId>webmagic-core</artifactId>
                <exclusions>
                    <exclusion>
                        <groupId>org.slf4j</groupId>
                        <artifactId>slf4j-log4j12</artifactId>
                    </exclusion>
                </exclusions>
                <version>${webmagic.version}</version>
            </dependency>
    
            <!--WebMagic 扩展依赖-->
            <dependency>
                <groupId>us.codecraft</groupId>
                <artifactId>webmagic-extension</artifactId>
                <version>${webmagic.version}</version>
            </dependency>
    
            <dependency>
                <groupId>com.google.guava</groupId>
                <artifactId>guava</artifactId>
                <version>28.1-jre</version>
            </dependency>
        </dependencies>
    
        <build>
            <plugins>
                <plugin>
                    <groupId>org.springframework.boot</groupId>
                    <artifactId>spring-boot-maven-plugin</artifactId>
                </plugin>
            </plugins>
        </build>
    
    </project>
    
    

    MyMapper类 定义一个tkMybatis的总接口

    package club.studycode.mapper;
    
    
    import tk.mybatis.mapper.common.Mapper;
    import tk.mybatis.mapper.common.MySqlMapper;
    
    public interface MyMapper<T> extends Mapper<T>, MySqlMapper<T> {
    
    }
    
    

    JobsItemDao接口 继承MyMapper 相当于有了基本的curd方法

    package club.studycode.qcwy.crawler.dao;
    
    import club.studycode.mapper.MyMapper;
    import club.studycode.qcwy.crawler.entity.JobsItem;
    import org.springframework.stereotype.Repository;
    
    
    /**
     *  @ClassName: JobsItemDao.java
     *  @Author: Slayer
     *  @Date: 2019/11/16 0:59
     *  @Description:
     */
    @Repository
    public interface JobsItemDao extends MyMapper<JobsItem> {
    
    }
    
    

    JobsItem Entity类

    package club.studycode.qcwy.crawler.entity;
    
    import java.io.Serializable;
    import java.util.Date;
    
    import lombok.Data;
    
    import javax.persistence.Column;
    import javax.persistence.Id;
    import javax.persistence.Table;
    
    
    /**
     * @ClassName: QcwyItem.java
     * @Author: Slayer
     * @Date: 2019/11/16 0:51
     * @Description:
     */
    @Data
    @Table(name = "jobs_item")
    public class JobsItem implements Serializable {
    
        private static final long serialVersionUID = -1274246480063610692L;
    
        // 主键ID
        @Id
        @Column(name = "id")
        private Long id;
    
        // 公司名称
        @Column(name = "company_name")
        private String companyName;
    
        // 公司地址
        @Column(name = "company_addr")
        private String companyAddr;
    
        // 公司简介
        @Column(name = "company_info")
        private String companyInfo;
    
        // 工作名称
        @Column(name = "job_name")
        private String jobName;
    
        // 招聘人数
        @Column(name = "job_num")
        private Integer jobNum;
    
        // 工作地址
        @Column(name = "job_addr")
        private String jobAddr;
    
        // 工作简介
        @Column(name = "job_info")
        private String jobInfo;
    
        // 文凭
        @Column(name = "diploma")
        private String diploma;
    
        // 最小月薪
        @Column(name = "salary_min")
        private Integer salaryMin;
    
        // 最多月薪
        @Column(name = "salary_max")
        private Integer salaryMax;
    
        // 招聘信息详情页
        @Column(name = "url")
        private String url;
    
        // 职位最近发布时间
        @Column(name = "time")
        private String time;
    
        @Column(name = "created")
        private Date created;
    
        @Column(name = "updated")
        private Date updated;
    
    }
    
    

    JobsItemService 业务层接口

    package club.studycode.qcwy.crawler.service;
    
    import club.studycode.qcwy.crawler.entity.JobsItem;
    
    /**
     *  @ClassName: JobsItemService.java
     *  @Author: Slayer
     *  @Date: 2019/12/8 23:42
     *  @Description: 
     */
    public interface JobsItemService {
    
        void save(JobsItem jobsItem);
    
    
        JobsItem getByCompanyName(String companyName);
    
    }
    
    

    JobsItemServiceImpl 业务实现类

    package club.studycode.qcwy.crawler.service.impl;
    
    import club.studycode.qcwy.crawler.dao.JobsItemDao;
    import club.studycode.qcwy.crawler.entity.JobsItem;
    import club.studycode.qcwy.crawler.service.JobsItemService;
    import org.springframework.stereotype.Service;
    import org.springframework.transaction.annotation.Transactional;
    import tk.mybatis.mapper.entity.Example;
    
    import javax.annotation.Resource;
    
    
    @Transactional(readOnly = true)
    @Service
    public class JobsItemServiceImpl implements JobsItemService {
    
        @Resource
        private JobsItemDao jobsItemDao;
    
        /**
         * 保存数据
         *
         * @param jobsItem
         */
        @Override
        @Transactional(readOnly = false)
        public void save(JobsItem jobsItem) {
            // 新增数据
            if (jobsItem.getId() == null) {
                jobsItemDao.insert(jobsItem);
            }
            //更新数据
            else {
                jobsItemDao.updateByPrimaryKey(jobsItem);
            }
    
        }
    
    
        /**
         * 根据名称获取数据
         *
         * @param companyName
         * @return
         */
        @Override
        public JobsItem getByCompanyName(String companyName) {
            Example example = new Example(JobsItem.class);
            example.createCriteria().andEqualTo("companyName", companyName);
            return jobsItemDao.selectOneByExample(example);
        }
    
    }
    
    

    JobProcessor WebMagic 核心类 主要定义要爬取数据的规则

    package club.studycode.qcwy.crawler.task;
    
    import club.studycode.qcwy.crawler.entity.JobsItem;
    import org.apache.commons.lang3.StringUtils;
    import org.jsoup.Jsoup;
    import org.springframework.beans.factory.annotation.Autowired;
    import org.springframework.scheduling.annotation.Scheduled;
    import org.springframework.stereotype.Component;
    import us.codecraft.webmagic.Page;
    import us.codecraft.webmagic.Site;
    import us.codecraft.webmagic.Spider;
    import us.codecraft.webmagic.processor.PageProcessor;
    import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
    import us.codecraft.webmagic.scheduler.QueueScheduler;
    import us.codecraft.webmagic.selector.Selectable;
    
    import java.util.Date;
    import java.util.List;
    
    
    /**
     *  @ClassName: JobProcessor.java
     *  @Author: Slayer
     *  @Date: 2019/12/8 23:43
     *  @Description: 
     */
    @Component
    public class JobProcessor implements PageProcessor {
    
            private static final String URL_CRAWLER = "https://search.51job.com/list/000000,000000,0000,00,9,99,java,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";
    //    private static final String URL_CRAWLER = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E9%2594%2580%25E5%2594%25AE,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
    
    
        private long num = 1;
    
        @Override
        public void process(Page page) {
            List<Selectable> selectables = page.getHtml().css("div#resultList div.el").nodes();
    
            // 判断获取到的集合是否为空
            if (selectables.size() == 0) {
                // 为空则表示这是招聘详情页
                this.saveJobInfo(page);
            }
            // 不为空,表示这是详情页,解析出详情页的url地址,放到任务中
            else {
                selectables.forEach(selectable -> {
                    // 获取详情页url地址
                    String jobInfoUrl = selectable.css("p.t1 > span > a[href]").links().get();
                    // 把获取到的url地址放在任务队中
                    page.addTargetRequest(jobInfoUrl);
                });
    
                // 获取下一页的url地址
                String nextPage = page.getHtml().css("li.bk").nodes().get(1).links().get();
                // 把获取到的url地址放在任务队中
                page.addTargetRequest(nextPage);
            }
    
        }
    
        /**
         * 保存详情页内容
         *
         * @param page 页面数据
         */
        private void saveJobInfo(Page page) {
            JobsItem jobsItem = new JobsItem();
            // 设置公司名称
            String companyName = page.getHtml().css("p.cname > a", "text").get();
            jobsItem.setCompanyName(companyName);
    
            // 设置部分信息
            String info = page.getHtml().css("p.msg", "text").get();
            this.saveCompanyInfo(info, jobsItem);
    
            // 设置公司信息
            String companyInfo = page.getHtml().css("div.tmsg", "text").get();
            jobsItem.setCompanyInfo(companyInfo);
    
            // 设置工作名称
            String jobName = page.getHtml().css("div.cn > h1", "text").get();
            jobsItem.setJobName(jobName);
    
            // 设置工作地址
            List<Selectable> jobAddrSelectables = page.getHtml().css("p.fp").nodes();
            if (jobAddrSelectables.size() > 1) {
                String jobAddrHtml = jobAddrSelectables.get(1).css("p.fp", "text").get();
                jobsItem.setJobAddr(jobAddrHtml);
            }
    
            // 设置工作简介
    
            List<Selectable> jobInfoSelectables = page.getHtml().css("div.bmsg").nodes();
    
            if (jobInfoSelectables.size() >= 1) {
                String jobInfoHtml = jobInfoSelectables.get(0).get();
                String jobInfo = Jsoup.parse(jobInfoHtml).text();
                jobsItem.setJobInfo(jobInfo);
            }
    
            // 设置最高和最低月薪
            String salary = page.getHtml().css("div.cn > strong", "text").get();
            this.saveSalary(salary, jobsItem);
    
            // 设置招聘信息详情页
            jobsItem.setUrl(page.getUrl().get());
    
            jobsItem.setCreated(new Date());
    
            jobsItem.setUpdated(jobsItem.getCreated());
    
            // 保存数据
            page.putField("jobsItem", jobsItem);
    
            System.out.println("爬虫次数" + this.num++);
    
        }
    
        private void saveSalary(String salary, JobsItem jobsItem) {
            if (!StringUtils.isBlank(salary)) {
                String[] split = salary.split("-");
    
                char unit = split[1].charAt(split[1].length() - 3);
    
                double num = 0;
    
                switch (unit) {
                    case '千':
                        num = 1000;
                        break;
                    case '万':
                        num = 10000;
                        break;
                    default:
                        break;
                }
    
                // 设置最低月薪
                int salaryMin = (int) (Double.parseDouble(split[0]) * num);
                jobsItem.setSalaryMin(salaryMin);
    
                // 设置最多月薪
                int salaryMax = (int) (Double.parseDouble(split[1].substring(0, split[1].length() - 3)) * num);
                jobsItem.setSalaryMax(salaryMax);
            }
    
        }
    
    
        private void saveCompanyInfo(String companyInfo, JobsItem jobsItem) {
            if (!StringUtils.isBlank(companyInfo)) {
                String[] companyInfos = companyInfo.split("    ");
                // 设置公司地址
                jobsItem.setCompanyAddr(companyInfos[0].trim());
    
                // 设置公司招聘人数
                if ("招若干人".equals(companyInfos[3].trim())) {
                    jobsItem.setJobNum(9999);
                } else {
                    String num = companyInfos[3].replaceAll("[^0-9]", "");
                    if (!StringUtils.isBlank(num)) {
                        jobsItem.setJobNum(Integer.parseInt(num));
                    }
                }
    
                // 设置文凭
                if (companyInfos[2].contains("招")) {
                    jobsItem.setDiploma("无学历");
                    String num = companyInfos[2].replaceAll("[^0-9]", "");
                    if (!StringUtils.isBlank(num)) {
                        jobsItem.setJobNum(Integer.parseInt(num));
                    }
                } else {
                    jobsItem.setDiploma(companyInfos[2].trim());
                }
    
    
                // 设置职位最近发布时间
                for (String time : companyInfos) {
                    if (!StringUtils.isBlank(time)) {
                        if (time.indexOf("发布") > 0) {
                            time = time.replace("发布", "");
                            jobsItem.setTime(time);
                        }
                    }
                }
    
            }
        }
    
        private Site site = Site.me()
                // 设置字符编码
                .setCharset("gbk")
                // 设置超时时间
                .setTimeOut(10 * 1000)
                // 设置重试间隔
                .setRetrySleepTime(3 * 1000)
                // 设置重试次数
                .setSleepTime(3);
    
    
        @Override
        public Site getSite() {
            return site;
        }
    
    
        @Autowired
        private SaveDataPipeline saveDataPipeline;
    
        /**
         * initialDelay当任务启动后,等多久在执行
         * fixedDelay每隔多久执行一次
         */
        @Scheduled(initialDelay = 1000, fixedDelay = 10 * 1000)
        public void process() {
            Spider.create(new JobProcessor())
                    .addUrl(URL_CRAWLER)
                    // 使用BloomFilter来进行去重,占用内存较小,但是可能漏抓页面   //100000是估计的页面数量
                    .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000)))
                    .thread(1)
                    // 设置输出位置
                    .addPipeline(this.saveDataPipeline)
                    .run();
        }
    }
    
    

    SaveDataPipeline 主要把爬取的数据保存到数据库中

    package club.studycode.qcwy.crawler.task;
    
    import club.studycode.qcwy.crawler.entity.JobsItem;
    import club.studycode.qcwy.crawler.service.JobsItemService;
    import org.springframework.beans.factory.annotation.Autowired;
    import org.springframework.stereotype.Component;
    import us.codecraft.webmagic.ResultItems;
    import us.codecraft.webmagic.Task;
    import us.codecraft.webmagic.pipeline.Pipeline;
    
    
    /**
     *  @ClassName: SaveDataPipeline.java
     *  @Author: Slayer
     *  @Date: 2019/12/8 23:46
     *  @Description: 
     */
    @Component
    public class SaveDataPipeline implements Pipeline {
    
        @Autowired
        private JobsItemService jobsItemService;
    
        @Override
        public void process(ResultItems resultItems, Task task) {
            // 获取数据
            JobsItem jobsItem = resultItems.get("jobsItem");
            // 判断获取的数据是否符合
            if (jobsItem != null && jobsItem.getCompanyName() != null) {
                // 查询是否有重复的
                JobsItem resultJobsItem = jobsItemService.getByCompanyName(jobsItem.getCompanyName());
                // 有则更新
                if (resultJobsItem != null) {
                    jobsItem.setId(resultJobsItem.getId());
                    System.out.println("-----------------------------更新数据啦----------------------------------");
                }
                jobsItemService.save(jobsItem);
            }
    
        }
    }
    
    

    QcwyCrawlerApplication SpringBoot启动类

    package club.studycode.qcwy.crawler;
    
    import org.springframework.boot.SpringApplication;
    import org.springframework.boot.autoconfigure.SpringBootApplication;
    import org.springframework.scheduling.annotation.EnableScheduling;
    import org.springframework.transaction.annotation.EnableTransactionManagement;
    import tk.mybatis.spring.annotation.MapperScan;
    
    @SpringBootApplication
    @MapperScan(basePackages = "club.studycode.qcwy.crawler.dao")
    @EnableScheduling
    @EnableTransactionManagement
    public class QcwyCrawlerApplication {
        public static void main(String[] args) {
            SpringApplication.run(QcwyCrawlerApplication.class, args);
        }
    
    }
    
    

    application.yaml配置

    spring:
      datasource:
        type: com.zaxxer.hikari.HikariDataSource
        driver-class-name: com.mysql.cj.jdbc.Driver
        url: jdbc:mysql://localhost:3306/crawler?useUnicode=true&characterEncoding=utf-8&useSSL=false&serverTimezone=Asia/Shanghai
        username: root
        password: "020822"
        hikari:
          minimum-idle: 5
          idle-timeout: 600000
          maximum-pool-size: 10
          auto-commit: true
          pool-name: MyHikariCP
          max-lifetime: 1800000
          connection-timeout: 30000
          connection-test-query: SELECT 1
    
    mybatis:
      type-aliases-package: club.studycode.qcwy.crawler.entity
    
    

    这个案例主要演示了WebMagic的基本使用!!!

    相关文章

      网友评论

        本文标题:java爬虫案例

        本文链接:https://www.haomeiwen.com/subject/hnmvgctx.html