基于Node.js和Cheerio的校园通知爬虫

作者: 爱上落入尘世间的你 | 来源:发表于2017-10-29 21:51 被阅读0次

基于Node.js和Cheerio的校园通知爬虫
Node.js Request+Cheerio实现一个小爬虫-基
Node.js Request+Cheerio实现一个小爬虫-番
Node.js Request+Cheerio实现一个小爬虫-基
Node.js Request+Cheerio实现一个小爬虫-基
Node.js爬虫初体验
04_简单的爬虫cheerio基于node.js
2018-12-05爬虫
Node.js小爬虫
使用superagent 和cheerio完成简单点爬虫

我们学校的所有通知是在同一个网站上发布的，每天都会有很多条。我这个人比较懒，不喜欢经常翻看校内通知网，所以经常会错过某些重要的校内通知。所以最近突发奇想，我干脆做一个爬虫帮我每天看看都有哪些对我有用的通知吧。

const utils = require('../../myUtils')
const Crawler = require("crawler")
const oa = require('../../mongodb/jlu.edu.cn/oa')

const rootUrl = 'https://oa.jlu.edu.cn/defaultroot/'
const homepageUrl = 'https://oa.jlu.edu.cn/defaultroot/PortalInformation!jldxList.action'
let numOfNotifications = 0
let numOfPages = 0

function getPageUrl(num)
{
    return `${homepageUrl}?1=1&startPage=${num}`;
}

function handlePage(error, res, done)
{
    if(error)
    {
        console.log(error)
        return
    }
    else
    {
        const $ = res.$
        const $items = $("#itemContainer>div")
        $items.each(async function(index, element)
        {
            const $item = $.load($(element).html())
            const notification = {}
            notification.href = rootUrl + $item(".font14").attr("href")
            notification.title = $item(".font14").text().replace('[置顶]', '')
            notification.releaser = $item(".column").text()
            notification.date = $item(".time").text().replace(/今天*/, utils.date())
            await handleNotification(notification)
            numOfNotifications ++
        })
        numOfPages ++
        console.log(`crawled ${numOfPages} pages, ${numOfNotifications} notifications`)
    }
    done()
}

async function handleNotification(notification)
{
    const _notification = await oa.findOneNotification(notification)
    if( ! _notification)
    {
        await handleNewNotification(notification)
    }
}

async function handleNewNotification(notification)
{
    // todo: send some message to me
    return await oa.upsertNotification(notification)
}

const crawler = new Crawler
(
    {
        maxConnections: 20,
        callback: handlePage,
    }
)

function homepage()
{
    const url = getPageUrl(1)
    console.log(`crawling page: ${url}`)
    crawler.queue(url);
}

homepage()
setInterval(function()
{
    homepage()
}, 3600 * 1000)