代码
const http = require('http')
const path = require('path')
const fs = require('fs')
const cheerio = require('cheerio')
/*
url: 请求的网址
dst: 存储的路径(html为`${dst}.html`, 提取的数据存储在`${dst}.json`)
selectorPath: 查找的路径
findSelector: 要查找的selector
*/
const getDataAndSave = (url, dst, selectorPath, findSelector) => {
const aHtml = path.resolve(__dirname, `./datas/${dst}.html`)
const aJson = path.resolve(__dirname, `./datas/${dst}.json`)
if (fs.existsSync(aHtml)) {
// 先删除旧的文件
fs.rmSync(aHtml)
}
// 其实这里没有必要删除json, 因为writeFile会自动删除已存在的
if (fs.existsSync(aJson)) {
// 先删除旧的文件
fs.rmSync(aJson)
}
// 创建写入流
const writerStream = fs.createWriteStream(aHtml)
const req = http
.get(url, res => {
res.on('data', chunk => {
// 分段写入
writerStream.write(chunk)
console.log('请求')
})
})
.on('error', e => {
console.log('request-err', e.message)
})
.on('close', () => {
// 关闭流
writerStream.close()
console.log('开始解析')
// 读取html
fs.readFile(aHtml, { encoding: 'utf-8' }, (readError, datas) => {
const $ = cheerio.load(datas)
const elements = $(selectorPath).find(findSelector)
const length = elements.length
if (length > 0) {
const result = []
elements.each((i, item) => {
result[i] = $(item).text().trim()
})
fs.writeFile(aJson, JSON.stringify(result, null, 2), err => {
console.log(`fs-err`, err)
})
}
})
})
req.end()
}
getDataAndSave('http://www.lg.gov.cn/zwfw/zdfw/yl/fwsm/yljg/sqjkfwzx/', '社康', '.sk-box .sk-item', 'a')
要提取的内容
网友评论