本文主要介绍在神箭手上开发“药智网保健食品采集爬虫”的过程,爬虫主要实现按关键字批量爬取保健食品数据的功能,爬取字段包括产品名称、保健功能和适宜人群等30个字段。
你还可根据需求调整scanUrl中的“typeid”,采集药品、中药材、医疗器械等类别的数据。
药智网是全国最大的生物医药、化工在线技术交易平台。药智网专门针对生物医药、化工领域推出专业性的知识、技术交易平台,技术需求者可以通过本平台发布技术需求、寻求技术解决方案。
采集网站URL:https://db.yaozh.com/baojian
使用功能点:initCrawl、onProcessScanPage、onProcessHelperPage和afterExtractPage回调函数
下面,给你详细说明“药智网保健食品采集爬虫”在神箭手上的开发过程:
步骤1 创建爬虫
注册并登录神箭手,进入神箭手控制台。

点击“新建应用”,选择“爬虫”,点击“下一步”。

输入爬虫名称“药智网保健食品采集爬虫”,选择“编辑模式”,点击“创建”,神箭手爬虫创建成功。

步骤2 分析网页&开发爬虫
本文使用Chrome浏览器分析药智网保健食品的网页,按“F12”可打开浏览器“开发者工具”,对网页请求进行仔细分析。

由于保健食品网页中“下一页”的内容是通过JS渲染得到的,所以此处可以直接将下一个列表页url拼出,并添加到神箭手爬虫的待爬队列中取。

在浏览器上右击鼠标点击“查看网页源码”选项,可从网页源码中分析得出下一个列表页url的获取方法。

分析完列表页url的获取和内容页各字段数据的获取逻辑后,便可结合“神箭手开发文档”,在神箭手爬虫编辑页开发爬虫代码。

步骤3 测试并运行爬虫
爬虫开发完成后,点击“测试”按钮,检查爬虫的爬取结果是否正确。

测试通过后,进入“爬虫设置”页,设置“代理IP”、“文件云托管”等服务,点击“保存”。

返回爬虫总览页,点击“启动爬虫”,稍等片刻,爬虫就会爬到数据了。如果觉得爬虫的爬取速度,建议在神箭手后台给爬虫增加节点,或者优化爬虫代码。

步骤4 数据发布与导出
爬虫爬到数据后,可以选择将数据“发布到网站或数据库”中,点此查看神箭手数据发布详细教程。

此外,还可选择将数据“导出”,点此查看神箭手数据导出详细教程。

神箭手上开发的“药智网保健食品采集爬虫”导出数据示例,如下图所示:

“药智网保健食品采集爬虫”完整示例代码:
/**
药智网保健食品采集爬虫源码
建议给爬虫配置神箭手代理IP,可有效解决药智网反爬问题
对于爬虫代码有不懂的地方,请参考神箭手开发文档(http://docs.shenjian.io/develop/crawler/quick-start.html)
**/
var keywords = ["纽崔莱"];//@tags(keywords, 关键字, 请输入保健食品名称或拼音首字母或完整受理号/批准文)
var domain = "https://db.yaozh.com";
var configs = {
domains: ["db.yaozh.com"],
contentUrlRegexes: [
/http[\w:\/]+db\.yaozh\.com\/[^\/]+\/\d+.html/
],
helperUrlRegexes: [
/http[\w:\/]+db.yaozh\.com\/\w+\?comprehensivesearchcontent=.*/
],
autoFindUrls: false,
interval: 10000,
fields: [
{
name: "id",
alias: "产品编号"
},
{
name: "name",
alias: "产品名称"
},
{
name: "cn_name",
alias: "产品中文名称"
},
{
name: "en_name",
alias: "产品英文名称"
},
{
name: "p_cn_name",
alias: "申请人中文名称"
},
{
name: "p_en_name",
alias: "申请人英文名称"
},
{
name: "p_address",
alias: "申请人地址"
},
{
name: "healthy_function",
alias: "保健功能"
},
{
name: "component_content",
alias: "功效成分/标志性成分含量"
},
{
name: "main_material",
alias: "主要原料"
},
{
name: "suit_crowds",
alias: "适宜人群"
},
{
name: "not_suit_crowds",
alias: "不适宜人群"
},
{
name: "food_methods",
alias: "食用方法及食用量"
},
{
name: "sku",
alias: "产品规格"
},
{
name: "life",
alias: "保质期"
},
{
name: "storage_condition",
alias: "贮藏方法"
},
{
name: "attentions",
alias: "注意事项"
},
{
name: "approve_date",
alias: "批准日期"
},
{
name: "approve_change_date",
alias: "批准变更日期"
},
{
name: "change_content",
alias: "变更内容"
},
{
name: "record_date",
alias: "备案日期"
},
{
name: "record_content",
alias: "备案内容"
},
{
name: "approve_id",
alias: "批准文号"
},
{
name: "valid_till",
alias: "有效期至"
},
{
name: "packing_pic",
alias: "包装图片"
},
{
name: "company_cn_name",
alias: "生产企业中文名称"
},
{
name: "company_en_name",
alias: "生产企业英文名称"
},
{
name: "country",
alias: "生产国"
},
{
name: "address",
alias: "地址"
},
{
name: "remarks",
alias: "备注"
}
]
};
configs.initCrawl = function(site) {
if (keywords && keywords.length == 0) {
system.exit("请输入关键字!");
}
for (var k in keywords) {
site.addScanUrl("https://db.yaozh.com/Search?typeid=8456&content=" + encodeURIComponent(keywords[k]));
}
};
configs.onProcessScanPage = function(page, content, site) {
var helperUrls = extractList(page.raw, "//div[contains(@class,'ui-pane')]//a[contains(@class,'item')]/@href");
for (var hu in helperUrls) {
site.addUrl(domain + helperUrls[hu]);
}
return false;
};
configs.onProcessHelperPage = function(page, content, site) {
var contentUrls = extractList(page.raw, "//th/a[contains(@class,'cl-blue')]/@href");
for (var cu in contentUrls) {
site.addUrl(domain + contentUrls[cu]);
}
var curPageNum = extract(page.raw, "//div[contains(@class,'tr offset-top')]/@data-page");
curPageNum = parseInt(curPageNum);
var totalPageNum = extract(page.raw, "//div[contains(@class,'tr offset-top')]/@data-total");
totalPageNum = Math.ceil(parseInt(totalPageNum)/20);
if (curPageNum >= totalPageNum) {
return false;
}
var nextHelperUrl = page.url.replace("&", "&") + "p=" + (curPageNum + 1) + "&pageSize=20";
site.addUrl(nextHelperUrl);
return false;
};
configs.afterExtractPage = function(page, data, site) {
var details = extractList(page.raw, "//table[contains(@class,'table')]//tr");
for (var d in details) {
var th = extract(details[d], "//th[contains(@class,'detail-table-th')]").trim();
switch(th) {
case "产品编号":
data.id = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "产品名称":
data.name = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "产品中文名称":
data.cn_name = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "产品英文名称":
data.en_name = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "申请人中文名称":
data.p_cn_name = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "申请人英文名称":
data.p_en_name = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "申请人地址":
data.p_address = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "保健功能":
data.healthy_function = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "功效成分/标志性成分含量":
data.component_content = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "主要原料":
data.main_material = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "适宜人群":
data.suit_crowds = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "不适宜人群":
data.not_suit_crowds = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "食用方法及食用量":
data.food_methods = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "产品规格":
data.sku = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "保质期":
data.life = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "贮藏方法":
data.storage_condition = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "注意事项":
data.attentions = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "批准日期":
data.approve_date = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "批准变更日期":
data.approve_change_date = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "变更内容":
data.change_content = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "备案日期":
data.record_date = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "备案内容":
data.record_content = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "批准文号":
data.approve_id = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "有效期至":
data.valid_till = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "包装图片":
data.packing_pic = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "生产企业中文名称":
data.company_cn_name = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "生产企业英文名称":
data.company_en_name = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "生产国":
data.country = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "地址":
data.address = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
case "备注":
data.remarks = extract(details[d], "//span[contains(@class,'toFindImg')]");
break;
}
}
return data;
};
var crawler = new Crawler(configs);
crawler.start();
网友评论