关于WebMagic
WebMagic是一个简单灵活的Java爬虫框架。基于WebMagic,你可以快速开发出一个高效、易维护的爬虫。
特性:
简单的API,可快速上手
模块化的结构,可轻松扩展
提供多线程和分布式支持
官网:http://webmagic.io/
中文: http://webmagic.io/docs/zh/
English: http://webmagic.io/docs/en
Javadocs: http://webmagic.io/apidocs/
WebMagic由四个组件(PageProcessor、Pipeline、Downloader、Scheduler)构成
- PageProcessor
PageProcessor负责解析页面,抽取有用信息,以及发现新的链接。 - Pipeline
Pipeline负责抽取结果的处理,包括计算、持久化到文件、数据库等.。 - Downloader
Downloader负责从互联网上下载页面,以便后续处理。WebMagic默认使用了Apache HttpClient作为下载工具。 - Scheduler
Scheduler负责管理待抓取的URL,以及一些去重的工作。WebMagic默认提供了JDK的内存队列来管理URL,并用集合来进行去重。也支持使用Redis进行分布式管理。
项目结构
项目结构代码实现
Maven依赖
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.2</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.2</version>
</dependency>
配置数据库
配置数据库具体代码
@Component
public class NetEaseMusicPageProcessor implements PageProcessor {
// 正则表达式\\. \\转义java中的\ \.转义正则中的.
// 主域名
public static final String BASE_URL = "http://music.163.com/";
// 匹配专辑URL
public static final String ALBUM_URL = "http://music\\.163\\.com/playlist\\?id=\\d+";
// 匹配歌曲URL
public static final String MUSIC_URL = "http://music\\.163\\.com/song\\?id=\\d+";
// 初始地址, 褐言喜欢的音乐id 148174530
public static final String START_URL = "http://music.163.com/playlist?id=148174530";
public static final int ONE_PAGE = 20;
private Site site = Site.me()
.setDomain("http://music.163.com")
.setSleepTime(1000)
.setRetryTimes(30)
.setCharset("utf-8")
.setTimeOut(30000)
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
@Override
public Site getSite() {
return site;
}
@Autowired
MusicService mMusicService;
@Override
public void process(Page page) {
// 根据URL判断页面类型
if (page.getUrl().regex(ALBUM_URL).match()) {
System.out.println("歌曲总数----->" + page.getHtml().xpath("//span[@id='playlist-track-count']/text()").toString());
// 爬取歌曲URl加入队列
page.addTargetRequests(page.getHtml().xpath("//div[@id=\"song-list-pre-cache\"]").links().regex(MUSIC_URL).all());
} else {
String url = page.getUrl().toString();
Music music = new Music();
// 单独对AJAX请求获取评论数, 使用JSON解析返回结果
String songId = url.substring(url.indexOf("id=") + 3);
int commentCount = getComment(page, songId, 0);
// music 保存到数据库
music.setSongId(songId);
music.setCommentCount(commentCount);
music.setTitle(page.getHtml().xpath("//em[@class='f-ff2']/text()").toString());
music.setAuthor(page.getHtml().xpath("//p[@class='des s-fc4']/span/a/text()").toString());
music.setAlbum(page.getHtml().xpath("//p[@class='des s-fc4']/a/text()").toString());
music.setURL(url);
//page.putField("music", music);
mMusicService.addMusic(music);
}
}
private int getComment(Page page, String songId, int offset) {
int commentCount;
String s = NetEaseMusicUtils.crawlAjaxUrl(songId, offset);
if (s.contains("503 Service Temporarily Unavailable")) {
commentCount = -1;
} else {
JSONObject jsonObject = JSON.parseObject(s);
commentCount = (Integer) JSONPath.eval(jsonObject, "$.total");
for (; offset < commentCount; offset = offset + ONE_PAGE) {
JSONObject obj = JSON.parseObject(NetEaseMusicUtils.crawlAjaxUrl(songId, offset));
List<Integer> commentIds = (List<Integer>) JSONPath.eval(obj, "$.comments.commentId");
List<String> contents = (List<String>) JSONPath.eval(obj, "$.comments.content");
List<Integer> likedCounts = (List<Integer>) JSONPath.eval(obj, "$.comments.likedCount");
List<String> nicknames = (List<String>) JSONPath.eval(obj, "$.comments.user.nickname");
List<Long> times = (List<Long>) JSONPath.eval(obj, "$.comments.time");
List<Comment> comments = new ArrayList<>();
for (int i = 0; i < contents.size(); i++) {
// 保存到数据库
Comment comment = new Comment();
comment.setCommentId(commentIds.get(i));
comment.setSongId(songId);
comment.setContent(NetEaseMusicUtils.filterEmoji(contents.get(i)));
comment.setLikedCount(likedCounts.get(i));
comment.setNickname(nicknames.get(i));
comment.setTime(NetEaseMusicUtils.stampToDate(times.get(i)));
comments.add(comment);
mMusicService.addComment(comment);
}
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
return commentCount;
}
public void start(NetEaseMusicPageProcessor processor, NetEaseMusicPipeline netEaseMusicPipeline) {
long start = System.currentTimeMillis();
Spider.create(processor)
.addUrl(START_URL)
// .addPipeline(netEaseMusicPipeline)
.run();
long end = System.currentTimeMillis();
System.out.println("爬虫结束,耗时--->" + NetEaseMusicUtils.parseMillisecone(end - start));
}
}
遇到的主要一个问题就是评论获取,网易对其进行了加密,参考 平胸小仙女的知乎回答的python版本去获取评论数据的。具体实现在NetEaseMusicUtils.java中。
public class NetEaseMusicUtils {
public static String crawlAjaxUrl(String songId, int offset) {
CloseableHttpClient httpclient = HttpClients.createDefault();
CloseableHttpResponse response = null;
String first_param = "{rid:\"\", offset:\"offset_param\", total:\"true\", limit:\"20\", csrf_token:\"\"}";
first_param = first_param.replace("offset_param", offset + "");
//first_param = first_param.replace("limit_param", ONE_PAGE + "");
try {
// 参数加密
// 16位随机字符串,直接FFF
// String secKey = new BigInteger(100, new SecureRandom()).toString(32).substring(0, 16);
String secKey = "FFFFFFFFFFFFFFFF";
// 两遍ASE加密
String encText = NetEaseMusicUtils.aesEncrypt(aesEncrypt(first_param, "0CoJUm6Qyw8W8jud"), secKey);
//
String encSecKey = rsaEncrypt();
HttpPost httpPost = new HttpPost("http://music.163.com/weapi/v1/resource/comments/R_SO_4_" + songId + "/?csrf_token=");
httpPost.addHeader("Referer", NetEaseMusicPageProcessor.BASE_URL);
List<NameValuePair> ls = new ArrayList<NameValuePair>();
ls.add(new BasicNameValuePair("params", encText));
ls.add(new BasicNameValuePair("encSecKey", encSecKey));
UrlEncodedFormEntity paramEntity = new UrlEncodedFormEntity(ls, "utf-8");
httpPost.setEntity(paramEntity);
response = httpclient.execute(httpPost);
HttpEntity entity = response.getEntity();
if (entity != null) {
return EntityUtils.toString(entity, "utf-8");
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
response.close();
httpclient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return "";
}
/**
* ASE-128-CBC加密模式可以需要16位
*
* @param src 加密内容
* @param key 密钥
* @return
*/
public static String aesEncrypt(String src, String key) throws Exception {
String encodingFormat = "UTF-8";
String iv = "0102030405060708";
Cipher cipher = Cipher.getInstance("AES/CBC/PKCS5Padding");
byte[] raw = key.getBytes();
SecretKeySpec secretKeySpec = new SecretKeySpec(raw, "AES");
IvParameterSpec ivParameterSpec = new IvParameterSpec(iv.getBytes());
// 使用CBC模式,需要一个向量vi,增加加密算法强度
cipher.init(Cipher.ENCRYPT_MODE, secretKeySpec, ivParameterSpec);
byte[] encrypted = cipher.doFinal(src.getBytes(encodingFormat));
return new BASE64Encoder().encode(encrypted);
}
public static String rsaEncrypt() {
String secKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c";
return secKey;
}
public static String parseMillisecone(long millisecond) {
String time = null;
try {
long yushu_day = millisecond % (1000 * 60 * 60 * 24);
long yushu_hour = (millisecond % (1000 * 60 * 60 * 24))
% (1000 * 60 * 60);
long yushu_minute = millisecond % (1000 * 60 * 60 * 24)
% (1000 * 60 * 60) % (1000 * 60);
@SuppressWarnings("unused")
long yushu_second = millisecond % (1000 * 60 * 60 * 24)
% (1000 * 60 * 60) % (1000 * 60) % 1000;
if (yushu_day == 0) {
return (millisecond / (1000 * 60 * 60 * 24)) + "天";
} else {
if (yushu_hour == 0) {
return (millisecond / (1000 * 60 * 60 * 24)) + "天"
+ (yushu_day / (1000 * 60 * 60)) + "时";
} else {
if (yushu_minute == 0) {
return (millisecond / (1000 * 60 * 60 * 24)) + "天"
+ (yushu_day / (1000 * 60 * 60)) + "时"
+ (yushu_hour / (1000 * 60)) + "分";
} else {
return (millisecond / (1000 * 60 * 60 * 24)) + "天"
+ (yushu_day / (1000 * 60 * 60)) + "时"
+ (yushu_hour / (1000 * 60)) + "分"
+ (yushu_minute / 1000) + "秒";
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return time;
}
/*
* 将时间戳转换为时间
*/
public static String stampToDate(long s) {
String res;
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long lt = s;
Date date = new Date(lt);
res = simpleDateFormat.format(date);
return res;
}
/**
* 将emoji表情替换成*
*
* @param source
* @return 过滤后的字符串
*/
public static String filterEmoji(String source) {
if (StringUtils.isNotBlank(source)) {
return source.replaceAll("[\\ud800\\udc00-\\udbff\\udfff\\ud800-\\udfff]", "*");
} else {
return source;
}
}
}
展示
爬取过程中1 爬取过程中2最后
运行NetEaseMusicApplication.java
localhost:8888/...
运行Log
之前评论每页50条爬的,爬了4万多条就503了,改成每页20条和配置了代理, 配置代理ProxyProvider
网友评论
msg 上写着Cheating,是不是猜出你是爬虫了,把你封了,要不先换个网络(比如现在是家的IP,换到公司的IP,记得讯代理加白名单),然后每页爬取数据量变少 ,再试试
public static final String START_URL = "http://music.163.com/playlist?id=148174530";
可换成任何歌单的id
localhost:8888/...
嗯,还没下班= =