美文网首页
node 爬虫

node 爬虫

作者: 秋天de童话 | 来源:发表于2018-09-03 23:40 被阅读20次

clawer.js

const fs=require('fs');
const Mysql=require('mysql-pro');

const db=new Mysql({
  mysql: {
    host: 'localhost',
    port: 3306,
    user: 'root',
    password: 'admin',
    database: 'zhihu'
  }
});


const arr=JSON.parse(fs.readFileSync('.topics').toString());

let topics={}, topic_ID=1;
let authors={}, author_ID=1;
let questions={}, question_ID=1;
let answers={}, answer_ID=1;

arr.forEach(question=>{
  //topic
  question.topics=question.topices.map(json=>{
    let {title}=json;
    title=title.replace(/^\s+|\s+$/g, '');

    if(!topics[title]){
      topics[title]=topic_ID++;
    }

    return topics[title];
  }).join(',');

  //author
  [question.bestAnswer.author, ...question.answers.map(answer=>answer.author)].forEach((author,index)=>{
    let old_id=author.id;
    if(!authors[old_id]){
      authors[author.id]=author;
      author.id=question_ID++;
    }

    if(index==0){
      delete question.bestAnswer.author;
      question.bestAnswer.author_ID=author.id;
    }else{
      delete question.answers[index-1].author;
      question.answers[index-1].author_ID=author.id;
    }

    return authors[old_id];
  });

  //question
  let ID=question_ID;
  questions[question_ID++]=question;

  //answers
  [question.bestAnswer, ...question.answers].forEach(answer=>{
    answer.id=answer_ID;
    answer.question_ID=ID;
    answers[answer_ID++]=answer;
  });
});

(async()=>{
  function dataJoin(...args){
    return "('"+args.map(item=>{
      item=item||'';
      item=item.toString().replace(/'/g, '\\\'');

      return item;
    }).join("','")+"')";
  }

  //topics
  let aTopics=[];
  for(let title in topics){
    let ID=topics[title];

    aTopics.push(dataJoin(ID, title));
  }
  let topic_sql=`INSERT INTO topic_table VALUES${aTopics.join(',')}`;

  //authors
  let aAuthors=[];
  for(let id in authors){
    let author=authors[id];
    
    if(author.followerCount==''){
      author.followerCount = -1;
    }
    if(author.gender==''){
      author.gender = -1;
    }
    aAuthors.push(dataJoin(author.id, author.type, author.name, author.gender, author.userType, author.img_url, author.headline, author.followerCount));
  }
  let author_sql=`INSERT INTO author_table VALUES${aAuthors.join(',')}`;
  

  //questions
  let aQuestions=[];
  for(let ID in questions){
    let question=questions[ID];
    //console.log(`id`,ID);
    aQuestions.push(dataJoin(ID, question.title, question.question_content, question.topics, question.attention_count, question.view_count, question.bestAnswer.id));
  }
  
  let question_sql=`INSERT INTO question_table VALUES${aQuestions.join(',')}`;
  //console.log(question_sql);
  //answers
  let aAnswers=[];
  for(let ID in answers){
    let answer=answers[ID];

    aAnswers.push(dataJoin(ID, answer.question_ID, answer.author_ID, answer.content, answer.createdTime));
  }
  let answer_sql=`INSERT INTO answer_table VALUES${aAnswers.join(',')}`;

  //topic_sql
  await db.startTransaction();
  await db.executeTransaction(topic_sql);
  await db.executeTransaction(author_sql);
  await db.executeTransaction(question_sql);
  await db.executeTransaction(answer_sql);
  await db.stopTransaction();

  console.log('完成');
})();

相关文章

  • node爬虫之路(一)

    最近对爬虫很感兴趣,我们node.js也是可以写爬虫。所以写一个node爬虫系列,记录我的爬虫之路,感兴趣的同学可...

  • node爬虫快速入门

    node爬虫 初入前端,刚刚接触node,对于耳闻已久的node爬虫非常神往,所以有了这篇文章,项目代码在文章末尾...

  • node入门场景之——爬虫

    边做边学效率更高,爬虫是node的适用场景之一,关于爬虫的另一篇文章node爬虫进阶之——登录为了验证“经验总结、...

  • node 爬虫

    clawer.js

  • node爬虫

    node爬虫用到的第三方模块 Cheerio 服务端的jQueryhttps://segmentfault.c...

  • Node爬虫

    使用cheerio爬虫模块抓取页面后获取元素信息跟jQuery基本一样

  • node爬虫

    /** 教程:https://blog.csdn.net/Qc1998/article/details/83154...

  • node 爬虫

  • node爬虫

    声明:所有文章都是转载整理的,只是为了自己学习,方便自己观看,如有侵权,请立即联系我,谢谢~ Node.js的学习...

  • node爬虫

    以下代码爬取豆瓣电影网的数据并且写入数据库首先安装cheerio和mysql

网友评论

      本文标题:node 爬虫

      本文链接:https://www.haomeiwen.com/subject/ycswwftx.html