美文网首页
Pyspider和pymysql简单使用实例

Pyspider和pymysql简单使用实例

作者: Yuu_CX | 来源:发表于2018-01-08 17:55 被阅读0次
    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2018-01-08 10:21:36
    # Project: newv2ex
    
    from pyspider.libs.base_handler import *
    import pymysql
    import random
    
    
    class Handler(BaseHandler):
        crawl_config = {
        }
    
            
        def add_question(self,title,content):
            db = pymysql.connect(host="localhost",user="root",password="root",db="club",charset="utf8")
            try:
                cursor = db.cursor()
                #注意此处字符串的占位符要加双引号"%s"
                sql = ' INSERT INTO question (title,content,user_id,created_date,comment_count) VALUES("%s","%s",%d,now(),%d)' % (title,content,random.randint(1,10),2)
                print(sql)
                cursor.execute(sql)
                print(cursor.lastrowid)
                db.commit()
            except:
                db.rollback()
        
        @every(minutes=24 * 60)
        def on_start(self):
            self.crawl('https://www.v2ex.com/', callback=self.index_page, validate_cert=False)
    
        @config(age=10 * 24 * 60 * 60)
        def index_page(self, response):
            for each in response.doc('a[href^="https://www.v2ex.com/?tab="]').items():
                self.crawl(each.attr.href, callback=self.tab_page, validate_cert=False)
    
        @config(priority=2)
        def tab_page(self, response):
            for each in response.doc('a[href^="https://www.v2ex.com/go/"]').items():
                self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)
                
        @config(priority=2)
        def board_page(self, response):
            for each in response.doc('a[href^="https://www.v2ex.com/t/"]').items():
                url = each.attr.href
                if url.find('#reply')>0:
                    url = url[0:url.find('#')]
                self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
            for each in response.doc('a.page_normal').items():
                self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)
            
        @config(priority=2)
        def detail_page(self, response):
            title = response.doc('h1').text()
            content = response.doc('div.topic_content').text()
            #insert into MySQL
            self.add_question(title,content)
            return {
                "url": response.url,
                "title": response.doc('h1').text(),
                "content": response.doc('div.topic_content').text()
            }
    
    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2018-01-08 20:58:58
    # Project: newzhihu
    
    from pyspider.libs.base_handler import *
    import pymysql
    import random
    
    class Handler(BaseHandler):
        crawl_config = {
            'headers': {
                'User-Agent': 'GoogleBot',
            }
        }        
    
        def add_question(self,title,content,comment_count):
            db = pymysql.connect(host="localhost",user="root",password="root",db="club",charset="utf8")
            try:
                cursor = db.cursor()
                #注意此处字符串的占位符要加双引号"%s"
                sql = ' INSERT INTO question (title,content,user_id,created_date,comment_count) VALUES("%s","%s",%d,now(),%d)' % (title,content,random.randint(1,10),comment_count)
                print(sql)
                cursor.execute(sql)
                qid = cursor.lastrowid
                db.commit()
                print(qid)
                return qid
            except:
                db.rollback()
            return 0
                
        def add_comment(self,qid,comment):
            db = pymysql.connect(host="localhost",user="root",password="root",db="club",charset="utf8")
            try:
                cursor = db.cursor()
                #注意此处字符串的占位符要加双引号"%s"
                sql = 'insert into comment(content, entity_type, entity_id, user_id, created_date) values ("%s",%d,%d, %d,now())' % (comment, 1, qid, random.randint(1, 10));
                print(sql)
                cursor.execute(sql)
                
                #qid = cursor.lastrowid
                #print(qid)
                
                db.commit()
            except:
                db.rollback()
        
        @every(minutes=24 * 60)
        def on_start(self):
            self.crawl('https://www.zhihu.com/topic/19550517/top-answers', callback=self.index_page, validate_cert=False)
    
        @config(age=10 * 24 * 60 * 60)
        def index_page(self, response):
            for each in response.doc('a.question_link').items():
                self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
            for each in response.doc('div.zm-invite-pager span a').items():
                self.crawl(each.attr.href, callback=self.index_page, validate_cert=False)
    
        @config(priority=2)
        def detail_page(self, response):
            items = response.doc('span.RichText.CopyrightRichText-richText').items()
            title = response.doc('h1.QuestionHeader-title').text()
            content = response.doc('div.QuestionHeader-detail').text()
    
            qid = self.add_question(title, content, sum(1 for x in items))
            for each in response.doc('span.RichText.CopyrightRichText-richText').items():
                self.add_comment(qid, each.text())
    
            return {
                "url": response.url,
                "title": title,
                "content": content,
            }
    

    相关文章

      网友评论

          本文标题:Pyspider和pymysql简单使用实例

          本文链接:https://www.haomeiwen.com/subject/cwnbnxtx.html