美文网首页python爬虫呆鸟的Python数据分析
小猪的Python学习之旅 —— 20.抓取Gank.io所有

小猪的Python学习之旅 —— 20.抓取Gank.io所有

作者: coder_pig | 来源:发表于2018-05-17 14:15 被阅读41次

    一句话概括本文

    内容较多,建议先mark后看,讲解了一波MySQL安装,基本操作,语法速成,DataGrip使用,
    链接远程数据库问题,爬取Gank.io API接口,存储到数据,还有遇到的三个问题。

    image
    # 抓取Gank.io所有文章的爬虫
    
    import pymysql
    import requests as rq
    import urllib
    import coderpig_n as cn
    
    gank_api = "http://gank.io/api/data/"
    
    # 各种分类的表名:Android,iOS,休息视频,福利,拓展资源,前端,瞎推荐,App
    category_list = ["android", "ios", "video", "meizi", "other", "fed", "random", "app"]
    type_list = ["Android", "iOS", "休息视频", "福利", "拓展资源", "前端", "瞎推荐", "App"]
    column_list = ('_id', 'dsec', 'images', 'url', 'type')
    
    
    def init_db():
        db = pymysql.connect(host='localhost', user='root', password='zpj12345', port=3306, db='gank', charset="utf8")
        cursor = db.cursor()
        try:
            for category in category_list:
                sql = "CREATE TABLE IF NOT EXISTS %s (" \
                      "_id  VARCHAR(50) NOT NULL," \
                      "dsec TEXT," \
                      "images  TEXT," \
                      "url  TEXT," \
                      "type VARCHAR(50)  DEFAULT ''," \
                      "PRIMARY KEY (_id))" % category
                cursor.execute(sql)
            db.close()
        except:
            pass
    
    
    class Gank:
        _id = dsec = images = url = type = ''
    
        def __init__(self, _id, dsec, images, url, type):
            self._id = _id
            self.dsec = dsec
            self.images = images
            self.url = url
            self.type = type
    
        # 以元组的方式返回值
        def to_value_tuple(self):
            return self._id, self.dsec, self.images, self.url, self.type
    
    
    def insert_db(gank_list):
        db = pymysql.connect(host='localhost', user='root', password='zpj12345', port=3306, db='gank', charset="utf8")
        cursor = db.cursor()
        try:
            for data in gank_list:
                if data.type in type_list:
                    category = category_list[type_list.index(data.type)]
                    data_tuple = data.to_value_tuple()
                    sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=category,
                                                                                 keys=','.join(column_list),
                                                                                 values=','.join(['%s'] * len(data_tuple)))
                    cursor.execute(sql, data_tuple)
                    print(data_tuple)
            db.commit()
        except Exception as e:
            print(str(e))
            db.rollback()
        db.close()
    
    
    def spider_data(pos):
        count = 1
        while True:
            resp = rq.get(gank_api + urllib.parse.quote(type_list[pos]) + "/50/" + str(count), proxies=cn.get_proxy_ip())
            resp_json = resp.json()
            print(resp.url)
            if resp.status_code == 200 and len(resp_json['results']) != 0:
                json_list = []
                for result in resp_json['results']:
                    images = result.get('images')
                    if images is None:
                        images = ''
                    else:
                        images = images[0]
                    gank = Gank(result['_id'], result['desc'], images, result.get('url', ''),
                                result['type'])
                    json_list.append(gank)
                insert_db(json_list)
            else:
                break
            count += 1
    
    
    if __name__ == '__main__':
        init_db()
        for i in range(0, len(type_list)):
            spider_data(i)
    
        db = pymysql.connect(host='localhost', user='root', password='zpj12345', port=3306, db='gank', charset="utf8")
        cursor = db.cursor()
        cursor.execute('SELECT * FROM android')
        print(cursor.rowcount)
        results = cursor.fetchall()
        for result in results:
            print(result)
        cursor.close()
    
    

    来啊,Py交易啊

    想加群一起学习Py的可以加下,智障机器人小Pig,验证信息里包含:
    PythonpythonpyPy加群交易屁眼 中的一个关键词即可通过;

    image

    验证通过后回复 加群 即可获得加群链接(不要把机器人玩坏了!!!)~~~
    欢迎各种像我一样的Py初学者,Py大神加入,一起愉快地交流学♂习,van♂转py。


    相关文章

      网友评论

        本文标题:小猪的Python学习之旅 —— 20.抓取Gank.io所有

        本文链接:https://www.haomeiwen.com/subject/ivybdftx.html