美文网首页爬虫
5-案例:淘宝商品页爬取

5-案例:淘宝商品页爬取

作者: 撸撸很乖张 | 来源:发表于2018-08-10 02:51 被阅读2次
import requests
import re
import pymysql
import time


def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""


def parsePage(ilt, html):
    try:
        plt = re.findall(r'\"view_price\":\"[\d.]*\"', html)
        tlt = re.findall(r'\"raw_title\":\".*?\"', html)
        for i in range(len(plt)):
            # price = plt[i].split(':')[1].strip('"')
            # title = tlt[i].split(':')[1].strip('"')
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            ilt.append([price, title])
    except:
        print("")


def deposit(ilt, cursor, db):
    try:
        for item in ilt:
            sql = "INSERT INTO `computer` (`title`, `price`) VALUES (%s, %s)"
            cursor.execute(sql, (item[1], item[0]))
            db.commit()
    except:
        db.rollback()


def main():
    db = pymysql.connect('127.0.0.1', 'root', '123456', 'spider')
    db.set_charset('utf8')
    cursor = db.cursor()
    goods = '台式机'
    depth = 99
    start_url = 'https://s.taobao.com/search?q=' + goods
    infoList = []
    for i in range(depth):
        try:
            url = start_url + '&s=' + str(44 * i)
            html = getHTMLText(url)
            parsePage(infoList, html)
            deposit(infoList, cursor, db)
            infoList = []
        except:
            continue
        time.sleep(3)
        print(i)
    db.close()


if __name__ == '__main__':
    main()

相关文章

网友评论

    本文标题:5-案例:淘宝商品页爬取

    本文链接:https://www.haomeiwen.com/subject/jjbhbftx.html