5-案例：淘宝商品页爬取

作者: 撸撸很乖张 | 来源:发表于2018-08-10 02:51 被阅读2次

5-案例：淘宝商品页爬取
单元八·实例
网络爬虫实战(5个案例)
Java爬虫爬取天猫淘宝京东搜索页和商品详情
案例集锦
利用Python爬取淘宝商品信息
selenium 爬取淘宝列表页
淘宝客高佣金商品采集爬虫开发教程-采集商品名称、月销、价格、佣金
我用Python爬了4400条淘宝商品数据，竟发现了这些“潜规则
selenium爬取淘宝商品

import requests
import re
import pymysql
import time


def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""


def parsePage(ilt, html):
    try:
        plt = re.findall(r'\"view_price\":\"[\d.]*\"', html)
        tlt = re.findall(r'\"raw_title\":\".*?\"', html)
        for i in range(len(plt)):
            # price = plt[i].split(':')[1].strip('"')
            # title = tlt[i].split(':')[1].strip('"')
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            ilt.append([price, title])
    except:
        print("")


def deposit(ilt, cursor, db):
    try:
        for item in ilt:
            sql = "INSERT INTO `computer` (`title`, `price`) VALUES (%s, %s)"
            cursor.execute(sql, (item[1], item[0]))
            db.commit()
    except:
        db.rollback()


def main():
    db = pymysql.connect('127.0.0.1', 'root', '123456', 'spider')
    db.set_charset('utf8')
    cursor = db.cursor()
    goods = '台式机'
    depth = 99
    start_url = 'https://s.taobao.com/search?q=' + goods
    infoList = []
    for i in range(depth):
        try:
            url = start_url + '&s=' + str(44 * i)
            html = getHTMLText(url)
            parsePage(infoList, html)
            deposit(infoList, cursor, db)
            infoList = []
        except:
            continue
        time.sleep(3)
        print(i)
    db.close()


if __name__ == '__main__':
    main()