美文网首页
抓包之分类数据抓取

抓包之分类数据抓取

作者: LuckTime | 来源:发表于2016-10-25 11:27 被阅读48次

https://www.hardtofind.com.au/

Paste_Image.png

1.获取分类的链接,写成数组的形式,保存在一个文件中
2.获取每个分类的网页数据,保存在csv文件。
3.读取每个分类的商品链接,并获取该链接下的商品。

引入插件 。

from selenium import webdriver
# get encode uncode by base64
import base64

# get xpath by BeautifulSoup  == make base sdk
from bs4 import BeautifulSoup

import time
# get get time by sleep  eg:time.sleep(0.1)
from time import sleep

import random
# get re way   eg:match = re.search(r'[\w.-]+@[\w.-]+', Fi_Email)
import re
import csv
import unicodecsv
#import unicodecsv               ========make
from cStringIO import StringIO
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

import json

import urllib2

from lxml import etree

图片地址分析:
大图:https://res.cloudinary.com/hardtofind/image/upload/c_pad,h_580,w_580/b_rgb:ffffff,h_580,w_580/cs_srgb,f_auto,fl_lossy/v1/product_image/santa_friends
小图:
https://res.cloudinary.com/hardtofind/image/upload/c_pad,h_235,w_235/b_rgb:ffffff,h_235,w_235/cs_srgb,f_auto,fl_lossy/v1/product_image/santa_friends

实现

#_*_coding:utf8_*_
#!/usr/local/bin/python    MAC
# Filename: hard.py
#intend: image,title,price,class


from selenium import webdriver
from bs4 import BeautifulSoup
import urllib2
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

import csv
import unicodecsv
import codecs
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 "
    "(KHTML, like Gecko) Chrome/15.0.87"
)
driver = webdriver.PhantomJS(desired_capabilities=dcap)

# step 1 get all class
hrefarray = ['gifts','home-garden','prints-art','fashion','jewellery','men','kids','weddings']
for href in hrefarray:
    page = -1;
    while (page < 200):
        page = page + 1
        strPageIndex = str(page)
# step 2 get all href
        href_a = "https://www.hardtofind.com.au/categories/"+href+"?page="+strPageIndex
        print "======"+strPageIndex+"==="+href
        csvPageindexfile = file('hrefhardtofind.csv', 'a+')
        writer = unicodecsv.writer(csvPageindexfile,encoding='utf-8')
        #writer.writerow(['name', 'class', 'type','Address','Po_Address','Phone','Fax','Ma_Office','email'])
        hrefdb = (strPageIndex,href_a)
        hrefdata = [
         hrefdb
           ]
        writer.writerows(hrefdata)
        csvPageindexfile.close()
        content = urllib2.urlopen(href_a).read()
        s = BeautifulSoup(content,"html5lib")
        nav_Data = s.find("div", {"id": "products"})
        roothref_data = nav_Data.findAll("div", {"class": "sale-item-text-wrap"})
        index = 0
        maxindex = len(roothref_data) - 1

        for webhref in roothref_data:
            if (index >= maxindex):
                break
            index = index + 1
            lablehref = webhref.find("a",href=True)
            page_href =  lablehref["href"]
            threecontent = urllib2.urlopen(page_href).read()
            s = BeautifulSoup(threecontent,"html5lib")
            page_form = s.find("form",{"id":"form_add_to_cart"})
            page_title = page_form.find("div",{"class":"title"})
            title = page_title.h1.string
            print title
            price = page_title.find("span",{"itemprop":"price"}).string
            print price
            page_image =s.find("div",{"class":"galleryContainer"})
            page_image_a = page_image.findAll("ul",{"id":"imageGallery"})
            # image = page_image.findAll("a")
            print page_image_a[0].findAll("a")
            # print image
            page_class = s.find("div",{"class":"pagination"})
            classname = page_class.get_text().strip().encode('utf-8')
            classname = classname.replace(' ', '').replace('\n','>')
            print classname
# step 3 get all content
            csvfile = file('hardtofind_com_au.csv', 'a+')
            writer = unicodecsv.writer(csvfile,encoding='utf-8')
            csvfile.write(codecs.BOM_UTF8)
            writer = csv.writer(csvfile)
# step 4  seve to csv
            db = (title,price,classname, page_image_a)
            data = [
            db
            ]
            writer.writerows(data)
            csvfile.close()
            # break



    page += 1
    time.sleep(3)



相关文章

网友评论

      本文标题:抓包之分类数据抓取

      本文链接:https://www.haomeiwen.com/subject/hthluttx.html