爬取网址:https://www.taobao.com/
爬取信息:商品价格,付款人数,商品名称、商家名称和地址
存储方式:mongodb存储
淘宝的商品信息是采用Ajax动态加载的,所以使用PhantomJS能自动加载内容,省去了分析构造网页的步骤。
from selenium import webdriver
from bs4 import BeautifulSoup
import pymongo
import time
client = pymongo.MongoClient('localhost',27017)
mydb = client['mydb']
taobao_rnp = mydb['taobao_renaiping']
def search_goods(word):
url = "https://www.taobao.com/"
driver.get(url)
driver.implicitly_wait(10)
driver.find_element_by_id("q").clear()
driver.find_element_by_id("q").send_keys(word)
driver.find_element_by_class_name("btn-search").click()
return driver.current_url
def get_info(url):
driver.get(url)
driver.implicitly_wait(10)
soup = BeautifulSoup(driver.page_source,"lxml")
infos = soup.select("div.item.J_MouserOnverReq")
for info in infos:
goods = info.select("div.row a")[0].text.strip()
price = info.select("div.price > strong")[0].text
sell = info.select("div.deal-cnt")[0].text
shop = info.select("a.shopname > span")[1].text
city = info.select("div.location")[0].text
detail = {
'商品':goods,
'价格':price,
'付款人数':sell,
'商店名':shop,
'城市':city}
taobao_rnp.insert_one(detail)
#print(goods,price,sell,shop,city)
def get_nextpage(url):
driver.get(url)
driver.implicitly_wait(10)
driver.find_element_by_css_selector("a[trace=srp_bottom_pagedown]").click()
time.sleep(2)
return driver.current_url
if __name__ == "__main__":
driver = webdriver.PhantomJS()
#driver = webdriver.Chrome()
driver.maximize_window()
url = search_goods("热奶瓶")
get_info(url)
for i in range(99):
next_url = get_nextpage(url)
get_info(next_url)
网友评论