一,结果
image.png
二,思路
三,上源码
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import requests
import re
import pymysql
conn = pymysql.Connect(host='x',user='x',password='x',port=x,database='x',charset='x')
dataname = input('请输入数据库名称:')
cursor = conn.cursor()
sql = "CREATE TABLE IF NOT EXISTS %s(ID INT(10) NOT NULL PRIMARY KEY AUTO_INCREMENT,TIME_DATA TIMESTAMP DEFAULT CURRENT_TIMESTAMP," \
"A VARCHAR(255),B VARCHAR(255),Cc varchar(10),D VARCHAR(255),E VARCHAR(255), F VARCHAR(255),G VARCHAR(255))ENGINE=INNODB DEFAULT CHARSET=UTF8"
value = (dataname)
cursor.execute(sql%value)
print('创建成功!!')
url = 'http://www.jiuxian.com/'
browser = webdriver.Firefox()
browser.set_window_size(900,900)
timeout = WebDriverWait(browser,10)
browser.get(url)
time.sleep(3)
input_a =timeout.until(EC.presence_of_element_located((By.ID,'wd')))
#input_a = browser.find_element_by_id('wd')
print('一个a通过,下一步')
input_a.clear()
input_a.send_keys('白酒')
input_a.send_keys(Keys.ENTER)
# html_apage = browser.page_source
# soupa = BeautifulSoup(html_apage,'lxml')
# html_aprice = [html_aprice.get_text() for html_aprice in soupa.find_all('p',class_='price')]
# html_aid = [html_aid['proimgid'] for html_aid in soupa.find_all('img',attrs={'proimgid':re.compile('\d+')})]
# for aprice,aid in zip(html_aprice,html_aid):
# print(aprice,aid)
# aids = aid #商品ID
# aprices =aprice #商品价钱
p = 346
p2 =1
while True:
if p == 346:
time.sleep(4)
input_a = timeout.until(EC.presence_of_element_located((By.ID, 'wd')))
print('正在跳转到葡萄酒')
input_a.clear()
input_a.send_keys('葡萄酒')
input_a.send_keys(Keys.ENTER)
time.sleep(3)
for apage in range(0,10000,2500):
browser.execute_script('window,scrollBy(0,{})'.format(apage))
time.sleep(3)
print(apage)
if apage == 5000:
break
html_apage = browser.page_source
soupa = BeautifulSoup(html_apage, 'lxml')
html_aprice = [html_aprice.get_text() for html_aprice in soupa.find_all('p', class_='price')]
html_aid = [html_aid['proimgid'] for html_aid in soupa.find_all('img', attrs={'proimgid': re.compile('\d+')})]
for aprice, aid in zip(html_aprice, html_aid):
print('\n\n价钱:%s\nID:%s'%(aprice, aid))
a6 = ('\n\n价钱:%s\nID:%s'%(aprice, aid))
print('正在进行requests请求!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
url = 'http://www.jiuxian.com/goods-{}.html'.format(aid)
# url1 = 'http://www.jiuxian.com/goods-16807.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'
}
response_three = requests.post(url, headers=headers)
if response_three.status_code == 200:
# print(response_three.text)
html = response_three.text
soup = BeautifulSoup(html, 'lxml')
items = soup.find_all('ul', class_='intrList clearfix')
response_three = requests.post(url, headers=headers)
if response_three.status_code == 200:
# print(response_three.text)
html = response_three.text
soup = BeautifulSoup(html, 'lxml')
items = soup.find_all('ul', class_='intrList clearfix')
for li in items:
# a3原配料
try:
a3 = [a3.get_text() for a3 in li.find_all('ul',class_='intrList clearfix')]
except:
pass
try:
a1 = [li.get_text() for li in li.find_all('li')[3:4]][0]
# print([li.get_text() for li in li.find_all('li')[3:4]][0]) # 1.酒厂名称
except:
pass
try:
a6 = [li.get_text() for li in li.find_all('li')[4:5]][0]
# print([li.get_text() for li in li.find_all('li')[4:5]][0]) # 2.容量
except:
pass
itema = soup.find_all('div', class_='comName')
for h in itema:
nameitem = h.get_text().replace('\n', '')
a2 = nameitem # !酒名称
try:
a55 = re.findall('\d+ml|\d+mL', nameitem) # !容量
a5 = ''.join(list(a55))
except:
pass
try:
a44 = re.findall('(\d+.){1}\W', nameitem) # 度数
a4 = ''.join(list(a44))
except:
pass
try:
aa = ('名称:{}\n容量:{}\n酒精度:{}度,{}{}{}'.format(a1, a2, a3, a4, a5, a6))
print('名称:{}\n容量:{}\n酒精度:{}度,{}{}{}'.format(a1, a2, a3,a4,a5,a6))
except:
pass
try:
sql = "INSERT INTO %s(A,B,Cc,D,E,F)VALUES('1.酒厂名称:%s','2.酒名:%s','3.原配料:%s','4.度数:%s','5.容量:%s','6.%s')"
value_sql2 = (dataname, a1, a2, a3, a4, a5, a6)
cursor.execute(sql % value_sql2)
print('sql1')
except:
sql = "INSERT INTO %s(A,B,D,E,F)VALUES('1.酒厂名称:%s','2.酒名:%s','4.度数:%s','5.容量:%s','6.%s')"
value_sql2 = (dataname, a1, a2, a4, a5, a6)
cursor.execute(sql % value_sql2)
print('sql2')
finally:
conn.commit()
print('导入成功!')
if p2 == p2:
p2 +=1
next_element = timeout.until(EC.presence_of_element_located((By.LINK_TEXT, '{}'.format(p2))))
next_element.click()
print('下一页')
print('葡萄酒第%s页' % p2)
if p == p:
p +=1
next_element = timeout.until(EC.presence_of_element_located((By.LINK_TEXT, '{}'.format(p))))
next_element.click()
print('下一页')
print('白酒第%s页'%p)
# nextpage = browser.find_element_by_xpath('//*[@class="clearfix"]/*/*/a[@class="nextpage"]')
# nextpage.click()
# time.sleep(20)
# browser.quit()
# print('关闭浏览器')
网友评论