12_selenium基础/01_链家网.py:
"""
https://cs.lianjia.com/ershoufang/pg3/
"""
import aiohttp
import asyncio
from lxml import etree
import time
async def requestDef(url):
session = aiohttp.ClientSession() # 等于 session = HTMLSession
response = await session.get(url=url)
result = await response.text() # text(),返回html源码
await session.close()
return result
def parse(html):
tree = etree.HTML(html)
for div in tree.xpath('//div[@class="info clear"]'):
title = div.xpath('./div[@class="title"]/a/text()')[0] # 标题 ./表示当前循环后的div里的所有内容
position_Small = div.xpath('.//div[@class="positionInfo"]/a[1]/text()')[0].strip() # 打印 和美星城
position_Big = div.xpath('.//div[@class="positionInfo"]/a[2]/text()')[0]
position = '{}-{}'.format(position_Small, position_Big) # 地点 打印和美星城-暮云
house = div.xpath('.//div[@class="houseInfo"]/text()')[0] # 房子
follow = div.xpath('./div[@class="followInfo"]/text()')[0] # 打印 0人关注 / 7天以前发布
followinfo = follow.split('/') # 打印 ['0人关注 ', ' 7天以前发布']
amount_of_attention = followinfo[0] # 关注度
release_time = followinfo[1] # 发布时间
house_price = div.xpath('.//div[@class="totalPrice totalPrice2"]/span/text()|//div[@class="totalPrice totalPrice2"]/i[2]/text()') # 打印 ['121', '万']
house_price = house_price[0] + house_price[1] # 打印 121万
per_yuan = div.xpath('.//div[@class="unitPrice"]/span/text()')[0]
data = (title, position, house, amount_of_attention, release_time, house_price, per_yuan)
print(data)
async def get_html(url):
print("采集请求:",url)
result = await requestDef(url)
parse(result)
startTime = time.time()
ReqUrl = "https://cs.lianjia.com/ershoufang/pg%s/"
tasks = []
for page in range(1,100):
c = get_html(ReqUrl % page)
task = asyncio.ensure_future(c)
tasks.append(task)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
print(time.time()-startTime)
12_selenium基础/02_selenium的基本使用.py:
from selenium import webdriver
browser = webdriver.Chrome()
# browser.maximize_window() # 将浏览器最大化显示
# browser.minimize_window() # 将浏览器最小化显示
browser.set_window_size(400,800) # 设置浏览器宽400 高800 显示
browser.get("http://www.baidu.com/")
browser.get("http://www.douban.com/")
browser.back() # 后退
browser.forward() # 前进
# browser.quit() # 自动关闭
# browser.close() # 自动关闭
12_selenium基础/03_selenium元素定位.py:
from selenium import webdriver
from selenium.webdriver.common.by import By
browser = webdriver.Chrome()
browser.get("https://www.baidu.com/")
"""百度输入框的定位方式 元素定位"""
# # 通过ID方式定位
# browser.find_element(By.ID,'kw').send_keys("selenium")
# # 如果返回多个需要加索引,从0开始
# browser.find_element(By.CLASS_NAME,'s_ipt').send_keys("selenium")
# # 通过name方式定位
# browser.find_element(By.NAME,'wd').send_keys("selenium")
# # 通过css方式定位
# browser.find_element(By.CSS_SELECTOR,'#kw').send_keys("selenium")
# 通过xpath方式定位
browser.find_element(By.XPATH,'//input[@id="kw"]').send_keys("selenium")
browser.find_element(By.ID,'su').click() # click点击
12_selenium基础/04_京东.py:
from selenium import webdriver
from selenium.webdriver.common.by import By
from lxml import etree
import time
browser = webdriver.Chrome()
browser.get("https://www.jd.com/")
browser.find_element(By.ID,'key').send_keys('笔记本')
browser.find_element(By.XPATH,'//button[@class="button"]').click()
time.sleep(2)
html = browser.page_source # 拿到当前页面源码
tree = etree.HTML(html)
# print(tree.xpath('//title/text()')) # 返回页面标题,需要配上延时,不然返回的是上个页面
for li in tree.xpath('//ul[@class="gl-warp clearfix"]/li'):
print(li.xpath('.//div[@class="p-price"]//i/text()'),end='---')
print(li.xpath('.//div[@class="p-name p-name-type-2"]//em//text()'),end='---')
print(li.xpath('.//div[@class="p-commit"]//a/text()'),end='---')
print(li.xpath('.//div[@class="p-shop"]//a/text()'))
browser.find_element(By.XPATH,'//a[@class="pn-next"]').click() # 翻页
"""
京东一次只刷新30条数据,而页面总共是60条,所以需要有翻页操作
"""
文章到这里就结束了!希望大家能多多支持Python(系列)!六个月带大家学会Python,私聊我,可以问关于本文章的问题!以后每天都会发布新的文章,喜欢的点点关注!一个陪伴你学习Python的新青年!不管多忙都会更新下去,一起加油!
Editor:Lonelyroots
网友评论