一。Ajax的动态加载数据(requests库)
美团网(ajax动态,一次性加载完数据,在开发者工具的Network中的XHR下查看ajax的响应数据)
import requests
import re
import json
# 获取网页
def get_page():
url = 'http://cd.meituan.com/meishi/b6119/'
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.content.decode('utf-8')
return None
def main():
html = get_page()
print(html)
pattern = re.compile('"poiInfos":(.*?)},"comHeader"', re.S)
meituan_list = re.findall(pattern, html)
meituan_result = json.loads(meituan_list[0])
print(len(meituan_result))
for item in meituan_result:
print(item['title'])
if __name__ == '__main__':
main()
二.蘑菇街的动态数据使用ajax的爬取
import requests
from urllib.parse import urlencode
import json
from spider_save_helper import save_item
'''
https://list.mogujie.com/search?callback=jQuery21109528018020686176_1536678057418&_version=8193&ratio=3%3A4&cKey=15&page=1&sort=pop&ad=0&fcid=10059141&action=sports
'''
def get_page(page):
params = {
'callback': 'jQuery21109528018020686176_1536678057418',
'_version': 8193,
'ratio': '3%3A4',
'cKey': 15,
'page': page,
'sort': 'pop',
'ad': 0,
'fcid': '10059141',
'action': 'sports'
}
# urlencode将参数params解析为符合url格式的字符串
url = 'https://list.mogujie.com/search?' + urlencode(params)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre"
}
response = requests.get(url, headers=headers)
print(response.status_code)
if response.status_code == 200:
text = response.content.decode('utf-8')
return text
return None
def get_images(json_dict):
docs = json_dict['result']['wall']['docs']
for item in docs:
item_dict = {}
item_dict['img'] = item['img']
item_dict['title'] = item['title']
item_dict['org_price'] = item['orgPrice']
item_dict['price'] = item['price']
yield item_dict
def parse_json(text):
text = text.replace('/**/jQuery21109528018020686176_1536678057418(', '')[:-2]
json_dict = json.loads(text)
return json_dict
def get_pages():
page = 1
while True:
text = get_page(page)
json_dict = parse_json(text)
is_end = json_dict['result']['wall']['isEnd']
if is_end:
return
result = get_images(json_dict)
for item in result:
print(item)
save_item(item)
page += 1
def main():
result_list = []
get_pages()
if __name__ == '__main__':
main()
三.selenium实现自动化网页的请求
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
def f1(browser):
# 访问页面
browser.get('https://www.mkv99.com/vod-detail-id-9462.html')
# # 获取渲染后的页面内容
# print(browser.page_source)
# # 获取当前网址
# print(browser.current_url)
# # 获取浏览器cookie
# print(browser.get_cookies())
# # 根据id获取单个节点
input1 = browser.find_element_by_id('1thUrlid第01集')
# print(input1)
# # # 获取节点属性
# print(input1.get_attribute('href'))
# # 用css选择器获取单个节点
# input_list = browser.find_elements_by_css_selector('.dwon2')
# for item in input_list:
# print(item.get_attribute('href'))
# print(input2.get_attribute('href'))
# # # 获取节点在页面的坐标(节点左上角)
print(input1.location)
# # # 获取节点的宽高
print(input1.size)
# # 用xpath方法获取单个节点
# input3 = browser.find_element_by_xpath('//*[@class="dwon2"]')
# print(input3.get_attribute('id'))
# # 根据name获取单个节点
# input4 = browser.find_element_by_name('CopyAddr1')
# print(input4.tag_name)
# # 根据链接文字获取单个节点
# input5 = browser.find_element_by_link_text('今日更新')
# # 获取包含下载的所有节点
# input6 = browser.find_elements_by_partial_link_text('下载')
# # 获取节点文本值
# print(input5.text)
# print(input6)
def f2(browser):
browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
# 切换到指定iframe标签内的网站链接(网页里面嵌套的另外一个网站,老版本有,现在很少见)
browser.switch_to.frame('iframeResult')
source = browser.find_element_by_css_selector('#draggable')
target = browser.find_element_by_css_selector('#droppable')
# 动作链(连续执行一连串的动作)
actions = ActionChains(browser)
# 将选定的源移动到目标的位置(将一个节点source拖到另外一个节点target)
actions.drag_and_drop(source, target)
actions.perform()
def main():
# 使用chrome浏览器
browser = webdriver.Chrome()
# 使用Firefox浏览器
# browser = webdriver.Firefox()
# 使用Edge浏览器
# browser = webdriver.Edge()
# 使用Phantom浏览器(已经没用了)
# browser = webdriver.PhatomJS()
# 使用Safari浏览器
# browser = webdriver.Safari()
try:
f2(browser)
finally:
# 关闭浏览器
browser.close()
if __name__ == '__main__':
main()
网友评论