美文网首页
118.获取网页的方式

118.获取网页的方式

作者: 羽天驿 | 来源:发表于2020-02-15 19:41 被阅读0次

一。Ajax的动态加载数据(requests库)

美团网(ajax动态,一次性加载完数据,在开发者工具的Network中的XHR下查看ajax的响应数据)

import requests
import re
import json


# 获取网页
def get_page():
    url = 'http://cd.meituan.com/meishi/b6119/'
    headers = {
        "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)" 
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.content.decode('utf-8')
    return None


def main():
    html = get_page()
    print(html)
    pattern = re.compile('"poiInfos":(.*?)},"comHeader"', re.S)
    meituan_list = re.findall(pattern, html)
    meituan_result = json.loads(meituan_list[0])
    print(len(meituan_result))
    for item in meituan_result:
        print(item['title'])


if __name__ == '__main__':
    main()

二.蘑菇街的动态数据使用ajax的爬取

import requests
from urllib.parse import urlencode
import json
from spider_save_helper import save_item

'''
https://list.mogujie.com/search?callback=jQuery21109528018020686176_1536678057418&_version=8193&ratio=3%3A4&cKey=15&page=1&sort=pop&ad=0&fcid=10059141&action=sports
'''

def get_page(page):
    params = {
        'callback': 'jQuery21109528018020686176_1536678057418',
        '_version': 8193,
        'ratio': '3%3A4',
        'cKey': 15,
        'page': page,
        'sort': 'pop',
        'ad': 0,
        'fcid': '10059141',
        'action': 'sports'
    }
    # urlencode将参数params解析为符合url格式的字符串
    url = 'https://list.mogujie.com/search?' + urlencode(params)

    headers =  {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre" 
    }
    response = requests.get(url, headers=headers)
    print(response.status_code)
    if response.status_code == 200:
        text = response.content.decode('utf-8')
        return text
    return None


def get_images(json_dict):
    docs = json_dict['result']['wall']['docs']
    for item in docs:
        item_dict = {}
        item_dict['img'] = item['img']
        item_dict['title'] = item['title']
        item_dict['org_price'] = item['orgPrice']
        item_dict['price'] = item['price']
        yield item_dict

def parse_json(text):
    text = text.replace('/**/jQuery21109528018020686176_1536678057418(', '')[:-2]
    json_dict = json.loads(text)
    return json_dict

def get_pages():
    page = 1
    while True:
        text = get_page(page)
        json_dict = parse_json(text)
        is_end = json_dict['result']['wall']['isEnd']
        if is_end:
            return
        result = get_images(json_dict)
        for item in result:
            print(item)
            save_item(item)
        page += 1

def main():
    result_list = []
    get_pages()

if __name__ == '__main__':
    main()

三.selenium实现自动化网页的请求

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains

def f1(browser):
    # 访问页面
    browser.get('https://www.mkv99.com/vod-detail-id-9462.html')
    # # 获取渲染后的页面内容
    # print(browser.page_source)
    # # 获取当前网址
    # print(browser.current_url)
    # # 获取浏览器cookie
    # print(browser.get_cookies())

    # # 根据id获取单个节点
    input1 = browser.find_element_by_id('1thUrlid第01集')
    # print(input1)
    # # # 获取节点属性
    # print(input1.get_attribute('href'))
    
    # # 用css选择器获取单个节点
    # input_list = browser.find_elements_by_css_selector('.dwon2')
    # for item in input_list:
    #   print(item.get_attribute('href'))
    # print(input2.get_attribute('href'))
    # # # 获取节点在页面的坐标(节点左上角)
    print(input1.location)
    # # # 获取节点的宽高
    print(input1.size)

    # # 用xpath方法获取单个节点
    # input3 = browser.find_element_by_xpath('//*[@class="dwon2"]')
    # print(input3.get_attribute('id'))

    # # 根据name获取单个节点
    # input4 = browser.find_element_by_name('CopyAddr1')
    # print(input4.tag_name)

    # # 根据链接文字获取单个节点
    # input5 = browser.find_element_by_link_text('今日更新')
    # # 获取包含下载的所有节点
    # input6 = browser.find_elements_by_partial_link_text('下载')
    # # 获取节点文本值
    # print(input5.text)
    # print(input6)
def f2(browser):
    browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    # 切换到指定iframe标签内的网站链接(网页里面嵌套的另外一个网站,老版本有,现在很少见)
    browser.switch_to.frame('iframeResult') 
    source = browser.find_element_by_css_selector('#draggable')
    target = browser.find_element_by_css_selector('#droppable')
    # 动作链(连续执行一连串的动作)
    actions = ActionChains(browser)
    # 将选定的源移动到目标的位置(将一个节点source拖到另外一个节点target)
    actions.drag_and_drop(source, target)
    actions.perform()


def main():
    # 使用chrome浏览器
    browser = webdriver.Chrome()
    # 使用Firefox浏览器
    # browser = webdriver.Firefox()
    # 使用Edge浏览器
    # browser = webdriver.Edge()
    # 使用Phantom浏览器(已经没用了)
    # browser = webdriver.PhatomJS()
    # 使用Safari浏览器
    # browser = webdriver.Safari()

    try:
        f2(browser)
    finally:
        # 关闭浏览器
        browser.close()

if __name__ == '__main__':
    main()

相关文章

网友评论

      本文标题:118.获取网页的方式

      本文链接:https://www.haomeiwen.com/subject/azsyfhtx.html