根据excel的URL获取网页title.py

作者: Aedda | 来源:发表于2020-03-17 10:08 被阅读0次

根据excel的URL获取网页title.py
我的第一个爬虫——获取国防科技大学历年录取分数
Java开源爬虫框架WebCollector图片抓取教程
Python爬虫：Urllib库的基本使用
用Python实现一个简单的爬虫
(一) 爬虫的基本概念和抓包工具的使用
爬虫常用库总结
AFNetworking的用法
php获取当前域名、主机、URL、端口、参数、网址、路径、代理等
EXCEL VBA 获取指定URL网页的源码资源

from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import re
from xlrd import open_workbook
from openpyxl import load_workbook
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities


def q1(file_name,target_file):
    desired_capabilities = DesiredCapabilities.CHROME # 懒加载模式
    desired_capabilities["pageLoadStrategy"] = "none"

    chrome_options = webdriver.ChromeOptions()  # 无头模式
    chrome_options.add_argument("--headless")

    driver = Chrome(desired_capabilities=desired_capabilities,chrome_options=chrome_options)
    # driver = PhantomJS(desired_capabilities=desired_capabilities)
    wait = WebDriverWait(driver, timeout=5)

    data = open_workbook(file_name) #读取文件
    table = data.sheet_by_name(data.sheet_names()[2])   #索引为2的页
    all_heng = table.nrows  #行数
    print('大类：',table.cell(1, 3).value,'行数:',all_heng)
    for i in range(1, all_heng):
        FEATURE_initial = table.cell(i, 5).value
        if '&&' not in FEATURE_initial and 'http.host matches' in FEATURE_initial:
            FEATURE_ls = re.findall('http.host matches \"(.*?)\"',FEATURE_initial)
            for FEATURE_l in FEATURE_ls:
                if ':' in FEATURE_l:
                    FEATURE_l=re.findall('(.*?):', FEATURE_l)[0]    # 去掉端口信息
                if FEATURE_l[0] == '.': # .开头的
                    FEATURE='http://www'+FEATURE_l
                elif FEATURE_l != '' and '{@0}' not in FEATURE_l:   # 只加http://的
                    FEATURE='http://'+FEATURE_l
                elif '{@0}' in FEATURE_l:   # {@0}开头的
                    FEATURE = 'http://' + FEATURE_l[4:]


                app_name = table.cell(i, 1).value
                app_class = table.cell(i, 3).value
                t1=time.time()
                try:
                    print(FEATURE)
                    driver.get(FEATURE)
                    wait.until(EC.presence_of_element_located((By.XPATH, "//title")))
                    driver.execute_script("window.stop();")
                except:
                    print()
                t2 = time.time() - t1
                try:
                    title = driver.title
                except:
                    print()
                print(i,t2,FEATURE_initial, FEATURE, app_name,app_class, title)

                xlsx = load_workbook(target_file)
                Sheet1 = xlsx.active
                Sheet1.cell(i, 1, FEATURE_initial)
                Sheet1.cell(i, 2, FEATURE)
                Sheet1.cell(i, 3, app_name)
                Sheet1.cell(i, 4, str(app_class))
                Sheet1.cell(i, 5, str(title))
                xlsx.save(target_file)


if __name__ == '__main__':
    task_file = './任务.xlsx'
    target_file = './mb.xlsx'
    q1(task_file,target_file)