美文网首页
2018-01-18

2018-01-18

作者: Viemax | 来源:发表于2018-01-18 02:43 被阅读0次
    from selenium import webdriver
    from selenium.webdriver.firefox.options import Options
    import requests
    from bs4 import BeautifulSoup
    import csv
    import ssl
    import re
    import time
    from prettyprinter import cpprint
    
    ssl._create_default_https_context = ssl._create_unverified_context
    
    def get_newURL(surname):
        if __name__ == "__main__":
            options = Options()
            options.add_argument('-headless')
            global browser
            browser = webdriver.Firefox(executable_path=r"/Users/viemaxwei/Downloads/geckodriver", firefox_options=options)
            # browser = webdriver.Firefox(executable_path=r"/Users/viemaxwei/Downloads/geckodriver")
            browser.get('http://search.library.sh.cn/jiapu/bSearch.htm')
            input_str = browser.find_element_by_name('expr')
            input_str.send_keys(surname)
            browser.find_element_by_xpath("//*[@value='检索']").click()
            time.sleep(1.5)
            browser.switch_to.window(browser.window_handles[1])
            global newurl
            newurl = browser.current_url
            browser.quit()
    
    
    def get_next_page(i, new_url):
        global url_new
        global browser_1
        try:
            if __name__ == "__main__":
                options = Options()
                options.add_argument('-headless')
                browser_1 = webdriver.Firefox(executable_path=r"/Users/viemaxwei/Downloads/geckodriver", firefox_options=options)
                # browser_1 = webdriver.Firefox(executable_path=r"/Users/viemaxwei/Downloads/geckodriver")
                browser_1.get(new_url)
                browser_1.find_element_by_xpath("//*[@value='下页']").click()
                browser_1.switch_to.window(browser_1.window_handles[0])
                url_new = browser_1.current_url
                i += 1
                print("检索第%d页的url" % i)
                single_url_collector(url_new)
                browser_1.quit()
                return get_next_page(i, url_new)
        except:
            browser_1.quit()
            print("<---检索完成--->")
    
    
    
    with open("/Users/viemaxwei/Downloads/surname_1.csv", "rt") as sur:
        cin = csv.reader(sur)
        surname = [i for i in cin]
        surname_dict = dict(surname)
    
    single_url_set = []
    def single_url_collector(url):
        single_url_set.append(url)
        return single_url_set
    
    total_url = {}
    def get_single_url_set():
        for index in surname_dict:
            print("<---现在自动检索_" + surname_dict[index] + "氏_数据--->")
            print("检索第1页的url")
            get_newURL(surname_dict[index] + "氏")
            single_url_set.clear()
            single_url_set.append(newurl)
            get_next_page(1, newurl)
            url_set_copy = single_url_set.copy()
            total_url[surname_dict[index]] = url_set_copy
    
        # cpprint(total_url)
    start = time.time()
    get_single_url_set()
    end = time.time()
    time_total = (end - start)/60
    print("====================")
    print("全部完成!!! 共用时__%f__分钟" % (time_total))
    
    

    相关文章

      网友评论

          本文标题:2018-01-18

          本文链接:https://www.haomeiwen.com/subject/flwpoxtx.html