美文网首页
05 如何在知乎上自动互粉、商品上架提醒、拉勾网职位提醒

05 如何在知乎上自动互粉、商品上架提醒、拉勾网职位提醒

作者: 夏威夷的芒果 | 来源:发表于2018-08-04 18:42 被阅读22次

    任务

    image.png

    问题拆解



    # Windows版的代码请在这下载:https://video.mugglecode.com/net5.py
    # 以下为Mac/Linux/可在麻瓜编程在线运行的代码:
    
    from selenium import webdriver
    import time
    
    # 运行前先下载 chrome driver,下载地址是:https://sites.google.com/a/chromium.org/chromedriver/downloads,点击【Latest Release: ChromeDriver x.xx】进入下载
    
    def start_chrome():
        driver = webdriver.Chrome(executable_path='./chromedriver') # Windows 需写成'./chromedriver.exe'
        driver.start_client()
        return driver
    
    def find_strangers():
        # btn
        btn_sel = 'div.ContentItem-extra > button.Button--blue'
        elems = driver.find_elements_by_css_selector(btn_sel)
        return elems
    
    def add_fren():
        pass
    
    
    while True:
        url = 'https://www.zhihu.com/'
        follower_url = 'https://www.zhihu.com/people/xxx/followers' #需替换成你的知乎url,点击【我的主页】→【关注者】可进入该页面
        driver = start_chrome()
        driver.get(url)
        if not driver.get_cookies():
            push()
        time.sleep(20)
            # wait login
    
        driver.get(follower_url)
        time.sleep(6) # wait for loading page & users
        strangers = find_strangers()
        for s in strangers:
            s.click()
            time.sleep(3)
        print('Done!')
        time.sleep(3000)
    # js_execute('xxx.click()')
    

    商品上架提醒这一部分需要考虑的是样品的标签名称。另外pyautogui库里面的alert方法可以提醒。

    from selenium.webdriver import Chrome
    import pyautogui
    import webbrowser
    import time
    
    
    class PageObserver:
    
        def __init__(self, url, target_sel):
            self.driver = Chrome(executable_path='./chromedriver')
            self.url = url
            self.target_sel = target_sel
            self.request_time = 20
            #self.driver.find_elements_by_css_selector(self.target_sel)
    
        def is_changed(self):
            self.driver.get(self.url)
            time.sleep(self.request_time)
            oos_el = self.driver.find_elements_by_css_selector(self.target_sel)
            buy_button = self.driver.find_elements_by_id('update-cart')
            # -> []
            print(oos_el)
            print(buy_button)
            if not oos_el and buy_button:
                return True
    
    
    def alert():
        pyautogui.alert('The bag is available')
    
    
    
    # find changes
    url = 'https://www.strathberry.com/products/east-west-mini-tri-colour-navy-ruby-vanilla'
    target_sel = 'div.oos.swatch-container.swatch-3-colours.active-colour'
    
    fake_url = 'https://www.strathberry.com/products/east-west-mini-black-with-eyelets'
    fake_target_sel = 'oss.swatch-container.swatch-1-colours.active-colour'
    
    target = PageObserver(url=fake_url, target_sel=fake_target_sel)
    while True:
        if target.is_changed():
            alert()
            webbrowser.open(fake_url)
            target.driver.close()
        else:
            print('Nope!')
    

    拉勾网职位提醒

    其实爬取的时候面向对象还是面向过程, 这个代码可以来对比一下的。
    面向过程的时候呢:


    面向过程

    面向对象可以看做是面向过程的一种扩展,也是一种比较友好的形式。


    面向对象
    # get_page() -> parse_page() -> filter_job() -> send()
    '''
    raw_html = []
    for i in range(1, 30):
        page = get_page()
        raw_html.append(page)
    
    all_jobs = []
    for html in raw_html:
        jobs = parse(html)
        all_jobs.append(jobs)
    
    for job in all_jobs:
        result = filter_job(job)
        if result:
            send(job)
    '''
    
    # Spider -> Parser -> Job
    
    '''
    s = Spider()
    raw_pages = s.crawl(url)
    p = Parser(raw_pages)
    jobs = p.get_jobs()
    
    for j in jobs:
        if j.is_today():
            j.send_to_me()
    '''
    from selenium.webdriver import Chrome
    from bs4 import BeautifulSoup
    import time
    # https://www.lagou.com/zhaopin/qukuailian/12/
    class Spider:
        def __init__(self, index_url, page_range):
            self.page_range = page_range + 1
            self.index_url = index_url
            self.raw_pages = []
            self.boot()
    
        def boot(self):
            self.chrome = Chrome(executable_path='./chromedriver')
            self.chrome.start_client()
    
        def crawl(self):
            for num in range(1, self.page_range):
                full_url = f'{self.index_url}{num}/'
                self.chrome.get(full_url)
                print('Wait for loading page')
                time.sleep(3)
                single_html = self.chrome.page_source
                #本来这个浏览器应该打开就处理,但是也可以一次性网页都加载完再挨个处理,这个可以存内容的。,
                self.raw_pages.append(single_html)
                print('Done')
    
    
    class Parser:
    
        def __init__(self, raw_pages):
            self.raw_pages = raw_pages
            self.jobs = []
    
            self.parse()
    
        def parse(self):
            for html in self.raw_pages:
                soup = BeautifulSoup(html, 'html.parser') #这里注意是html.parser
                time_sel = 'ul span.format-time'
                comp_sel = 'ul .company_name > a'
                link_sel = 'ul a.position_link'
    
                time_els = soup.select(time_sel) # list
                comp_els= soup.select(comp_sel)  # list
                link_els= soup.select(link_sel)  # list
                for t,c,l in zip(time_els, comp_els, link_els):
    
                    cell = {  #字典方便查询
                        'time':t.text,
                        'comp':c.text,
                        'link':l.get('href')
                    }
                    self.jobs.append(cell)
                    # [{},{}]
        def get_jobs(self):
            return [Job(j) for j in self.jobs]
    
    
    
    class Job:
    
        def __init__(self,data):
    
            self.time = data.get('time')
            self.comp = data.get('comp')
            self.link = data.get('link')
    
    
        def is_today(self):
            return ':' in self.time # -> T or F
    
        def send(self):
            pass
    
        def save_into_csv(self):
            pass
    
    
    s = Spider(
        index_url='https://www.lagou.com/zhaopin/qukuailian/',
        page_range=2
    )
    s.crawl()
    p = Parser(s.raw_pages)
    jobs = p.get_jobs()
    for j in jobs:
        if j.is_today():
            print(j.comp,j.link)
    

    相关文章

      网友评论

          本文标题:05 如何在知乎上自动互粉、商品上架提醒、拉勾网职位提醒

          本文链接:https://www.haomeiwen.com/subject/ahpmvftx.html