美文网首页
python爬虫并发多线程获取网页上邮箱

python爬虫并发多线程获取网页上邮箱

作者: FengZai8 | 来源:发表于2016-12-23 14:17 被阅读228次

    从Excel表获取数据,爬到的数据存到Excel表
    2万条记录15分钟左右..
    每个网页获取格式可能不一样,注意观察修改。

     # -*- coding: utf-8 -*-
    
    import requests
    
    import openpyxl
    
    import re
    
    from bs4 import BeautifulSoup
    
    from multiprocessing.dummy import Pool as ThreadPool
    
    wb = openpyxl.load_workbook('company_database.xlsx')
    
    ws = wb.active
    
    start = 10000
    
    f = 'F'
    
    urlstr = 'http://directory...任意url/'
    
    query = '/q/'
    
    def getEmail(url):
    
      print('grab email....')
    
      html = requests.get(url[0])
    
      soup = BeautifulSoup(html.text,"lxml")
    
      pudge = soup.find_all("p", text="Email")
    
      if pudge:
    
      email = re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+",html.text,re.I)[0]
    
      return email,url[1]
    
      else:
    
      return '',url[1]
    
    def writeExcel(mailCount):
    
      ws['L'+str(mailCount[1])].value = mailCount[0]
    
      print('%d save ok: ' % mailCount[1]+mailCount[0])
    
    def geturl(num):
    
      print('Grabing url')
    
      celstr = f + str(num)
    
      compname = ws[celstr].value
    
      url = urlstr + compname.replace(' ', '+') + query
      
      return url,num
    
    def main(start):
    
      pool1 = ThreadPool(16)
    
      urlCount = pool1.map(geturl, range(start,17722))
    
      pool1.close()
    
      pool1.join()
    
      pool2 = ThreadPool(16)
    
      mailCount = pool2.map(getEmail,urlCount)
    
    pool2.close()
    
    pool2.join()
    
    pool3 = ThreadPool(16)
    
    pool3.map(writeExcel,mailCount)
    
    pool3.close()
    
    pool3.join()
    
    wb.save('company_database.xlsx')
    
    print (' ok!')
    
    if __name__ == '__main__':
    
    main(start)   ```

    相关文章

      网友评论

          本文标题:python爬虫并发多线程获取网页上邮箱

          本文链接:https://www.haomeiwen.com/subject/udekvttx.html