爬虫-抓取手机号

作者: 旻璿 | 来源:发表于2019-01-31 11:31 被阅读0次

爬虫-抓取手机号
spider(爬虫)
Python 爬虫协议及建议
无标题文章
Python爬虫入门--了解爬虫---什么是爬虫？
爬虫基础
R爬虫实践—抓取国自然基金信息【下篇】
人人都会数据分析大纲
基于Python的豆瓣影评分析——数据预处理
全程干货 | 解密爬虫抓取、更新网页的策略方法

某网站论坛上有大量用户留下手机号，写个简单的爬虫就可以获取。
⚠️友情提醒：个人信息安全需保密，否则很容易被不法分子窃取。

crawler.py

import requests
import urllib2
import urllib
import hashlib
import json
import re
import sys
import getopt
import time

def getInfoByInput(input):
    regex_email = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}\b", re.IGNORECASE)
    regex_phone = re.compile(r"1[3|4|5|7|8]\d{9}\b", re.IGNORECASE)
    result = {}
    result['email'] = re.findall(regex_email, input)
    result['phone'] = re.findall(regex_phone, input)
    return result

def write_to_file(out_file_path,content):
        file = open(out_file_path,'a')
        file.writelines(content)

headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"authorization":"d8E9N3o5dlkmz1AOEt2jSTqQRMlRp8nTrm3tBpXeDQenRIQXgzze4yI4f9lDUwSF"}


for x in xrange(1,1000):
    url = 'https://lkd.yooyuu.com.cn/api/posts?filter=%7B%22skip%22:'+str(x)+',%22type%22:1%7D'
    req = urllib2.Request(url, headers=headers)
    response = urllib2.urlopen(req)
    content = response.read()
    print(getInfoByInput(content))

filter.py

import io
def filter(infile,outfile):
    infopen = io.open(infile,'r',encoding='utf-8')
    outopen = io.open(outfile,'w',encoding='utf-8')
    lines = infopen.readlines()
    list_1 = []
    for line in lines:
        if line not in list_1:
            list_1.append(line)
            outopen.write(line)
    infopen.close()
    outopen.close()
filter("crawl.txt","result.txt")