某网站论坛上有大量用户留下手机号,写个简单的爬虫就可以获取。
⚠️友情提醒:个人信息安全需保密,否则很容易被不法分子窃取。
crawler.py
import requests
import urllib2
import urllib
import hashlib
import json
import re
import sys
import getopt
import time
def getInfoByInput(input):
regex_email = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}\b", re.IGNORECASE)
regex_phone = re.compile(r"1[3|4|5|7|8]\d{9}\b", re.IGNORECASE)
result = {}
result['email'] = re.findall(regex_email, input)
result['phone'] = re.findall(regex_phone, input)
return result
def write_to_file(out_file_path,content):
file = open(out_file_path,'a')
file.writelines(content)
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"authorization":"d8E9N3o5dlkmz1AOEt2jSTqQRMlRp8nTrm3tBpXeDQenRIQXgzze4yI4f9lDUwSF"}
for x in xrange(1,1000):
url = 'https://lkd.yooyuu.com.cn/api/posts?filter=%7B%22skip%22:'+str(x)+',%22type%22:1%7D'
req = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(req)
content = response.read()
print(getInfoByInput(content))
filter.py
import io
def filter(infile,outfile):
infopen = io.open(infile,'r',encoding='utf-8')
outopen = io.open(outfile,'w',encoding='utf-8')
lines = infopen.readlines()
list_1 = []
for line in lines:
if line not in list_1:
list_1.append(line)
outopen.write(line)
infopen.close()
outopen.close()
filter("crawl.txt","result.txt")
网友评论