import re
# 正则表达式模块,用来匹配图片地址
import urllib.request
# 用来获取HTML源码
import sys
import os
import re
def geturls(path):
urls = []
with open(path, 'r', encoding='utf-8') as f1:
url_list = f1.readlines()
for url in url_list:
urls.append(url[3:])
print('get urls list ready!')
return urls
def getHtml(urls):
num = 0
for url in urls:
print(num)
page = urllib.request.urlopen(url)
html = page.read()
num += 1
yield html
print('get htmls list ready!')
def getpic(htmls, dir):
'''
:param htmls: iteration
:param dir:
:return:
'''
if not os.path.exists(dir):
os.makedirs(dir)
imgName = 0
while (htmls.__next__()):
html = htmls.__next__()
reg = r'src="(https://imgsa.*?\.jpg)"'
imgre = re.compile(reg)
imList = re.findall(imgre, html.decode('utf-8'))
print(imList)
# 下载图片
for imgPath in imList:
# ------ 这里最好使用异常处理及多线程编程方式 ------
try:
f = open(dir +'/' + str(imgName) + ".jpg", 'wb')
f.write((urllib.request.urlopen(imgPath)).read())
print(imgPath)
f.close()
except Exception as e:
print(imgPath + " error")
imgName += 1
def main():
pic_dir = 'pic'
urls = geturls('_防诈骗.txt')
htmls = getHtml(urls)
getpic(htmls, pic_dir)
if __name__ == '__main__':
main()
网友评论