import os import urllib.request from lxml import etree #下载数据 def download_img(src_list, name_list): dir_path="./bf-girl" for i in range(len(name_list)): #截取后缀名 suffix=os.path.splitext(src_list[i])[-1] #得到图片全路径 file_path=os.path.join(dir_path,name_list[i]+suffix) try: #下载 urllib.request.urlretrieve(src_list[i],file_path) print("%s---download finsh" %file_path) except Exception as e: print("%s---file missing!" % file_path) #发送请求并获取响应数据以及通过Xpath解析数据 def get_data(req): #发起请求 res=urllib.request.urlopen(req) html=res.read().decode("utf-8") html_etree=etree.HTML(html) src_list= html_etree.xpath("//div[@id='container']/div/div/a/img/@src2") name_list = html_etree.xpath("//div[@id='container']/div/div/a/img/@alt") # print(len(src_list)) download_img(src_list,name_list) #构建请求对象 def build_req(url): headers ={ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' } req = urllib.request.Request(url=url,headers=headers) return req def main(): start_page=int(input("请输入起始页码:")) end_page = int(input("请输入结束页码:")) #临时地址 tem_url = "http://sc.chinaz.com/tupian/rentixiezhen" for page in range(start_page,end_page+1): if page != 1: url = tem_url+"_"+str(page)+".html" else: url = tem_url+".html" req=build_req(url) get_data(req) if __name__ == "__main__": main()
网友评论