get_all_url
作者: 程序里的小仙女 | 来源:发表于2020-10-29 17:52 被阅读0次
get_all_url
# -*- coding: utf-8 -*-
"""
 @Time   : 2020/10/29 13:47 
 @Athor   : LinXiao
 @功能   :
"""
# ------------------------------
# 获取每个城市的总页数
import io
import sys
import time
from pprint import pprint

import lxml
import requests
from bs4 import BeautifulSoup
from loguru import logger
from lxml import etree
import random

from redis import Redis
from requests.exceptions import ProxyError


from spider.alifapai_pc import broswer_head, pagination, broswer_head_city_frist
from tools.city_name import city_to_gb2312, hanzi_to_pinyin

# sys.stdout=io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')

redis_url=Redis(db=10)

CITY_NAMES=['成都', '西安', '重庆', '武汉', '青岛', '广州', '长沙', '兰州']


# 获取总共的页数
def get_page_total(first_page):
    time.sleep(random.random())
    ip, headers, payload, files, proxies=broswer_head_city_frist()
    time.sleep(random.random())
    # time.sleep(random.randint(2, 6))
    sleeptime=random.randint(15, 35)
    time.sleep(sleeptime)
    print(f'随机休眠 {sleeptime}s')
    print('开始请求页面.......')
    try:
        try:
            start=time.time()
            response=requests.request("GET", first_page, headers=headers, data=payload, files=files, proxies=proxies,
                                  timeout=40)
            pprint(response.text)
            end=time.time()
            print(f'页面请求 Sucess! 用时{end - start}S')
            try:
                # 转码
                html_content=response.text.encode(response.encoding).decode("gbk")
                # 开始xpath解析
                tree_html=etree.HTML(html_content)
                page_total_str=(tree_html.xpath('//*[@id="J_LimitFixed"]/ul/li[1]/em/text()'))[0]
                # except Exception as e:
                #     print('解析不到页面数量!')
    
                # page_total_str=(tree_html.xpath('/html/body/div[3]/div[4]/span/em/text()'))[0]
                items_count=int(page_total_str)
                if items_count <= 40:
                    page_total=1
                else:
                    page_total=items_count // 40 + 1
                print(f'page_tatol is: {page_total}')
                return page_total
            except Exception as e:
                print('解析错误!')
                        
        except Exception as e:
            logger.error(f'请求出错!....{e} ,删除不可用ip!')
            redis_ip=Redis(db=8)
            redis_ip.lrem("proxy_ip", 0, ip)  # 移除表中所有与 value 相等的值
            


    # soup = BeautifulSoup(html_content,'lxml')
    # page_total_str = str(soup.select('em[class="page-total"]'))   # 返回 <em class="page-total">2</em>
    # page_total = int(page_total_str.split('</em>')[0].split('total">')[1])

    # page_total_str=(tree_html.xpath('//*[@class="page-skip"]/em/text()'))# '16'  list index out of range
    # try:
    # page_total_str1=(tree_html.xpath('//*[@id="J_LimitFixed"]/ul/li[1]/em/text()'))
    # print(page_total_str1)
    page_total_str=(tree_html.xpath('//*[@id="J_LimitFixed"]/ul/li[1]/em/text()'))[0]
    # except Exception as e:
    #     print('解析不到页面数量!')

    # page_total_str=(tree_html.xpath('/html/body/div[3]/div[4]/span/em/text()'))[0]
    items_count=int(page_total_str)
    if items_count <= 40:
        page_total = 1
    else:
        page_total = items_count//40 + 1
    print(f'page_tatol is: {page_total}')
    return page_total





# 构造所有page页的url
def get_all_page(page_tatol, cityname):
    for page_num in range(1, int(page_tatol + 1)):  # 32页就是2020年8月29号   (只要 九月份以前的数据)
        parm=pagination()  # spm=a213w.7398504.pagination.8.6NzcEktGwdiVP0
        # pre_url = "https://sf.taobao.com/item_list.htm?spm=a213w.7398504.pagination.7.14f14cc6QQLvCs&category=50025969&auction_source=0&city=%B3%C9%B6%BC&st_param=-1&auction_start_seg=-1&page=9"
        pre_url="https://sf.taobao.com/item_list.htm?"
        city_pinyin=hanzi_to_pinyin(cityname)

        # sorder = (x for x in range(0,2))
        city_code=city_to_gb2312(cityname)
        suffix=f"&category=50025969&auction_source=0&city={city_code}&st_param=-1&sorder=0&auction_start_seg=-1&page={page_num}"

        url=pre_url + parm + suffix
        print(url)
        redis_url.lpush(str(city_pinyin), url)
    logger.info(f"已经获取并保存 {cityname} 全部url")


if __name__ == '__main__':
    # first_page="https://sf.taobao.com/item_list.htm?spm=a213w.7398504.pagination.1.6e464cc6ZhiDi4&category=50025969&auction_source=0&city=%B3%C9%B6%BC&sorder=4&st_param=-1&auction_start_seg=-1&page=2"
    # first_page="https://sf.taobao.com/item_list.htm?spm=a213w.7398504.filter.105.501c4cc6MHjcIg&category=50025969&auction_source=0&item_biz_type=6&city=%B3%C9%B6%BC&sorder=1&st_param=-1&auction_start_seg=-1"
    # first_page = "https://sf.taobao.com/item_list.htm?spm=a213w.7398504.filter.104.e3954cc6Ph5abU&category=50025969&auction_source=0&city=%B3%C9%B6%BC&sorder=0&st_param=-1&auction_start_seg=-1"
    # get_page_total(first_page)


    pre_url="https://sf.taobao.com/item_list.htm?"
    for cityname in CITY_NAMES:
        print(cityname)

        city_code=city_to_gb2312(cityname)
        parm=pagination()  # spm=a213w.7398504.pagination.8.6NzcEktGwdiVP0

        suffix=f'&category=50025969&auction_source=0&city={city_code}&sorder=0&st_param=-1&auction_start_seg=-1&page=1'
        url=pre_url + parm + suffix
        print(url)
        page_total=get_page_total(url)   # 返回正在进行的每个城市的总共多少页
        print(f'{cityname} 正在进行 拍卖的房源总共有{page_total} 页')

        # 生成每个城市的正在进行的所有的项目的url

        # get_all_page(page_tatol, cityname)
网友评论

本文标题：get_all_url
本文链接：https://www.haomeiwen.com/subject/ijklvktx.html
延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！
get_all_url

相关文章

get_all_url

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读