美文网首页
Python文轩网爬虫

Python文轩网爬虫

作者: peng_js | 来源:发表于2017-07-02 23:07 被阅读0次

    encoding=utf8

    import pymysql
    import time
    import sys
    import requests
    import os

    捕获错误

    import traceback
    import types

    将html实体化

    import cgi
    import warnings
    reload(sys)
    sys.setdefaultencoding('utf-8')
    from pyquery import PyQuery as pq
    from lxml import etree
    sys.setdefaultencoding('utf-8')

    屏蔽错误

    warnings.filterwarnings("ignore")

    下载图片

    def dowloadPic(imageUrl,filePath):
    r = requests.get(imageUrl,timeout=60)
    status=r.status_code
    if status == 404:
    return 404
    with open(filePath, "wb") as code:
    code.write(r.content)

    根据详情页地址抓取数据并插入数据库

    def getData(final_url):
    file_open=open('./url.txt', 'w')
    file_open.write(final_url)
    file_open.close()
    #链接数据库
    conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='bookinfo', charset='utf8')
    #设置浮标
    cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)
    #解析详情页面
    try:
    detail_url=final_url
    c=pq(detail_url)
    head=c('html').attr('xmlns')
    err='http://www.w3.org/1999/xhtml'
    err1='http://www.winxuan.com/cms/2016db_sh'
    if head == err or head == err1:
    return 'back'
    except Exception, e:
    return 'back'

    i=0
    while i<12:
        text = c('#page').find('.cont').find('li').eq(i).text()
        text=text.replace(' ','')
        if 'I S B N' in text:
            isbn=text.replace('I S B N:','')
            isbn=isbn.strip()
            sel='select count(*) from bi_book where isbn ='+isbn
            cursor.execute(sel)
            result=cursor.fetchone()
            count=result['count(*)']
            if count != 0 :
                print u'已存在'
                return 'back'
        if 'isbn:' in text :
            isbn=text.replace('isbn:','')
            isbn=isbn.strip()
            sel='select count(*) from bi_book where isbn ='+isbn
            cursor.execute(sel)
            result=cursor.fetchone()
            count=result['count(*)']
            if count != 0 :
                print u'已存在'
                return 'back'
        if '作者:' in text :
            author = text.replace('作者:','')
        if '出版社:' in text :
            press_name=text.replace('出版社:','')
        if '版次:' in text :
            edition=text.replace('版次:','')
        if '印次:' in text :
            impressions=text.replace('印次:','')  
        if '装帧:' in text :
            packaging=text.replace('装帧:','')
        if '开本:' in text:
            size=text.replace('开本:','')
        if '出版时间:' in text:
            press_time=text.replace('出版时间:','')
            press_time=press_time.strip()
            if press_time == '无':
                press_time='1970-01-01'
        if '印刷时间:' in text:
            print_time=text.replace('印刷时间:','')
            print_time=print_time.strip()
            if print_time== '无':
                print_time='1970-01-01'
        if '页数:' in text:
            page_num=text.replace('页数:','')
        if '字数:' in text:
            word_num=text.replace('字数:','')
        i+=1
    if ('author' in locals().keys()) == False:
        author = ''
    if ('press_time' in locals().keys()) == False:
        press_time = '1970-01-01'
    if ('print_time' in locals().keys()) == False:
        print_time = '1970-01-01' 
    if ('impressions' in locals().keys()) == False:
        impressions = ''
    if ('edition' in locals().keys())== False:
        edition = ''
    if ('page_num' in locals().keys())== False:
        page_num = ''
    if ('word_num' in locals().keys())== False:
        word_num = ''
    if ('packaging' in locals().keys())== False:
        packaging = ''
    if ('size' in locals().keys())== False:
        size = ''
    if ('press_name' in locals().keys())== False:
        press_name = ''
    #暂无图片地址
    none_img='http://static.winxuancdn.com/goods/sml_blank.jpg'
    #获取大小图地址
    big_path=c('.info-side').find('.img').find('a').find('img').attr('src')
    if big_path is None:
        return 'back'
    elif big_path == none_img :
        big_path=''
        small_path=''
    else :
        small_path=big_path.replace('_16','_11')
    #获取分类
    #先获取a标签html
    ahtml=c('#page').find('.base-nav').eq(0).html()
    #解析a标签html
    cate=pq(ahtml)
    #获取分类的最后一个分类
    category=cate('a:last').text()
    #获取书名
    name=c('.info-main').find('.name').eq(0).find('h1').eq(0).text()
    name=name.strip()
    #获取价格
    price=c('.info-main').find('.attr').eq(0).find('.price-n').eq(0).find('b').text()
    price=price.replace('¥','')
    #循环获取内容简介和目录信息
    k=5
    while k<12:
        title=c('#page').find('.title').eq(k).find('.tab').find('h4').text()
        if '内容简介' in title:
            con=c('#page').find('.title').eq(k).nextAll()
            det=pq(con)
            content=det('.text-words-1').html()
            content=content.encode("utf8", "ignore");
        if '目录' in title:
            con=c('#page').find('.title').eq(k).nextAll()
            dry=pq(con)
            directory=dry('.text-words-1').html()
            directory=directory.encode("utf8", "ignore");
        k+=1
    #如果内容简介和目录没有的时候指定为空字符串
    if ('content' in locals().keys())== False:
        content = ''
    if ('directory' in locals().keys())== False:
        directory = ''
    details   = '内容简介<br>'+content+'<br><br>目录<br>'+directory
    details=cgi.escape(details)
    #录入时间
    add_time = time.strftime('%Y-%m-%d',time.localtime(time.time()))    
    #下载小图
    #文件根目录
    root_path=sys.path[0]
    #创建isbn文件夹路径
    root_path=root_path.replace('\\','/')
    isbn_path=root_path+'/download/'+isbn
    if big_path != '' and small_path !='' :
        #创建isbn目录
        if os.path.isdir(isbn_path) ==False :
            os.mkdir(isbn_path)
            #组合下载后图片保存路径
            down_img_small = isbn_path+"/small"+isbn+".jpg"
            down_img_big   = isbn_path+'/big'+isbn+".jpg"
            #调用下载图片方法
            small_res=dowloadPic(small_path,down_img_small)
            #大图保存数据库路径
            big_res=dowloadPic(big_path,down_img_big)
            #小图保存数据库路径
            if small_res==404 :
                img_small = 'none-picture/none-small.jpg'
            else :
                img_small = 'download/'+isbn+'/small'+isbn+'.jpg'
            if big_res==404 :
                img_big = 'none-picture/none-big.jpg'
            else :
                img_big   = 'download/'+isbn+'/big'+isbn+'.jpg'
    
        else :
            #组合保存数据库中的图片路径
            img_small = 'download/'+isbn+'/small'+isbn+'.jpg'
            img_big   = 'download/'+isbn+'/big'+isbn+'.jpg'
    else :
        img_big = 'none-picture/none-big.jpg'
        img_small = 'none-picture/none-small.jpg'
    source_type = 3
    try :
        #要插入的列表
        li=[0,source_type,category,details,detail_url,price,add_time,packaging,print_time,impressions,name,author,press_name,isbn,edition,size,press_time,page_num,word_num,img_big,img_small]
        #执行sql
        sql="insert into bi_book (book_id,source_type,category,details,detail_url,price,add_time,packaging,print_time,impressions,name,author,press_name,isbn,edition,size,press_time,page_num,word_num,img_big,img_small) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        aaa=cursor.execute(sql,li)
        if aaa==1:
            print u'插入成功'
        conn.commit()
    except Exception, e :
        return 'back'
    

    def winxuan(n):
    #首页解析
    home_url='http://www.winxuan.com/'
    h=pq(home_url)
    #分类导航链接
    menu=h('.mod-mainmenu').find('dd').find('a').eq(n).attr('href')
    #print menu
    #分类书籍首页
    try:
    mh=pq(menu)
    except Exception, e :
    return 'backs'
    # text=mh('.main').find('a').text()
    # text=text.encode("GBK", "ignore");
    li=[]
    u=0
    while u<248 :
    detail_urls=mh('.main').find('a').eq(u).attr('href')
    #将取到所有地址放入到列表当中
    li.append(detail_urls)
    u+=1
    #进行列表去重
    li=list(set(li))
    for final_url in li:
    try:
    result=getData(final_url)
    except Exception, e :
    continue
    if result=='back' :
    continue
    print 'OK,finished'

    n=0

    while n<58:

    while n<58:
    print n
    string=str(n)
    file_open=open('./number.txt', 'w')
    file_open.write(string)
    file_open.close()
    res=winxuan(n)
    n+=1
    if res=='backs' :
    continue

    相关文章

      网友评论

          本文标题:Python文轩网爬虫

          本文链接:https://www.haomeiwen.com/subject/npiocxtx.html