美文网首页
网页抓取保存到本地和解析

网页抓取保存到本地和解析

作者: AlastairYuan | 来源:发表于2018-11-21 23:00 被阅读0次

    网页抓取保存到本地

    savedata_Chrome_byurl.py


    from selenium import webdriver

    import time

    import io

    import csv

    import pymysql

    import os

    import re

    from lxml import etree

    import codecs

    def savepage(browser, filepath, pagename):

        try:

          if not os.path.exists(filepath):

                os.mkdir(filepath)

            textContent = browser.find_element_by_xpath('//html').get_attribute('outerHTML')

            str_utf8 = textContent.encode("UTF-8")

            textContent = str_utf8.decode('UTF-8', 'strict')

            pagepath = filepath +'//'+ pagename + '.html'

            fp = open(pagepath, "w", encoding='UTF-8');

            fp.write(textContent);

            fp.close()

        except Exception as excpt:

            print(excpt)

    def getDbConn(db):

        isonserver = True

        osname = os.name

        if osname == 'nt':

            isonserver = False

            print('windows')

        else:

            isonserver = True

            print(os.name)

        isonserver = False

        if isonserver:

            host = 'localhost'

            user = 'root'

            passwd = '123456'

        else:

            host = ''

            user = ''

            passwd = ''

        # db = 'couponcategory'

        port = 3306

        conn = pymysql.connect(host=host, port=port, user=user, password=passwd, db=db)

        return conn

    def parse_data_page_step1(browser, url, urlname):

        print('doing.......')

        creditcard__items = browser.find_elements_by_xpath('//div[@class="creditcard__item"]')

        for creditcard__item in creditcard__items:

            try:

                # title = creditcard__item.find_element_by_xpath('.//h2[@class="creditcard__heading"]').get_attribute('textContent')

                article = creditcard__item.find_element_by_xpath('./article');

                href = article.find_element_by_xpath('./div[@class="compare"]').find_element_by_xpath('./div[last()]/a').get_attribute('href')

                # .get_attribute('href')

                item = {}

                item['url'] = url

                item['url2'] = href

                item['info0'] = urlname

                # item['info1'] = title

                print(urlname)

                print(url)

                print(href)

                stu1 = [url, href, urlname, '']

                out = open('fix10004.csv', 'a', newline='')

                # out = open('d:/data_source10004_v1.csv', 'a', newline='')

                # 设定写入模式

                csv_write = csv.writer(out, dialect='excel')

                # 写入具体内容

                csv_write.writerow(stu1)

                out.close()

            except Exception as aas:

                print(aas)

            # print('write item.............................................')

            # print(item)

            # dbname = 'brcardsdata'

            # dbtablename = 'data_source10004_url_v2'

            # updateToDatabase(dbname, dbtablename, item)

            # print('write item..............................................')

    def get_key_url_map(dbname, tablename):

        conn = getDbConn(dbname)

        cursor = conn.cursor()

        print("mysql connect success")

        sql = "select url,pagecode from " + tablename

        cursor.execute(sql)

        dataresult = cursor.fetchall()

        conn.close()

        return dataresult

    def scrapyStart1(browser, url, pagecode):

        # 返回一个

        # get_attribute('textContent')

        # get_attribute('innerHTML')

        # get_attribute('outerHTML')

        print('4')

        time.sleep(1)

        print('6')

        browser.get(url)

        print('7')

        time.sleep(5)

        print('8')

        try:

            savepage(browser, '10004', pagecode)

        except Exception as errr:

            print('........currpage....error......................')

            print(errr)

        try:

            targetElem = browser.find_element_by_xpath('//div[@class="pagehero__button"]')

            browser.execute_script("arguments[0].focus();", targetElem)

            time.sleep(0.5)

            targetElem.click()

            time.sleep(1.8)

            print('8')

            pagecode2 = pagecode + '_nextpage'

            savepage(browser, '10004', pagecode2)

        except Exception as eerr:

            print('........nextpage....error......................')

            print(eerr)

    # re.sub(r'\?.*','',url)

    browser = webdriver.Chrome()

    time.sleep(0.5)

    browser.maximize_window()

    time.sleep(1)

    key_url_map = get_key_url_map('pagedata', 'data_source10004_url')

    # key_url_map = [['https://www.foregon.com/solicitar/cartaodecredito/agillitas/fgn/cartao-pre-pago-agillitas-mundo-livre-visa/1028','1']]

    for key_url in key_url_map:

        url = key_url[0]

        pagecode = key_url[1]

        pagecode = str(pagecode)

        print(url)

        scrapyStart1(browser, url, pagecode)

    time.sleep(100)

    browser.close()

    parsepagedata.py


    from selenium import webdriver

    import time

    import io

    import csv

    import pymysql

    import os

    import re

    from lxml import etree

    from bs4 import BeautifulSoup

    import numpy as np

    import codecs

    def etreeWebElemToOuterHtml(webitem):

        outerHtml = etree.tostring(webitem)

        outerHtml = outerHtml.decode('utf-8')

        return outerHtml

    def trimDataHtmlProAndImg(divstr):

        divstr = re.sub(r' href=".*?"', "", divstr)

        divstr = re.sub(r' class=".*?"', "", divstr)

        divstr = re.sub(r' target=".*?"', "", divstr)

        divstr = re.sub(r' align=".*?"', "", divstr)

        divstr = re.sub(r' rel=".*?"', "", divstr)

        divstr = re.sub(r'<img.*?>', "", divstr)

        divstr = re.sub(r' data-cfemail=".*?"', "", divstr)

        divstr = re.sub(r' id=".*?"', "", divstr)

        divstr = re.sub(r' name=".*?"', "", divstr)

        divstr = re.sub(r' style=".*?"', "", divstr)

        divstr = re.sub(r' src=".*?"', "", divstr)

        divstr = re.sub(r' dir=".*?"', "", divstr)

        divstr = re.sub(r'<div .*?>', "<p>", divstr)

        divstr = re.sub(r'<strong .*?>', "<p>", divstr)

        divstr = re.sub(r'<a .*?</a>', "", divstr)

        divstr = re.sub(r'<p .*?>', "<p>", divstr)

        divstr = re.sub(r'<button .*?</button>', "", divstr)

        divstr = divstr.replace('<div>', '<p>')

        divstr = divstr.replace('<strong>', '<p>')

        divstr = divstr.replace('</div>', '</p>')

        divstr = divstr.replace('</strong>', '</p>')

        return divstr

    def loadpage(filepath, pagename):

        try:

            pagepath = filepath + '//' + pagename + '.html'

            htmlf = open(pagepath,'r',encoding="utf-8")

            htmlContent = htmlf.read()

            return htmlContent

        except Exception as excpt:

            print(excpt)

        return ''

    def parseWithBeautifulSoup(htmlContent):

        soup = BeautifulSoup(htmlContent, 'lxml')

        mululist = soup.find_all(class_='mulu')

    def parseWithXpath(htmlContent):

        html = etree.HTML(htmlContent)

        mululist = html.xpath('.//*[@class="mulu"]')

    def getDbConn(db):

        isonserver = True

        osname = os.name

        if osname == 'nt':

            isonserver = False

            print('windows')

        else:

            isonserver = True

            print(os.name)

        isonserver = False

        if isonserver:

            host = 'localhost'

            user = 'root'

            passwd = '123456'

        else:

            host = ''

            user = ''

            passwd = ''

        port = 3306

        conn = pymysql.connect(host=host, port=port, user=user, password=passwd, db=db)

        return conn

    def updateToDatabase(dbname, tablename, item):

        url2 = item['url2']

        updatevalue = {'url2': url2}

        setsqllist = []

        collist = ['info0', 'info1', 'info2', 'info3', 'info4', 'info5', 'info6', 'info7', 'info8', 'info9', 'info10', 'url']

        for idx in range(len(collist)):

            colname = collist[idx]

            if colname in item:

                if item[colname]:

                    updatevalue[colname] = item[colname]

                    setsqllist.append(colname + '=%(' + colname + ')s')

        setsqllistlen = len(setsqllist)

        if setsqllistlen > 0:

            updatesql = 'update ' + tablename + ' set '

            setsqlliststr = ','.join(setsqllist)

            wherestr = ' where url2=%(url2)s'

            updatesql = updatesql + setsqlliststr + wherestr

            print(updatesql)

            # print(updatevalue)

            conn = getDbConn(dbname)

            cursor = conn.cursor()

            try:

                cursor.execute(updatesql, updatevalue)

            except Exception as e:

                print('Insert Error1', e)

                conn.rollback()

            else:

                conn.commit()

            conn.close()

    def parse_data_page_step1(htmlContent, pageid):

        print('doing.......')

        html = etree.HTML(htmlContent)

        divcon = html.xpath('//div[@class="pagehero__content"]')[0]

        str1 = divcon.xpath('./div[@class="pagehero__wrapper"]/h1[@class="pagehero__heading"]')[0].text

        str2 = divcon.xpath('./div[@class="pagehero__wrapper"]/strong[@class="pagehero__description"]')[0].text

        item = {}

        item['url2'] = url

        item['info1'] = str1

        item['info8'] = str2

        print('write item.............................................')

        print(item)

        # dbname = 'brcardsdata'

        # dbtablename = 'data_source10004_url'

        # updateToDatabase(dbname, dbtablename, item)

        print('write item..............................................')

    def parse_data_page_step2(htmlContent, pageid):

        print('doing.......')

        html = etree.HTML(htmlContent)

        itemlist= html.xpath('//div[@class="box--list"]/div[@class="box--list-item"]')

        info5 = ''

        info6 = ''

        info7 = ''

        info10 = ''

        for item in  itemlist:

            itemcon = item.xpath('./div[@class="box--container"]')[0]

            str1 = itemcon.xpath('./div[@class="box--header"]/h3')[0].text

            print(str1)

            itemconbody = itemcon.xpath('./div[@class="box--body"]')[0]

            str1 = str1.lower()

            str1 = str1.strip()

            # print(str1)

            if str1 == 'online':

                str2item = itemconbody.xpath('./div[contains(@class,"notsignedin")]')[0]

                str2 = etreeWebElemToOuterHtml(str2item)

                # print(str2)

                str2 = trimDataHtmlProAndImg(str2)

                str2 = str2.replace('<a></a>', '')

                info5 = '<p>' + str2 + '</p>'

                print('info5')

                print(info5)

            if str1 == 'no local':

                str2item = itemconbody

                str2 = etreeWebElemToOuterHtml(str2item)

                str2 = trimDataHtmlProAndImg(str2)

                info6 = '<p>' + str2 + '</p>'

            if str1 == 'por telefone':

                str2item = itemconbody

                str2 = etreeWebElemToOuterHtml(str2item)

                str2 = trimDataHtmlProAndImg(str2 )

                info7 = '<p>' + str2 + '</p>'

            if str1 == 'online':

                try:

                    info10 = itemconbody.xpath('./div[contains(@class,"notsignedin")]').getAttributeValue('data-redirect') #申请链接

                except Exception as exx:

                    print('....................errr1.......................')

                    print(exx)

                    try:

                        info10 = itemconbody.find_element_by_xpath('./div[contains(@class,"notsignedin")]/button').getAttributeValue('data-redirect')  # 申请链接

                    except Exception as exx:

                        print('....................errr2.......................')

                        print(exx)

                info10 = 'https://www.foregon.com' + info10

        item = {}

        item['url2'] = url

        item['info5'] = info5

        item['info6'] = info6

        item['info7'] = info7

        item['info10'] = info10

        print('write item.............................................')

        print(item)

        # dbname = 'brcardsdata'

        # dbtablename = 'data_source10004_url'

        # updateToDatabase(dbname, dbtablename, item)

        print('write item.................................................')

    def get_key_url_map(dbname, tablename):

        conn = getDbConn(dbname)

        cursor = conn.cursor()

        print("mysql connect success")

        sql = "select url,pagecode from " + tablename

        cursor.execute(sql)

        dataresult = cursor.fetchall()

        conn.close()

        return dataresult

    def scrapyStart1(url, pagecode):

        htmlContent = loadpage('10004', pagecode)

        parse_data_page_step1(htmlContent, pagecode)

        pagecode2 = pagecode + '_nextpage'

        htmlContent = loadpage('10004', pagecode2)

        parse_data_page_step2(htmlContent, pagecode2)

    # key_url_map = get_key_url_map('pagedata', 'data_source10004_url')

    key_url_map = [['https://www.foregon.com/solicitar/cartaodecredito/agillitas/fgn/cartao-pre-pago-agillitas-mundo-livre-visa/1028','1']]

    for key_url in key_url_map:

        url = key_url[0]

        pagecode = key_url[1]

        pagecode = str(pagecode)

        print(url)

        scrapyStart1(url, pagecode)

    相关文章

      网友评论

          本文标题:网页抓取保存到本地和解析

          本文链接:https://www.haomeiwen.com/subject/fjpcqqtx.html