网页抓取保存到本地和解析

作者: AlastairYuan | 来源:发表于2018-11-21 23:00 被阅读0次

网页抓取保存到本地和解析
28.用配合scrapy的方式爬取本地保存的html
urllib2库的简单使用
2019-02-13 网站专题栏目模版制作步骤
Python3爬虫抓取东方财富网股票数据并实现MySQL数据库存
Python笔记-从网页抓取图片并保存到本地（简单的正则表达式方
Python爬虫实战第一周作业
spider - 猫眼电影top100
Python爬虫初练
urllib库的基本使用

网页抓取保存到本地

savedata_Chrome_byurl.py

from selenium import webdriver

import time

import io

import csv

import pymysql

import os

import re

from lxml import etree

import codecs

def savepage(browser, filepath, pagename):

try:

if not os.path.exists(filepath):

os.mkdir(filepath)

textContent = browser.find_element_by_xpath('//html').get_attribute('outerHTML')

str_utf8 = textContent.encode("UTF-8")

textContent = str_utf8.decode('UTF-8', 'strict')

pagepath = filepath +'//'+ pagename + '.html'

fp = open(pagepath, "w", encoding='UTF-8');

fp.write(textContent);

fp.close()

except Exception as excpt:

print(excpt)

def getDbConn(db):

isonserver = True

osname = os.name

if osname == 'nt':

isonserver = False

print('windows')

else:

isonserver = True

print(os.name)

isonserver = False

if isonserver:

host = 'localhost'

user = 'root'

passwd = '123456'

else:

host = ''

user = ''

passwd = ''

# db = 'couponcategory'

port = 3306

conn = pymysql.connect(host=host, port=port, user=user, password=passwd, db=db)

return conn

def parse_data_page_step1(browser, url, urlname):

print('doing.......')

creditcard__items = browser.find_elements_by_xpath('//div[@class="creditcard__item"]')

for creditcard__item in creditcard__items:

try:

# title = creditcard__item.find_element_by_xpath('.//h2[@class="creditcard__heading"]').get_attribute('textContent')

article = creditcard__item.find_element_by_xpath('./article');

href = article.find_element_by_xpath('./div[@class="compare"]').find_element_by_xpath('./div[last()]/a').get_attribute('href')

# .get_attribute('href')

item = {}

item['url'] = url

item['url2'] = href

item['info0'] = urlname

# item['info1'] = title

print(urlname)

print(url)

print(href)

stu1 = [url, href, urlname, '']

out = open('fix10004.csv', 'a', newline='')

# out = open('d:/data_source10004_v1.csv', 'a', newline='')

# 设定写入模式

csv_write = csv.writer(out, dialect='excel')

# 写入具体内容

csv_write.writerow(stu1)

out.close()

except Exception as aas:

print(aas)

# print('write item.............................................')

# print(item)

# dbname = 'brcardsdata'

# dbtablename = 'data_source10004_url_v2'

# updateToDatabase(dbname, dbtablename, item)

# print('write item..............................................')

def get_key_url_map(dbname, tablename):

conn = getDbConn(dbname)

cursor = conn.cursor()

print("mysql connect success")

sql = "select url,pagecode from " + tablename

cursor.execute(sql)

dataresult = cursor.fetchall()

conn.close()

return dataresult

def scrapyStart1(browser, url, pagecode):

# 返回一个

# get_attribute('textContent')

# get_attribute('innerHTML')

# get_attribute('outerHTML')

print('4')

time.sleep(1)

print('6')

browser.get(url)

print('7')

time.sleep(5)

print('8')

try:

savepage(browser, '10004', pagecode)

except Exception as errr:

print('........currpage....error......................')

print(errr)

try:

targetElem = browser.find_element_by_xpath('//div[@class="pagehero__button"]')

browser.execute_script("arguments[0].focus();", targetElem)

time.sleep(0.5)

targetElem.click()

time.sleep(1.8)

print('8')

pagecode2 = pagecode + '_nextpage'

savepage(browser, '10004', pagecode2)

except Exception as eerr:

print('........nextpage....error......................')

print(eerr)

# re.sub(r'\?.*','',url)

browser = webdriver.Chrome()

time.sleep(0.5)

browser.maximize_window()

time.sleep(1)

key_url_map = get_key_url_map('pagedata', 'data_source10004_url')

# key_url_map = [['https://www.foregon.com/solicitar/cartaodecredito/agillitas/fgn/cartao-pre-pago-agillitas-mundo-livre-visa/1028','1']]

for key_url in key_url_map:

url = key_url[0]

pagecode = key_url[1]

pagecode = str(pagecode)

print(url)

scrapyStart1(browser, url, pagecode)

time.sleep(100)

browser.close()

parsepagedata.py

from selenium import webdriver

import time

import io

import csv

import pymysql

import os

import re

from lxml import etree

from bs4 import BeautifulSoup

import numpy as np

import codecs

def etreeWebElemToOuterHtml(webitem):

outerHtml = etree.tostring(webitem)

outerHtml = outerHtml.decode('utf-8')

return outerHtml

def trimDataHtmlProAndImg(divstr):

divstr = re.sub(r' href=".*?"', "", divstr)

divstr = re.sub(r' class=".*?"', "", divstr)

divstr = re.sub(r' target=".*?"', "", divstr)

divstr = re.sub(r' align=".*?"', "", divstr)

divstr = re.sub(r' rel=".*?"', "", divstr)

divstr = re.sub(r'<img.*?>', "", divstr)

divstr = re.sub(r' data-cfemail=".*?"', "", divstr)

divstr = re.sub(r' id=".*?"', "", divstr)

divstr = re.sub(r' name=".*?"', "", divstr)

divstr = re.sub(r' style=".*?"', "", divstr)

divstr = re.sub(r' src=".*?"', "", divstr)

divstr = re.sub(r' dir=".*?"', "", divstr)

divstr = re.sub(r'<div .*?>', "<p>", divstr)

divstr = re.sub(r'<strong .*?>', "<p>", divstr)

divstr = re.sub(r'<a .*?</a>', "", divstr)

divstr = re.sub(r'<p .*?>', "<p>", divstr)

divstr = re.sub(r'<button .*?</button>', "", divstr)

divstr = divstr.replace('<div>', '<p>')

divstr = divstr.replace('<strong>', '<p>')

divstr = divstr.replace('</div>', '</p>')

divstr = divstr.replace('</strong>', '</p>')

return divstr

def loadpage(filepath, pagename):

try:

pagepath = filepath + '//' + pagename + '.html'

htmlf = open(pagepath,'r',encoding="utf-8")

htmlContent = htmlf.read()

return htmlContent

except Exception as excpt:

print(excpt)

return ''

def parseWithBeautifulSoup(htmlContent):

soup = BeautifulSoup(htmlContent, 'lxml')

mululist = soup.find_all(class_='mulu')

def parseWithXpath(htmlContent):

html = etree.HTML(htmlContent)

mululist = html.xpath('.//*[@class="mulu"]')

def getDbConn(db):

isonserver = True

osname = os.name

if osname == 'nt':

isonserver = False

print('windows')

else:

isonserver = True

print(os.name)

isonserver = False

if isonserver:

host = 'localhost'

user = 'root'

passwd = '123456'

else:

host = ''

user = ''

passwd = ''

port = 3306

conn = pymysql.connect(host=host, port=port, user=user, password=passwd, db=db)

return conn

def updateToDatabase(dbname, tablename, item):

url2 = item['url2']

updatevalue = {'url2': url2}

setsqllist = []

collist = ['info0', 'info1', 'info2', 'info3', 'info4', 'info5', 'info6', 'info7', 'info8', 'info9', 'info10', 'url']

for idx in range(len(collist)):

colname = collist[idx]

if colname in item:

if item[colname]:

updatevalue[colname] = item[colname]

setsqllist.append(colname + '=%(' + colname + ')s')

setsqllistlen = len(setsqllist)

if setsqllistlen > 0:

updatesql = 'update ' + tablename + ' set '

setsqlliststr = ','.join(setsqllist)

wherestr = ' where url2=%(url2)s'

updatesql = updatesql + setsqlliststr + wherestr

print(updatesql)

# print(updatevalue)

conn = getDbConn(dbname)

cursor = conn.cursor()

try:

cursor.execute(updatesql, updatevalue)

except Exception as e:

print('Insert Error1', e)

conn.rollback()

else:

conn.commit()

conn.close()

def parse_data_page_step1(htmlContent, pageid):

print('doing.......')

html = etree.HTML(htmlContent)

divcon = html.xpath('//div[@class="pagehero__content"]')[0]

str1 = divcon.xpath('./div[@class="pagehero__wrapper"]/h1[@class="pagehero__heading"]')[0].text

str2 = divcon.xpath('./div[@class="pagehero__wrapper"]/strong[@class="pagehero__description"]')[0].text

item = {}

item['url2'] = url

item['info1'] = str1

item['info8'] = str2

print('write item.............................................')

print(item)

# dbname = 'brcardsdata'

# dbtablename = 'data_source10004_url'

# updateToDatabase(dbname, dbtablename, item)

print('write item..............................................')

def parse_data_page_step2(htmlContent, pageid):

print('doing.......')

html = etree.HTML(htmlContent)

itemlist= html.xpath('//div[@class="box--list"]/div[@class="box--list-item"]')

info5 = ''

info6 = ''

info7 = ''

info10 = ''

for item in itemlist:

itemcon = item.xpath('./div[@class="box--container"]')[0]

str1 = itemcon.xpath('./div[@class="box--header"]/h3')[0].text

print(str1)

itemconbody = itemcon.xpath('./div[@class="box--body"]')[0]

str1 = str1.lower()

str1 = str1.strip()

# print(str1)

if str1 == 'online':

str2item = itemconbody.xpath('./div[contains(@class,"notsignedin")]')[0]

str2 = etreeWebElemToOuterHtml(str2item)

# print(str2)

str2 = trimDataHtmlProAndImg(str2)

str2 = str2.replace('<a></a>', '')

info5 = '<p>' + str2 + '</p>'

print('info5')

print(info5)

if str1 == 'no local':

str2item = itemconbody

str2 = etreeWebElemToOuterHtml(str2item)

str2 = trimDataHtmlProAndImg(str2)

info6 = '<p>' + str2 + '</p>'

if str1 == 'por telefone':

str2item = itemconbody

str2 = etreeWebElemToOuterHtml(str2item)

str2 = trimDataHtmlProAndImg(str2 )

info7 = '<p>' + str2 + '</p>'

if str1 == 'online':

try:

info10 = itemconbody.xpath('./div[contains(@class,"notsignedin")]').getAttributeValue('data-redirect') #申请链接

except Exception as exx:

print('....................errr1.......................')

print(exx)

try:

info10 = itemconbody.find_element_by_xpath('./div[contains(@class,"notsignedin")]/button').getAttributeValue('data-redirect') # 申请链接

except Exception as exx:

print('....................errr2.......................')

print(exx)

info10 = 'https://www.foregon.com' + info10

item = {}

item['url2'] = url

item['info5'] = info5

item['info6'] = info6

item['info7'] = info7

item['info10'] = info10

print('write item.............................................')

print(item)

# dbname = 'brcardsdata'

# dbtablename = 'data_source10004_url'

# updateToDatabase(dbname, dbtablename, item)

print('write item.................................................')

def get_key_url_map(dbname, tablename):

conn = getDbConn(dbname)

cursor = conn.cursor()

print("mysql connect success")

sql = "select url,pagecode from " + tablename

cursor.execute(sql)

dataresult = cursor.fetchall()

conn.close()

return dataresult

def scrapyStart1(url, pagecode):

htmlContent = loadpage('10004', pagecode)

parse_data_page_step1(htmlContent, pagecode)

pagecode2 = pagecode + '_nextpage'

htmlContent = loadpage('10004', pagecode2)

parse_data_page_step2(htmlContent, pagecode2)

# key_url_map = get_key_url_map('pagedata', 'data_source10004_url')

key_url_map = [['https://www.foregon.com/solicitar/cartaodecredito/agillitas/fgn/cartao-pre-pago-agillitas-mundo-livre-visa/1028','1']]

for key_url in key_url_map:

url = key_url[0]

pagecode = key_url[1]

pagecode = str(pagecode)

print(url)

scrapyStart1(url, pagecode)

网友评论

本文标题：网页抓取保存到本地和解析

本文链接：https://www.haomeiwen.com/subject/fjpcqqtx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

网页抓取保存到本地和解析

相关文章