美文网首页我爱编程
python爬虫笔记-weki数据传入数据库

python爬虫笔记-weki数据传入数据库

作者: SWJTU_CC | 来源:发表于2018-04-14 18:06 被阅读0次

#from urllib import request

from urllib.request import urlopen

#from urllib.request import Request

from urllib import parse

from bs4 import BeautifulSoup

import re

import pymysql.cursors

resp = urlopen("https://en.wikipedia.org/wiki/Main_Page").read().decode("utf-8")

soup = BeautifulSoup(resp,"html.parser")

listUrls = soup.find_all("a",href=re.compile("^/wiki/")) #采集数据的关键,找出/wiki/的链接

# print(listUrls)

for urlin listUrls:

if not re.search("\.(jpg|JPG)$",url["href"]):

print(url.get_text(),"-------","https://en.wikipedia.org" + url["href"])

#连接数据库

    connection = pymysql.connect(

host ='localhost',

    user ='root',

    password ='123456',

    db ='wikiurl',

    charset ='utf8mb4')#utf-8编码的扩展集

    try:#获取会话指针

      with connection.cursor()as cursor:

#创建sql语句

        sql ="insert into `urls`(`urlname`,`urlhref`)values(%s,%s)"

            #执行sql语句

        cursor.execute(sql,(url.get_text(),"https://en.wikipedia.org" + url["href"]))

#提交

        connection.commit()

finally:

connection.close()


创建表的时候注意urlhref的设置255字符长度可能不够,设置1000.

主键和自增的设置。


修改后。不会把.JPG的条目存入数据库中。

#from urllib import request

from urllib.requestimport urlopen

#from urllib.request import Request

from urllib import parse

from bs4 import BeautifulSoup

import re

import pymysql.cursors

resp = urlopen("https://en.wikipedia.org/wiki/Main_Page").read().decode("utf-8")

soup = BeautifulSoup(resp,"html.parser")

listUrls = soup.find_all("a",href=re.compile("^/wiki/"))

# print(listUrls)

for urlin listUrls:

if not re.search("\.(jpg|JPG)$",url["href"]):

print(url.get_text(),"-------","https://en.wikipedia.org" + url["href"])

#连接数据库

    connection = pymysql.connect(

host ='localhost',

    user ='root',

    password ='123456',

    db ='wikiurl',

    charset ='utf8mb4')#utf-8编码的扩展集

    try:#获取会话指针

      with connection.cursor()as cursor:

#创建sql语句

# for url in listUrls:

        if not re.search("\.(jpg|JPG)$", url["href"]):   #修改的地方,不会把.JPG的条目存入数据库中。

sql ="insert into `urls`(`urlname`,`urlhref`)values(%s,%s)"

            #执行sql语句

        cursor.execute(sql,(url.get_text(),"https://en.wikipedia.org" + url["href"]))

#提交

        connection.commit()

finally:

connection.close()

相关文章

网友评论

    本文标题:python爬虫笔记-weki数据传入数据库

    本文链接:https://www.haomeiwen.com/subject/plglkftx.html