美文网首页我爱编程
python爬虫笔记-weki数据传入数据库

python爬虫笔记-weki数据传入数据库

作者: SWJTU_CC | 来源:发表于2018-04-14 18:06 被阅读0次

    #from urllib import request

    from urllib.request import urlopen

    #from urllib.request import Request

    from urllib import parse

    from bs4 import BeautifulSoup

    import re

    import pymysql.cursors

    resp = urlopen("https://en.wikipedia.org/wiki/Main_Page").read().decode("utf-8")

    soup = BeautifulSoup(resp,"html.parser")

    listUrls = soup.find_all("a",href=re.compile("^/wiki/")) #采集数据的关键,找出/wiki/的链接

    # print(listUrls)

    for urlin listUrls:

    if not re.search("\.(jpg|JPG)$",url["href"]):

    print(url.get_text(),"-------","https://en.wikipedia.org" + url["href"])

    #连接数据库

        connection = pymysql.connect(

    host ='localhost',

        user ='root',

        password ='123456',

        db ='wikiurl',

        charset ='utf8mb4')#utf-8编码的扩展集

        try:#获取会话指针

          with connection.cursor()as cursor:

    #创建sql语句

            sql ="insert into `urls`(`urlname`,`urlhref`)values(%s,%s)"

                #执行sql语句

            cursor.execute(sql,(url.get_text(),"https://en.wikipedia.org" + url["href"]))

    #提交

            connection.commit()

    finally:

    connection.close()


    创建表的时候注意urlhref的设置255字符长度可能不够,设置1000.

    主键和自增的设置。


    修改后。不会把.JPG的条目存入数据库中。

    #from urllib import request

    from urllib.requestimport urlopen

    #from urllib.request import Request

    from urllib import parse

    from bs4 import BeautifulSoup

    import re

    import pymysql.cursors

    resp = urlopen("https://en.wikipedia.org/wiki/Main_Page").read().decode("utf-8")

    soup = BeautifulSoup(resp,"html.parser")

    listUrls = soup.find_all("a",href=re.compile("^/wiki/"))

    # print(listUrls)

    for urlin listUrls:

    if not re.search("\.(jpg|JPG)$",url["href"]):

    print(url.get_text(),"-------","https://en.wikipedia.org" + url["href"])

    #连接数据库

        connection = pymysql.connect(

    host ='localhost',

        user ='root',

        password ='123456',

        db ='wikiurl',

        charset ='utf8mb4')#utf-8编码的扩展集

        try:#获取会话指针

          with connection.cursor()as cursor:

    #创建sql语句

    # for url in listUrls:

            if not re.search("\.(jpg|JPG)$", url["href"]):   #修改的地方,不会把.JPG的条目存入数据库中。

    sql ="insert into `urls`(`urlname`,`urlhref`)values(%s,%s)"

                #执行sql语句

            cursor.execute(sql,(url.get_text(),"https://en.wikipedia.org" + url["href"]))

    #提交

            connection.commit()

    finally:

    connection.close()

    相关文章

      网友评论

        本文标题:python爬虫笔记-weki数据传入数据库

        本文链接:https://www.haomeiwen.com/subject/plglkftx.html