#from urllib import request
from urllib.request import urlopen
#from urllib.request import Request
from urllib import parse
from bs4 import BeautifulSoup
import re
import pymysql.cursors
resp = urlopen("https://en.wikipedia.org/wiki/Main_Page").read().decode("utf-8")
soup = BeautifulSoup(resp,"html.parser")
listUrls = soup.find_all("a",href=re.compile("^/wiki/")) #采集数据的关键,找出/wiki/的链接
# print(listUrls)
for urlin listUrls:
if not re.search("\.(jpg|JPG)$",url["href"]):
print(url.get_text(),"-------","https://en.wikipedia.org" + url["href"])
#连接数据库
connection = pymysql.connect(
host ='localhost',
user ='root',
password ='123456',
db ='wikiurl',
charset ='utf8mb4')#utf-8编码的扩展集
try:#获取会话指针
with connection.cursor()as cursor:
#创建sql语句
sql ="insert into `urls`(`urlname`,`urlhref`)values(%s,%s)"
#执行sql语句
cursor.execute(sql,(url.get_text(),"https://en.wikipedia.org" + url["href"]))
#提交
connection.commit()
finally:
connection.close()
创建表的时候注意urlhref的设置255字符长度可能不够,设置1000.
主键和自增的设置。
修改后。不会把.JPG的条目存入数据库中。
#from urllib import request
from urllib.requestimport urlopen
#from urllib.request import Request
from urllib import parse
from bs4 import BeautifulSoup
import re
import pymysql.cursors
resp = urlopen("https://en.wikipedia.org/wiki/Main_Page").read().decode("utf-8")
soup = BeautifulSoup(resp,"html.parser")
listUrls = soup.find_all("a",href=re.compile("^/wiki/"))
# print(listUrls)
for urlin listUrls:
if not re.search("\.(jpg|JPG)$",url["href"]):
print(url.get_text(),"-------","https://en.wikipedia.org" + url["href"])
#连接数据库
connection = pymysql.connect(
host ='localhost',
user ='root',
password ='123456',
db ='wikiurl',
charset ='utf8mb4')#utf-8编码的扩展集
try:#获取会话指针
with connection.cursor()as cursor:
#创建sql语句
# for url in listUrls:
if not re.search("\.(jpg|JPG)$", url["href"]): #修改的地方,不会把.JPG的条目存入数据库中。
sql ="insert into `urls`(`urlname`,`urlhref`)values(%s,%s)"
#执行sql语句
cursor.execute(sql,(url.get_text(),"https://en.wikipedia.org" + url["href"]))
#提交
connection.commit()
finally:
connection.close()
网友评论