爬虫第一天从内链外链学起,上一个简单的获取内链外链程序,现阶段的疑惑是我认为这个程序是以外链作为内链节节寻找,不知道这样的意义在哪里。
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import random
import datetime
import re
pages = set()
random.seed(datetime.datetime.now())
#获取页面内部链接
def getInternalLinks(bsObj,includeUrl):
includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
internalLinks = []
for link in bsObj.findAll("a",href=re.compile("^(/|.*"+includeUrl+")")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
if(link.href['href'].startswith("/")):
internalLinks.append(includeUrl+link.attrs['href'])
else:
internalLinks.append(link.attrs['href'])
return internalLinks
def getExtrenalLinks(bsObj,excludeurl):
extrenalLinks=[]
#查找http开头和www开头的域名
for link in bsObj.findAll("a",href =re.compile("^(http|www)((?!"+excludeurl+").)*$")):
if link.attrs['href'] is not None:
#如果内连接包含跳转到其他页面的链接
if link.attrs['href'] not in extrenalLinks:
extrenalLinks.append(link.attrs['href'])
return extrenalLinks
def getRandomExtrnalLink(startingPage):
html=urlopen(startingPage)
bsObj= BeautifulSoup(html,"html.parser")
extrenalLinks = getExtrenalLinks(bsObj,urlparse(startingPage).netloc)
if len(extrenalLinks)==0:
print("没有找到外链")
domain =urlparse(html).scheme+"://"+urlparse(startingPage).netloc
internalLinks=getInternalLinks(bsObj,domain)
return getRandomExtrnalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
else:
return extrenalLinks[random.randint(0,len(extrenalLinks)-1)]
def followExtrenalOnly(startingPage):
externalLink =getRandomExtrnalLink(startingPage)
#externalLink = "https://en.wikipedia.org/wiki/Intelligence_agency"
print("Random extranal link is"+externalLink)
followExtrenalOnly(externalLink)
# def main():
# followExtrenalOnly("http://en.wikipedia.org")
# print('End')
# if __name__ == '__main__':
# main()
followExtrenalOnly("https://en.wikipedia.org/wiki/Main_Page")
(作者:语落心生 链接:https://www.jianshu.com/p/ec0cbe424353)
Ctrl+Shift+I google开发者工具
开始像股票分析方面靠近:
from urllib.request import urlopen as uu
import re
url=["http://fund.eastmoney.com/000051.html",
"http://fund.eastmoney.com/213008.html",
"http://fund.eastmoney.com/000173.html",
"http://fund.eastmoney.com/000477.html"]
find_re = re.compile(r'<dd class="dataNums"><span class=".+?">(.+?)</span>',re.DOTALL)
html_re = re.compile(r'http://fund.eastmoney.com/(.+?).html',re.DOTALL)
time_re = re.compile(r'<span id="gz_gztime">(.+?)</span>',re.DOTALL)
for ul in url:
html=uu(ul).read()
html=html.decode('utf-8')#python3
print ("基金代码:" + str(html_re.findall(ul)))
print ("单位净值:" + str(find_re.findall(html)))
print ("最后更新时间:" + str(time_re.findall(html)))
print ('')
运行结果.png
这是一个用正则表达式爬取基金简单信息的程序,没有动态没用beautufulsoup。接下来先看看需要分析什么,可能要涉及机器学习,最终目的是有一些统计意义的小程序。
继续撸知乎找灵感!
from urllib.request import urlopen
from bs4 import BeautifulSoup
quote_page = 'http://www.bloomberg.com/quote/SPX:IND'
page = urlopen(quote_page)
soup = BeautifulSoup(page, "html.parser")
name_box = soup.find("h1", attrs={"class": "name"})
name = name_box.text.strip() # strip() is used to remove starting and trailing
print (name)
price_box = soup.find("div", attrs={"class":"price"})
price = price_box.text
print (price)
这是一段爬bloomberg上静态信息的美丽soup,接下来要知道做什么才要紧!
网友评论