美文网首页
python爬虫初步

python爬虫初步

作者: xyzjia | 来源:发表于2020-08-24 16:35 被阅读0次

    和http(超文本传输协议)

    爬虫步骤:确认需求》寻找需求》发送请求》解析数据》存储数据

    python爬虫初步

    编码规范

    #-*-coding:utf-8-*-  or  #coding =utf-8

    python可以加入main函数来测试程序if __name__= “”__mian__“”;当函数执行时调用函数,(文件中要定义main函数)

    文件中如果不定义此函数,编译器按顺序执行

    注释#导入第三方模块modulebs4#网页解析re#正则表达式urllib requrets error#制订url,获取网页数据xlat#进行excel操作sqlite3#进行SQLite数据库操作

    baseurl ="https://movie.douban.com/top250?start="# 链接、图片、

    findlink = re.compile(r'<a href="(.*?)">')#创建正则表达式对象,表示规则(字符串模式)

    findimg = re.compile(r'<img.*src="(.*?)"',re.S)

    findTitle = re.compile(r'<span class="title">(.*)</span>')

    findrating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')

    findjudege = re.compile(r'<span>(\d*)人评价</span>')

    findinq = re.compile(r'<span class="inq">(.*)</span>')

    findbd = re.compile(r'<p class="">(.*?)</p>',re.S)

    # re.S表示忽视换行符,包括换行符

    savepath ="豆瓣电影top250.xls"

    datalist = []

    def main():

    # getdata(baseurl)

        datalist = getdata(baseurl)

    saveData(datalist,savepath)

    def getdata(baseurl):

    for iin range(0,10):

    url = baseurl+str(i*25)

    html = askurl(url)

    #逐一解析

            soup = BeautifulSoup(html,"html.parser")

    for itemin soup.find_all("div",class_="item"):

    data = []

    item =str(item)

    # print(item)

    # break

                link = re.findall(findlink,item)[0]

    data.append(link)

    img = re.findall(findimg,item)[0]

    data.append(img)

    title = re.findall(findTitle,item)

    if(len(title)==2):

    ctitle = title[0]

    data.append(ctitle)

    otitle = title[1].replace("/","")

    data.append(otitle)

    else:

    data.append(title[0])

    data.append(' ')

    rating = re.findall(findrating,item)[0]

    data.append(rating)

    jubge = re.findall(findjudege,item)[0]

    data.append(jubge)

    inq = re.findall(findinq,item)

    if(len(inq) !=0):

    inq = inq[0].replace("。","")

    data.append(inq)

    else:

    data.append(" ")

    bd = re.findall(findbd,item)[0]

    bd=re.sub('<br(\s+)?/>(\s+)?'," ",bd)

    bd = re.sub("/"," ",bd)

    data.append(bd.strip())

    datalist.append(data)

    # print(datalist)

        return datalist

    def askurl(url):

    head = {

    "User-Agent":"Mozilla/5.0(Windows NT 10.0;WOW64) AppleWebKit/537.36(KHTML,likeGecko)Chrome/78.0.3904.108Safari/537.36"

        }

    html =""

        try:

    request = urllib.request.Request(url,headers=head)

    response = urllib.request.urlopen(request)

    html = response.read().decode("utf-8")

    # print(html)

        except urllib.error.URLErroras e:

    if hasattr(e,"code"):

    print(e.code)

    if hasattr(e,"reason"):

    print(e.reason)

    return  html

    def saveData(datalist,savepath):

    book = xlwt.Workbook(encoding="utf-8",style_compression=0)

    sheet = book.add_sheet('豆瓣电影top250',cell_overwrite_ok=True)

    col = ("电影链接","1","1","2","1","2","1","2")

    for iin range(8):

    sheet.write(0,i,col[i])

    for iin range(250):

    print("第%d条"%i)

    data = datalist[i]

    for jin range(8):

    sheet.write(i+1, j, data[j])

    book.save(savepath)

    if __name__ =="__main__":

    main()


    相关文章

      网友评论

          本文标题:python爬虫初步

          本文链接:https://www.haomeiwen.com/subject/sliukktx.html