【Python】下载百度空间文章的python源码

作者: IT派森 | 来源:发表于2019-08-02 22:49 被阅读2次

纯python新手写的关于下载百度空间文章python源码，代码写的不好，能用不能看。大家看看效果就行，不要求代码的精简程度。大牛请飘过。

下载百度空间文章python源码使用方法：
在cmd中输入：> python "F:\Walkbox\Python\mywork\baidu\getArticleId - r1.py" bspeng922 6
命令格式：python 文件存放路径 [用户名] [下载页数]
下载页数可以不填，不填则为全部下载。如果大于实际总页数，则会重复下载第一页的内容

这段代码只能是新版的百度空间，只测试了”低调优雅“模板，生成的是html文件；
同时我突然发现一个奇特的功能，这段代码竟然可以用来刷百度空间的访问量，不错哦。

下载百度空间文章python源码，如下：

`# -*- coding: utf8 -*-`

`import` `urllib`

`import` `re,os,sys,time`

`def` `articleDownload(username,pageCount):`

`#判断传入的参数是否合法`

`if` `username` `=``=` `"``" : username = "``bspeng922"`

`if` `pageCount` `=``=` `""` `or` `int``(pageCount)<``0` `:`

`pageCount` `=` `0`

`else``:`

`pageCount` `=` `int``(pageCount)` `+` `1`

`print` `"Blog: [http://hi.baidu.com/new/%s](http://hi.baidu.com/new/%s)"``%``username`

`#文件保存目录，可修改`

`saveDrive` `=` `"E:\\test"`  `#directory to save html files`

`#html文件保存目录`

`if` `not` `os.path.exists(saveDrive) :`

`os.mkdir(saveDrive)`

`mydrive` `=` `os.path.join(saveDrive,username)`

`if` `not` `os.path.exists(mydrive) :`

`os.mkdir(mydrive)`

`#图片保存目录`

`imgDir` `=` `"img"`

`imgPath` `=` `os.path.join(saveDrive,username,imgDir)`

`if` `not` `os.path.exists(imgPath):`

`os.mkdir(imgPath)`

`#判断传入的页数是否为0，为0则全部下载`

`if` `pageCount` `=``=` `0` `:`

`fstbaidu` `=` `urllib.urlopen(``"[http://hi.baidu.com/new/%s](http://hi.baidu.com/new/%s)"``%``username)   `

`totalRecord,pagesize``=``0``,``0`

`for` `fstline` `in` `fstbaidu:       `

`if` `fstline.find(``"allCount"``)>``0``:` `#only one tag`

`totalRecord` `=` `int``(fstline[fstline.index(``"'")+1:fstline.rindex("'"``)])`

`if` `fstline.find(``"pageSize"``)>``0``:`

`pagesize` `=` `int``(fstline[fstline.index(``"'")+1:fstline.rindex("'"``)])`

`if` `pagesize !``=` `0` `and` `totalRecord !``=` `0``:`

`pageCount` `=` `totalRecord``/``pagesize`

`if` `totalRecord` `/` `float``(pagesize) > totalRecord``/``pagesize:`

`pageCount` `=` `pageCount` `+` `2`

`fstbaidu.close()`

`print` `"Page Count: "``,pageCount` `-` `1`

`#根据文章ID获得文章实际链接`

`articleCount` `=` `0`   

`sumHtmlPath` `=` `os.path.join(saveDrive,``"%s.html"``%``username)`

`sumfile` `=` `open``(sumHtmlPath,``"w"``)` `#the sum file`

`aTagCmp` `=` `re.``compile``(``"""<a href="/%s/item/([\w]*?)" class="a-incontent a-title cs-contentblock-hoverlink" target=_blank>(.*?)</a>"""``%``username)`

`for` `page` `in` `range``(``1``,pageCount):`

`thisPageUrl` `=` `urllib.urlopen(``"[http://hi.baidu.com/new/%s?page=%d](http://hi.baidu.com/new/%s?page=%d)"``%``(username,page))`

`print` `"Page: "``,page`

`for` `line` `in` `thisPageUrl:`

`if` `line.find(``"a-incontent a-title"``)>``0` `:`

`articleCount` `+``=` `1`    `#博客文章数目`

`linefind` `=` `aTagCmp.findall(line)`

`#print linefind`

`for` `line` `in` `linefind :`

`#文章的ID和名称`

`myurl` `=` `line[``0``]`

`mytitle` `=` `line[``1``]`

`sumfile.write(``"""<a href='%s\\%s.html' target='blank'>%s</a><br>"""``%``(username,myurl,mytitle))`

`#获得真实的文章，并保存`

`thispath` `=` `os.path.join(mydrive,``"%s.html"``%``myurl)`

`thisfile` `=` `open``(thispath,``'w'``)`

`thisArticle` `=` `urllib.urlopen(``"[http://hi.baidu.com/%s/item/%s](http://hi.baidu.com/%s/item/%s)"``%``(username,myurl))`

`for` `thisline` `in` `thisArticle:`

`imgCount` `=` `0`

`badImg` `=` `0`

`if` `thisline.find(``"content-head clearfix"``)>``0``:` `#只取正文`

`#匹配图片标签`

`imgTagCmp` `=` `re.``compile``(``"""<img.*?src="(.*?)".*?>"""``)`

`imglist` `=` `imgTagCmp.findall(thisline)`

`for` `imglink` `in` `imglist :`

`imageNewPath` `=` `""`

`#print imglink`

`if` `imglink.find(``"""://"""``)>``0``:`

`imageName` `=` `imglink[imglink.rindex(``"/"``)``+``1``:]`

`#下载图片`

`try``:`

`urllib.urlretrieve(imglink,os.path.join(imgPath,imageName))`

`imgCount` `+``=` `1`

`except` `:` `#不能下载则报错`

`print` `"cannot download this image: "``+``imageName`

`#替换图片链接`

`imageNewPath` `=` `"""<img src="%s/%s" />"""``%``(imgDir,imageName)`

`thisImgCmp` `=` `re.``compile``(``"""<img width="\d{1,4}" height="\d{1,4}" src="[http://.](http://./)*?/%s" />|<img src="[http://.](http://./)*?/%s" small="0" />|<img src="[http://.](http://./)*?/%s" />|<img small="0" src="[http://.](http://./)*?/%s" />"""``%``(imageName,imageName,imageName,imageName))`

`#print imageNewPath`

`try``:`

`#print thisImgCmp.findall(thisline)`

`thisline` `=` `thisImgCmp.sub(imageNewPath,thisline)` `#每次都对当前图片标签进行替换`

`#print thisline`

`except``:`

`print` `"UnExpect error"`

`else``:``#www.iplaypy.com`

`badImg` `+``=` `1`

`#删除多余的内容`

`pos` `=` `thisline.find(``"mod-post-info clearfix"``)`

`if` `pos>``0` `:`

`thisline` `=` `thisline[``0``:pos``-``12``]`

`thisfile.write(thisline.strip())               `

`thisfile.close()`

`thisArticle.close()`

`#print "Image Count: %d  Bad Image: %d"%(imgCount, badImg)`

`thisPageUrl.close()`

`sumfile.close()`

`print` `"Article Count: "``,articleCount`

`if` `__name__` `=``=` `"__main__"``:`

`st` `=` `time.time()`

`#获得命令行参数`

`if` `len``(sys.argv)` `=``=` `2``:`

`uname` `=` `sys.argv[``1``]`

`pages` `=` `0`

`elif` `len``(sys.argv)>``2``:`

`uname` `=` `sys.argv[``1``]`

`pages` `=` `int``(sys.argv[``2``])``+``1`

`else``:`

`uname` `=` `raw_input``(``"Username -> "``)`

`pages` `=` `raw_input``(``"Page -> "``)`

`articleDownload(uname,pages)`

`et` `=` `time.time()`

`print` `"Time used: %0.2fs"``%``(et``-``st)`
Python资源分享秋秋裙 784758214 ,内有安装包，学习视频资料，这里是Python学习者的聚集地，零基础，进阶，都欢迎

网友评论

本文标题：【Python】下载百度空间文章的python源码

本文链接：https://www.haomeiwen.com/subject/bedaqctx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

【Python】下载百度空间文章的python源码

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读

大数据爬虫Python AI Sql

互联网科技

码农的世界

Python，web开发，前端技术分享

【Python】下载百度空间文章的python源码

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读

大数据 爬虫Python AI Sql

互联网科技

码农的世界

Python，web开发，前端技术分享

大数据爬虫Python AI Sql