美文网首页
一个简单的Python写的XML爬虫;Python访问网页

一个简单的Python写的XML爬虫;Python访问网页

作者: 坤文 | 来源:发表于2018-09-17 18:23 被阅读0次

    Python访问网页

    2011-09-15 15:21:21|  分类:派森程序点滴|举报|字号订阅

    使用Python访问网页主要有三种方式: urllib, urllib2, httplib

    urllib比较简单,功能相对也比较弱,httplib简单强大,但好像不支持session

    1. 最简单的页面访问

    import urllib2

    res=urllib2.urlopen(url)

    except urllib2.URLError, e:

    print res.read()

    2. 加上要get或post的数据

    data={"name":"hank", "passwd":"hjz"}

    urllib2.urlopen(url, urllib.urlencode(data))

    3. 加上http头

    header={"User-Agent": "Mozilla-Firefox5.0"}

    urllib2.urlopen(url, urllib.urlencode(data), header)使用opener和handler

    opener = urllib2.build_opener(handler)

    urllib2.install_opener(opener)

    4. 加上session

    cj = cookielib.CookieJar()

    cjhandler=urllib2.HTTPCookieProcessor(cj)

    opener = urllib2.build_opener(cjhandler)

    urllib2.install_opener(opener)

    5. 加上Basic认证

    password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()

    top_level_url = "http://www.163.com/"

    password_mgr.add_password(None, top_level_url, username, password)

    handler = urllib2.HTTPBasicAuthHandler(password_mgr)

    opener = urllib2.build_opener(handler)

    urllib2.install_opener(opener)

    6. 使用代理

    proxy_support = urllib2.ProxyHandler({"http":"http://1.2.3.4:3128/"})

    opener = urllib2.build_opener(proxy_support)

    urllib2.install_opener(opener)

    7. 设置超时

    socket.setdefaulttimeout(5)参考:http://svn.python.org/projects/python/trunk/Doc/howto/urllib2.rst

    一个简单的Python写的XML爬虫

    http://www.veryhuo.com/a/view/11163.html

    #-*- encoding: utf-8 -*- 

    import codecs 

    import sys 

    import threading 

    from urllib import urlencode 

    from urllib2 import urlopen 

    from xml.dom.minidom import parseString 

    class Serach: 

    def __init__(self, key=None): 

    self.key = key 

    def SendPy(self, key): 

    try: 

    contentpy = urlopen("http://xxxx.com/ac_box?ac=" + self.key).read() 

    except: 

    print ("down load py!") 

    try: 

    xmldoc = parseString(contentpy) 

    except: 

    print ("ill formed xml file") 

    root = xmldoc.documentElement 

    ''分析XML的结构,得到数组 

    keyList = root.getElementsByTagName('SuggestWord') 

    return keyList 

    def SendKey(self, keyword): 

    keyword = keyword.encode('gbk') 

    tupleList = [] 

    try: 

    ''读XML地址,转码 

    content = urlopen("http://xxxx.com/btinfo?keyword=" + keyword + "&num=1").read() 

    content = unicode(content, "cp936").encode("utf-8") 

    except: 

    print ("down load key!") 

    ''替换 

    content = content.replace('''<?xml version="1.0" encoding="gbk"?>''', '''<?xml version="1.0" encoding="utf-8"?>''') 

    try: 

    xmldoc = parseString(content) 

    except: 

    print ("ill formed xml file") 

    try: 

    query = xmldoc.getElementsByTagName('Query')[0] 

    tupleList = query.getAttribute('ErrorCode') 

    except: 

    tupleList = 104 

    return tupleList 

    def run(self): 

    ls = self.SendPy(self.key) 

    count = len(self.key) 

    cur = self.conn.cursor() 

    str = '' 

    for doc in ls: 

    tuple = doc.firstChild.data 

    text = self.SendKey(tuple) 

    if text == '0': 

    test = self.MySQLKey(tuple) 

    if test != '2': 

    str = str + tuple + '|' + test + ',' 

    if count > 3: 

    sitetag = self.MySQLPy(self.key) 

    if sitetag != (): 

    for x in sitetag: 

    tsql = "xxxx" 

    cur.execute(tsql) 

    #print(cur.fetchall()) 

    for s in cur.fetchall(): 

    if (s[0]=='rmvb') or (s[0]=='rm'): 

    r = '0' 

    else: 

    r = '1' 

    str = str + x[0] + '|' + r + ',' 

    str = str[:-1] 

    else: 

    str = str[:-1] 

    #转成数组后过滤重复字段 

    strtag = list(set(str.split(','))) 

    sText = ','.join(strtag) 

    file_object = codecs.open(self.savePath + self.key + '.txt', 'w', 'utf-8') 

    file_object.write(sText) 

    file_object.close() 

    if __name__ == "__main__": 

    if len(sys.argv) > 1: 

    s = Serach(sys.argv[1]); 

    s.run()

    相关文章

      网友评论

          本文标题:一个简单的Python写的XML爬虫;Python访问网页

          本文链接:https://www.haomeiwen.com/subject/pjnrnftx.html