这不是教程!!!!
HTMLParser
Introduction
This module defines a class HTMLParser which serves as the basis for parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
常用方法
-
HTMLParser.feed(data)
Feed some text to the parser. It is processed insofar as it consists of complete elements; incomplete data is buffered until more data is fed or close() is called. data can be either unicode or str, but passing unicode is advised. -
HTMLParser.handle_starttag(tag, attrs)
- 处理开始html标签;
- tag:标签名称;
- attrs:属性列表,列表由元组构成
(属性名称, 值)
的列表
-
HTMLParser.handle_data(data)
处理标签里的数据体;data:数据文本
# -*- coding: utf-8 -*-
# @Date : 2016-06-14 13:47:56
# @Author : Yuan Su (tomcatyuanshu@gmail.com)
# @Python Version : 2.7.9
"""
抓取豆瓣电影上的热映电影和即将上映电影
"""
import urllib2
from HTMLParser import HTMLParser
class MovieParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.movies = []
def handle_starttag(self, tag, attrs):
nowplaying = 'nowplaying'
upcoming = 'upcoming'
tag_li = 'li'
def _attr(attrlist, attrname):
for attr in attrlist:
if attr[0] == attrname:
return attr[1]
return None
def find_right_data(tag, myattr):
if tag == tag and _attr(attrs, 'data-title') and _attr(attrs, 'data-category') == myattr:
movie = {}
movie['title'] = _attr(attrs, 'data-title')
movie['score'] = _attr(attrs, 'data-score')
movie['director'] = _attr(attrs, 'data-director')
movie['actors'] = _attr(attrs, 'data-actors')
self.movies.append(movie)
# 当我注释掉print myattr, 'movies: '这条语句时,不能抓取所有数据,不注释就可以有所有数据
print myattr, 'movies: '
print movie['title'], ' | ', movie['score'], ' | ', movie['director'], ' | ', movie['actors']
find_right_data(tag_li, nowplaying)
find_right_data(tag_li, upcoming)
def movies(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36'}
req = urllib2.Request(url, headers=headers)
s = urllib2.urlopen(req)
parser = MovieParser()
parser.feed(s.read())
s.close()
return parser.movies
if __name__ == '__main__':
url = 'http://movie.douban.com/nowplaying/chengdu/'
movies = movies(url)
# print movies
# import json
# print '%s' % json.dumps(movies, sort_keys=True, indent=4,
# separators=(',', ':'))
网友评论