scrapy startproject baidunews
cd baidunews
scrapy genspider n1 baidu.com
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class BaidunewsItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title=scrapy.Field()
link=scrapy.Field()
content=scrapy.Field()
>>>fh=open("D:/python/abc.txt","r",encoding="utf-8")
>>>import re
>>>pat1="id=(.*?)&"
>>>for i in fh:
thisdata=re.compile(pat1).findall(i)
if(len(thisdata)!=0):
thisdata=thisdata[0]
else:
pass
print(thisdata)
执行结果
GangAoTai
ShiZheng
LatestNews
LocalNews
LocalNews
[]
civilnews
InternationalNews
FinanceNews
EnterNews
SportNews
AutoNews
HouseNews
InternetNews
TechNews
LocalHouseNews
EduNews
GameNews
DiscoveryNews
HealthNews
LadyNews
SocialNews
MilitaryNews
PicWall
[]
a=[]
>>>fh=open("D:/python/abc.txt","r",encoding="utf-8")
>>>for i in fh:
thisdata=re.compile(pat1).findall(i)
if(len(thisdata)!=0):
a.append(thisdata[0])
else:
a.append(i)
>>>a
执行结果
['LocalNews', 'GangAoTai', 'ShiZheng', 'LatestNews', 'LocalNews', 'LocalNews', 'http://news.baidu.com/widget?ajax=json&id=ad\n', 'civilnews', 'InternationalNews', 'FinanceNews', 'EnterNews', 'SportNews', 'AutoNews', 'HouseNews', 'InternetNews', 'TechNews', 'LocalHouseNews', 'EduNews', 'GameNews', 'DiscoveryNews', 'HealthNews', 'LadyNews', 'SocialNews', 'MilitaryNews', 'PicWall', '\n']
nl.py
# -*- coding: utf-8 -*-
import scrapy
from baidunews.items import BaidunewsItem
from scrapy.http import Request
import re
import time
class NlSpider(scrapy.Spider):
name = 'nl'
allowed_domains = ['baidu.com']
start_urls = ['http://news.baidu.com/widget?id=LocalHouseNews&ajax=json']
allid = ['LocalHouseNews', 'LocalNews', 'civilnews', 'InternationalNews', 'FinanceNews', 'EnterNews', 'SportNews',
'AutoNews', 'HouseNews', 'InternetNews', 'InternetPlusNews', 'TechNews', 'EduNews', 'GameNews',
'DiscoveryNews', 'HealthNews', 'LadyNews', 'SocialNews', 'MilitaryNews', 'PicWall']
allurl = []
for k in range(0, len(allid)):
thisurl = "http://news.baidu.com/widget?id=" + allid[k] + "&ajax=json"
allurl.append(thisurl)
def parse(self, response):
while True:
for m in range(0, len(self.allurl)):
print("第" + str(m) + "个栏目")
yield Request(self.allurl[m], callback=self.next)
time.sleep(300)
def next(self, response):
data = response.body.decode("utf-8", "ignore")
pat1 = '"m_url":"(.*?)"'
pat2 = '"url":"(.*?)"'
url1 = re.compile(pat1, re.S).findall(data)
url2 = re.compile(pat2, re.S).findall(data)
if (len(url1) != 0):
url = url1
else:
url = url2
print(url)
for i in range(0, len(url)):
thisurl = re.sub("\\\/", "/", url[i])
print(thisurl)
yield Request(thisurl, callback=self.next2, dont_filter=True)
def next2(self, response):
item = BaidunewsItem()
item["link"] = response.url
item["title"] = response.xpath("/html/head/title/text()").extract()
item["content"] = response.body
yield item
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class BaidunewsPipeline(object):
def process_item(self, item, spider):
print(item["title"])
return item
网友评论