scrapy新闻爬虫

作者: 薛落花随泪绽放 | 来源:发表于2017-11-05 14:08 被阅读2次

scrapy新闻爬虫
Pycharm+Scrapy框架运行爬虫糗事百科（无items数
Scrapy笔记
各类链接
scrapy爬虫
深度爬虫
爬虫框架常见命令（善忘者）
(六)Scrapy爬虫框架的认识(读书笔记)|Python网络爬
2018-05-13
scrapy与scrapy-redis的使用（一）-基础

scrapy startproject baidunews
cd baidunews
scrapy genspider n1 baidu.com

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class BaidunewsItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title=scrapy.Field()
    link=scrapy.Field()
    content=scrapy.Field()

>>>fh=open("D:/python/abc.txt","r",encoding="utf-8")
>>>import re
>>>pat1="id=(.*?)&"
>>>for i in fh:
    thisdata=re.compile(pat1).findall(i)
    if(len(thisdata)!=0):
        thisdata=thisdata[0]
    else:
        pass
    print(thisdata)

执行结果

GangAoTai
ShiZheng
LatestNews
LocalNews
LocalNews
[]
civilnews
InternationalNews
FinanceNews
EnterNews
SportNews
AutoNews
HouseNews
InternetNews
TechNews
LocalHouseNews
EduNews
GameNews
DiscoveryNews
HealthNews
LadyNews
SocialNews
MilitaryNews
PicWall
[]

a=[]
>>>fh=open("D:/python/abc.txt","r",encoding="utf-8")
>>>for i in fh:
    thisdata=re.compile(pat1).findall(i)
    if(len(thisdata)!=0):
        a.append(thisdata[0])
    else:
        a.append(i)
>>>a

执行结果

['LocalNews', 'GangAoTai', 'ShiZheng', 'LatestNews', 'LocalNews', 'LocalNews', 'http://news.baidu.com/widget?ajax=json&id=ad\n', 'civilnews', 'InternationalNews', 'FinanceNews', 'EnterNews', 'SportNews', 'AutoNews', 'HouseNews', 'InternetNews', 'TechNews', 'LocalHouseNews', 'EduNews', 'GameNews', 'DiscoveryNews', 'HealthNews', 'LadyNews', 'SocialNews', 'MilitaryNews', 'PicWall', '\n']

nl.py

# -*- coding: utf-8 -*-
import scrapy
from baidunews.items import BaidunewsItem
from scrapy.http import Request
import re
import time


class NlSpider(scrapy.Spider):
    name = 'nl'
    allowed_domains = ['baidu.com']
    start_urls = ['http://news.baidu.com/widget?id=LocalHouseNews&ajax=json']
    allid = ['LocalHouseNews', 'LocalNews', 'civilnews', 'InternationalNews', 'FinanceNews', 'EnterNews', 'SportNews',
             'AutoNews', 'HouseNews', 'InternetNews', 'InternetPlusNews', 'TechNews', 'EduNews', 'GameNews',
             'DiscoveryNews', 'HealthNews', 'LadyNews', 'SocialNews', 'MilitaryNews', 'PicWall']
    allurl = []
    for k in range(0, len(allid)):
        thisurl = "http://news.baidu.com/widget?id=" + allid[k] + "&ajax=json"
        allurl.append(thisurl)

    def parse(self, response):
        while True:
            for m in range(0, len(self.allurl)):
                print("第" + str(m) + "个栏目")
                yield Request(self.allurl[m], callback=self.next)
            time.sleep(300)

    def next(self, response):
        data = response.body.decode("utf-8", "ignore")
        pat1 = '"m_url":"(.*?)"'
        pat2 = '"url":"(.*?)"'
        url1 = re.compile(pat1, re.S).findall(data)
        url2 = re.compile(pat2, re.S).findall(data)
        if (len(url1) != 0):
            url = url1
        else:
            url = url2
        print(url)
        for i in range(0, len(url)):
            thisurl = re.sub("\\\/", "/", url[i])
            print(thisurl)
            yield Request(thisurl, callback=self.next2, dont_filter=True)

    def next2(self, response):
        item = BaidunewsItem()
        item["link"] = response.url
        item["title"] = response.xpath("/html/head/title/text()").extract()
        item["content"] = response.body
        yield item

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


class BaidunewsPipeline(object):
    def process_item(self, item, spider):
        print(item["title"])
        return item