美文网首页
【爬虫篇】:爬取赶集网

【爬虫篇】:爬取赶集网

作者: dataheart | 来源:发表于2016-05-31 14:37 被阅读149次

成果:

Paste_Image.png

代码:

1.channel_extract.py

from bs4 import BeautifulSoup
import requests

start_url = 'http://bj.ganji.com/wu/'
url_host = 'http://bj.ganji.com'

def get_channel_urls(url):

start_url = 'http://bj.ganji.com/wu/'
wb_data = requests.get(start_url)
soup = BeautifulSoup(wb_data.text,'lxml')
links = soup.select('dl > dd > a')
for link in links:
    page_url = url_host + link.get('href')
    print(page_url)

channel_list = '''
http://bj.ganji.com/zhuanqu_jiaren/all/
http://bj.ganji.com/zhuanqu_shenghuo/all/
'''

2.page_parsing.py

from bs4 import BeautifulSoup
import requests
import pymongo
import itertools

'''
client = pymongo.MongoClient('localhost',27017)
ganji = client['ganji']
ganjiwang = ganji['ganjiwang']
'''

headers = {'User-Agent':'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1','Connection':'keep-alive'}

def get_links_from(channel,pages,who_sell='o'):

list_view ='{}{}{}'.format(channel,str(who_sell),str(pages))
wb_data = requests.get(list_view)
soup = BeautifulSoup(wb_data.text,'lxml')
for link in soup.select('dd.feature > div > ul > li > a'):
    data = { 'url':link.get('href'),'title':link.get_text().split()}
    ganjiwang.insert_one(data)
    print(data)

def get_information(url):

url = 'http://bj.ganji.com/shuma/1872151635x.htm'
wb_data = requests.get(url,headers = headers)
soup = BeautifulSoup(wb_data.text,'lxml')

data = {
'title':soup.select('div.col-cont.title-box > h1')[0].get_text(),
'time':soup. select('div > ul.title-info-l.clearfix > li > i')[0].get_text(),
'cate' :soup.select('ul.det-infor > li > span')[0].get_text(),
'price':soup.select('li > i.f22.fc-orange.f-type')[0].get_text(),
'place':list(map(lambda x:x.text,soup.select('div > ul.det-infor > li > a')[1:])),
'url':url
}
print(data)
        data = {
        'title':soup.title.text.strip(),
        'price':soup.select('.f22.fc-orange.f-type')[0].text.strip(),
        'pub_date':soup.select('.pr-5')[0].text.strip().split(' ')[0],
        'area':list(map(lambda x:x.text,soup.select('ul.det-infor > li:nth-of-type(3) > a'))),
        'cates':list(soup.select('ul.det-infor > li:nth-of-type(1) > span')[0].stripped_strings),
        'url':url
    }
    print(data)
for i in place:
    print(i.get_text())

get_links_from('http://bj.ganji.com/ipodTouch/',1)

3.mianshop

from multiprocessing import Pool
from channel_extract import channel_list
from page_parsing import get_links_from,get_information

ganjiwang_urls = [item['url'] for item in ganjiwang.find()]
shopurl = set(ganjiwang_urls)

def get_all_information(ganjiwangshopurl):

for shop in ganjiwangshopurl:
    get_information(shop)

if name == 'main':

pool = Pool()
pool.map(get_all_information,shopurl)    
pool.close()
pool.jion()

4.count.py

import time
from page_parsing import url_list

while True:

print(url_list.find().count())
time.sleep(1)

感触:

多练多google,其他到时没什么,公司电脑装不上win32为mongodb,坑 啊!,每天回家操作。。。

相关文章

网友评论

      本文标题:【爬虫篇】:爬取赶集网

      本文链接:https://www.haomeiwen.com/subject/okwcdttx.html