成果:

代码:
1.channel_extract.py
from bs4 import BeautifulSoup
import requests
start_url = 'http://bj.ganji.com/wu/'
url_host = 'http://bj.ganji.com'
def get_channel_urls(url):
start_url = 'http://bj.ganji.com/wu/'
wb_data = requests.get(start_url)
soup = BeautifulSoup(wb_data.text,'lxml')
links = soup.select('dl > dd > a')
for link in links:
page_url = url_host + link.get('href')
print(page_url)
channel_list = '''
http://bj.ganji.com/zhuanqu_jiaren/all/
http://bj.ganji.com/zhuanqu_shenghuo/all/
'''
2.page_parsing.py
from bs4 import BeautifulSoup
import requests
import pymongo
import itertools
'''
client = pymongo.MongoClient('localhost',27017)
ganji = client['ganji']
ganjiwang = ganji['ganjiwang']
'''
headers = {'User-Agent':'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1','Connection':'keep-alive'}
def get_links_from(channel,pages,who_sell='o'):
list_view ='{}{}{}'.format(channel,str(who_sell),str(pages))
wb_data = requests.get(list_view)
soup = BeautifulSoup(wb_data.text,'lxml')
for link in soup.select('dd.feature > div > ul > li > a'):
data = { 'url':link.get('href'),'title':link.get_text().split()}
ganjiwang.insert_one(data)
print(data)
def get_information(url):
url = 'http://bj.ganji.com/shuma/1872151635x.htm'
wb_data = requests.get(url,headers = headers)
soup = BeautifulSoup(wb_data.text,'lxml')
data = {
'title':soup.select('div.col-cont.title-box > h1')[0].get_text(),
'time':soup. select('div > ul.title-info-l.clearfix > li > i')[0].get_text(),
'cate' :soup.select('ul.det-infor > li > span')[0].get_text(),
'price':soup.select('li > i.f22.fc-orange.f-type')[0].get_text(),
'place':list(map(lambda x:x.text,soup.select('div > ul.det-infor > li > a')[1:])),
'url':url
}
print(data)
data = {
'title':soup.title.text.strip(),
'price':soup.select('.f22.fc-orange.f-type')[0].text.strip(),
'pub_date':soup.select('.pr-5')[0].text.strip().split(' ')[0],
'area':list(map(lambda x:x.text,soup.select('ul.det-infor > li:nth-of-type(3) > a'))),
'cates':list(soup.select('ul.det-infor > li:nth-of-type(1) > span')[0].stripped_strings),
'url':url
}
print(data)
for i in place:
print(i.get_text())
get_links_from('http://bj.ganji.com/ipodTouch/',1)
3.mianshop
from multiprocessing import Pool
from channel_extract import channel_list
from page_parsing import get_links_from,get_information
ganjiwang_urls = [item['url'] for item in ganjiwang.find()]
shopurl = set(ganjiwang_urls)
def get_all_information(ganjiwangshopurl):
for shop in ganjiwangshopurl:
get_information(shop)
if name == 'main':
pool = Pool()
pool.map(get_all_information,shopurl)
pool.close()
pool.jion()
4.count.py
import time
from page_parsing import url_list
while True:
print(url_list.find().count())
time.sleep(1)
感触:
多练多google,其他到时没什么,公司电脑装不上win32为mongodb,坑 啊!,每天回家操作。。。
网友评论