美文网首页
乱七八糟第四天

乱七八糟第四天

作者: 小喵周周 | 来源:发表于2017-12-30 11:12 被阅读0次

萌萌走了开始继续充电,今天晚上把51job爬虫分析的代码分析摘录一下:

import os
from pprint import pprint
import csv
from collections import Counter

from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt
import jieba
from wordcloud import WordCloud


class JobSpider:
    """
    51 job 网站爬虫类
    """

    def __init__(self):
        self.company = []
        self.text = ""
        self.headers = {
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36'
                          '(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}

Beautiful Soup支持Python标准库中的HTML解析器,还支持一些第三方的解析器,如果我们不安装它,则 Python 会使用 Python默认的解析器,lxml 解析器更加强大,速度更快,推荐安装。
BeautifulSoup(markup, “html.parser”) ---python标准库,旧版本容错能力差
BeautifulSoup(markup, “lxml”)---lxml库,需要安装C语言库

f = open("text.txt", 'r', encoding='utf8')  # 从文件中读取要处理的大段文字
lines = []
for line in f:
    rs = line.rstrip('\n')  # 删掉换行符
    lines.append(rs)
print(lines)

解决读取文件中的换行符
职位处理初稿

#coding=gbk

#coding=utf-8

#-*- coding: UTF-8 -*
import os
import sys
from pprint import pprint
import csv
from collections import Counter

from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt
import jieba
from wordcloud import WordCloud


class JobSpider:
    
    def __init__(self):
        self.company = []
        self.text = ""
        self.headers = {
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36'
                          '(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
        }

    def job_spider(self):
       
        url = "http://search.51job.com/list/010000%252C020000%252C030200%252C" \
              "040000,000000,0000,00,9,99,Python,2,{}.html? lang=c&stype=1&" \
              "postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99" \
              "&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9" \
              "&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
        urls = [url.format(p) for p in range(2, 3)]
        for url in urls:
            r = requests.get(url, headers=self.headers).content.decode('gbk')
            bs = BeautifulSoup(r, 'lxml').find(
                "div", class_="dw_table").find_all("div", class_="el")
            for b in bs:
                try:
                    href, post = b.find('a')['href'], b.find('a')['title']
                    locate = b.find('span', class_='t3').text
                    salary = b.find('span', class_='t4').text
                    d = {
                        'href': href,
                        'post': post,
                        'locate': locate,
                        'salary': salary
                    }
                    self.company.append(d)
                except Exception:
                    pass
    '''def stopwordslist(stoppath): 
        stoppath='stop_words.txt'
        stopwords = [line.strip() for line in open(stoppath, 'r', encoding='utf-8').readlines()]  
        print(stopwords)
        return stopwords'''
    
    def post_require(self):
        
        for c in self.company:
            r = requests.get(
                c.get('href'), headers=self.headers).content.decode('gbk')
            #print('href')
            bs = BeautifulSoup(r, 'lxml').find(
                'div', class_="bmsg job_msg inbox").text
            s = bs.replace("举报", "").replace("分享", "").replace("\t", "").strip()
            self.text += s
        # print(self.text)
        with open(os.path.join("post_require.txt"),
                  "w+", encoding="utf-8",newline='') as f:
            f.write(self.text)

    @staticmethod
     
    def post_desc_counter():
        
        # import thulac
        
        post = open(os.path.join("post_require.txt"),
                    "r", encoding="utf-8").read()
        # 使用 thulac 分词
        # thu = thulac.thulac(seg_only=True)
        # thu.cut(post, text=True)

        # 使用 jieba 分词
        file_path = os.path.join("user_dict.txt")
        jieba.load_userdict(file_path)
        seg_list = jieba.cut(post, cut_all=False)
        #stopwords = spider.stopwordslist('stop_words.txt')# 这里加载停用词的路径  
        stopwords = [line.strip() for line in open('stop_words.txt', 'r', encoding='utf-8').readlines()]  
      
        outstr = []  
        for word in seg_list:  
            
            if word not in stopwords:  
                if word != '\t':  
                    outstr.append(word)
                    #print(outstr)
                    #outstr += " "  
        
        counter = dict()
        
        if outstr=='':
            print('bad result')
        
        for seg in outstr:
            
            counter[seg] = counter.get(seg, 1) + 1
        counter_sort = sorted(
            counter.items(), key=lambda value: value[1], reverse=True)
        pprint(counter_sort)
        with open(os.path.join("post_pre_desc_counter.csv"),
                  "w+", encoding="utf-8",newline='') as f:
            f_csv = csv.writer(f)
            f_csv.writerows(counter_sort)

    def post_counter(self):

        lst = [c.get('post') for c in self.company]
        counter = Counter(lst)
        counter_most = counter.most_common()
        pprint(counter_most)
        with open(os.path.join("post_pre_counter.csv"),
                  "w+", encoding="utf-8") as f:
            f_csv = csv.writer(f)
            f_csv.writerows(counter_most)

    def post_salary_locate(self):
        
        lst = []
        for c in self.company:
            lst.append((c.get('salary'), c.get('post'), c.get('locate')))
        #pprint(lst)
        file_path = os.path.join( "post_salary_locate.csv")
        with open(file_path, "w+", encoding="utf-8",newline='') as f:
            f_csv = csv.writer(f)
            f_csv.writerows(lst)

    @staticmethod
    def post_salary():
        
        mouth = []
        year = []
        thousand = []
        with open(os.path.join("post_salary_locate.csv"),
                  "r", encoding="utf-8") as f:
            f_csv = csv.reader(f)
            print(f_csv)
            for row in f_csv:
                print(row)
                if "万/月" in row[0]:
                    mouth.append((row[0][:-3], row[2], row[1]))
                    print(mouth)
                elif "万/年" in row[0]:
                    year.append((row[0][:-3], row[2], row[1]))
                elif "千/月" in row[0]:
                    thousand.append((row[0][:-3], row[2], row[1]))
        #pprint(mouth)

        calc = []
        for m in mouth:
            s = m[0].split("-")
            calc.append(
                (round(
                    (float(s[1]) - float(s[0])) * 0.4 + float(s[0]), 1),
                 m[1], m[2]))
        for y in year:
            s = y[0].split("-")
            calc.append(
                (round(
                    ((float(s[1]) - float(s[0])) * 0.4 + float(s[0])) / 12, 1),
                 y[1], y[2]))
        for t in thousand:
            s = t[0].split("-")
            calc.append(
                (round(
                    ((float(s[1]) - float(s[0])) * 0.4 + float(s[0])) / 10, 1),
                 t[1], t[2]))
        pprint(calc)
        with open(os.path.join("post_salary.csv"),
                  "w+", encoding="utf-8",newline='') as f:
            f_csv = csv.writer(f)
            f_csv.writerows(calc)

    @staticmethod
    def post_salary_counter():
        """ 薪酬统计
        """
        with open(os.path.join("post_salary.csv"),
                  "r", encoding="utf-8") as f:
            f_csv = csv.reader(f)
            lst = [row[0] for row in f_csv]
        counter = Counter(lst).most_common()
        pprint(counter)
        with open(os.path.join("post_salary_counter1.csv"),
                  "w+", encoding="utf-8",newline='') as f:
            f_csv = csv.writer(f)
            f_csv.writerows(counter)

    @staticmethod
 
    def world_cloud():
        """ 生成词云
        """
        counter = {}
        with open(os.path.join("post_pre_desc_counter.csv"),
                  "r", encoding="utf-8") as f:
            f_csv = csv.reader(f)
            for row in f_csv:
                counter[row[0]] = counter.get(row[0], int(row[1]))
            #pprint(counter)
        file_path = os.path.join("msyh.ttf")
        wc = WordCloud(font_path=file_path,
                       max_words=100,
                       height=600,
                       width=1200).generate_from_frequencies(counter)
        plt.imshow(wc)
        plt.axis('off')
        plt.show()
        wc.to_file(os.path.join( "wc.jpg"))

    '''@staticmethod
    def insert_into_db():
        """ 插入数据到数据库
            create table jobpost(
                j_salary float(3, 1),
                j_locate text,
                j_post text
            );
        """
        import pymysql
        conn = pymysql.connect(host="localhost",
                               port=3306,
                               user="root",
                               passwd="0303",
                               db="chenx",
                               charset="utf8")
        cur = conn.cursor()
        with open(os.path.join("data", "post_salary.csv"),
                  "r", encoding="utf-8") as f:
            f_csv = csv.reader(f)
            sql = "insert into jobpost(j_salary, j_locate, j_post) values(%s, %s, %s)"
            for row in f_csv:
                value = (row[0], row[1], row[2])
                try:
                    cur.execute(sql, value)
                    conn.commit()
                except Exception as e:
                    print(e)
        cur.close()

       '''
if __name__ == "__main__":
    spider = JobSpider()
    spider.job_spider()
    #spider.stopwordslist()
    #spider.post_require()
    # 按需启动
    #spider.post_salary_locate()
    spider.post_desc_counter()
    #spider.post_salary()
    # spider.insert_into_db()
    #spider.post_salary_counter()
    #spider.post_counter()
    spider.world_cloud()
   

相关文章

  • 乱七八糟第四天

    萌萌走了开始继续充电,今天晚上把51job爬虫分析的代码分析摘录一下: Beautiful Soup支持Pytho...

  • 【追寻人生】

    和一只笨蛋同学一样,第四天了,没怎么看书,除了一篇影评,其他的乱七八糟写了一些,明天开始继续认真看书,那么今天继续...

  • 乱七八糟

    今天和乱七八糟的人聊了乱七八糟 还说了乱七八糟的话 搞得自己乱七八糟的 可自己,哎 真不想乱七八糟的

  • 平庸

    想的乱七八糟,做的乱七八糟……

  • 一篇乱七八糟随便有没有人看的随笔

    我乱七八糟的睡眠,乱七八糟的饮食,还有乱七八糟的感情。 当然,我也厌倦这些乱七八糟的事情。 我总觉得很多重大的决定...

  • 28

    乱七八糟乱七八糟…… 画画的路漫漫长…慢慢来

  • 像大多的普通日子

    上课睡觉吃饭乱七八糟做着一些乱七八糟的事

  • 乱七八糟

    考完一场乱七八糟的考试,突然想发篇乱七八糟的日志,猛地一瞧,自己都一个多月没盘点了,这一个多月都干啥去了?乱七八糟...

  • 乱七八糟

    不知道怎么排解自己的心情,就在这里写吧,这些天我的不开心爸妈都收在眼底,什么都知道了,知道女儿被人骗感情,被人耍来...

  • 乱七八糟

    天好像黑了。月亮好像也没有。明天应该要下雨了吧。 最近,总是很心烦。 太多的事,积攒着,便好像觉得人都要垮了。 突...

网友评论

      本文标题:乱七八糟第四天

      本文链接:https://www.haomeiwen.com/subject/rntigxtx.html