美文网首页
山脚下的小树林与笨拙的伐木工

山脚下的小树林与笨拙的伐木工

作者: 骊殇 | 来源:发表于2018-03-24 22:26 被阅读1次

    是的,你没看错,这一个不知所云的标题,不过用来描述此时我的心情和我的处境,脑子里面蹦出来的最合适的比喻,就是这个了。

    做了什么

    • 统计学工部2017年新闻稿数据,包括:
    日期 标题 学院 文本字数 图片数 稿费

    注:稿费又‘文本字数’ ‘图片数’ 共同计算得到

    • 按照时间顺序排序,再分学院统计整理,最后计算得到每个学院稿费总和

    怎么实现


    1. 从列表网站获取每个单独新闻稿网址 NewsPart1.py
    2. 网页熬成美味的汤汁,用漏勺捞需要的东西吧 NewsPart2.py

    看起来挺简单的对吧,一开始对python,爬虫之类名词只是听说过的我来说就是想象的这么简单。在实际操作中,因为新闻稿中学院名称表述过于混乱,最终添加了一个确定标准学院名称的步骤


    1. 获取标准学院名称 *NewsPart3.py

    标准学院名称与新闻稿中出现的各种称呼

    SourceCode

    NewsPart1.py

       1 #!/usr/bin/python4
    -  2 import  urllib.request
    |  3 import re
    |  4 import os
    |  5 import time
       6 
    -  7 def get_url(url):
    |  8     req = urllib.request.Request(url)
    |  9     req.add_header('User_Agent',
    | 10                    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36        |    (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36' )
    | 11     reponse = urllib.request.urlopen(url)
    | 12     html = reponse.read().decode('utf=8')
    | 13     return html
      14 
    - 15 def get_news_url(url):
    | 16     html = get_url(url)
    | 17 #    print(html)
    | 18     page_re = r"<a href='(/2017/\d{4}/\w+/page.htm)'"
    | 19     page_addrs_ = re.findall(page_re, html)
    | 20     return page_addrs_
      21 
    - 22 def save_txt(addrs,url_home):
    | 23     file = open('page_list.txt', 'a')
    | 24     for item in addrs:
    | 25         file.write(url_home + item + "\n")
    | 26     file.close
      27 
    - 28 def sort_news():
    | 29     url = 'http://stuhome.ustc.edu.cn'
    | 30     num_news = 0
    | 31     page_addrs = []
    | 32 
    | 33     for page_list in range(10,44):
    | 34         url_page = url + '/2314/list' + str(page_list) + '.|    htm'
    | 35         #print(url_page)
    | 36         page_addrs_temp = get_news_url(url_page)
    | 37         num_news += len(page_addrs_temp)
    | 38         for each in page_addrs_temp:
    | 39             page_addrs.append(each)
    | 40         #print(page_addrs)
    | 41         #print(num_news)
    | 42         #save_txt(page_addrs, url)
    | 43        # time.sleep(100)
    | 44     print('page numbers:     '+ str(num_news))
    | 45 #page_addrs plus home_url
    | 46     for i in range(0,len(page_addrs)):
    | 47          page_addrs[i]= url + page_addrs[i]
    | 48     #print(page_addrs)
    | 49     return(page_addrs)
      50 
      51 
    - 52 if __name__=='__main__':
    | 53     sort_news()
    
    

    NewsPart2.py

    -  1 import urllib.request
    |  2 import sys
    |  3 import re
    |  4 import os
    |  5 import time
    |  6 import NewsPart1
    |  7 import NewsPart3
    |  8 import openpyxl
    |  9 from openpyxl import load_workbook
    | 10 from bs4 import BeautifulSoup
      11 
    - 12 def get_data(html, url):
    | 13 
    | 14     soup = BeautifulSoup(html, 'lxml')
    | 15 
    | 16 #get news_title
    | 17     soup_title = soup.find('span', attrs={
    | 18         'frag': '窗口1'
    | 19     })
    | 20     news_title = soup_title.get_text()
    | 21     print('news_title:  '+ news_title)
    | 22 
    | 23 #get news_body
    | 24     news_body = soup.find('div', attrs={
    | 25         'class':'wp_articlecontent'
    | 26     })
    | 27 
    | 28 #get text number
    | 29     news_text = news_body.get_text()
    | 30     news_text_num = len(news_text)
    | 31     print('news_text_num:   '+ str(news_text_num))
    | 32 
    | 33 #get img_num
    | 34     img_url = news_body.find_all('img')
    | 35     img_num  = 0
    | 36     for link in img_url:
    | 37         #print(link['src'])
    | 38         img_num = img_num + 1
    | 39     print('img_num: '+ str(img_num))
    | 40 
    | 41 #get date_num
    | 42     date_num = soup.find('span', attrs={
    | 43         'frag':'窗口4'
    | 44     }).getText()
    | 45     print('date_num:    '+ date_num)
    | 46 
    | 47 #get news_money
    | 48     news_money = 50
    | 49     if(news_text_num > 500):
    | 50         news_money += 15
    | 51     if img_num > 2:
    | 52         news_money += 15
    | 53     print('news_money:  '+ str(news_money) +' RMB')
    | 54 
    | 55 #get news_content
    | 56     #news_content = news_body.get_text('\n')
    | 57 #get news_college
    | 58 #get college from news_body divided by <p>tag
    | 59     p_college = []
    | 60     for p_content in news_body.find_all('p'):
    | 61         p_college.append(p_content.get_text())
    | 62     #print(p_college)
    | 63 #select the college
    | 64     try:
    | 65         i = -1
    | 66         while( i>=-3 & abs(i)<len(p_college) ):
    | 67             pattern = re.compile(r'[\u4e00-\u9fa5]+')
    | 68             #print(p_college[i])
    | 69             if re.search(pattern, p_college[i]) == None:
    | 70                 i = i-1
    | 71             else:
    | 72                 break
    | 73         news_college = p_college[i]
    | 74     except IndexError as e:
    | 75         print(e)
    | 76         news_college = ' '
    | 77         pass
    | 78 
    | 79     #print(p_college[i])
    | 80     #print(news_content)
    | 81     #news_college = re.split(r'\n', news_content)[-1]
    | 82     news_college = news_college.strip('  )(() ')
    | 83     #print('college+title:   '+ news_college + news_title)
    | 84     news_college = NewsPart3.                             |    Standardcollege(news_college + '\n' + news_title +'\n'+   |    news_text)
    | 85     print('news_college:    '+ news_college)
    | 86 
    | 87 
    | 88 #return result
    | 89     #result = date_num + news_title + news_college +      |    str(news_text_num) +
    | 90     #str(img_num) + str(news_money)
    | 91     #result = []
    | 92     result = [date_num, news_title, news_college,         |    news_text_num, img_num,
    | 93                    news_money, url]
    | 94     return result
    | 95     #return date_num, news_title, news_college,           |    news_text_num, img_num, news_money
    | 96 
    
    -136 def count_num(url_page):
    |137 
    |138     #create a xlsx
    |139     wb = openpyxl.Workbook()
    |140     ws = wb.active
    |141     ws['A1'] = 'date'
    |142     ws['B1'] = 'title'
    |143     ws['C1'] = 'college'
    |144     ws['D1'] = 'text_num'
    |145     ws['E1'] = 'img_num'
    |146     ws['F1'] = 'money'
    |147     ws['G1'] = 'news_url'
    |154 
    |155 #from txt file get news_url
    |156     #file_list = open('page_list.txt','r')
    |157     #url_page = file_list.readlines()
    |158     for each in url_page:
    |159         print('\n---------------------')
    |160         each = each.strip('\n')
    |161         print(each)
    |162         html = NewsPart1.get_url(each)
    |163         result = get_data(html, each)
    |164         #save xlsx
    |165         ws.append(result)
    |166         wb.save('news_info.xlsx')
    |167         #time.sleep(1)
     168 
     169 
    -170 if __name__=='__main__':
    |171     start = time.clock()
    |172     page_addrs = NewsPart1.sort_news()
    |173     count_num(page_addrs)
    |174     end = time.clock()
    |175     print('Running time:  %s seconds'%(end-start))
    
    

    NewsPart3.py

    -  1 import os
    |  2 
    |  3 # pip install openpyxl
    |  4 # 中文文档:https://blog.csdn.net/hunter_wyh/article/details/78498323
    |  5 import openpyxl
    |  6 
    |  7 from openpyxl import load_workbook
       8 
    -  9 def Standardcollege(str):
    | 10 
    | 11     #split into title & college ,make college first
    | 12     str_list = []
    | 13     str_list = str.split('\n')
    | 14 
    | 15     #open StandardCollegeName
    | 16     wb = load_workbook('StandardCollegeName.xlsx')
    | 17     ws = wb['Sheet1']
    | 18 
    | 19     #find standardName in str input
    | 20     for each in str_list:
    | 21         i = 0
    | 22         #print('\n\nstr  is :'+ each)
    | 23         for row in ws.iter_rows():
    | 24             i = i + 1
    | 25             for cell in row:
    | 26                 if(cell.value == None):
    | 27                     break
    | 28                 cell.value = cell.value.strip()
    | 29                 #print('stardard college number: '+ cell.value +'$')
    | 30                 flag = each.find(cell.value)
    | 31                 #print('flag(-1 mean unmatch): '+ repr(flag))
    | 32                 if flag != -1:
    | 33                     return ws.cell(row = i, column = 1).value
    | 34 
    | 35     # flag == -1 means we don't find and need select by hand'
    | 36     print(flag)
    | 37     if flag == -1:
    | 38         for column_1 in ws['A1' : 'A15']:
    | 39             for cell in column_1:
    | 40                 print(cell.row, cell.value)
    | 41         print("0  : i can't choose")
    | 42         # choose the right number
    | 43         serial_num = int(input('your order you choose is:'))
    | 44         print(serial_num)
    | 45         if serial_num == 1:
    | 46             return ws['A1'].value
    | 47         elif serial_num == 2:
    | 48             return ws['A2'].value
    | 49         elif serial_num == 3:
    | 50             return ws['A3'].value
    | 51         elif serial_num == 4:
    | 52             return ws['A4'].value
    | 53         elif serial_num == 5:
    | 54             return ws['A5'].value
    | 55         elif serial_num == 6:
    | 56             return ws['A6'].value
    | 57         elif serial_num == 7:
    | 58             return ws['A7'].value
    | 59         elif serial_num == 8:
    | 60             return ws['A8'].value
    | 61         elif serial_num == 9:
    | 62             return ws['A9'].value
    | 63         elif serial_num == 10:
    | 64             return ws['A10'].value
    | 65         elif serial_num == 11:
    | 66             return ws['A11'].value
    | 67         elif serial_num == 12:
    | 68             return ws['A12'].value
    | 69         elif serial_num == 13:
    | 70             return ws['A13'].value
    | 71         elif serial_num == 14:
    | 72             return ws['A14'].value
    | 73         elif serial_num == 15:
    | 74             return ws['A15'].value
    | 75         else:
    | 76             return 'ZERO'
    | 77     else:
    | 78         return 'ZERO'
    
    

    输出结果

    相关文章

      网友评论

          本文标题:山脚下的小树林与笨拙的伐木工

          本文链接:https://www.haomeiwen.com/subject/wrsccftx.html