是的,你没看错,这一个不知所云的标题,不过用来描述此时我的心情和我的处境,脑子里面蹦出来的最合适的比喻,就是这个了。
做了什么
- 统计学工部2017年新闻稿数据,包括:
日期 | 标题 | 学院 | 文本字数 | 图片数 | 稿费 |
---|
注:稿费又‘文本字数’ ‘图片数’ 共同计算得到
- 按照时间顺序排序,再分学院统计整理,最后计算得到每个学院稿费总和
怎么实现
- 从列表网站获取每个单独新闻稿网址 NewsPart1.py
- 网页熬成美味的汤汁,用漏勺捞需要的东西吧 NewsPart2.py
看起来挺简单的对吧,一开始对python,爬虫之类名词只是听说过的我来说就是想象的这么简单。在实际操作中,因为新闻稿中学院名称表述过于混乱,最终添加了一个确定标准学院名称的步骤
- 获取标准学院名称 *NewsPart3.py
标准学院名称与新闻稿中出现的各种称呼
SourceCode
NewsPart1.py
1 #!/usr/bin/python4
- 2 import urllib.request
| 3 import re
| 4 import os
| 5 import time
6
- 7 def get_url(url):
| 8 req = urllib.request.Request(url)
| 9 req.add_header('User_Agent',
| 10 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 | (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36' )
| 11 reponse = urllib.request.urlopen(url)
| 12 html = reponse.read().decode('utf=8')
| 13 return html
14
- 15 def get_news_url(url):
| 16 html = get_url(url)
| 17 # print(html)
| 18 page_re = r"<a href='(/2017/\d{4}/\w+/page.htm)'"
| 19 page_addrs_ = re.findall(page_re, html)
| 20 return page_addrs_
21
- 22 def save_txt(addrs,url_home):
| 23 file = open('page_list.txt', 'a')
| 24 for item in addrs:
| 25 file.write(url_home + item + "\n")
| 26 file.close
27
- 28 def sort_news():
| 29 url = 'http://stuhome.ustc.edu.cn'
| 30 num_news = 0
| 31 page_addrs = []
| 32
| 33 for page_list in range(10,44):
| 34 url_page = url + '/2314/list' + str(page_list) + '.| htm'
| 35 #print(url_page)
| 36 page_addrs_temp = get_news_url(url_page)
| 37 num_news += len(page_addrs_temp)
| 38 for each in page_addrs_temp:
| 39 page_addrs.append(each)
| 40 #print(page_addrs)
| 41 #print(num_news)
| 42 #save_txt(page_addrs, url)
| 43 # time.sleep(100)
| 44 print('page numbers: '+ str(num_news))
| 45 #page_addrs plus home_url
| 46 for i in range(0,len(page_addrs)):
| 47 page_addrs[i]= url + page_addrs[i]
| 48 #print(page_addrs)
| 49 return(page_addrs)
50
51
- 52 if __name__=='__main__':
| 53 sort_news()
NewsPart2.py
- 1 import urllib.request
| 2 import sys
| 3 import re
| 4 import os
| 5 import time
| 6 import NewsPart1
| 7 import NewsPart3
| 8 import openpyxl
| 9 from openpyxl import load_workbook
| 10 from bs4 import BeautifulSoup
11
- 12 def get_data(html, url):
| 13
| 14 soup = BeautifulSoup(html, 'lxml')
| 15
| 16 #get news_title
| 17 soup_title = soup.find('span', attrs={
| 18 'frag': '窗口1'
| 19 })
| 20 news_title = soup_title.get_text()
| 21 print('news_title: '+ news_title)
| 22
| 23 #get news_body
| 24 news_body = soup.find('div', attrs={
| 25 'class':'wp_articlecontent'
| 26 })
| 27
| 28 #get text number
| 29 news_text = news_body.get_text()
| 30 news_text_num = len(news_text)
| 31 print('news_text_num: '+ str(news_text_num))
| 32
| 33 #get img_num
| 34 img_url = news_body.find_all('img')
| 35 img_num = 0
| 36 for link in img_url:
| 37 #print(link['src'])
| 38 img_num = img_num + 1
| 39 print('img_num: '+ str(img_num))
| 40
| 41 #get date_num
| 42 date_num = soup.find('span', attrs={
| 43 'frag':'窗口4'
| 44 }).getText()
| 45 print('date_num: '+ date_num)
| 46
| 47 #get news_money
| 48 news_money = 50
| 49 if(news_text_num > 500):
| 50 news_money += 15
| 51 if img_num > 2:
| 52 news_money += 15
| 53 print('news_money: '+ str(news_money) +' RMB')
| 54
| 55 #get news_content
| 56 #news_content = news_body.get_text('\n')
| 57 #get news_college
| 58 #get college from news_body divided by <p>tag
| 59 p_college = []
| 60 for p_content in news_body.find_all('p'):
| 61 p_college.append(p_content.get_text())
| 62 #print(p_college)
| 63 #select the college
| 64 try:
| 65 i = -1
| 66 while( i>=-3 & abs(i)<len(p_college) ):
| 67 pattern = re.compile(r'[\u4e00-\u9fa5]+')
| 68 #print(p_college[i])
| 69 if re.search(pattern, p_college[i]) == None:
| 70 i = i-1
| 71 else:
| 72 break
| 73 news_college = p_college[i]
| 74 except IndexError as e:
| 75 print(e)
| 76 news_college = ' '
| 77 pass
| 78
| 79 #print(p_college[i])
| 80 #print(news_content)
| 81 #news_college = re.split(r'\n', news_content)[-1]
| 82 news_college = news_college.strip(' )(() ')
| 83 #print('college+title: '+ news_college + news_title)
| 84 news_college = NewsPart3. | Standardcollege(news_college + '\n' + news_title +'\n'+ | news_text)
| 85 print('news_college: '+ news_college)
| 86
| 87
| 88 #return result
| 89 #result = date_num + news_title + news_college + | str(news_text_num) +
| 90 #str(img_num) + str(news_money)
| 91 #result = []
| 92 result = [date_num, news_title, news_college, | news_text_num, img_num,
| 93 news_money, url]
| 94 return result
| 95 #return date_num, news_title, news_college, | news_text_num, img_num, news_money
| 96
-136 def count_num(url_page):
|137
|138 #create a xlsx
|139 wb = openpyxl.Workbook()
|140 ws = wb.active
|141 ws['A1'] = 'date'
|142 ws['B1'] = 'title'
|143 ws['C1'] = 'college'
|144 ws['D1'] = 'text_num'
|145 ws['E1'] = 'img_num'
|146 ws['F1'] = 'money'
|147 ws['G1'] = 'news_url'
|154
|155 #from txt file get news_url
|156 #file_list = open('page_list.txt','r')
|157 #url_page = file_list.readlines()
|158 for each in url_page:
|159 print('\n---------------------')
|160 each = each.strip('\n')
|161 print(each)
|162 html = NewsPart1.get_url(each)
|163 result = get_data(html, each)
|164 #save xlsx
|165 ws.append(result)
|166 wb.save('news_info.xlsx')
|167 #time.sleep(1)
168
169
-170 if __name__=='__main__':
|171 start = time.clock()
|172 page_addrs = NewsPart1.sort_news()
|173 count_num(page_addrs)
|174 end = time.clock()
|175 print('Running time: %s seconds'%(end-start))
NewsPart3.py
- 1 import os
| 2
| 3 # pip install openpyxl
| 4 # 中文文档:https://blog.csdn.net/hunter_wyh/article/details/78498323
| 5 import openpyxl
| 6
| 7 from openpyxl import load_workbook
8
- 9 def Standardcollege(str):
| 10
| 11 #split into title & college ,make college first
| 12 str_list = []
| 13 str_list = str.split('\n')
| 14
| 15 #open StandardCollegeName
| 16 wb = load_workbook('StandardCollegeName.xlsx')
| 17 ws = wb['Sheet1']
| 18
| 19 #find standardName in str input
| 20 for each in str_list:
| 21 i = 0
| 22 #print('\n\nstr is :'+ each)
| 23 for row in ws.iter_rows():
| 24 i = i + 1
| 25 for cell in row:
| 26 if(cell.value == None):
| 27 break
| 28 cell.value = cell.value.strip()
| 29 #print('stardard college number: '+ cell.value +'$')
| 30 flag = each.find(cell.value)
| 31 #print('flag(-1 mean unmatch): '+ repr(flag))
| 32 if flag != -1:
| 33 return ws.cell(row = i, column = 1).value
| 34
| 35 # flag == -1 means we don't find and need select by hand'
| 36 print(flag)
| 37 if flag == -1:
| 38 for column_1 in ws['A1' : 'A15']:
| 39 for cell in column_1:
| 40 print(cell.row, cell.value)
| 41 print("0 : i can't choose")
| 42 # choose the right number
| 43 serial_num = int(input('your order you choose is:'))
| 44 print(serial_num)
| 45 if serial_num == 1:
| 46 return ws['A1'].value
| 47 elif serial_num == 2:
| 48 return ws['A2'].value
| 49 elif serial_num == 3:
| 50 return ws['A3'].value
| 51 elif serial_num == 4:
| 52 return ws['A4'].value
| 53 elif serial_num == 5:
| 54 return ws['A5'].value
| 55 elif serial_num == 6:
| 56 return ws['A6'].value
| 57 elif serial_num == 7:
| 58 return ws['A7'].value
| 59 elif serial_num == 8:
| 60 return ws['A8'].value
| 61 elif serial_num == 9:
| 62 return ws['A9'].value
| 63 elif serial_num == 10:
| 64 return ws['A10'].value
| 65 elif serial_num == 11:
| 66 return ws['A11'].value
| 67 elif serial_num == 12:
| 68 return ws['A12'].value
| 69 elif serial_num == 13:
| 70 return ws['A13'].value
| 71 elif serial_num == 14:
| 72 return ws['A14'].value
| 73 elif serial_num == 15:
| 74 return ws['A15'].value
| 75 else:
| 76 return 'ZERO'
| 77 else:
| 78 return 'ZERO'
网友评论