一、csv文件的特点
1.文件结构简单,基本上和文本的差别不大;
2.可以和microExcle进行转换,这是一个很大的优点,很容易进行察看模式转换,但是如果你同样的csv文件和将其转换成xls文件后的size比较就更加明白他在size上的优势了;
3.由于其简单的存储方式,一方面可以减少存储信息的容量,这样有利于网络传输以及客户端的再处 理;
4.由于是一堆没有任何说明的数据,具备基本的安全性。
二、创建csv文件
import csv
from csv import DictWriter
with open('names.csv', 'w') as csvfile:
fieldnames = ['first_name', 'last_name']
#writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer = DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerow({'first_name': 'Baked', 'last_name': 'Beans'})
writer.writerow({'first_name': 'Lovely', 'last_name': 'Spam'})
writer.writerow({'first_name': 'Wonderful', 'last_name': 'Spam'})
writer.writerow({'first_name': '钟', 'last_name': '史俊'})
三、读取csv文件
import csv
with open('names.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
print(row['first_name'], row['last_name'])
四、实例
我们在之前Requests+正则表达式爬取猫眼电影的基础上,将爬取的数据存储为csv文件,并读取部分csv文件,代码如下:
import requests
from requests.exceptions import RequestException
import re
import json
import csv
import pandas as pd
from multiprocessing import Pool #多线程
def get_one_page(url):
try:
response=requests.get(url)
if response.status_code==200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern=re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
+'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
items=re.findall(pattern,html)
#print(items)
for item in items:
yield {
'index':item[0],
'image':item[1],
'title':item[2],
'actor':item[3].strip()[3:],
'time':item[4].strip()[5:],
'score':item[5]+item[6]
}
def write_to_file(content):
with open('result.text','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.close()
def main():
with open('top100.csv', 'a', encoding='utf-8') as csvfile:
fieldnames = ['title', 'actor','index', 'score', 'time', 'image']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for i in range(10):
url='http://maoyan.com/board/4?offset='+str(i)
get_one_page(url)
html=get_one_page(url)
#parse_one_page(html)
#print(html)
for item in parse_one_page(html):
writer.writerow(item)
print(item)
write_to_file(item)
with open('top100.csv', 'r', encoding='utf-8') as csvfile: #这里要加encoding='utf-8',不然会有编码错误
reader = csv.DictReader(csvfile)
for row in reader:
print(row['title'], row['time'])
if __name__ == '__main__':
main()
'''pool=Pool()
pool.map(main,[i*10 for i in range(10)])'''
网友评论