下面是自己在处理数据时用到的同一个代码的两个版本:其中第一个版本预计需要超过1小时处理时间,且速度越来越慢;第二个需1分钟。
结论:尽量不要在循环里面使用df = df.append(info_dct, ignore_index=True),虽然自己之前经常这么用。正确的做法是把row function的返回值用list接收起来,最后统一用一次df.append。
代码一:
# coding: utf-8
import pandas as pd
from tqdm import tqdm
from os.path import join
import os
def rearrange(row, df_new):
for i in range(row.left_frame, row.right_frame):
image_path = join(os.getcwd(), row.file_path, row.video_name, row.video_name+"_{}.jpg".format(i))
gaze_region = row.facing_region
df_new = df_new.append({"image_path": image_path, "gaze_region": gaze_region}, ignore_index=True)
return df_new
if __name__ == "__main__":
df = pd.read_csv('./gaze_region_gt.csv', delimiter=',', index_col=None)
df_rearranged = pd.DataFrame(columns=["image_path", "gaze_region"])
for i, row in tqdm(df.iterrows()):
df_rearranged = rearrange(row, df_rearranged)
df_rearranged.to_csv("gaze_region_gt_rearranged_tmp.csv", index=False)
代码二:
# coding: utf-8
import pandas as pd
from tqdm import tqdm
from os.path import join
import os
def rearrange(row):
for i in range(row.left_frame, row.right_frame):
image_path = join(os.getcwd(), row.file_path, row.video_name, row.video_name+"_{}.jpg".format(i))
gaze_region = row.facing_region
yield image_path, gaze_region
if __name__ == "__main__":
df = pd.read_csv('./gaze_region_gt.csv', delimiter=',', index_col=None)
df_rearranged = pd.DataFrame(columns=["image_path", "gaze_region"])
img_paths = []
gaze_regions = []
for i, row in tqdm(df.iterrows()):
items = rearrange(row)
for item in items:
img_paths.append(item[0])
gaze_regions.append(item[1])
#df_rearranged.to_csv("gaze_region_gt_rearranged_tmp.csv", index=False)
df_rearranged = pd.DataFrame({"image_path": img_paths, "gaze_region": gaze_regions})
df_rearranged.to_csv("tst.csv", index=False)
网友评论