```{python}
#https://www.stechies.com/read-file-line-by-line-python/
## https://blog.csdn.net/weixin_41578567/article/details/103262925
## https://www.statology.org/pandas-remove-duplicates-keep-max/
import os
import sys
import pandas as pd
input_file="alltransc.genome.sorted.uniq.gene+_gene-.bingji_145.sart1.haveid.2.genome.mRNA_CDS_1_head100.gff3"
## read big data
#df=pd.read_csv(input_file,header=None,sep="\t",nrows=100)
data=pd.read_csv(input_file,header=None,sep="\t", chunksize=500)
df = pd.concat(data)
## need install datatable module in conda environment by "conda install -c conda-forge datatable"
#import datatable as dt
#df = dt.fread()
num_list=list(range(1,10))
df.columns=["V"+str(x) for x in num_list]
#drop rows with duplicate team and positions but keeps row with max points
## V5=mRNA; V3=start; V4=end
df_new = df.sort_values('V5', ascending=False).drop_duplicates(['V3', 'V4']).sort_index()
df_new.to_csv("output.gff3",sep="\t",index=False,header=False)
```
网友评论