Python处理千万行数据的过程和思路.md

作者: 平凡的运维之路 | 来源:发表于2020-08-08 23:42 被阅读0次

Python处理千万行数据的过程和思路.md
Python数据分析与数据挖掘思路
每日计划详细版
python操作excel的实践
10个Python Pandas库学习案例
linux文件md5加密
Mac PostgreSQL和postgis环境配置
Linux | Python 命令行
2021-02-10 Xgboost超参调整
数据分析圣经《利用python进行数据分析第二版》

事情的来源

给客户提录音文件和录音索引数据，两则直接由差异对比说明，然后提供给客户

现有阶段实现方式

现阶段方式使用的是，读取文件列表和录音索引列表都转换成list，然后循环索引的列表，截取关键字然后去判断是否在文件列表，然后再把该文件关键组，存储在redis的zset集合中，待下次有相同的关键字，则直接跳过不执行
现有代码

#!/bin/python
import redis

def FileToList(Filename):
    File1 = []
    print(Filename,'---->')
    with open(Filename,'r')  as  File:
        for line  in  File.readlines():
            NewLine = line.strip()
            File1.append(NewLine)
    return File1

def FileToSplistList(Filename):
    File1 = []
    print(Filename,'---->')
    with open(Filename,'r')  as  File:
        for line  in  File.readlines():
            NewLine = line.strip().split("/")[-1]
            File1.append(NewLine)
    return File1

def HandleIndex(SqlList,RecordList):
    #print(SqlList)
    for Sqltxt in SqlList:
        RecordName = Sqltxt.split(",")[0].split("/")[-1]
        Entid = Sqltxt.split(",")[0].split("/")[1]
        KeyName =  "RecordName" + Entid
        IfRedisMsg = client.sismember(KeyName,RecordName)
        if False == IfRedisMsg and RecordName in RecordList:
            KeyName =  "RecordName" + Entid
            client.sadd(KeyName,RecordName)
            result = client.sadd('RecordList', RecordName)
            Writefile.writelines(Sqltxt + "\n")
        else:
            #print(Sqltxt,"----->")
            ChayiWritefile.writelines(Sqltxt + "\n")

if __name__ == '__main__':
    client = redis.StrictRedis(host='localhost', port=6379, db=0)
    #RedisCli =  redis.StrictRedis(connection_pool = client)

    recordfiles = './Xsfiles/0211270039.txt'
    RecordIndexs = './DsRecordIndex/0211270039.txt'
    Entid = "0211270039"
    filename = Entid + "_sql.txt"
    Chayifilename = Entid + "_ChaYi_sql.txt"
    Writefile = open(filename, 'a')
    ChayiWritefile = open(Chayifilename, 'a')

    newSqlList =  FileToList(RecordIndexs)
    newRecordList =  FileToSplistList(recordfiles)
    HandleIndex(newSqlList,newRecordList)

    Writefile.close()
    ChayiWritefile.close()

改造实现方式

就是把上面录音文件的读取出来，放在redis库中，然后循环索引是否在redis库zset集合里面，然后根据返回值判断是否能对应上
通过该方式要比Python内部两个list去循环判断快了，几何的倍数，情况如下

#使用的新的方式循环第一个文件列表去redis中判断，是否存在，1秒处理有4K多的数据
[root@xxxx NewRun]# wc -l 0211270052_sql.txt ; sleep 1 ; wc -l 0211270052_sql.txt 
293040 0211270052_sql.txt
297450 0211270052_sql.txt

#而使用Python两个文件list循环每秒有20多条的数据处理，而这个两个列表的数据量只有350W的数据量。
[root@xxxx XianShanData]# wc -l 0211270039_sql.txt   ; sleep 1;  wc -l 0211270039_sql.txt
3166737 0211270039_sql.txt
3166759 0211270039_sql.txt

redis存储数据，key中的个数查看

[root@xxxx NewRun]# redis-cli 
127.0.0.1:6379> scard  0211270052_RecordFile_List
(integer) 15619783

使用redis-cli命令查看redis 全部key和删除某个key

127.0.0.1:6379> KEYS *
1) "0211270052_RecordIndex_List"
2) "0211270052_RecordFile_List"
3) "test001"

127.0.0.1:6379> del test001
(integer) 1

查看redis key对应的值数据,默认莫看10条数据

127.0.0.1:6379> sscan 0211270053_RecordFile_List 1
1) "2305"
2)  1) "TEL-18683047475_8000662250_20190108101557.wav"

代码详情

#!/bin/python
#-*- coding:utf-8 -*-

import redis

"""
1.新增就是把上面录音文件的读取出来，放在redis库中，然后循环索引是否在redis库zset集合里面，然后根据返回值判断是否能对应上
2.2020年8月8日16:42:18

"""

def FileToList(Filename):
    File1 = []
    print(Filename,'---->')
    with open(Filename,'r')  as  File:
        for line  in  File.readlines():
            NewLine = line.strip()
            File1.append(NewLine)
    return File1

def FileToSplistList(Filename):
    File1 = []
    print(Filename,'---->')
    with open(Filename,'r')  as  File:
        for line  in  File.readlines():
            NewLine = line.strip().split("/")[-1]
            File1.append(NewLine)
    return File1


def FileToSplistToRedisSet(Filename,KeyName):
    print(Filename,'---->')
    with open(Filename,'r')  as  File:
        for line  in  File.readlines():
            RecordName = line.strip().split("/")[-1]
            client.sadd(KeyName,RecordName)

def HandleIndex(SqlList,RecordFileKeyName,RecordListKeyName):
    for Sqltxt in SqlList:
        RecordName = Sqltxt.split(",")[0].split("/")[-1]
        Entid = Sqltxt.split(",")[0].split("/")[1]
        KeyName =  "RecordName" + Entid
        #判断是在redis的set集合中
        IfRedisIndexRecord = client.sismember(RecordListKeyName,RecordName)
        IfRedisRecordFiles = client.sismember(RecordFileKeyName,RecordName)
        if False == IfRedisIndexRecord and True == IfRedisRecordFiles:
            client.sadd(RecordListKeyName,RecordName)
            Writefile.writelines(Sqltxt + "\n")
        else:
            #print(Sqltxt,"----->")
            ChayiWritefile.writelines(Sqltxt + "\n")

if __name__ == '__main__':
    client = redis.StrictRedis(host='localhost', port=6379, db=0)

    recordfiles = '/home/record/XianShanData/Xsfiles/0211270052.txt'
    RecordIndexs = '/home/record/XianShanData/DsRecordIndex/0211270052.txt'
    Entid = "0211270052"
    filename = Entid + "_sql.txt"
    Chayifilename = Entid + "_ChaYi_sql.txt"
    Writefile = open(filename, 'a')
    ChayiWritefile = open(Chayifilename, 'a')

    newSqlList =  FileToList(RecordIndexs)
    RecordFileKeyName =  Entid + "_RecordFile_List"
    RecordListKeyName =  Entid + "_RecordIndex_List"
    client.delete(RecordFileKeyName)
    client.delete(RecordListKeyName)
    FileToSplistToRedisSet(recordfiles,RecordFileKeyName)
    newRecordfileList =  FileToSplistList(recordfiles)
    HandleIndex(newSqlList,RecordFileKeyName,RecordListKeyName)

    Writefile.close()
    ChayiWritefile.close()

总结分析

通过处理这1000多万的数据量，最开始的去Python的无奈，List太大，执行速度太慢，网上搜索则使用过 [i for i,x in enumerate(SqlList) if x.find(Record) !=-1]实践中，效率太慢，后面又在网络的海洋中检索有帮助的信息文档。
最后想到如何把差异重复的放在redis中的，这样存储临时数据，检索python对redis的操作，决定使用set集合的方式，适用于本问题，验证测试没问题之后，改造把第一个文件内容读取处理入库到redis当中，然后第二个文件读取循环截取获取关键字，然后再到redis中的set列表中去判断，是否正常，则对应返回True或者Flase，根据判断结果，把需要的数据写入文件中。
遇到暂时无法解决的问题，请不要放弃，可能前方就是解决的方式或者方式，相信自己，共勉。