数据修改子集
案例
- 测试集合修改设置session_detail字段为空
mongos> db.session_detail.updateMany( {}, { $set: { session_detail: null } } )
{ "acknowledged" : true, "matchedCount" : 1944, "modifiedCount" : 0 }
- 查看验证是否已删除
mongos> db.session_detail.find().pretty().limit(1)
{
"_id" : ObjectId("5c8af0cb85c1f416e07a85f1"),
"start_time" : "1552609482000",
"end_time" : "1552609482000",
"end_reason" : "2",
"session_detail" : null,
}
删除建议
- 由于有的集合数据量多大,需要根据时间段来执行
db.session_detail.updateMany(
{ start_time: { $gte: "1672502400000", $lte: "1675180800000" } },
{ $set: { session_detail: null } }
)
python脚本批量执行修改
- python代码
#!/usr/bin/env python3
import os
import pymongo,configparser,logging
from logging.handlers import RotatingFileHandler
from pymongo import MongoClient
import time
from datetime import datetime,timedelta
def Auto_Mongo(MongoDBAuth):
if "True" == MongoDBAuth:
ConnPasswd = "mongodb://" + User + ":" + Passwd + "@" + MongoHost + ":" + Port + "/"
try:
# clients = pymongo.MongoClient(ConnPasswd)
clients = MongoClient(ConnPasswd,maxPoolSize=int(mongodb_maxPoolSize), minPoolSize=int(mongodb_minPoolSize))
logger.info("init mongodb conn ...")
return clients
except Exception as e:
logger.info("use mongodb uesr passwd error: " + str(e) + " check Exception")
return False
else:
try:
# clients = pymongo.MongoClient(MongoHost, int(Port))
clients = MongoClient(MongoHost, int(Port),maxPoolSize=int(mongodb_maxPoolSize), minPoolSize=int(mongodb_minPoolSize))
logger.info("init mongodb conn: " + MongoHost + " "+ Port + " ")
return clients
except Exception as Nopasswd:
logger.info("Not auth mongodb check conn info")
return False
def logger_func(log_level):
# 创建日志记录器
logger = logging.getLogger("my_logger")
if "DEBUG" == log_level:
logger.setLevel(logging.DEBUG)
elif "INFO" == log_level:
logger.setLevel(logging.INFO)
elif "WARNING" == log_level:
logger.setLevel(logging.WARNING)
elif "ERROR" == log_level:
logger.setLevel(logging.ERROR)
elif "CRITICAL" == log_level:
logger.setLevel(logging.CRITICAL)
else:
logger.setLevel(logging.DEBUG)
# 创建RotatingFileHandler对象
handler = RotatingFileHandler(log_file, maxBytes=max_log_size, backupCount=backup_count)
# 定义日志格式
formatter = logging.Formatter("%(asctime)s %(thread)d %(filename)s:%(lineno)d %(levelname)s %(message)s")
handler.setFormatter(formatter)
# 将处理程序添加到日志记录器
logger.addHandler(handler)
return logger
def check_job_hour():
while True:
current_hour = datetime.now().hour
if str(current_hour) in run_non_work_time_list:
logger.info("now Hour " + str(current_hour) + " in " + str(run_non_work_time_list) + ", sleep 3600s")
time.sleep(3600)
else:
logger.info("In script job time " + str(run_non_work_time_list) + ", break run script")
break
def get_ent_id_list(mongodb_client):
ent_id_list = []
if static_state_get_entid == True:
logger.debug("This join ent_id config file get")
ent_id_list = Get_ent_id_list.split(",")
else:
client = mongodb_auth
dbs = client.list_database_names()
ent_id_list = [db for db in dbs if db.isdigit()]
logger.info("获取 企业库列表:" + str(ent_id_list))
return ent_id_list
#查看当前实际占用磁盘空间大小
def get_storage_Size(db_name):
client = mongodb_auth
db = client[db_name]
collection = db['session_detail']
# storage_size = collection.storageSize()
storage_size = db.command("collstats", "session_detail")["storageSize"]
logger.info("当前 "+ str(db_name) + " session_detail 实际占用磁盘空间大小:" + str(storage_size) + " byte")
return storage_size
def remove_session_detail(ent_id, start_timestamp, end_timestamp):
# client = MongoClient(MongoHostAndPort)
client = mongodb_auth
db = client[ent_id]
session_detail = db['session_detail']
query = {"start_time": {"$gte": str(start_timestamp), "$lte": str(end_timestamp)}}
logger.debug(ent_id + " update session_detail "+ str(query))
update = {"$set": {"session_detail": None}}
result = session_detail.update_many(query, update)
new_start_time = datetime.fromtimestamp(start_timestamp /1000)
new_end_time = datetime.fromtimestamp(end_timestamp /1000)
logger.info(f"{ent_id} update time {new_start_time} = {new_end_time} session_detail {result.modified_count} documents updated")
if __name__ == "__main__":
for dirpath in os.popen("pwd"):
dirpath = dirpath.strip('\n')
cfgpath = os.path.join(dirpath, "cfg/config.ini")
conf = configparser.ConfigParser()
print("config file ---> ",cfgpath)
conf.read(cfgpath)
##### logger init #####
max_log_size = 100 * 1024 * 1024 # 100 MB
backup_count = 50
log_file = "./log/update_session.log"
shell = "mkdir -p " + log_file.split("update_session.log")[0]
os.popen(shell)
MongoDBAuth = conf.get("mongodb", "Auth")
MongoHost = conf.get("mongodb", "MongoHost")
User = conf.get("mongodb", "User")
Passwd = conf.get("mongodb", "Passwd")
Port = conf.get("mongodb", "Port")
mongodb_maxPoolSize = conf.get("mongodb", "maxPoolSize")
mongodb_minPoolSize = conf.get("mongodb", "minPoolSize")
mongodb_mingxi_table = conf.get("mongodb", "mongodb_mingxi_table")
start_date = conf.get("Base", "job_start_date")
end_date = conf.get("Base", "job_end_date")
CheckintervalDate = conf.get("Base", "CheckintervalDate")
static_state_get_entid = conf.getboolean("Base", "static_state_get_entid")
Get_ent_id_list = conf.get("Base", "Get_ent_id_list")
run_non_work_time_list = conf.get("Base", "run_non_work_time_list")
Log_Level = conf.get("Base", "Log_Level")
logger = logger_func(Log_Level)
# check_for_hour()
mongodb_auth = Auto_Mongo(MongoDBAuth)
if mongodb_auth:
ent_id_list = get_ent_id_list(mongodb_auth)
for ent_id in ent_id_list:
start_get_size = get_storage_Size(ent_id)
start_date = datetime.strptime(start_date, "%Y-%m-%d")
end_date = datetime.strptime(end_date, "%Y-%m-%d")
start_date_timestamp = int(time.mktime(start_date.timetuple()))
end_date_timestamp = int(time.mktime(end_date.timetuple()))
while start_date <= end_date:
timestamp = int(time.mktime(start_date.timetuple()))
end_of_day = start_date + timedelta(days=1) - timedelta(seconds=1)
end_timestamp = int(time.mktime(end_of_day.timetuple()))
remove_session_detail(ent_id, timestamp *1000, end_timestamp *1000)
start_date += timedelta(days=1)
time.sleep(int(CheckintervalDate))
check_job_hour()
end_get_size = get_storage_Size(ent_id)
del_storg_size = start_get_size - end_get_size
del_num = round(del_storg_size,3)
logger.info(ent_id + " 运行去子集操作减少磁盘空间: " + str(del_num) + " MB")
start_date = conf.get("Base", "job_start_date")
end_date = conf.get("Base", "job_end_date")
logger.debug(ent_id + " update session_detail data ok")
logger.info("*" * 100)
else:
logger.error("init mongodb error")
- 脚本配置文件说明
[Base]
#执行运行间隔休眠时间单位是s,每天执行间隔时间1,建议设置为1秒。
CheckintervalDate=1
#需要修改session_detail子集的开始和截止时间
job_start_date = 2023-01-01
job_end_date = 2024-01-31
#是否使用静态配置企业id,配置未True则读取配置文件获取企业id,配置为False,则自动从mongodb获取企业id
static_state_get_entid = False
#配置镜像企业id列表,上面必须为True,该配置项才能生效。
Get_ent_id_list = 0101290030,0101290033,0103290011
#脚本运行非工作时间
run_non_work_time_list = ["08","09", "10", "11", "12", "13","14", "15", "16", "17", "18", "19", "20"]
#日志输出级别: DEBUG 、INFO、WARNING、ERROR、CRITICAL
Log_Level = INFO
[mongodb]
#mongodb IP地址
MongoHost=127.0.0.1
#mongodb Port
Port=27017
#Mongodb 用户名
User=admin
#Mongodb 密码
Passwd=admin
#登陆库名
AdminDB=admin
#Mongodb集合
mongodb_mingxi_table=session_detail
#是否启动密码加密: True启用, False不启用
Auth=False
#mongodb最大连接数
maxPoolSize= 5
#mongodb最小连接数
minPoolSize= 2
- 脚本运行要求
- 二进制文件脚本需要环境需求: 需要centos7 x64系统。
- 下载地址为内网IP,有需要请自行编译二进制,或执行源码。
[ccodsupport@localhost apex]$ cd /home/ccodsupport/apex/
[ccodsupport@localhost apex]$ wget http://1.1.1.1/Deploymentpackage/remove_session_detail.tar.gz
[ccodsupport@localhost apex]$ tar xvf xvf remove_session_detail.tar.gz
[ccodsupport@localhost apex]$ cd remove_session_detail
[ccodsupport@localhost remove_session_detail]$ ./start.sh -d session
[ccodsupport@localhost remove_session_detail]$ ./start.sh list
modify_session (pid 9740 9743) [running]
[ccodsupport@localhost remove_session_detail]$ tail -1000f log/update_session.log
2024-01-02 15:38:45,747 140637576845120 modify_session_detail.py:25 INFO init mongodb conn: 127.0.0.1 27017
2024-01-02 15:38:45,750 140637576845120 modify_session_detail.py:77 INFO 获取 企业库列表:['01032900104', '1906100002']
2024-01-02 15:38:45,753 140637576845120 modify_session_detail.py:87 INFO 当前session_detail 实际占用磁盘空间大小:278528 byte
2024-01-02 15:38:45,759 140637576845120 modify_session_detail.py:101 INFO 01032900104 update time 2023-05-20 00:00:00 = 2023-05-20 23:59:59 session_detail 0 documents updated
2024-01-02 15:38:45,759 140637576845120 modify_session_detail.py:65 INFO In script job time ["08","09", "10", "11", "12", "13", "16", "16", "17", "18", "19", "20"], break run script
2024-01-02 15:38:45,761 140637576845120 modify_session_detail.py:101 INFO 01032900104 update time 2023-05-21 00:00:00 = 2023-05-21 23:59:59 session_detail 0 documents updated
2024-01-02 15:38:45,761 140637576845120 modify_session_detail.py:65 INFO In script job time ["08","09", "10", "11", "12", "13", "16", "16", "17", "18", "19", "20"], break run script
2024-01-02 15:38:45,784 140637576845120 modify_session_detail.py:87 INFO 当前session_detail 实际占用磁盘空间大小:278528 byte
2024-01-02 15:38:45,784 140637576845120 modify_session_detail.py:159 INFO 01032900104 运行去子集操作减少磁盘空间: 0 MB
2024-01-02 15:38:45,784 140637576845120 modify_session_detail.py:163 INFO ****************************************************************************************************
2024-01-02 15:38:45,787 140637576845120 modify_session_detail.py:87 INFO 当前session_detail 实际占用磁盘空间大小:36864 byte
2024-01-02 15:38:45,788 140637576845120 modify_session_detail.py:101 INFO 1906100002 update time 2023-05-20 00:00:00 = 2023-05-20 23:59:59 session_detail 0 documents updated
2024-01-02 15:38:45,788 140637576845120 modify_session_detail.py:65 INFO In script job time ["08","09", "10", "11", "12", "13", "16", "16", "17", "18", "19", "20"], break run script
2024-01-02 15:38:45,789 140637576845120 modify_session_detail.py:101 INFO 1906100002 update time 2023-05-21 00:00:00 = 2023-05-21 23:59:59 session_detail 0 documents updated
2024-01-02 15:38:45,789 140637576845120 modify_session_detail.py:65 INFO In script job time ["08","09", "10", "11", "12", "13", "16", "16", "17", "18", "19", "20"], break run script
2024-01-02 15:38:45,789 140637576845120 modify_session_detail.py:101 INFO 1906100002 update time 2023-05-22 00:00:00 = 2023-05-22 23:59:59 session_detail 0 documents updated
2024-01-02 15:38:45,790 140637576845120 modify_session_detail.py:65 INFO In script job time ["08","09", "10", "11", "12", "13", "16", "16", "17", "18", "19", "20"], break run script
2024-01-02 15:38:45,790 140637576845120 modify_session_detail.py:101 INFO 1906100002 update time 2023-05-23 00:00:00 = 2023-05-23 23:59:59 session_detail 0 documents updated
2024-01-02 15:38:45,800 140637576845120 modify_session_detail.py:87 INFO 当前session_detail 实际占用磁盘空间大小:36864 byte
2024-01-02 15:38:45,801 140637576845120 modify_session_detail.py:159 INFO 1906100002 运行去子集操作减少磁盘空间: 0 MB
2024-01-02 15:38:45,801 140637576845120 modify_session_detail.py:163 INFO ****************************************************************************************************
- 脚本运行成功后,登录mongodb查询去掉结果。
mongos> db.session_detail.find().pretty()
{
"_id" : ObjectId("5aab8efe6ee63530732bc78a"),
"start_time" : "1521192700000",
"end_time" : "1521192701000",
"agent_id" : "2001",
"session_detail" : null
}
把字段设置null后
-
MongoDB中,当一个字段的值被设置为null时,并不会释放该字段所占用的空间。这是因为MongoDB使用了一种称为"延迟清理"的策略来回收未使用的空间。当您插入或更新文档时,MongoDB会尝试在内存中重用已分配的空间。但是,当MongoDB需要为新的文档分配空间时,它可能会触发后台清理操作,将未使用的空间返回给操作系统。这个过程可能需要一些时间。
-
因此,如果您将一个字段的多个数据设置为null,实际集合存储大小不会立即减少。要查看实际磁盘空间占用的变化,可以使用
db.collection.stats()
命令来获取集合的统计信息。该命令会返回一个包含有关集合的详细信息的对象,其中包括集合的总大小、已使用的空间和空闲空间等信息。
网友评论