有一个中文文本库,里面是一些违规词,检测图片中是否包含违规词,先对图片进行文字检测,把检测出的文字保存在一个txt中。
import sys
import io
import os
reload(sys)
sys.setdefaultencoding('utf8')
#python2版本需要用io.open,后面才可以使用encoding参数,python3版本不需要加io
path = r'D:\img_program\ciku_test'
# 这个txt里面是包含多个需要检测的词语,成为词库
f2=io.open(os.path.join(path,'cuxiao_words_test.txt'),'r', encoding='gbk')
#readline只读取第一行内容,readlines读取多行内容
#例如('lines_a', [u'6.16\u5230\u624b\u4ef7\n', u'\u4f18\u60e0\n'])
lines_b=f2.readlines()
#这个是使用OCR检测出的图像中包含的词语
detect_path = r'D:\img_program\ciku_test\detect'
#这个是匹配结果的保存路径
path_result = r'D:\img_program\cuxiaoci_txt_result'
texts = os.listdir(detect_path)
i = 0
for text in texts:
print(text)
shotname, extension = os.path.splitext(text)
fc = io.open(os.path.join(path_result,shotname+'_result.txt'), 'w', encoding='utf-8')
f1 = io.open(os.path.join(detect_path,text),'r', encoding='utf-8')
#read是读取文本中所有内容,例如
#('lines_a', u'6.16\u5230\u624b\u4ef78\n\u4f18\u60e0\n')
lines_a=f1.read()
#遍历词库中每行的词
for line_b in lines_b:
#去掉换行符,例如敏感是【促销】,当检测词是【促销价】是就无法匹配,因为#【促销】后面其实有个换行符/n的,去掉后就可以匹配【促销价】之类的了
line_b = line_b.strip()
if line_b in lines_a:
print(line_b)
fc = io.open(os.path.join(path_result,shotname+'_result.txt'), 'a', encoding='utf-8')
fc.writelines(line_b+'\n')
除了用if ... in ...这种写法,也可以使用re的findall,不同的是,当检测语句中有多个相同的词时,if ... in ...只会召回一个词语,findall会全部召回。
import re
pattern = re.compile(line_b)
find = pattern.findall(lines_a)
fc = io.open(os.path.join(path_result,shotname+'_result.txt'), 'a', encoding='utf-8')
fc.writelines(find)
检测文本如果行数很多,建议用readlines替换read,例如
f2=io.open('cuxiao_words.txt','r', encoding='gbk')
lines_b=f2.readlines()
detect_path = r'D:\img_program\cuxiaoci_txt'
path_result = r'D:\img_program\cuxiaoci_txt_result'
texts = os.listdir(detect_path)
i = 0
for text in texts:
print(text)
shotname, extension = os.path.splitext(text)
fc = io.open(os.path.join(path_result,shotname+'_result.txt'), 'w', encoding='utf-8')
f1 = io.open(os.path.join(detect_path,text),'r', encoding='utf-8')
lines_a=f1.readlines()
for line_a in lines_a:
line_a = line_a.strip()
for line_b in lines_b:
line_b = line_b.strip()
if line_b in lines_a:
fc = io.open(os.path.join(path_result,shotname+'_result.txt'), 'a', encoding='utf-8')
fc.writelines(line_b+'\n')
网友评论