姓名是否可以判断出性别呢?例如小红这个名字,我想应该是一个女性吧。为了测试看看姓名是否和性别有关,借助网上提供的数据集,编写以下代码进行测试。
代码如下:
import tensorflow as tf
from tensorflow.keras import Sequential, layers, losses, optimizers
import numpy as np
gpu = tf.config.experimental.list_physical_devices(device_type='GPU')
assert len(gpu) >= 1
tf.config.experimental.set_memory_growth(gpu[0], True)
NAME_MAX_LEN = 4
def get_word_index(ch: str):
bs = ch.encode("unicode-escape")
no = int(bs[2:], base=16)
if no >= 19968 and no <= 40869:
return no - 19967 + 6582
elif no >= 13312 and no <= 19893:
return no - 13312
return 0
def get_word_text(idx: int):
return chr(idx)
def encode_name(name):
name_len = len(name)
name_iter = iter(name)
name_arr = [get_word_index(i) for i in name_iter]
if name_len > NAME_MAX_LEN:
x = name_arr[:NAME_MAX_LEN]
else:
x = name_arr + [0] * (NAME_MAX_LEN - name_len)
return x
def get_name_sex(test_ratio=0.2):
X = []
Y = []
with open(r'..\nameSex.txt', 'r', encoding='utf-8') as rf:
for line in rf.readlines():
fields = line.split(':')
name = fields[0]
sex = int(fields[1].strip())
X.append(encode_name(name))
Y.append(sex)
train_num = int(len(X) * (1 - test_ratio))
x_train, y_train = X[:train_num], Y[:train_num]
x_test, y_test = X[train_num:], Y[train_num:]
x_train_np_arr = np.ndarray((len(x_train),), dtype=object)
for step, x in enumerate(x_train):
x_train_np_arr.itemset(step, x)
x_test_np_arr = np.ndarray((len(x_test),), dtype=object)
for step, x in enumerate(x_test):
x_test_np_arr.itemset(step, x)
return (x_train, y_train), (x_test, y_test)
(train_data, train_labels), (test_data, test_labels) = get_name_sex()
vocab_size = 27500
model = Sequential([
layers.Embedding(vocab_size, 10),
layers.GRU(20, return_sequences=True),
layers.GRU(20, dropout=0.5),
layers.Dense(1, activation=tf.nn.sigmoid)
])
model.summary()
optimizer = optimizers.Adam(1e-2)
model.compile(optimizer=optimizer,
loss=losses.binary_crossentropy,
metrics=['accuracy'])
x_val = train_data[:1000]
partial_x_train = train_data[1000:]
y_val = train_labels[:1000]
partial_y_train = train_labels[1000:]
class ShowSaveCallback(tf.keras.callbacks.Callback):
def __init__(self):
super().__init__()
def on_epoch_end(self, epoch, logs=None):
print("log", logs)
model.save("./model_back/{}_name_sex_model.h5".format(epoch))
history = model.fit(partial_x_train,
partial_y_train,
epochs=10,
validation_data=(x_val, y_val),
callbacks=[ShowSaveCallback()])
model.evaluate(test_data, test_labels)
nameSex数据结构:
欧:0
倩:0
妍:0
文竹:0
璐:0
彤彤:0
祎祎:0
紫微:0
裕贤:0
钰雯:0
睿:0
淑云:0
婧祺:0
玉:0
遥:0
丽香:0
钰:0
泸:1
烨婷:0
宸:1
佳琪:0
靖博:1
博安:1
阳枝山:1
靖茜:0
测试发现有关系,准确率在87%,说明大部分的人我们还是可以通过姓名判断其性别的,但是少部分不能。
网友评论