labels 必须是要是整数
参考这个分类模型:https://www.cnblogs.com/zhangxianrong/p/15067067.html
for batch in train_loader:
# 正向传播
optim.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs[0]
total_train_loss += loss.item()
总是提醒:
File "/opt/conda/lib/python3.7/site-packages/torch/nn/functional.py", line 2980, in binary_cross_entropy_with_logits
raise ValueError("Target size ({}) must be the same as input size ({})".format(target.size(), input.size()))
ValueError: Target size (torch.Size([16])) must be the same as input size (torch.Size([16, 2]))
也就是损失函数的input 和 target 对不上,就是 label 一直对不上。我们是一个2分类的问题,我们输入的label
labels = torch.tensor([x[2] for x in data])
这里由于没有强制转化,这个label 是一个float的类型;
但是我们对于label 必须是整数类型,不然无法自动处理,深层次的原因我还要再看看。先记录,花了成就才debug出来。
labels = torch.tensor([int(x[2]) for x in data])
解决 bertTokener 太慢的问题
- 使用 多线程,而不是多进程,多进程数据处理有问题
- 使用BertTokenizerFast ,相比BertTokenizer要快一些
def batch_encode_data(self,data,thread=8):
pool = ThreadPoolExecutor(thread)
results = []
batch_size = int(len(data)/(thread-1))
start = 0
for i in range(thread):
sub_data = data[start:start+batch_size]
start = start+batch_size
if len(sub_data) ==0:
continue
results.append(pool.submit(self.sub_encode_data,sub_data))
input_ids = []
attention_masks = []
labels=[]
for sub in as_completed(results):
id, mask, label =sub.result()
input_ids = input_ids + id
attention_masks = attention_masks + mask
labels = labels+ label
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
return input_ids, attention_masks, labels
def sub_encode_data(self, data):
input_ids = []
attention_masks = []
print("total sample szie {}".format(len(data)))
cnt =0
for sample in data:
theme_len = len(sample[1])
encoded_dict = self.tokenizer.encode_plus(sample[1][:min(theme_len, 200)] + sample[0],
add_special_tokens=True, max_length=512, pad_to_max_length=True,
return_attention_mask=True, return_tensors="pt", truncation=True)
input_ids.append(encoded_dict["input_ids"])
attention_masks.append(encoded_dict["attention_mask"])
cnt = cnt+1
if cnt %1000==0:
print("encoding process {}".format(cnt))
labels = [int(x[2]) for x in data]
return input_ids, attention_masks, labels
CUDA 和 GeForce RTX 3090 版本问题
RTX3090 需要 cuda 11.x 的runtime api,10.x 的runtime api 一直不能使用
image.png
网友评论