一,代码
sentences = [
"Kage is Teacher",
"Mazong is Boss",
"Niuzong is Boss",
"Xiaobing is Student",
"Xiaoxue is Student"
]
words = ' '.join(sentences).split()
word_list = list(set(words))
word_to_idx = {word: idx for idx, word in enumerate(word_list)}
idx_to_word = {idx: word for idx, word in enumerate(word_to_idx)}
voc_size = len(word_list)
print("词汇表:", word_list)
print("词汇到索引的字典: ", word_to_idx)
print("索引到词汇的字典: ", idx_to_word)
print("词汇表大小: ", voc_size)
def create_cbow_dataset(sentences, window_size=2):
data = []
for sentence in sentences:
sentence = sentence.split()
for idx, word in enumerate(sentence):
context_word = sentence[max(idx-window_size, 0):idx] \
+ sentence[idx+1:min(idx+window_size+1, len(sentence))]
data.append((word, context_word))
return data
cbow_data = create_cbow_dataset(sentences)
print("CBOW数据样例(未编码): ", cbow_data)
import torch
def one_hot_encoding(word, word_to_idx):
tensor = torch.zeros(len(word_to_idx))
tensor[word_to_idx[word]] = 1
return tensor
import torch.nn as nn
class CBOW(nn.Module):
def __init__(self, vec_size, embeding_size):
super(CBOW, self).__init__()
self.input_to_hidden = nn.Linear(vec_size, embeding_size, bias=False)
self.hidden_to_output = nn.Linear(embeding_size, vec_size, bias=False)
def forward(self, X):
embeddings = self.input_to_hidden(X)
hidden_layer = torch.mean(embeddings, dim=0)
output_layer = self.hidden_to_output(hidden_layer.unsqueeze(0))
return output_layer
embedding_size = 2
cbow_model = CBOW(voc_size, embedding_size)
print("CBOW模型:", cbow_model)
leaning_rate = 0.001
epochs = 1000
criterion = nn.CrossEntropyLoss()
import torch.optim as optim
optimizer = optim.SGD(cbow_model.parameters(), lr=leaning_rate)
loss_values = []
for epoch in range(epochs):
loss_sum = 0
for target, context_words in cbow_data:
X = torch.stack([one_hot_encoding(word, word_to_idx) for word in context_words]).float()
y_true = torch.tensor([word_to_idx[target]], dtype=torch.long)
y_pred = cbow_model(X)
loss = criterion(y_pred, y_true)
loss_sum += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (epoch+1) % 100 == 0:
print(f"Epoch: {epoch+1}, Loss: {loss_sum/len(cbow_data)}")
loss_values.append(loss_sum/len(cbow_data))
import matplotlib.pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams["axes.unicode_minus"] = False
plt.plot(range(1, epochs//100 + 1), loss_values)
plt.title('训练损失曲线')
plt.xlabel('轮次')
plt.ylabel('损失')
plt.show()
print("CBOW词嵌入:")
for word, idx in word_to_idx.items():
print(f"{word}: {cbow_model.input_to_hidden.weight[:, idx].detach().numpy()}")
fig, ax = plt.subplots()
for word, idx in word_to_idx.items():
vec = cbow_model.input_to_hidden.weight[:, idx].detach().numpy()
ax.scatter(vec[0], vec[1])
ax.annotate(word, (vec[0], vec[1]), fontsize=12)
plt.title('二维词嵌入')
plt.xlabel('向量维度1')
plt.ylabel('向量维度2')
plt.show()
二,截图

网友评论