一，代码

sentences = [
    "Kage is Teacher",
    "Mazong is Boss",
    "Niuzong is Boss",
    "Xiaobing is Student",
    "Xiaoxue is Student"
]
words = ' '.join(sentences).split()
word_list = list(set(words))
word_to_idx = {word: idx for idx, word in enumerate(word_list)}
idx_to_word = {idx: word for idx, word in enumerate(word_to_idx)}
voc_size = len(word_list)

print("词汇表：", word_list)
print("词汇到索引的字典： ", word_to_idx)
print("索引到词汇的字典： ", idx_to_word)
print("词汇表大小： ", voc_size)

def create_cbow_dataset(sentences, window_size=2):
    data = []
    for sentence in sentences:
        sentence = sentence.split()
        for idx, word in enumerate(sentence):
            context_word = sentence[max(idx-window_size, 0):idx] \
                           + sentence[idx+1:min(idx+window_size+1, len(sentence))]
            data.append((word, context_word))
    return data

cbow_data = create_cbow_dataset(sentences)
print("CBOW数据样例（未编码）： ", cbow_data)

import torch
def one_hot_encoding(word, word_to_idx):
    tensor = torch.zeros(len(word_to_idx))
    tensor[word_to_idx[word]] = 1
    return tensor


import torch.nn as nn
class CBOW(nn.Module):
    def __init__(self, vec_size, embeding_size):
        super(CBOW, self).__init__()
        self.input_to_hidden = nn.Linear(vec_size, embeding_size, bias=False)
        self.hidden_to_output = nn.Linear(embeding_size, vec_size, bias=False)
    def forward(self, X):
        embeddings = self.input_to_hidden(X)
        hidden_layer = torch.mean(embeddings, dim=0)
        output_layer = self.hidden_to_output(hidden_layer.unsqueeze(0))
        return output_layer

embedding_size = 2
cbow_model = CBOW(voc_size, embedding_size)
print("CBOW模型：", cbow_model)


leaning_rate = 0.001
epochs = 1000
criterion = nn.CrossEntropyLoss()
import torch.optim as optim
optimizer = optim.SGD(cbow_model.parameters(), lr=leaning_rate)
loss_values = []
for epoch in range(epochs):
    loss_sum = 0
    for target, context_words in cbow_data:
        X = torch.stack([one_hot_encoding(word, word_to_idx) for word in context_words]).float()
        y_true = torch.tensor([word_to_idx[target]], dtype=torch.long)
        y_pred = cbow_model(X)
        loss = criterion(y_pred, y_true)
        loss_sum += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch+1) % 100 == 0:
        print(f"Epoch: {epoch+1}, Loss: {loss_sum/len(cbow_data)}")
        loss_values.append(loss_sum/len(cbow_data))

import matplotlib.pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams["axes.unicode_minus"] = False
plt.plot(range(1, epochs//100 + 1), loss_values)
plt.title('训练损失曲线')
plt.xlabel('轮次')
plt.ylabel('损失')
plt.show()

print("CBOW词嵌入：")
for word, idx in word_to_idx.items():
    print(f"{word}: {cbow_model.input_to_hidden.weight[:, idx].detach().numpy()}")

fig, ax = plt.subplots()
for word, idx in word_to_idx.items():
    vec = cbow_model.input_to_hidden.weight[:, idx].detach().numpy()
    ax.scatter(vec[0], vec[1])
    ax.annotate(word, (vec[0], vec[1]), fontsize=12)

plt.title('二维词嵌入')
plt.xlabel('向量维度1')
plt.ylabel('向量维度2')
plt.show()

二，截图

image.png