CBOW

作者: 王小鸟_wpcool | 来源:发表于2018-01-17 18:01 被阅读0次

    CBOW(continuous bag of words ):
    用来完成word2vec

    import tensorflow as tf
    import numpy as np
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt
    
    
    
    
    def one_hot(ind, vocab_size):
        rec = np.zeros(vocab_size)
        rec[ind] = 1
        return rec
    
    
    def create_training_data(corpus_raw, window_size=2):
        words_list = []
        for sent in corpus_raw.split('.'):
            for w in sent.split():
                words_list.append(w)
        words_list = set(words_list)
    
        word2ind = {}
        ind2word = {}
    
        vocab_size = len(words_list)
    
        for i, w in enumerate(words_list):
            word2ind[w] = i
            ind2word[i] = w
        sentence_list = corpus_raw.split('.')
        sentences = []
        for sent in sentence_list:
            sent_array = sent.split()
            sent_array = [s.split('.')[0] for s in sent_array]
            sentences.append(sent_array)
    
        data_recs = []
        for sent in sentences:
            for ind, w in enumerate(sent):
                rec = []
                for nb_w in sent[max(ind - window_size, 0): min(ind + window_size,  len(sent)) + 1]:
                    if nb_w != w:
                        rec.append(nb_w)
                    data_recs.append([rec, w])
        x_train = []
        y_train = []
    
        for rec in data_recs:
            input_ = np.zeros(vocab_size)
            for i in range(window_size - 1):
                input_ += one_hot(word2ind[rec[0][i]], vocab_size)
            input_ = input_ / len(rec[0])
            x_train.append(input_)
            y_train.append(one_hot(word2ind[rec[1]], vocab_size))
    
        return x_train, y_train, word2ind, ind2word,vocab_size
    
    
    
    
    corpus_raw = "Deep Learning has evolved from Artificial Neural Networks, which has been\
     there since the 1940s. Neural Networks are interconnected networks of processing units\
     called artificial neurons that loosely mimic axons in a biological brain. In a biological\
     neuron, the dendrites receive input signals from various neighboring neurons, typically\
     greater than 1000. These modified signals are then passed on to the cell body or soma of\
     the neuron, where these signals are summed together and then passed on to the axon of the\
     neuron. If the received input signal is more than a specified threshold, the axon will\
     release a signal which again will pass on to neighboring dendrites of other neurons. Figure\
     2-1 depicts the structure of a biological neuron for reference. The artificial neuron units\
     are inspired by the biological neurons with some modifications as per convenience. Much\
     like the dendrites, the input connections to the neuron carry the attenuated or amplified\
     input signals from other neighboring neurons. The signals are passed on to the neuron, where\
     the input signals are summed up and then a decision is taken what to output based on the\
     total input received. For instance, for a binary threshold neuron an output value of 1 is\
     provided when the total input exceeds a pre-defined threshold; otherwise, the output stays\
     at 0. Several other types of neurons are used in artificial neural networks, and their\
     implementation only differs with respect to the activation function on the total input to\
     produce the neuron output. In Figure 2-2 the different biological equivalents are tagged in\
     the artificial neuron for easy analogy and interpretation."
    
    
    corpus_raw = corpus_raw.lower()
    x_train, y_train, word2ind, ind2word, vocab_size = create_training_data(corpus_raw, 2)
    
    emb_dim = 128
    learning_rate = 0.001
    
    x = tf.placeholder(tf.float32, shape=[None, vocab_size])
    y = tf.placeholder(tf.float32, shape=[None, vocab_size])
    W = tf.Variable(tf.random_normal(shape=[vocab_size, emb_dim], mean=0.0, stddev=0.02, dtype=tf.float32))
    b = tf.Variable(tf.random_normal(shape=[emb_dim], mean=0.0, stddev=0.02, dtype=tf.float32))
    W_outer = tf.Variable(tf.random_normal(shape=[emb_dim, vocab_size], mean=0.0, stddev=0.02, dtype=tf.float32))
    b_outer = tf.Variable(tf.random_normal(shape=[vocab_size], mean=0.0, stddev=0.02, dtype=tf.float32))
    
    hidden = tf.add(tf.matmul(x, W), b)
    print hidden.shape
    logits = tf.add(tf.matmul(hidden, W_outer), b_outer)
    
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
    
    
    epochs, batch_size = 100, 10
    batch = len(x_train) // batch_size
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(epochs):
            batch_index = 0
            for batch_num in range(batch):
                x_batch = x_train[batch_index: batch_index + batch_size]
                y_batch = y_train[batch_index: batch_index + batch_size]
                sess.run(optimizer, feed_dict={x: x_batch, y: y_batch})
            print ("loss:", sess.run(cost, feed_dict={x: x_batch, y: y_batch}))
            batch_index = batch_index + batch
        W_embed_trained = sess.run(W)
    
    
    W_embedded = TSNE(n_components=2).fit_transform(W_embed_trained)   #实现降为,将131维数据降低到2维,方便画图展示
    plt.figure(figsize=(10,10))
    for i in xrange(len(W_embedded)):
        plt.text(W_embedded[i,0],W_embedded[i,1],ind2word[i])
    plt.xlim(-150,150)
    plt.ylim(-150,150)
    plt.show()
    
    
    

    相关文章

      网友评论

          本文标题:CBOW

          本文链接:https://www.haomeiwen.com/subject/cebuoxtx.html