node2vec

作者: heishanlaoniu | 来源:发表于2021-05-19 11:17 被阅读0次
    
    # reference:
    # https://github.com/aditya-grover/node2vec/blob/master/src/node2vec.py
    
    import numpy as np
    import networkx as nx
    import random
    
    
    class Graph():
        def __init__(self, nx_G, is_directed, p, q):
            self.G = nx_G
            self.is_directed = is_directed
            self.p = p
            self.q = q
    
        def node2vec_walk(self, walk_length, start_node):
    
            # Simulate a random walk starting from start node.
    
            G = self.G
            alias_nodes = self.alias_nodes   # 用于首次转移
            alias_edges = self.alias_edges   # 用于第二次及以后转移
            # alias_nodes形式为{1:(J,q),2:(J,q)..} 1和2表示节点id key值是一个元素 即一个节点
            # alias_edges形式为{(1,2):(J,q),(2,1):(J,q),(1,3):(J,q)...}
            # (1,2)代表上一个节点是1,当前节点为2.也就是预测下一步转移需要前两步的节点id,key值是一个元组
    
            walk = [start_node]   # 游走出来的路径 用路径上的节点来表示
    
            while len(walk) < walk_length:
                cur = walk[-1]    # current节点是节点list的最后一个
                cur_nbrs = sorted(G.neighbors(cur))   # sorted 对list进行重新排序 默认升序 返回升序后的list
                if len(cur_nbrs) > 0:   # 如果存在邻居节点:
                    if len(walk) == 1:   # 如果序列中仅仅有一个节点 即第一次开始游走
                    # !!!第一次游走的时候只需要根据概率选择转移节点即可
                        walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])])
                        # alias_draw返回采样的节点索引(或者类似的意思)
                    else:
                        # !!!但是第二次转移的时候就需要考虑前两个节点 也就是需要跳转概率 即get_alias_edge里面写的
                        prev = walk[-2] # 当前节点的前一个节点
                        next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0],
                            alias_edges[(prev, cur)][1])]
                        walk.append(next)
                else:
                    break   #退出循环
    
            return walk
    
        def simulate_walks(self, num_walks, walk_length):
    
            # Repeatedly simulate random walks from each node.
            # 对每个节点,都得到其num_walks条随机游走路径
    
            G = self.G
            walks = []
            nodes = list(G.nodes())
            print('Walk iteration:')
            for walk_iter in range(num_walks):
                print(str(walk_iter+1), '/', str(num_walks))
                random.shuffle(nodes)
                for node in nodes:
                    walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node))
    
            return walks
    
        #Sampling a new node in the walk can be efficiently done in O(1) time using alias sampling
        def get_alias_edge(self, src, dst):
    
            # src 上一个节点
            # dst 当前节点
            # Get the alias edge setup lists for a given edge.
            # 新的转移概率计算方法:
            # 正常加入当前节点dst转移到其他节点[1,2,3,4]的概率分布为[a,b,c,d]
            # 那么这时需要考虑上一个节点src
            # (1)若第一个节点1正好就是src,则新的转移概率为a×(1/p)
            # (2)若第二、三个节点2 3 也是src的邻居,则新的转移概率不变 仍分别为b c
            # (3)若第四个节点4和src没有连接,则新的转移概率为d×(1/q)
            
            G = self.G
            p = self.p
            q = self.q
    
            unnormalized_probs = []
            for dst_nbr in sorted(G.neighbors(dst)):
                if dst_nbr == src:
                    unnormalized_probs.append(1/p) # 1 modified
                elif G.has_edge(dst_nbr, src):
                    unnormalized_probs.append(1) # 1 modified
                else:
                    unnormalized_probs.append(1/q)  #1 modified
            norm_const = sum(unnormalized_probs)
            normalized_probs =  [float(u_prob)/norm_const for u_prob in unnormalized_probs]
    
            return alias_setup(normalized_probs)
    
        def preprocess_transition_probs(self):
    
            # 用来为随机游走准备马尔科夫转移矩阵
            # 用来得到输出alias_nodes和alias_edges
            # Preprocessing of transition probabilities for guiding the random walks.
    
            G = self.G
            is_directed = self.is_directed
    
            # step1 这个函数先返回alias_nodes 存储每个节点对应的两个采样list
            alias_nodes = {}
            for node in G.nodes():
                # 没有归一化的邻居节点的概率分布
                unnormalized_probs = [1 for nbr in sorted(G.neighbors(node))] #1 modified   # 认为有链接的权重都是1
                # 归一化常数
                norm_const = sum(unnormalized_probs)
                # 对概率进行归一化
                normalized_probs =  [float(u_prob)/norm_const for u_prob in unnormalized_probs]
                # 将一个node的经过normalize之后的权重送到alias_setup进行处理 返回每个节点的J,q向量 作为采样时候的输入
                alias_nodes[node] = alias_setup(normalized_probs)
    
            alias_edges = {}
            triads = {}
    
            if is_directed:
                for edge in G.edges():
                    alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
            else:
                for edge in G.edges():
                    alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
                    alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0])
                    # 若是无向图 则是对称的alias_edges
    
            self.alias_nodes = alias_nodes
            self.alias_edges = alias_edges
    
            return
    
    # 以下都是alias采样
    def alias_setup(probs):
    
        # alias sampling 离散分布随机采样 算法复杂度为O(1)
        # 这是setup
        # Compute utility lists for non-uniform sampling from discrete distributions.
        # Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
        # for details
        # input:某个概率分布probs
        # output:Alias数组和Prob数组
    
        K = len(probs)   # 类别数
        q = np.zeros(K)   # 对应prob数组
        J = np.zeros(K, dtype=np.int)   # 对应alias数组
    
        smaller = []   #存储扩充之后比1小的列
        larger = []    #存储扩充之后比1大的列
        for kk, prob in enumerate(probs):
            q[kk] = K*prob
            if q[kk] < 1.0:
                smaller.append(kk)
            else:
                larger.append(kk)
    
        #拼凑 将各个类别都凑为1
        while len(smaller) > 0 and len(larger) > 0:
            small = smaller.pop()
            large = larger.pop()
    
            J[small] = large
            q[large] = q[large] + q[small] - 1.0  #将大的分到小的上
            if q[large] < 1.0:
                smaller.append(large)
            else:
                larger.append(large)
    
        return J, q
    
    def alias_draw(J, q):
    
    
        # Draw sample from a non-uniform discrete distribution using alias sampling.
    
        K = len(J)
    
        kk = int(np.floor(np.random.rand()*K))   # 随机取一列
        if np.random.rand() < q[kk]:   # 再进行一次随机数 如果随机数小于当前这一列的prob 则返回这一列的采样索引,否则返回第二层
            return kk
        else:
            return J[kk]
    
    

    相关文章

      网友评论

          本文标题:node2vec

          本文链接:https://www.haomeiwen.com/subject/rfqpjltx.html