# reference:
# https://github.com/aditya-grover/node2vec/blob/master/src/node2vec.py
import numpy as np
import networkx as nx
import random
class Graph():
def __init__(self, nx_G, is_directed, p, q):
self.G = nx_G
self.is_directed = is_directed
self.p = p
self.q = q
def node2vec_walk(self, walk_length, start_node):
# Simulate a random walk starting from start node.
G = self.G
alias_nodes = self.alias_nodes # 用于首次转移
alias_edges = self.alias_edges # 用于第二次及以后转移
# alias_nodes形式为{1:(J,q),2:(J,q)..} 1和2表示节点id key值是一个元素 即一个节点
# alias_edges形式为{(1,2):(J,q),(2,1):(J,q),(1,3):(J,q)...}
# (1,2)代表上一个节点是1,当前节点为2.也就是预测下一步转移需要前两步的节点id,key值是一个元组
walk = [start_node] # 游走出来的路径 用路径上的节点来表示
while len(walk) < walk_length:
cur = walk[-1] # current节点是节点list的最后一个
cur_nbrs = sorted(G.neighbors(cur)) # sorted 对list进行重新排序 默认升序 返回升序后的list
if len(cur_nbrs) > 0: # 如果存在邻居节点:
if len(walk) == 1: # 如果序列中仅仅有一个节点 即第一次开始游走
# !!!第一次游走的时候只需要根据概率选择转移节点即可
walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])])
# alias_draw返回采样的节点索引(或者类似的意思)
else:
# !!!但是第二次转移的时候就需要考虑前两个节点 也就是需要跳转概率 即get_alias_edge里面写的
prev = walk[-2] # 当前节点的前一个节点
next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0],
alias_edges[(prev, cur)][1])]
walk.append(next)
else:
break #退出循环
return walk
def simulate_walks(self, num_walks, walk_length):
# Repeatedly simulate random walks from each node.
# 对每个节点,都得到其num_walks条随机游走路径
G = self.G
walks = []
nodes = list(G.nodes())
print('Walk iteration:')
for walk_iter in range(num_walks):
print(str(walk_iter+1), '/', str(num_walks))
random.shuffle(nodes)
for node in nodes:
walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node))
return walks
#Sampling a new node in the walk can be efficiently done in O(1) time using alias sampling
def get_alias_edge(self, src, dst):
# src 上一个节点
# dst 当前节点
# Get the alias edge setup lists for a given edge.
# 新的转移概率计算方法:
# 正常加入当前节点dst转移到其他节点[1,2,3,4]的概率分布为[a,b,c,d]
# 那么这时需要考虑上一个节点src
# (1)若第一个节点1正好就是src,则新的转移概率为a×(1/p)
# (2)若第二、三个节点2 3 也是src的邻居,则新的转移概率不变 仍分别为b c
# (3)若第四个节点4和src没有连接,则新的转移概率为d×(1/q)
G = self.G
p = self.p
q = self.q
unnormalized_probs = []
for dst_nbr in sorted(G.neighbors(dst)):
if dst_nbr == src:
unnormalized_probs.append(1/p) # 1 modified
elif G.has_edge(dst_nbr, src):
unnormalized_probs.append(1) # 1 modified
else:
unnormalized_probs.append(1/q) #1 modified
norm_const = sum(unnormalized_probs)
normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs]
return alias_setup(normalized_probs)
def preprocess_transition_probs(self):
# 用来为随机游走准备马尔科夫转移矩阵
# 用来得到输出alias_nodes和alias_edges
# Preprocessing of transition probabilities for guiding the random walks.
G = self.G
is_directed = self.is_directed
# step1 这个函数先返回alias_nodes 存储每个节点对应的两个采样list
alias_nodes = {}
for node in G.nodes():
# 没有归一化的邻居节点的概率分布
unnormalized_probs = [1 for nbr in sorted(G.neighbors(node))] #1 modified # 认为有链接的权重都是1
# 归一化常数
norm_const = sum(unnormalized_probs)
# 对概率进行归一化
normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs]
# 将一个node的经过normalize之后的权重送到alias_setup进行处理 返回每个节点的J,q向量 作为采样时候的输入
alias_nodes[node] = alias_setup(normalized_probs)
alias_edges = {}
triads = {}
if is_directed:
for edge in G.edges():
alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
else:
for edge in G.edges():
alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0])
# 若是无向图 则是对称的alias_edges
self.alias_nodes = alias_nodes
self.alias_edges = alias_edges
return
# 以下都是alias采样
def alias_setup(probs):
# alias sampling 离散分布随机采样 算法复杂度为O(1)
# 这是setup
# Compute utility lists for non-uniform sampling from discrete distributions.
# Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
# for details
# input:某个概率分布probs
# output:Alias数组和Prob数组
K = len(probs) # 类别数
q = np.zeros(K) # 对应prob数组
J = np.zeros(K, dtype=np.int) # 对应alias数组
smaller = [] #存储扩充之后比1小的列
larger = [] #存储扩充之后比1大的列
for kk, prob in enumerate(probs):
q[kk] = K*prob
if q[kk] < 1.0:
smaller.append(kk)
else:
larger.append(kk)
#拼凑 将各个类别都凑为1
while len(smaller) > 0 and len(larger) > 0:
small = smaller.pop()
large = larger.pop()
J[small] = large
q[large] = q[large] + q[small] - 1.0 #将大的分到小的上
if q[large] < 1.0:
smaller.append(large)
else:
larger.append(large)
return J, q
def alias_draw(J, q):
# Draw sample from a non-uniform discrete distribution using alias sampling.
K = len(J)
kk = int(np.floor(np.random.rand()*K)) # 随机取一列
if np.random.rand() < q[kk]: # 再进行一次随机数 如果随机数小于当前这一列的prob 则返回这一列的采样索引,否则返回第二层
return kk
else:
return J[kk]
网友评论