一、Python版
#!/usr/bin/python
# coding=utf-8
classsimhash:
#构造函数
def__init__(self, tokens='', hashbits=128):
self.hashbits = hashbits
self.hash =self.simhash(tokens);
#toString函数
def__str__(self):
returnstr(self.hash)
#生成simhash值
defsimhash(self, tokens):
v = [0] *self.hashbits
fortin[self._string_hash(x)forxintokens]:#t为token的普通hash值
foriinrange(self.hashbits):
bitmask =1<< i
ift & bitmask :
v[i] +=1#查看当前bit位是否为1,是的话将该位+1
else:
v[i] -=1#否则的话,该位-1
fingerprint =0
foriinrange(self.hashbits):
ifv[i] >=0:
fingerprint +=1<< i
returnfingerprint#整个文档的fingerprint为最终各个位>=0的和
#求海明距离
defhamming_distance(self, other):
x = (self.hash ^ other.hash) & ((1<
tot =0;
whilex :
tot +=1
x &= x -1
returntot
#求相似度
defsimilarity (self, other):
a = float(self.hash)
b = float(other.hash)
ifa > b :returnb / a
else:returna / b
#针对source生成hash值 (一个可变长度版本的Python的内置散列)
def_string_hash(self, source):
ifsource == "":
return0
else:
x = ord(source[0]) <<7
m =1000003
mask =2**self.hashbits -1
forcinsource:
x = ((x * m) ^ ord(c)) & mask
x ^= len(source)
ifx == -1:
x = -2
returnx
if__name__ =='__main__':
s ='This is a test string for testing'
hash1 = simhash(s.split())
s ='This is a test string for testing also'
hash2 = simhash(s.split())
s ='nai nai ge xiong cao'
hash3 = simhash(s.split())
print(hash1.hamming_distance(hash2) ," ", hash1.similarity(hash2))
print(hash1.hamming_distance(hash3) ," ", hash1.similarity(hash3))
二、Java版:
importjava.math.BigInteger;
importjava.util.StringTokenizer;
publicclassSimHash {
privateString tokens;
privateBigInteger strSimHash;
privateinthashbits =128;
publicSimHash(String tokens) {
this.tokens = tokens;
this.strSimHash =this.simHash();
}
publicSimHash(String tokens,inthashbits) {
this.tokens = tokens;
this.hashbits = hashbits;
this.strSimHash =this.simHash();
}
publicBigInteger simHash() {
int[] v =newint[this.hashbits];
StringTokenizer stringTokens =newStringTokenizer(this.tokens);
while(stringTokens.hasMoreTokens()) {
String temp = stringTokens.nextToken();
BigInteger t =this.hash(temp);
for(inti =0; i
BigInteger bitmask =newBigInteger("1").shiftLeft(i);
if(t.and(bitmask).signum() !=0) {
v[i] +=1;
}else{
v[i] -=1;
}
}
}
BigInteger fingerprint =newBigInteger("0");
for(inti =0; i
if(v[i] >=0) {
fingerprint = fingerprint.add(newBigInteger("1").shiftLeft(i));
}
}
returnfingerprint;
}
privateBigInteger hash(String source) {
if(source ==null|| source.length() ==0) {
returnnewBigInteger("0");
}else{
char[] sourceArray = source.toCharArray();
BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) <<7);
BigInteger m =newBigInteger("1000003");
BigInteger mask =newBigInteger("2").pow(this.hashbits).subtract(
newBigInteger("1"));
for(charitem : sourceArray) {
BigInteger temp = BigInteger.valueOf((long) item);
x = x.multiply(m).xor(temp).and(mask);
}
x = x.xor(newBigInteger(String.valueOf(source.length())));
if(x.equals(newBigInteger("-1"))) {
x =newBigInteger("-2");
}
returnx;
}
}
publicinthammingDistance(SimHash other) {
BigInteger m =newBigInteger("1").shiftLeft(this.hashbits).subtract(
newBigInteger("1"));
BigInteger x =this.strSimHash.xor(other.strSimHash).and(m);
inttot =0;
while(x.signum() !=0) {
tot +=1;
x = x.and(x.subtract(newBigInteger("1")));
}
returntot;
}
publicstaticvoidmain(String[] args) {
String s ="This is a test string for testing";
SimHash hash1 =newSimHash(s,128);
System.out.println(hash1.strSimHash +" "+ hash1.strSimHash.bitLength());
s ="This is a test string for testing also";
SimHash hash2 =newSimHash(s,128);
System.out.println(hash2.strSimHash+" "+ hash2.strSimHash.bitCount());
s ="This is a test string for testing als";
SimHash hash3 =newSimHash(s,128);
System.out.println(hash3.strSimHash+" "+ hash3.strSimHash.bitCount());
System.out.println("============================");
System.out.println(hash1.hammingDistance(hash2));
System.out.println(hash1.hammingDistance(hash3));
}
}
结论:
python的计算能力确实很强,float可以表示任意长度的数字,而对应java、c++只能用其他办法来实现了,比如java的BigIneteger,对应的位操作也只能利用类方法。。。汗。。。
另外说明,位运算只适合整数哦。。。因为浮点的存储方案决定不能位运算,如果非要位运算,就需要Float.floatToIntBits,运算完,再通过Float.intBitsToFloat转化回去。(java默认的float,double的hashcode其实就是对应的floatToIntBits的int值)
java左移、右移: 移位运算符和气压的位运算符一样都是用来操作二进制位。
1)<< ,左移位:将操作符左侧的操作数向左移动操作数右侧指定的位数。移动的规则是在二进制的低位补0.
2)>> ,有符号右移位,将操作符左侧的操作数向右移动操作数右侧指定的位数。移动的规则是,如果被操作数的符号为正,则在二进制的高位补0;如果被操作数的符号为负,则在二进制的高位补1
3)>>> ,无符号右移位:将操作符左侧的操作数向右移动操作数右侧指定的位数。移动的对则是,无论被操作数的符号是正是负,都在二进制的高位补0.
网友评论