simhash算法实现（源码）

作者: 5f32fea06d2d | 来源:发表于2016-11-08 14:55 被阅读333次

一、Python版

#!/usr/bin/python

# coding=utf-8

classsimhash:

#构造函数

def__init__(self, tokens='', hashbits=128):

self.hashbits = hashbits

self.hash =self.simhash(tokens);

#toString函数

def__str__(self):

returnstr(self.hash)

#生成simhash值

defsimhash(self, tokens):

v = [0] *self.hashbits

fortin[self._string_hash(x)forxintokens]:#t为token的普通hash值

foriinrange(self.hashbits):

bitmask =1<< i

ift & bitmask :

v[i] +=1#查看当前bit位是否为1,是的话将该位+1

else:

v[i] -=1#否则的话,该位-1

fingerprint =0

foriinrange(self.hashbits):

ifv[i] >=0:

fingerprint +=1<< i

returnfingerprint#整个文档的fingerprint为最终各个位>=0的和

#求海明距离

defhamming_distance(self, other):

x = (self.hash ^ other.hash) & ((1<

tot =0;

whilex :

tot +=1

x &= x -1

returntot

#求相似度

defsimilarity (self, other):

a = float(self.hash)

b = float(other.hash)

ifa > b :returnb / a

else:returna / b

#针对source生成hash值 (一个可变长度版本的Python的内置散列)

def_string_hash(self, source):

ifsource == "":

return0

else:

x = ord(source[0]) <<7

m =1000003

mask =2**self.hashbits -1

forcinsource:

x = ((x * m) ^ ord(c)) & mask

x ^= len(source)

ifx == -1:

x = -2

returnx

if__name__ =='__main__':

s ='This is a test string for testing'

hash1 = simhash(s.split())

s ='This is a test string for testing also'

hash2 = simhash(s.split())

s ='nai nai ge xiong cao'

hash3 = simhash(s.split())

print(hash1.hamming_distance(hash2) ," ", hash1.similarity(hash2))

print(hash1.hamming_distance(hash3) ," ", hash1.similarity(hash3))

二、Java版：

importjava.math.BigInteger;

importjava.util.StringTokenizer;

publicclassSimHash {

privateString tokens;

privateBigInteger strSimHash;

privateinthashbits =128;

publicSimHash(String tokens) {

this.tokens = tokens;

this.strSimHash =this.simHash();

}

publicSimHash(String tokens,inthashbits) {

this.tokens = tokens;

this.hashbits = hashbits;

this.strSimHash =this.simHash();

}

publicBigInteger simHash() {

int[] v =newint[this.hashbits];

StringTokenizer stringTokens =newStringTokenizer(this.tokens);

while(stringTokens.hasMoreTokens()) {

String temp = stringTokens.nextToken();

BigInteger t =this.hash(temp);

for(inti =0; i

BigInteger bitmask =newBigInteger("1").shiftLeft(i);

if(t.and(bitmask).signum() !=0) {

v[i] +=1;

}else{

v[i] -=1;

}

BigInteger fingerprint =newBigInteger("0");

for(inti =0; i

if(v[i] >=0) {

fingerprint = fingerprint.add(newBigInteger("1").shiftLeft(i));

}

returnfingerprint;

}

privateBigInteger hash(String source) {

if(source ==null|| source.length() ==0) {

returnnewBigInteger("0");

}else{

char[] sourceArray = source.toCharArray();

BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) <<7);

BigInteger m =newBigInteger("1000003");

BigInteger mask =newBigInteger("2").pow(this.hashbits).subtract(

newBigInteger("1"));

for(charitem : sourceArray) {

BigInteger temp = BigInteger.valueOf((long) item);

x = x.multiply(m).xor(temp).and(mask);

}

x = x.xor(newBigInteger(String.valueOf(source.length())));

if(x.equals(newBigInteger("-1"))) {

x =newBigInteger("-2");

}

returnx;

}

publicinthammingDistance(SimHash other) {

BigInteger m =newBigInteger("1").shiftLeft(this.hashbits).subtract(

newBigInteger("1"));

BigInteger x =this.strSimHash.xor(other.strSimHash).and(m);

inttot =0;

while(x.signum() !=0) {

tot +=1;

x = x.and(x.subtract(newBigInteger("1")));

}

returntot;

}

publicstaticvoidmain(String[] args) {

String s ="This is a test string for testing";

SimHash hash1 =newSimHash(s,128);

System.out.println(hash1.strSimHash +" "+ hash1.strSimHash.bitLength());

s ="This is a test string for testing also";

SimHash hash2 =newSimHash(s,128);

System.out.println(hash2.strSimHash+" "+ hash2.strSimHash.bitCount());

s ="This is a test string for testing als";

SimHash hash3 =newSimHash(s,128);

System.out.println(hash3.strSimHash+" "+ hash3.strSimHash.bitCount());

System.out.println("============================");

System.out.println(hash1.hammingDistance(hash2));

System.out.println(hash1.hammingDistance(hash3));

}

结论：

python的计算能力确实很强，float可以表示任意长度的数字，而对应java、c++只能用其他办法来实现了，比如java的BigIneteger,对应的位操作也只能利用类方法。。。汗。。。

另外说明，位运算只适合整数哦。。。因为浮点的存储方案决定不能位运算，如果非要位运算，就需要Float.floatToIntBits，运算完，再通过Float.intBitsToFloat转化回去。(java默认的float,double的hashcode其实就是对应的floatToIntBits的int值)

java左移、右移：移位运算符和气压的位运算符一样都是用来操作二进制位。

1）<< ,左移位:将操作符左侧的操作数向左移动操作数右侧指定的位数。移动的规则是在二进制的低位补0.

2）>> ,有符号右移位，将操作符左侧的操作数向右移动操作数右侧指定的位数。移动的规则是，如果被操作数的符号为正，则在二进制的高位补0；如果被操作数的符号为负，则在二进制的高位补1

3）>>> ，无符号右移位：将操作符左侧的操作数向右移动操作数右侧指定的位数。移动的对则是，无论被操作数的符号是正是负，都在二进制的高位补0.

网友评论

本文标题：simhash算法实现（源码）

本文链接：https://www.haomeiwen.com/subject/griauttx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

simhash算法实现（源码）

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读